1; Copyright © 2021, VideoLAN and dav1d authors
2; Copyright © 2021, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29SECTION_RODATA 16
30
31%if ARCH_X86_64
32%define PIC_sym(a) a
33%else
34%define PIC_base $$
35%define PIC_sym(a) pic_regq+a-PIC_base
36%endif
37
38pb_4x1_4x5_4x9_4x13: times 4 db 0, 1
39                     times 4 db 8, 9
40
41pw_1: times 8 dw 1
42pw_2: times 8 dw 2
43pw_3: times 8 dw 3
44; 4 and 16 need to be next to each other since they are used as alternates
45; depending on whether bitdepth is 10 or 12
46pw_4: times 8 dw 4
47pw_16: times 8 dw 16
48pw_8: times 8 dw 8
49pw_4096: times 8 dw 4096
50
51pb_mask: dd 1, 1, 2, 2
52
53SECTION .text
54
55%if ARCH_X86_32
56%if STACK_ALIGNMENT < 16
57%define extra_stack 2
58%else
59%define extra_stack 0
60%endif
61%endif
62
63%macro RELOC_ARGS 2 ; h/v, off
64ASSERT ARCH_X86_32
65%if STACK_ALIGNMENT < 16
66    mov          r5d, [rstk + stack_offset + 4*4 + 4]
67%define lstridem [esp+%2+0*gprsize]
68    mov     lstridem, r5d
69    mov          r5d, [rstk + stack_offset + 4*5 + 4]
70%define lutm [esp+%2+1*gprsize]
71    mov         lutm, r5d
72    mov          r5d, [rstk + stack_offset + 4*6 + 4]
73%ifidn %1, v
74%define wm [esp+%2+2*gprsize]
75    mov           wm, r5d
76    mov          r5d, [rstk + stack_offset + 4*3 + 4]
77%define lm [esp+%2+3*gprsize]
78    mov           lm, r5d
79%else ; %1 == h
80%define hm [esp+%2+2*gprsize]
81    mov           hm, r5d
82%endif ; %1==v
83   mov           r5d, r7m
84%define bdmulm [esp+%2+4*gprsize]
85    mov       bdmulm, r5d
86%else
87%define lstridem r4m
88%define lutm r5m
89%ifidn %1, v
90%define wm r6m
91%define lm r3m
92%else
93%define hm r6m
94%endif
95%define bdmulm r7m
96%endif ; STACK_ALIGNMENT
97%endmacro
98
99%macro UNRELOC_ARGS 0
100%if ARCH_X86_32
101%undef lm
102%undef lstridem
103%undef wm
104%undef hm
105%undef lutm
106%endif
107%endmacro
108
109%macro REPX 2-*
110    %xdefine %%f(x) %1
111%rep %0 - 1
112    %rotate 1
113    %%f(%1)
114%endrep
115%endmacro
116
117%macro SPLATD 2
118    movd %1, %2
119    pshufd %1, %1, q0000
120%endmacro
121
122%macro SPLATW 2
123    movd %1, %2
124    pshuflw %1, %1, q0000
125    punpcklqdq %1, %1
126%endmacro
127
128;        in:            out:
129; mm%1   a b c d        a e i m
130; mm%2   e f g h        b f j n
131; mm%3   i j k l   ->   c g k o
132; mm%4   m n o p        d h l p
133%macro TRANSPOSE4X4W 5
134    punpcklwd        m%5, m%1, m%2
135    punpckhwd        m%1, m%2
136    punpcklwd        m%2, m%3, m%4
137    punpckhwd        m%3, m%4
138    punpckldq        m%4, m%5, m%2
139    punpckhdq        m%5, m%2
140    punpckldq        m%2, m%1, m%3
141    punpckhdq        m%1, m%3
142
143    SWAP              %1, %4
144    SWAP              %2, %5, %3
145%endmacro
146
147;         in:                  out:
148; m%1   a b c d e f g h      a i q y 6 E M U
149; m%2   i j k l m n o p      b j r z 7 F N V
150; m%3   q r s t u v w x      c k s 0 8 G O W
151; m%4   y z 0 1 2 3 4 5      d l t 1 9 H P X
152; m%5   6 7 8 9 A B C D  ->  e m u 2 A I Q Y
153; m%6   E F G H I J K L      f n v 3 B J R Z
154; m%7   M N O P Q R S T      g o w 4 C K S +
155; m%8   U V W X Y Z + =      h p x 5 D L T =
156%if ARCH_X86_64
157%macro TRANSPOSE8X8W 9
158    ; m%1   a b c d e f g h      a i q y b j r z
159    ; m%2   i j k l m n o p      c k s 0 d l t 1
160    ; m%3   q r s t u v w x  ->  e m u 2 f n v 3
161    ; m%4   y z 0 1 2 3 4 5      g o w 4 h p x 5
162    TRANSPOSE4X4W     %1, %2, %3, %4, %9
163
164    ; m%5   6 7 8 9 A B C D      6 E M U 7 F N V
165    ; m%6   E F G H I J K L      8 G O W 9 H P X
166    ; m%7   M N O P Q R S T  ->  A I Q Y B J R Z
167    ; m%8   U V W X Y Z + =      C K S + D L T =
168    TRANSPOSE4X4W     %5, %6, %7, %8, %9
169
170    ; m%1   a i q y b j r z      a i q y 6 E M U
171    ; m%2   c k s 0 d l t 1      b j r z 7 F N V
172    ; m%3   e m u 2 f n v 3      c k s 0 8 G O W
173    ; m%4   g o w 4 h p x 5      d l t 1 9 H P X
174    ; m%5   6 E M U 7 F N V  ->  e m u 2 A I Q Y
175    ; m%6   8 G O W 9 H P X      f n v 3 B J R Z
176    ; m%7   A I Q Y B J R Z      g o w 4 C K S +
177    ; m%8   C K S + D L T =      h p x 5 D L T =
178    punpckhqdq       m%9, m%1, m%5
179    punpcklqdq       m%1, m%5
180    punpckhqdq       m%5, m%2, m%6
181    punpcklqdq       m%2, m%6
182    punpckhqdq       m%6, m%3, m%7
183    punpcklqdq       m%3, m%7
184    punpckhqdq       m%7, m%4, m%8
185    punpcklqdq       m%4, m%8
186
187    SWAP %8, %7, %4, %5, %3, %2, %9
188%endmacro
189%else ; x86-32
190; input: 1-7 in registers, 8 in first memory [read-only]
191; second memory is scratch, and may overlap with first or third memory
192; output: 1-5,7-8 in registers, 6 in third memory [write-only]
193%macro TRANSPOSE8X8W 13 ; regs [8x], mem [3x], a/u [in/out alignment [2x]
194    TRANSPOSE4X4W     %1, %2, %3, %4, %8
195%ifnidn %9, ""
196    mov%12           m%8, %9
197%else
198    mova             m%8, %10
199%endif
200    mova             %10, m%4
201    TRANSPOSE4X4W     %5, %6, %7, %8, %4
202    punpckhqdq       m%4, m%1, m%5
203    punpcklqdq       m%1, m%5
204    punpckhqdq       m%5, m%2, m%6
205    punpcklqdq       m%2, m%6
206    punpckhqdq       m%6, m%3, m%7
207    punpcklqdq       m%3, m%7
208    mova             m%7, %10
209%ifnidn %11, ""
210    mov%13           %11, m%6
211%else
212    mova             %10, m%6
213%endif
214    punpckhqdq       m%6, m%7, m%8
215    punpcklqdq       m%7, m%8
216
217    ; 1,4,2,5,3,8,7,6 -> 1,2,3,4,5,6,7,8
218    SWAP              %2, %4, %5, %3
219    SWAP              %6, %8
220%endmacro
221%endif ; x86-32/64
222
223; transpose and write m8-11, everything else is scratch
224%macro TRANSPOSE_8x4_AND_WRITE_4x8 5 ; p1, p0, q0, q1, tmp
225    ; transpose 8x4
226    punpcklwd     %5, %1, %2
227    punpckhwd     %1, %2
228    punpcklwd     %2, %3, %4
229    punpckhwd     %3, %4
230    punpckldq     %4, %5, %2
231    punpckhdq     %5, %2
232    punpckldq     %2, %1, %3
233    punpckhdq     %1, %3
234
235    ; write out
236    movq   [dstq+strideq*0-4], %4
237    movhps [dstq+strideq*1-4], %4
238    movq   [dstq+strideq*2-4], %5
239    movhps [dstq+stride3q -4], %5
240    lea         dstq, [dstq+strideq*4]
241    movq   [dstq+strideq*0-4], %2
242    movhps [dstq+strideq*1-4], %2
243    movq   [dstq+strideq*2-4], %1
244    movhps [dstq+stride3q -4], %1
245    lea         dstq, [dstq+strideq*4]
246%endmacro
247
248%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
249    ; load data
250%ifidn %2, v
251%if %1 == 4
252%if ARCH_X86_64
253%define P1 m8
254%define P0 m9
255%define Q0 m10
256%define Q1 m11
257    mova          P1, [dstq+mstrideq*2]         ; p1
258    mova          P0, [dstq+mstrideq*1]         ; p0
259    mova          Q0, [dstq+strideq*0]          ; q0
260    mova          Q1, [dstq+strideq*1]          ; q1
261%else ; x86-32
262%define P1 [dstq+mstrideq*2]
263%define P0 [dstq+mstrideq*1]
264%define Q0 [dstq+strideq*0]
265%define Q1 [dstq+strideq*1]
266%endif ; x86-32/64
267%else ; %1 != 4
268    ; load 6-8 pixels, remainder (for wd=16) will be read inline
269    lea         tmpq, [dstq+mstrideq*4]
270%if ARCH_X86_64
271    ; we load p3 later
272%define P2 m13
273%define P1 m8
274%define P0 m9
275%define Q0 m10
276%define Q1 m11
277%define Q2 m14
278    mova          P2, [tmpq+strideq*1]
279    mova          P1, [tmpq+strideq*2]
280    mova          P0, [tmpq+stride3q]
281    mova          Q0, [dstq+strideq*0]
282    mova          Q1, [dstq+strideq*1]
283    mova          Q2, [dstq+strideq*2]
284%if %1 != 6
285%define P3 [tmpq+strideq*0]
286%define Q3 m15
287    mova          Q3, [dstq+stride3q]
288%endif ; %1 != 6
289%else ; x86-32
290%define P2 [tmpq+strideq*1]
291%define P1 [dstq+mstrideq*2]
292%define P0 [dstq+mstrideq*1]
293%define Q0 [dstq+strideq*0]
294%define Q1 [dstq+strideq*1]
295%define Q2 [dstq+strideq*2]
296%if %1 != 6
297%define P3 [dstq+mstrideq*4]
298%define Q3 [dstq+stride3q]
299%endif ; %1 != 6
300%endif ; x86-32/64
301%endif ; %1 ==/!= 4
302%else ; %2 != v
303    ; load lines
304%if %1 == 4
305    movq          m0, [dstq+strideq*0-4]
306    movq          m2, [dstq+strideq*1-4]
307    movq          m4, [dstq+strideq*2-4]
308    movq          m5, [dstq+stride3q -4]
309    lea         tmpq, [dstq+strideq*4]
310    movq          m3, [tmpq+strideq*0-4]
311    movq          m6, [tmpq+strideq*1-4]
312    movq          m1, [tmpq+strideq*2-4]
313    movq          m7, [tmpq+stride3q -4]
314
315    ; transpose 4x8
316    ; m0: A-D0
317    ; m2: A-D1
318    ; m4: A-D2
319    ; m5: A-D3
320    ; m3: A-D4
321    ; m6: A-D5
322    ; m1: A-D6
323    ; m7: A-D7
324    punpcklwd     m0, m2
325    punpcklwd     m4, m5
326    punpcklwd     m3, m6
327    punpcklwd     m1, m7
328    ; m0: A0-1,B0-1,C0-1,D0-1
329    ; m4: A2-3,B2-3,C2-3,D2-3
330    ; m3: A4-5,B4-5,C4-5,D4-5
331    ; m1: A6-7,B6-7,C6-7,D6-7
332    punpckhdq     m2, m0, m4
333    punpckldq     m0, m4
334    punpckhdq     m4, m3, m1
335    punpckldq     m3, m1
336    ; m0: A0-3,B0-3
337    ; m2: C0-3,D0-3
338    ; m3: A4-7,B4-7
339    ; m4: C4-7,D4-7
340    punpckhqdq    m1, m0, m3
341    punpcklqdq    m0, m3
342    punpckhqdq    m3, m2, m4
343    punpcklqdq    m2, m4
344    ; m0: A0-7
345    ; m1: B0-7
346    ; m2: C0-7
347    ; m3: D0-7
348%if ARCH_X86_64
349    SWAP           0, 8
350    SWAP           1, 9
351    SWAP           2, 10
352    SWAP           3, 11
353%define P1 m8
354%define P0 m9
355%define Q0 m10
356%define Q1 m11
357%else
358%define P1 [esp+3*mmsize]
359%define P0 [esp+4*mmsize]
360%define Q0 [esp+5*mmsize]
361%define Q1 [esp+6*mmsize]
362    mova          P1, m0
363    mova          P0, m1
364    mova          Q0, m2
365    mova          Q1, m3
366%endif
367%elif %1 == 6 || %1 == 8
368    movu          m0, [dstq+strideq*0-8]
369    movu          m1, [dstq+strideq*1-8]
370    movu          m2, [dstq+strideq*2-8]
371    movu          m3, [dstq+stride3q -8]
372    lea         tmpq, [dstq+strideq*4]
373    movu          m4, [tmpq+strideq*0-8]
374    movu          m5, [tmpq+strideq*1-8]
375    movu          m6, [tmpq+strideq*2-8]
376%if ARCH_X86_64
377    movu          m7, [tmpq+stride3q -8]
378%endif
379
380    ; transpose 8x16
381    ; m0: A-H0,A-H8
382    ; m1: A-H1,A-H9
383    ; m2: A-H2,A-H10
384    ; m3: A-H3,A-H11
385    ; m4: A-H4,A-H12
386    ; m5: A-H5,A-H13
387    ; m6: A-H6,A-H14
388    ; m7: A-H7,A-H15
389%if ARCH_X86_64
390    punpcklwd     m8, m0, m1
391%else
392    punpcklwd     m7, m0, m1
393%endif
394    punpckhwd     m0, m1
395    punpcklwd     m1, m2, m3
396    punpckhwd     m2, m3
397    punpcklwd     m3, m4, m5
398    punpckhwd     m4, m5
399%if ARCH_X86_64
400    punpcklwd     m5, m6, m7
401    punpckhwd     m6, m7
402%else
403    mova  [rsp+3*16], m4
404    movu          m4, [tmpq+stride3q -8]
405    punpcklwd     m5, m6, m4
406    punpckhwd     m6, m4
407%endif
408    ; m8: A0-1,B0-1,C0-1,D0-1 [m7 on x86-32]
409    ; m0: E0-1,F0-1,G0-1,H0-1
410    ; m1: A2-3,B2-3,C2-3,D2-3
411    ; m2: E2-3,F2-3,G2-3,H2-3
412    ; m3: A4-5,B4-5,C4-5,D4-5
413    ; m4: E4-5,F4-5,G4-5,H4-5 [r3 on x86-32]
414    ; m5: A6-7,B6-7,C6-7,D6-7
415    ; m6: E6-7,F6-7,G6-7,H6-7
416%if ARCH_X86_64
417    punpckldq     m7, m8, m1
418    punpckhdq     m8, m1
419%else
420    punpckldq     m4, m7, m1
421    punpckhdq     m7, m1
422%endif
423    punpckldq     m1, m0, m2
424    punpckhdq     m0, m2
425    punpckldq     m2, m3, m5
426    punpckhdq     m3, m5
427%if ARCH_X86_64
428    punpckldq     m5, m4, m6
429    punpckhdq     m4, m6
430%else
431    mova  [rsp+4*16], m3
432    mova          m3, [rsp+3*16]
433    punpckldq     m5, m3, m6
434    punpckhdq     m3, m6
435%endif
436    ; m7: A0-3,B0-3 [m4 on x86-32]
437    ; m8: C0-3,D0-3 [m7 on x86-32]
438    ; m1: E0-3,F0-3
439    ; m0: G0-3,H0-3
440    ; m2: A4-7,B4-7
441    ; m3: C4-7,D4-7 [r4 on x86-32]
442    ; m5: E4-7,F4-7
443    ; m4: G4-7,H4-7 [m3 on x86-32]
444%if ARCH_X86_64
445%if %1 != 6
446    punpcklqdq    m6, m7, m2
447%endif
448    punpckhqdq    m7, m2
449    punpcklqdq    m2, m8, m3
450    punpckhqdq    m8, m3
451    punpcklqdq    m3, m1, m5
452    punpckhqdq    m1, m5
453%if %1 != 6
454    punpckhqdq    m5, m0, m4
455%endif
456    punpcklqdq    m0, m4
457%if %1 == 8
458    mova  [rsp+1*16], m6
459%define P3 [rsp+1*16]
460%endif
461    ; 7,2,8,3,1,0,5 -> 13,8,9,10,11,14,15
462    SWAP           7, 13
463    SWAP           8, 2, 9
464    SWAP           3, 10
465    SWAP           1, 11
466    SWAP           0, 14
467    SWAP           5, 15
468%define P2 m13
469%define P1 m8
470%define P0 m9
471%define Q0 m10
472%define Q1 m11
473%define Q2 m14
474%if %1 == 8
475%define Q3 m15
476%endif
477%else ; x86-32
478%if %1 == 8
479%define P3 [rsp+ 6*16]
480    punpcklqdq    m6, m4, m2
481    mova          P3, m6
482%endif
483    mova          m6, [rsp+4*16]
484    punpckhqdq    m4, m2
485    punpcklqdq    m2, m7, m6
486    punpckhqdq    m7, m6
487    punpcklqdq    m6, m1, m5
488    punpckhqdq    m1, m5
489%if %1 == 8
490%define Q3 [rsp+24*16]
491    punpckhqdq    m5, m0, m3
492    mova          Q3, m5
493%endif
494    punpcklqdq    m0, m3
495%if %1 == 8
496%define P2 [rsp+18*16]
497%define P1 [rsp+19*16]
498%define P0 [rsp+20*16]
499%define Q0 [rsp+21*16]
500%define Q1 [rsp+22*16]
501%define Q2 [rsp+23*16]
502%else
503%define P2 [rsp+3*16]
504%define P1 [rsp+4*16]
505%define P0 [rsp+5*16]
506%define Q0 [rsp+6*16]
507%define Q1 [rsp+7*16]
508%define Q2 [rsp+8*16]
509%endif
510    mova          P2, m4
511    mova          P1, m2
512    mova          P0, m7
513    mova          Q0, m6
514    mova          Q1, m1
515    mova          Q2, m0
516%endif ; x86-32/64
517%else ; %1 == 16
518    ; We only use 14 pixels but we'll need the remainder at the end for
519    ; the second transpose
520    mova          m0, [dstq+strideq*0-16]
521    mova          m1, [dstq+strideq*1-16]
522    mova          m2, [dstq+strideq*2-16]
523    mova          m3, [dstq+stride3q -16]
524    lea         tmpq, [dstq+strideq*4]
525    mova          m4, [tmpq+strideq*0-16]
526    mova          m5, [tmpq+strideq*1-16]
527    mova          m6, [tmpq+strideq*2-16]
528%if ARCH_X86_64
529    mova          m7, [tmpq+stride3q -16]
530
531    TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 8
532    SWAP           5, 13
533    SWAP           6, 8
534    SWAP           7, 9
535%define P2 m13
536%define P1 m8
537%define P0 m9
538%else ; x86-32
539%define P2 [esp+18*16]
540%define P1 [esp+19*16]
541%define P0 [esp+20*16]
542    TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, \
543                     [tmpq+stride3q -16], P2, "", a, a
544    mova          P1, m6
545    mova          P0, m7
546%endif ; x86-32/64
547    mova [rsp+ 7*16], m0
548    mova [rsp+ 8*16], m1
549    mova [rsp+ 9*16], m2
550    mova [rsp+10*16], m3
551%define P3 [rsp+6*16]
552    mova          P3, m4
553
554    mova          m0, [dstq+strideq*0]
555    mova          m1, [dstq+strideq*1]
556    mova          m2, [dstq+strideq*2]
557    mova          m3, [dstq+stride3q ]
558    lea         tmpq, [dstq+strideq*4]
559    mova          m4, [tmpq+strideq*0]
560    mova          m5, [tmpq+strideq*1]
561    mova          m6, [tmpq+strideq*2]
562%if ARCH_X86_64
563    mova          m7, [tmpq+stride3q ]
564
565    TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 10
566    SWAP          0, 10
567    SWAP          1, 11
568    SWAP          2, 14
569    SWAP          3, 15
570%define Q0 m10
571%define Q1 m11
572%define Q2 m14
573%define Q3 m15
574%else ; x86-32
575    TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, \
576                     [tmpq+stride3q ], [rsp+12*16], "", a, a
577%define Q0 [esp+21*16]
578%define Q1 [esp+22*16]
579%define Q2 [esp+23*16]
580%define Q3 [esp+24*16]
581    mova         Q0, m0
582    mova         Q1, m1
583    mova         Q2, m2
584    mova         Q3, m3
585%endif ; x86-32/64
586
587    mova [rsp+11*16], m4
588%if ARCH_X86_64
589    mova [rsp+12*16], m5
590%endif
591    mova [rsp+13*16], m6
592    mova [rsp+14*16], m7
593%endif ; %1 == 4/6/8/16
594%endif ; %2 ==/!= v
595
596    ; load L/E/I/H
597%if ARCH_X86_32
598%define l_strideq r5
599    mov    l_strideq, dword lstridem
600%ifidn %2, v
601%define lq r3
602    mov           lq, dword lm
603%endif
604%endif
605%ifidn %2, v
606%if cpuflag(sse4)
607    pmovzxbw      m1, [lq]
608    pmovzxbw      m0, [lq+l_strideq]
609    pxor          m2, m2
610%else ; ssse3
611    movq          m1, [lq]
612    movq          m0, [lq+l_strideq]
613    pxor          m2, m2
614    REPX {punpcklbw x, m2}, m1, m0
615%endif ; ssse3/sse4
616%else ; %2 != v
617    movq          m0, [lq]                      ; l0, l1
618    movq          m1, [lq+l_strideq]            ; l2, l3
619    punpckldq     m0, m1                        ; l0, l2, l1, l3
620    pxor          m2, m2
621    punpcklbw     m1, m0, m2                    ; l0, l2
622    punpckhbw     m0, m2                        ; l1, l3
623%endif ; %2==/!=v
624%if ARCH_X86_32
625%ifidn %2, v
626%undef lq
627    mov     mstrideq, mstridem
628%endif
629%endif
630    pcmpeqw       m5, m2, m0
631    pand          m1, m5
632    por           m0, m1                        ; l[x][] ? l[x][] : l[x-stride][]
633    pshufb        m0, [PIC_sym(pb_4x1_4x5_4x9_4x13)] ; l[x][1]
634    pcmpeqw       m5, m2, m0                    ; !L
635    psrlw         m5, 1
636%if ARCH_X86_64
637    psrlw         m2, m0, [lutq+128]
638    SPLATW        m1, [lutq+136]
639%else ; x86-32
640    mov           r5, lutm
641    psrlw         m2, m0, [r5+128]
642    SPLATW        m1, [r5+136]
643%endif ; x86-32/64
644    pminsw        m2, m1
645    pmaxsw        m2, [PIC_sym(pw_1)]           ; I
646    psrlw         m1, m0, 4                     ; H
647    paddw         m0, [PIC_sym(pw_2)]
648    paddw         m0, m0
649    paddw         m0, m2                        ; E
650    REPX {pmullw x, [bdmulq]}, m0, m1, m2
651%if ARCH_X86_32
652%undef l_strideq
653    lea    stride3q, [strideq*3]
654%endif
655
656    psubw         m3, P1, P0                    ; p1-p0
657    psubw         m4, Q0, Q1                    ; q0-q1
658    REPX {pabsw x, x}, m3, m4
659    pmaxsw        m3, m5
660    pmaxsw        m3, m4
661    pcmpgtw       m7, m3, m1                    ; hev
662%if %1 != 4
663    psubw         m4, P2, P0                    ; p2-p0
664    pabsw         m4, m4
665    pmaxsw        m4, m3
666%if %1 != 6
667    mova          m6, P3                        ; p3
668    psubw         m5, m6, P0                    ; p3-p0
669    pabsw         m5, m5
670    pmaxsw        m4, m5
671%endif ; %1 != 6
672    psubw         m5, Q0, Q2                    ; q0-q2
673    pabsw         m5, m5
674    pmaxsw        m4, m5
675%if %1 != 6
676    psubw         m5, Q0, Q3                    ; q0-q3
677    pabsw         m5, m5
678    pmaxsw        m4, m5
679%endif ; %1 != 6
680    pcmpgtw       m4, [bdmulq]                     ; !flat8in
681
682    psubw         m5, P2, P1                    ; p2-p1
683    pabsw         m5, m5
684%if %1 != 6
685    psubw         m6, P2                        ; p3-p2
686    pabsw         m6, m6
687    pmaxsw        m5, m6
688    psubw         m6, Q2, Q3                    ; q2-q3
689    pabsw         m6, m6
690    pmaxsw        m5, m6
691%endif ; %1 != 6
692    psubw         m6, Q2, Q1                    ; q2-q1
693    pabsw         m6, m6
694    pmaxsw        m5, m6
695
696%if %1 == 16
697    SPLATD        m6, [maskq+8]
698    SPLATD        m1, [maskq+4]
699    por           m6, m1
700    pand          m6, m12
701    pcmpeqd       m6, m12
702    pand          m5, m6
703%else ; %1 != 16
704    SPLATD        m6, [maskq+4]
705    pand          m6, m12
706    pcmpeqd       m6, m12
707    pand          m5, m6                        ; only apply fm-wide to wd>4 blocks
708%endif ; %1==/!=16
709    pmaxsw        m3, m5
710%endif ; %1 != 4
711    pcmpgtw       m3, m2
712
713    psubw         m5, P1, Q1                    ; p1-q1
714    psubw         m6, P0, Q0                    ; p0-q0
715    REPX {pabsw x, x}, m5, m6
716    paddw         m6, m6
717    psrlw         m5, 1
718    paddw         m5, m6                        ; abs(p0-q0)*2+(abs(p1-q1)>>1)
719    pcmpgtw       m5, m0                        ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E
720    por           m3, m5
721
722%if %1 == 16
723
724%ifidn %2, v
725    lea         tmpq, [dstq+mstrideq*8]
726    mova          m0, [tmpq+strideq*1]
727    mova          m1, [tmpq+strideq*2]
728    mova          m2, [tmpq+stride3q]
729%else ; %2 != v
730    mova          m0, [rsp+ 8*16]
731    mova          m1, [rsp+ 9*16]
732    mova          m2, [rsp+10*16]
733%endif ; %2==/!=v
734    REPX {psubw x, P0}, m0, m1, m2
735    REPX {pabsw x, x}, m0, m1, m2
736    pmaxsw        m1, m0
737    pmaxsw        m1, m2
738%ifidn %2, v
739    lea         tmpq, [dstq+strideq*4]
740    mova          m0, [tmpq+strideq*0]
741    mova          m2, [tmpq+strideq*1]
742    mova          m5, [tmpq+strideq*2]
743%else ; %2 != v
744    mova          m0, [rsp+11*16]
745    mova          m2, [rsp+12*16]
746    mova          m5, [rsp+13*16]
747%endif ; %2==/!=v
748    REPX {psubw x, Q0}, m0, m2, m5
749    REPX {pabsw x, x}, m0, m2, m5
750    pmaxsw        m0, m2
751    pmaxsw        m1, m5
752    pmaxsw        m1, m0
753    pcmpgtw       m1, [bdmulq]                  ; !flat8out
754    por           m1, m4                        ; !flat8in | !flat8out
755    SPLATD        m2, [maskq+8]
756    pand          m5, m2, m12
757    pcmpeqd       m5, m12
758    pandn         m1, m5                        ; flat16
759    pandn         m5, m3, m1                    ; flat16 & fm
760    SWAP           1, 5
761
762    SPLATD        m5, [maskq+4]
763    por           m5, m2
764    pand          m2, m5, m12
765    pcmpeqd       m2, m12
766    pandn         m4, m2                        ; flat8in
767    pandn         m2, m3, m4
768    SWAP           2, 4
769    SPLATD        m2, [maskq+0]
770    por           m2, m5
771    pand          m2, m12
772    pcmpeqd       m2, m12
773    pandn         m3, m2
774    pandn         m0, m4, m3                    ; fm & !flat8 & !flat16
775    SWAP           0, 3
776    pandn         m0, m1, m4                    ; flat8 & !flat16
777    SWAP           0, 4
778%elif %1 != 4
779    SPLATD        m0, [maskq+4]
780    pand          m2, m0, m12
781    pcmpeqd       m2, m12
782    pandn         m4, m2
783    pandn         m2, m3, m4                    ; flat8 & fm
784    SWAP           2, 4
785    SPLATD        m2, [maskq+0]
786    por           m0, m2
787    pand          m0, m12
788    pcmpeqd       m0, m12
789    pandn         m3, m0
790    pandn         m0, m4, m3                    ; fm & !flat8
791    SWAP           0, 3
792%else ; %1 == 4
793    SPLATD        m0, [maskq+0]
794    pand          m0, m12
795    pcmpeqd       m0, m12
796    pandn         m3, m0                        ; fm
797%endif ; %1==/!=4
798
799    ; short filter
800%if ARCH_X86_64
801    SPLATW        m0, r7m
802%else
803    SPLATW        m0, bdmulm
804%endif
805    pcmpeqw       m2, m2
806    psrlw         m0, 1                         ; 511 or 2047
807    pxor          m2, m0                        ; -512 or -2048
808
809    psubw         m5, Q0, P0                    ; q0-p0
810    paddw         m6, m5, m5
811    paddw         m6, m5                        ; 3*(q0-p0)
812    psubw         m5, P1, Q1                    ; iclip_diff(p1-q1)
813    pminsw        m5, m0
814    pmaxsw        m5, m2
815    pand          m5, m7                        ; f=iclip_diff(p1-q1)&hev
816    paddw         m5, m6                        ; f=iclip_diff(3*(q0-p0)+f)
817    pminsw        m5, m0
818    pmaxsw        m5, m2
819    pand          m3, m5                        ; f&=fm
820    paddw         m5, m3, [PIC_sym(pw_3)]
821    paddw         m3, [PIC_sym(pw_4)]
822    REPX {pminsw x, m0}, m5, m3
823    psraw         m5, 3                         ; f2
824    psraw         m3, 3                         ; f1
825    psubw         m0, m2                        ; 1023 or 4095
826    pxor          m2, m2
827%if ARCH_X86_64
828    paddw         P0, m5
829    psubw         Q0, m3
830%else
831    paddw          m5, P0
832    psubw          m6, Q0, m3
833    REPX {pminsw x, m0}, m5, m6
834    REPX {pmaxsw x, m2}, m5, m6
835%endif
836
837    paddw         m3, [PIC_sym(pw_1)]
838    psraw         m3, 1                         ; f=(f1+1)>>1
839    pandn         m7, m3                        ; f&=!hev
840    SWAP           7, 3
841%if ARCH_X86_64
842    paddw         P1, m3
843    psubw         Q1, m3
844    REPX {pminsw x, m0}, P1, P0, Q0, Q1
845    REPX {pmaxsw x, m2}, P1, P0, Q0, Q1
846%else
847    psubw         m7, Q1, m3
848    paddw         m3, P1
849    REPX {pminsw x, m0}, m7, m3
850    REPX {pmaxsw x, m2}, m7, m3
851%if %1 > 4
852    mova          P1, m3
853    mova          P0, m5
854    mova          Q0, m6
855    mova          Q1, m7
856%endif
857%endif
858
859%if %1 == 16
860
861; m8-11 = p1/p0/q0/q1, m4=flat8, m1=flat16
862; m12=filter bits mask
863; m13-15=p2/q2/q3
864; m0,2-3,5-7 = free
865
866    ; flat16 filter
867%ifidn %2, v
868    lea         tmpq, [dstq+mstrideq*8]
869    mova          m0, [tmpq+strideq*1]          ; p6
870    mova          m2, [tmpq+strideq*2]          ; p5
871    mova          m7, [tmpq+stride3q]           ; p4
872    mova          m6, [tmpq+strideq*4]          ; p3
873    lea         tmpq, [dstq+mstrideq*4]
874%else ; %2 != v
875    mova          m0, [rsp+ 8*16]
876    mova          m2, [rsp+ 9*16]
877    mova          m7, [rsp+10*16]
878    mova          m6, [rsp+ 6*16]
879%endif ; %2==/!=v
880
881    mova [rsp+ 0*16], m4
882
883    ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
884    psllw         m3, m0, 3                     ; p6*8
885    paddw         m3, [PIC_sym(pw_8)]
886    paddw         m5, m2, m7                    ; p5+p4
887    psubw         m3, m0
888    paddw         m5, m5                        ; (p5+p4)*2
889    paddw         m3, m6                        ; p6*7+p3
890    paddw         m5, P2                        ; (p5+p4)*2+p2
891    paddw         m3, P1                        ; p6*7+p3+p1
892    paddw         m5, P0                        ; (p5+p4)*2+p2+p0
893    paddw         m3, Q0                        ; p6*7+p3+p1+q0
894    paddw         m3, m5                        ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
895    psrlw         m5, m3, 4
896    pand          m5, m1
897    pandn         m4, m1, m2
898    por           m5, m4
899%ifidn %2, v
900    mova [tmpq+mstrideq*2], m5                   ; p5
901%else ; %2 != v
902    mova  [rsp+9*16], m5
903%endif ; %2==/!=v
904
905    ; sub p6*2, add p3/q1
906    paddw         m3, m6
907    paddw         m5, m0, m0
908    paddw         m3, Q1
909    psubw         m3, m5
910    psrlw         m5, m3, 4
911    pand          m5, m1
912    pandn         m4, m1, m7
913    por           m5, m4
914%ifidn %2, v
915    mova [tmpq+mstrideq*1], m5                   ; p4
916%else ; %2 != v
917    mova [rsp+10*16], m5
918%endif ; %2==/!=v
919
920    ; sub p6/p5, add p2/q2
921    psubw         m3, m0
922    paddw         m5, P2, Q2
923    psubw         m3, m2
924    paddw         m3, m5
925    psrlw         m5, m3, 4
926    pand          m5, m1
927    pandn         m4, m1, m6
928    por           m5, m4
929%ifidn %2, v
930    mova [tmpq+strideq*0], m5                  ; p3
931%else ; %2 != v
932    mova  [rsp+6*16], m5
933%endif ; %2==/!=v
934
935%define WRITE_IN_PLACE 0
936%ifidn %2, v
937%if ARCH_X86_64
938%define WRITE_IN_PLACE 1
939%endif
940%endif
941
942    ; sub p6/p4, add p1/q3
943    paddw         m3, P1
944    paddw         m5, m0, m7
945    paddw         m3, Q3
946    psubw         m3, m5
947    psrlw         m5, m3, 4
948    pand          m5, m1
949    pandn         m4, m1, P2
950    por           m5, m4
951%if WRITE_IN_PLACE
952    mova [tmpq+strideq*1], m5
953%else
954    mova  [rsp+1*16], m5                        ; don't clobber p2/m13
955%endif
956
957    ; sub p6/p3, add p0/q4
958    paddw         m3, P0
959    paddw         m5, m0, m6
960%ifidn %2, v
961    paddw         m3, [dstq+strideq*4]
962%else ; %2 != v
963    paddw         m3, [rsp+11*16]
964%endif ; %2==/!=v
965    psubw         m3, m5
966    psrlw         m5, m3, 4
967    pand          m5, m1
968    pandn         m4, m1, P1
969    por           m5, m4
970%if WRITE_IN_PLACE
971    mova [dstq+mstrideq*2], m5
972%else
973    mova  [rsp+2*16], m5                        ; don't clobber p1/m3
974%endif
975
976    ; sub p6/p2, add q0/q5
977    paddw         m3, Q0
978    paddw         m5, m0, P2
979%ifidn %2, v
980%if ARCH_X86_32
981    lea           r4, P2
982%endif
983    lea         tmpq, [dstq+strideq*4]
984    paddw         m3, [tmpq+strideq*1]
985%else ; %2 != v
986    paddw         m3, [rsp+12*16]
987%endif ; %2==/!=v
988    psubw         m3, m5
989    psrlw         m5, m3, 4
990    pand          m5, m1
991    pandn         m4, m1, P0
992    por           m5, m4
993%if WRITE_IN_PLACE
994    mova [dstq+mstrideq*1], m5
995%else
996    mova  [rsp+3*16], m5                        ; don't clobber p0/m4
997%endif
998
999    ; sub p6/p1, add q1/q6
1000    paddw         m3, Q1
1001    paddw         m5, m0, P1
1002%ifidn %2, v
1003    mova          m0, [tmpq+strideq*2]          ; q6
1004%else ; %2 != v
1005    mova          m0, [rsp+13*16]               ; q6
1006%endif ; %2==/!=v
1007    paddw         m3, m0
1008    psubw         m3, m5
1009    psrlw         m5, m3, 4
1010    pand          m5, m1
1011    pandn         m4, m1, Q0
1012    por           m5, m4
1013%if WRITE_IN_PLACE
1014    mova      [dstq], m5
1015%else
1016    mova  [rsp+4*16], m5                        ; don't clobber q0/m5
1017%endif
1018
1019    ; sub p5/p0, add q2/q6
1020    paddw         m3, Q2
1021    paddw         m5, m2, P0
1022    paddw         m3, m0
1023    psubw         m3, m5
1024    psrlw         m5, m3, 4
1025    pand          m5, m1
1026    pandn         m4, m1, Q1
1027    por           m2, m5, m4                    ; don't clobber q1/m6
1028
1029    ; sub p4/q0, add q3/q6
1030    paddw         m3, Q3
1031    paddw         m7, Q0
1032    paddw         m3, m0
1033    psubw         m3, m7
1034    psrlw         m7, m3, 4
1035    pand          m7, m1
1036    pandn         m4, m1, Q2
1037    por           m7, m4                        ; don't clobber q2/m14
1038
1039    ; sub p3/q1, add q4/q6
1040%ifidn %2, v
1041    paddw         m3, [tmpq+strideq*0]
1042%else ; %2 != v
1043    paddw         m3, [rsp+11*16]
1044%endif ; %2==/!=v
1045    paddw         m6, Q1
1046    paddw         m3, m0
1047    psubw         m3, m6
1048    psrlw         m6, m3, 4
1049    pand          m6, m1
1050    pandn         m4, m1, Q3
1051    por           m6, m4
1052%if WRITE_IN_PLACE
1053    mova [tmpq+mstrideq], m6                    ; q3
1054%else ; %2 != v
1055    mova  [rsp+5*16], m6
1056%endif ; %2==/!=v
1057
1058    ; sub p2/q2, add q5/q6
1059%ifidn %2, v
1060    paddw         m3, [tmpq+strideq*1]
1061%if ARCH_X86_64
1062    paddw         m5, P2, Q2
1063%else
1064    ; because tmpq is clobbered, so we use a backup pointer for P2 instead
1065    paddw         m5, [r4], Q2
1066    mov     pic_regq, pic_regm
1067%endif
1068%else ; %2 != v
1069    paddw         m3, [rsp+12*16]
1070    paddw         m5, P2, Q2
1071%endif ; %2==/!=v
1072    paddw         m3, m0
1073    psubw         m3, m5
1074    psrlw         m5, m3, 4
1075    pand          m5, m1
1076%ifidn %2, v
1077    pandn         m4, m1, [tmpq+strideq*0]
1078%else ; %2 != v
1079    pandn         m4, m1, [rsp+11*16]
1080%endif ; %2==/!=v
1081    por           m5, m4
1082%ifidn %2, v
1083    mova [tmpq+strideq*0], m5                   ; q4
1084%else ; %2 != v
1085    mova [rsp+11*16], m5
1086%endif ; %2==/!=v
1087
1088    ; sub p1/q3, add q6*2
1089    psubw         m3, P1
1090    paddw         m0, m0
1091    psubw         m3, Q3
1092    paddw         m3, m0
1093    psrlw         m5, m3, 4
1094    pand          m5, m1
1095%ifidn %2, v
1096    pandn         m4, m1, [tmpq+strideq*1]
1097%else ; %2 != v
1098    pandn         m4, m1, [rsp+12*16]
1099%endif ; %2==/!=v
1100    por           m5, m4
1101%ifidn %2, v
1102    mova [tmpq+strideq*1], m5                   ; q5
1103%else ; %2 != v
1104    mova [rsp+12*16], m5
1105%endif ; %2==/!=v
1106
1107    mova          m4, [rsp+0*16]
1108%ifidn %2, v
1109    lea         tmpq, [dstq+mstrideq*4]
1110%endif
1111%if ARCH_X86_64
1112    SWAP           2, 11
1113    SWAP           7, 14
1114    SWAP           6, 15
1115%else ; x86-32
1116    mova          Q1, m2
1117    mova          Q2, m7
1118%endif ; x86-32/64
1119%if WRITE_IN_PLACE
1120    mova          P2, [tmpq+strideq*1]
1121    mova          P1, [tmpq+strideq*2]
1122    mova          P0, [tmpq+stride3q]
1123    mova          Q0, [dstq]
1124%elif ARCH_X86_64
1125    mova          P2, [rsp+1*16]
1126    mova          P1, [rsp+2*16]
1127    mova          P0, [rsp+3*16]
1128    mova          Q0, [rsp+4*16]
1129%else ; !WRITE_IN_PLACE & x86-32
1130    mova          m0, [rsp+1*16]
1131    mova          m1, [rsp+2*16]
1132    mova          m2, [rsp+3*16]
1133    mova          m3, [rsp+4*16]
1134    mova          m7, [rsp+5*16]
1135    mova          P2, m0
1136    mova          P1, m1
1137    mova          P0, m2
1138    mova          Q0, m3
1139    mova          Q3, m7
1140%endif ; WRITE_IN_PLACE / x86-32/64
1141%undef WRITE_IN_PLACE
1142%endif ; %1 == 16
1143
1144%if %1 >= 8
1145
1146    ; flat8 filter
1147    mova          m0, P3                        ; p3
1148    paddw         m1, m0, P2                    ; p3+p2
1149    paddw         m2, P1, P0                    ; p1+p0
1150    paddw         m3, m1, m1                    ; 2*(p3+p2)
1151    paddw         m2, m0                        ; p1+p0+p3
1152    paddw         m3, Q0                        ; 2*(p3+p2)+q0
1153    paddw         m2, m3                        ; 3*p3+2*p2+p1+p0+q0
1154    pmulhrsw      m7, m2, [PIC_sym(pw_4096)]
1155    psubw         m7, P2
1156    pand          m7, m4
1157
1158    paddw         m3, P1, Q1                    ; p1+q1
1159    psubw         m2, m1                        ; 2*p3+p2+p1+p0+q0
1160    paddw         m2, m3                        ; 2*p3+p2+2*p1+p0+q0+q1
1161    pmulhrsw      m3, m2, [PIC_sym(pw_4096)]
1162    psubw         m3, P1
1163    pand          m3, m4
1164
1165    paddw         m5, m0, P1                    ; p3+p1
1166    paddw         m6, P0, Q2                    ; p0+q2
1167    psubw         m2, m5                        ; p3+p2+p1+p0+q0+q1
1168    paddw         m2, m6                        ; p3+p2+p1+2*p0+q0+q1+q2
1169    pmulhrsw      m5, m2, [PIC_sym(pw_4096)]
1170    psubw         m5, P0
1171    pand          m5, m4
1172
1173    paddw         m6, m0, P0                    ; p3+p0
1174    paddw         m1, Q0, Q3                    ; q0+q3
1175    psubw         m2, m6                        ; p2+p1+p0+q0+q1+q2
1176    paddw         m2, m1                        ; p2+p1+p0+2*q0+q1+q2+q3
1177    pmulhrsw      m6, m2, [PIC_sym(pw_4096)]
1178    psubw         m6, Q0
1179    pand          m6, m4
1180
1181    paddw         m2, Q1                        ; p2+p1+p0+2*q0+2*q1+q2+q3
1182    paddw         m2, Q3                        ; p2+p1+p0+2*q0+2*q1+q2+2*q3
1183    paddw         m1, P2, Q0                    ; p2+q0
1184    psubw         m2, m1                        ; p1+p0+q0+2*q1+q2+2*q3
1185    pmulhrsw      m1, m2, [PIC_sym(pw_4096)]
1186    psubw         m1, Q1
1187    pand          m1, m4
1188
1189    psubw         m2, P1                        ; p0+q0+2*q1+q2+2*q3
1190    psubw         m2, Q1                        ; p0+q0+q1+q2+2*q3
1191    paddw         m0, Q3, Q2                    ; q3+q2
1192    paddw         m2, m0                        ; p0+q0+q1+2*q2+3*q3
1193    pmulhrsw      m2, [PIC_sym(pw_4096)]
1194    psubw         m2, Q2
1195    pand          m2, m4
1196
1197    paddw         m7, P2
1198    paddw         m3, P1
1199    paddw         m5, P0
1200    paddw         m6, Q0
1201    paddw         m1, Q1
1202    paddw         m2, Q2
1203
1204%ifidn %2, v
1205    mova [tmpq+strideq*1], m7                   ; p2
1206    mova [tmpq+strideq*2], m3                   ; p1
1207    mova [tmpq+stride3q ], m5                   ; p0
1208    mova [dstq+strideq*0], m6                   ; q0
1209    mova [dstq+strideq*1], m1                   ; q1
1210    mova [dstq+strideq*2], m2                   ; q2
1211%else ; %2 != v
1212    mova          m0, P3
1213
1214%if %1 == 8
1215    lea         tmpq, [dstq+strideq*4]
1216%if ARCH_X86_64
1217    SWAP           4, 15
1218    TRANSPOSE8X8W  0, 7, 3, 5, 6, 1, 2, 4, 8
1219%else
1220    TRANSPOSE8X8W  0, 7, 3, 5, 6, 1, 2, 4, "", \
1221                      Q3, [tmpq+strideq*1-8], a, u
1222%endif
1223
1224    ; write 8x8
1225    movu   [dstq+strideq*0-8], m0
1226    movu   [dstq+strideq*1-8], m7
1227    movu   [dstq+strideq*2-8], m3
1228    movu   [dstq+stride3q -8], m5
1229    movu   [tmpq+strideq*0-8], m6
1230%if ARCH_X86_64
1231    movu   [tmpq+strideq*1-8], m1
1232%endif
1233    movu   [tmpq+strideq*2-8], m2
1234    movu   [tmpq+stride3q -8], m4
1235    lea         dstq, [dstq+strideq*8]
1236%else ; %1 != 8
1237%if ARCH_X86_64
1238    SWAP           6, 8
1239    SWAP           1, 9
1240    SWAP           2, 10
1241%else
1242    mova  [rsp+1*16], m6
1243    mova  [rsp+2*16], m1
1244    mova  [rsp+3*16], m2
1245%endif
1246
1247    mova          m1, [rsp+ 7*16]
1248    mova          m2, [rsp+ 8*16]
1249    mova          m4, [rsp+ 9*16]
1250    mova          m6, [rsp+10*16]
1251    lea         tmpq, [dstq+strideq*4]
1252%if ARCH_X86_64
1253    TRANSPOSE8X8W  1, 2, 4, 6, 0, 7, 3, 5, 11
1254%else
1255    mova  [rsp+7*16],  m5
1256    TRANSPOSE8X8W  1, 2, 4, 6, 0, 7, 3, 5, "", \
1257                      [rsp+7*16], [tmpq+strideq*1-16], a, a
1258%endif
1259
1260    mova [dstq+strideq*0-16], m1
1261    mova [dstq+strideq*1-16], m2
1262    mova [dstq+strideq*2-16], m4
1263    mova [dstq+stride3q -16], m6
1264    mova [tmpq+strideq*0-16], m0
1265%if ARCH_X86_64
1266    mova [tmpq+strideq*1-16], m7
1267%endif
1268    mova [tmpq+strideq*2-16], m3
1269    mova [tmpq+stride3q -16], m5
1270
1271%if ARCH_X86_64
1272    SWAP           6, 8
1273    SWAP           1, 9
1274    SWAP           2, 10
1275    SWAP           4, 15
1276%else
1277    mova          m6, [rsp+1*16]
1278    mova          m1, [rsp+2*16]
1279    mova          m2, [rsp+3*16]
1280    mova          m4, Q3
1281%endif
1282    mova          m0, [rsp+11*16]
1283    mova          m3, [rsp+12*16]
1284    mova          m5, [rsp+13*16]
1285%if ARCH_X86_64
1286    mova          m7, [rsp+14*16]
1287    TRANSPOSE8X8W  6, 1, 2, 4, 0, 3, 5, 7, 8
1288%else
1289    TRANSPOSE8X8W  6, 1, 2, 4, 0, 3, 5, 7, "", \
1290                      [rsp+14*16], [tmpq+strideq*1], a, a
1291%endif
1292    mova [dstq+strideq*0], m6
1293    mova [dstq+strideq*1], m1
1294    mova [dstq+strideq*2], m2
1295    mova [dstq+stride3q ], m4
1296    mova [tmpq+strideq*0], m0
1297%if ARCH_X86_64
1298    mova [tmpq+strideq*1], m3
1299%endif
1300    mova [tmpq+strideq*2], m5
1301    mova [tmpq+stride3q ], m7
1302    lea         dstq, [dstq+strideq*8]
1303%endif ; %1==/!=8
1304%endif ; %2==/!=v
1305%elif %1 == 6
1306    ; flat6 filter
1307    paddw         m3, P1, P0                    ; p1+p0
1308    paddw         m3, P2                        ; p2+p1+p0
1309    paddw         m6, P2, Q0                    ; p2+q0
1310    paddw         m3, m3                        ; 2*(p2+p1+p0)
1311    paddw         m3, m6                        ; p2+2*(p2+p1+p0)+q0
1312    pmulhrsw      m2, m3, [PIC_sym(pw_4096)]
1313    psubw         m2, P1
1314    pand          m2, m4
1315
1316    paddw         m3, Q0                        ; p2+2*(p2+p1+p0+q0)
1317    paddw         m6, P2, P2                    ; 2*p2
1318    paddw         m3, Q1                        ; p2+2*(p2+p1+p0+q0)+q1
1319    psubw         m3, m6                        ; p2+2*(p1+p0+q0)+q1
1320    pmulhrsw      m5, m3, [PIC_sym(pw_4096)]
1321    psubw         m5, P0
1322    pand          m5, m4
1323
1324    paddw         m3, Q1                        ; p2+2*(p1+p0+q0+q1)
1325    paddw         m6, P2, P1                    ; p2+p1
1326    paddw         m3, Q2                        ; p2+2*(p1+p0+q0+q1)+q2
1327    psubw         m3, m6                        ; p1+2*(p0+q0+q1)+q2
1328    pmulhrsw      m6, m3, [PIC_sym(pw_4096)]
1329    psubw         m6, Q0
1330    pand          m6, m4
1331
1332    psubw         m3, P1                        ; 2*(p0+q0+q1)+q2
1333%if ARCH_X86_64
1334    paddw         Q2, Q2                        ; q2*2
1335%else
1336    mova          m0, Q2
1337    paddw         m0, m0
1338%endif
1339    psubw         m3, P0                        ; p0+2*(q0+q1)+q2
1340%if ARCH_X86_64
1341    paddw         m3, Q2                        ; p0+q*(q0+q1+q2)+q2
1342%else
1343    paddw         m3, m0
1344%endif
1345    pmulhrsw      m3, [PIC_sym(pw_4096)]
1346    psubw         m3, Q1
1347    pand          m3, m4
1348
1349    paddw         m2, P1
1350    paddw         m5, P0
1351    paddw         m6, Q0
1352    paddw         m3, Q1
1353
1354%ifidn %2, v
1355    mova [dstq+mstrideq*2], m2                   ; p1
1356    mova [dstq+mstrideq*1], m5                   ; p0
1357    mova [dstq+strideq*0], m6                   ; q0
1358    mova [dstq+strideq*1], m3                   ; q1
1359%else ; %2 != v
1360    TRANSPOSE_8x4_AND_WRITE_4x8 m2, m5, m6, m3, m0
1361%endif ; %2==/!=v
1362%else ; %1 == 4
1363%if ARCH_X86_64
1364%ifidn %2, v
1365    mova [dstq+mstrideq*2], P1                   ; p1
1366    mova [dstq+mstrideq*1], P0                   ; p0
1367    mova [dstq+strideq*0], Q0                   ; q0
1368    mova [dstq+strideq*1], Q1                   ; q1
1369%else ; %2 != v
1370    TRANSPOSE_8x4_AND_WRITE_4x8 P1, P0, Q0, Q1, m0
1371%endif ; %2==/!=v
1372%else ; x86-32
1373%ifidn %2, v
1374    mova [dstq+mstrideq*2], m3
1375    mova [dstq+mstrideq*1], m5
1376    mova [dstq+strideq*0], m6
1377    mova [dstq+strideq*1], m7
1378%else ; %2 != v
1379    TRANSPOSE_8x4_AND_WRITE_4x8 m3, m5, m6, m7, m0
1380%endif ; %2==/!=v
1381%endif ; x86-32/64
1382%endif ; %1
1383%undef P3
1384%undef P2
1385%undef P1
1386%undef P0
1387%undef Q0
1388%undef Q1
1389%undef Q2
1390%undef Q3
1391%endmacro
1392
1393INIT_XMM ssse3
1394; stack layout:
1395; r0 - flat8 backup inside flat16 code
1396%if ARCH_X86_64
1397cglobal lpf_v_sb_y_16bpc, 6, 12, 16, -16 * 1, \
1398                          dst, stride, mask, l, l_stride, lut, \
1399                          w, stride3, mstride, tmp, mask_bits, bdmul
1400    mov          r6d, r7m
1401    sar          r6d, 7
1402    and          r6d, 16                      ; 0 for 10bpc, 16 for 12bpc
1403    lea       bdmulq, [pw_4]
1404    add       bdmulq, r6
1405    mov           wd, wm
1406    shl    l_strideq, 2
1407    sub           lq, l_strideq
1408%else
1409; stack layout [32bit only]:
1410; r1-4 - p2-q0 post-filter16
1411; r5 - p3
1412; r6 - q3 post-filter16
1413; r7 - GPRs [mask_bitsm, mstridem]
1414; r8 - m12/pb_mask
1415; r9 - bdmulq
1416cglobal lpf_v_sb_y_16bpc, 4, 7, 8, -16 * (10 + extra_stack), \
1417                          dst, stride, mask, mstride, pic_reg, stride3, tmp
1418    RELOC_ARGS     v, 10*16
1419%if STACK_ALIGNMENT >= 16
1420    mov          r5d, r7m
1421%endif
1422    sar          r5d, 7
1423    and          r5d, 16                      ; 0 for 10bpc, 16 for 12bpc
1424    LEA     pic_regq, PIC_base
1425%define pic_regm dword [esp+7*16+2*gprsize]
1426    mov     pic_regm, pic_regq
1427    mova          m0, [PIC_sym(pw_4)+r5]
1428%define bdmulq esp+9*16
1429    mova    [bdmulq], m0
1430    shl dword lstridem, 2
1431    sub           r3, dword lstridem
1432    mov     dword lm, r3
1433%endif
1434    mov     mstrideq, strideq
1435    neg     mstrideq
1436    lea     stride3q, [strideq*3]
1437%if ARCH_X86_64
1438    mov   mask_bitsd, 0x3
1439    mova         m12, [pb_mask]
1440%else
1441%define mstridem dword [esp+7*16+1*gprsize]
1442    mov     mstridem, mstrideq
1443%define mask_bitsm dword [esp+7*16+0*gprsize]
1444    mov   mask_bitsm, 0x3
1445    mova          m0, [PIC_sym(pb_mask)]
1446%define m12 [esp+8*16]
1447    mova         m12, m0
1448%endif
1449
1450.loop:
1451%if ARCH_X86_64
1452    test   [maskq+8], mask_bitsd              ; vmask[2]
1453%else
1454    mov          r6d, mask_bitsm
1455    test   [maskq+8], r6d
1456%endif
1457    jz .no_flat16
1458
1459    FILTER        16, v
1460    jmp .end
1461
1462.no_flat16:
1463%if ARCH_X86_64
1464    test   [maskq+4], mask_bitsd              ; vmask[1]
1465%else
1466    test   [maskq+4], r6d
1467%endif
1468    jz .no_flat
1469
1470    FILTER         8, v
1471    jmp .end
1472
1473.no_flat:
1474%if ARCH_X86_64
1475    test   [maskq+0], mask_bitsd              ; vmask[0]
1476%else
1477    test   [maskq+0], r6d
1478%endif
1479    jz .end
1480
1481    FILTER         4, v
1482
1483.end:
1484%if ARCH_X86_64
1485    pslld        m12, 2
1486    add           lq, 8
1487%else
1488    mova          m0, m12
1489    pslld         m0, 2
1490    mova         m12, m0
1491    add     dword lm, 8
1492%endif
1493    add         dstq, 16
1494%if ARCH_X86_64
1495    shl   mask_bitsd, 2
1496    sub           wd, 2
1497%else
1498    shl   mask_bitsm, 2
1499    sub     dword wm, 2
1500%endif
1501    jg .loop
1502%undef mask_bitsm
1503%undef bdmulq
1504    UNRELOC_ARGS
1505    RET
1506
1507INIT_XMM ssse3
1508; stack layout:
1509; r0 - flat8 backup inside flat16
1510; r1-4 - p2-q0 post-filter16 backup
1511; r5 - q3 post-filter16 backup
1512; r6 - p3
1513; r7-10 - p7-4
1514; r11-14 - q4-7
1515%if ARCH_X86_64
1516cglobal lpf_h_sb_y_16bpc, 6, 11, 16, -16 * 15, \
1517                          dst, stride, mask, l, l_stride, lut, \
1518                          h, stride3, tmp, mask_bits, bdmul
1519    mov          r6d, r7m
1520    sar          r6d, 7
1521    and          r6d, 16                      ; 0 for 10bpc, 16 for 12bpc
1522    lea       bdmulq, [pw_4]
1523    add       bdmulq, r6
1524    mov           hd, hm
1525    shl    l_strideq, 2
1526%else
1527; stack layout [32bit only]:
1528; r15 - GPRs [mask_bitsm]
1529; r16 - m12/pb_mask
1530; r17 - bdmulq
1531; r18-24 - p2-q3
1532cglobal lpf_h_sb_y_16bpc, 4, 7, 8, -16 * (25 + extra_stack), \
1533                          dst, stride, mask, l, pic_reg, stride3, tmp
1534    RELOC_ARGS     h, 25*16
1535%if STACK_ALIGNMENT >= 16
1536    mov          r5d, r7m
1537%endif
1538    sar          r5d, 7
1539    and          r5d, 16                      ; 0 for 10bpc, 16 for 12bpc
1540    LEA     pic_regq, PIC_base
1541    mova          m0, [PIC_sym(pw_4)+r5]
1542%define bdmulq esp+17*16
1543    mova    [bdmulq], m0
1544    shl dword lstridem, 2
1545%endif
1546    sub           lq, 4
1547    lea     stride3q, [strideq*3]
1548%if ARCH_X86_64
1549    mov   mask_bitsd, 0x3
1550    mova         m12, [pb_mask]
1551%else
1552%define mask_bitsm dword [esp+15*16+0*gprsize]
1553    mov   mask_bitsm, 0x3
1554    mova          m0, [PIC_sym(pb_mask)]
1555%define m12 [esp+16*16]
1556    mova         m12, m0
1557%endif
1558
1559.loop:
1560%if ARCH_X86_64
1561    test   [maskq+8], mask_bitsd            ; vmask[2]
1562%else
1563    mov         r6d, mask_bitsm
1564    test   [maskq+8], r6d
1565%endif
1566    jz .no_flat16
1567
1568    FILTER        16, h
1569    jmp .end
1570
1571.no_flat16:
1572%if ARCH_X86_64
1573    test   [maskq+4], mask_bitsd            ; vmask[1]
1574%else
1575    test   [maskq+4], r6d
1576%endif
1577    jz .no_flat
1578
1579    FILTER         8, h
1580    jmp .end
1581
1582.no_flat:
1583%if ARCH_X86_64
1584    test   [maskq+0], mask_bitsd            ; vmask[0]
1585%else
1586    test   [maskq+0], r6d
1587%endif
1588    jz .no_filter
1589
1590    FILTER         4, h
1591    jmp .end
1592
1593.no_filter:
1594    lea         dstq, [dstq+strideq*8]
1595.end:
1596%if ARCH_X86_64
1597    pslld        m12, 2
1598    lea           lq, [lq+l_strideq*2]
1599    shl   mask_bitsd, 2
1600    sub           hd, 2
1601%else
1602    mova          m0, m12
1603    pslld         m0, 2
1604    mova         m12, m0
1605    add           lq, dword lstridem
1606    add           lq, dword lstridem
1607    shl   mask_bitsm, 2
1608    sub     dword hm, 2
1609%endif
1610    jg .loop
1611%undef mask_bitsm
1612%undef bdmulq
1613    UNRELOC_ARGS
1614    RET
1615
1616INIT_XMM ssse3
1617%if ARCH_X86_64
1618cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \
1619                           dst, stride, mask, l, l_stride, lut, \
1620                           w, stride3, mstride, tmp, mask_bits, bdmul
1621    mov          r6d, r7m
1622    sar          r6d, 7
1623    and          r6d, 16                      ; 0 for 10bpc, 16 for 12bpc
1624    lea       bdmulq, [pw_4]
1625    add       bdmulq, r6
1626    mov           wd, wm
1627    shl    l_strideq, 2
1628    sub           lq, l_strideq
1629%else
1630; stack layout [32bit only]:
1631; r0 - GPRs [mask_bitsm, mstridem]
1632; r1 - m12/pb_mask
1633; r2 - bdmulq
1634cglobal lpf_v_sb_uv_16bpc, 4, 7, 8, -16 * (3 + extra_stack), \
1635                           dst, stride, mask, mstride, pic_reg, stride3, tmp
1636    RELOC_ARGS     v, 3*16
1637%if STACK_ALIGNMENT >= 16
1638    mov          r5d, r7m
1639%endif
1640    sar          r5d, 7
1641    and          r5d, 16                      ; 0 for 10bpc, 16 for 12bpc
1642    LEA     pic_regq, PIC_base
1643    mova          m0, [PIC_sym(pw_4)+r5]
1644%define bdmulq esp+2*16
1645    mova    [bdmulq], m0
1646    shl dword lstridem, 2
1647    sub           r3, dword lstridem
1648    mov     dword lm, r3
1649%endif
1650    mov     mstrideq, strideq
1651    neg     mstrideq
1652    lea     stride3q, [strideq*3]
1653%if ARCH_X86_64
1654    mov   mask_bitsd, 0x3
1655    mova         m12, [pb_mask]
1656%else
1657%define mask_bitsm dword [esp+0*gprsize]
1658%define mstridem dword [esp+1*gprsize]
1659    mov   mask_bitsm, 0x3
1660    mov     mstridem, mstrideq
1661    mova          m0, [PIC_sym(pb_mask)]
1662%define m12 [esp+1*16]
1663    mova         m12, m0
1664%endif
1665
1666.loop:
1667%if ARCH_X86_64
1668    test   [maskq+4], mask_bitsd            ; vmask[1]
1669%else
1670    mov          r6d, mask_bitsm
1671    test   [maskq+4], r6d
1672%endif
1673    jz .no_flat
1674
1675    FILTER         6, v
1676    jmp .end
1677
1678.no_flat:
1679%if ARCH_X86_64
1680    test   [maskq+0], mask_bitsd            ; vmask[0]
1681%else
1682    test   [maskq+0], r6d
1683%endif
1684    jz .end
1685
1686    FILTER         4, v
1687
1688.end:
1689%if ARCH_X86_64
1690    pslld        m12, 2
1691    add           lq, 8
1692%else
1693    mova          m0, m12
1694    pslld         m0, 2
1695    mova         m12, m0
1696    add     dword lm, 8
1697%endif
1698    add         dstq, 16
1699%if ARCH_X86_64
1700    shl   mask_bitsd, 2
1701    sub           wd, 2
1702%else
1703    shl   mask_bitsm, 2
1704    sub     dword wm, 2
1705%endif
1706    jg .loop
1707%undef mask_bitsm
1708%undef bdmulq
1709    UNRELOC_ARGS
1710    RET
1711
1712INIT_XMM ssse3
1713%if ARCH_X86_64
1714cglobal lpf_h_sb_uv_16bpc, 6, 11, 16, \
1715                           dst, stride, mask, l, l_stride, lut, \
1716                           h, stride3, tmp, mask_bits, bdmul
1717    mov          r6d, r7m
1718    sar          r6d, 7
1719    and          r6d, 16                      ; 0 for 10bpc, 16 for 12bpc
1720    lea       bdmulq, [pw_4]
1721    add       bdmulq, r6
1722    mov           hd, hm
1723    shl    l_strideq, 2
1724%else
1725; stack layout [32bit only]:
1726; r0 - GPRs [mask_bitsm]
1727; r1 - m12/pb_mask
1728; r2 - bdmulq
1729; r3-8 - p2-q2
1730cglobal lpf_h_sb_uv_16bpc, 4, 7, 8, -16 * (9 + extra_stack), \
1731                           dst, stride, mask, l, pic_reg, stride3, tmp
1732    RELOC_ARGS     h, 9*16
1733%if STACK_ALIGNMENT >= 16
1734    mov          r5d, r7m
1735%endif
1736    sar          r5d, 7
1737    and          r5d, 16                      ; 0 for 10bpc, 16 for 12bpc
1738    LEA     pic_regq, PIC_base
1739    mova          m0, [PIC_sym(pw_4)+r5]
1740%define bdmulq esp+2*16
1741    mova    [bdmulq], m0
1742    shl dword lstridem, 2
1743%endif
1744    sub           lq, 4
1745    lea     stride3q, [strideq*3]
1746%if ARCH_X86_64
1747    mov   mask_bitsd, 0x3
1748    mova         m12, [pb_mask]
1749%else
1750%define mask_bitsm dword [esp+0*gprsize]
1751    mov   mask_bitsm, 0x3
1752    mova          m0, [PIC_sym(pb_mask)]
1753%define m12 [esp+1*16]
1754    mova         m12, m0
1755%endif
1756
1757.loop:
1758%if ARCH_X86_64
1759    test   [maskq+4], mask_bitsd            ; vmask[1]
1760%else
1761    mov          r6d, mask_bitsm
1762    test   [maskq+4], r6d
1763%endif
1764    jz .no_flat
1765
1766    FILTER         6, h
1767    jmp .end
1768
1769.no_flat:
1770%if ARCH_X86_64
1771    test   [maskq+0], mask_bitsd            ; vmask[0]
1772%else
1773    test   [maskq+0], r6d
1774%endif
1775    jz .no_filter
1776
1777    FILTER         4, h
1778    jmp .end
1779
1780.no_filter:
1781    lea         dstq, [dstq+strideq*8]
1782.end:
1783%if ARCH_X86_64
1784    pslld        m12, 2
1785    lea           lq, [lq+l_strideq*2]
1786    shl   mask_bitsd, 2
1787    sub           hd, 2
1788%else
1789    mova          m0, m12
1790    pslld         m0, 2
1791    mova         m12, m0
1792    add           lq, dword lstridem
1793    add           lq, dword lstridem
1794    shl   mask_bitsm, 2
1795    sub     dword hm, 2
1796%endif
1797    jg .loop
1798%undef mask_bitsm
1799%undef bdmulq
1800    UNRELOC_ARGS
1801    RET
1802