1; Copyright © 2018, VideoLAN and dav1d authors
2; Copyright © 2018, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29SECTION_RODATA 16
30
31pb_4x0_4x4_4x8_4x12: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12
32pb_7_1: times 8 db 7, 1
33pb_3_1: times 8 db 3, 1
34pb_2_1: times 8 db 2, 1
35pb_m1_0: times 8 db -1, 0
36pb_m1_1: times 8 db -1, 1
37pb_m1_2: times 8 db -1, 2
38pb_1: times 16 db 1
39pb_2: times 16 db 2
40pb_3: times 16 db 3
41pb_4: times 16 db 4
42pb_16: times 16 db 16
43pb_63: times 16 db 63
44pb_64: times 16 db 64
45pb_128: times 16 db 0x80
46pb_129: times 16 db 0x81
47pb_240: times 16 db 0xf0
48pb_248: times 16 db 0xf8
49pb_254: times 16 db 0xfe
50
51pw_2048: times 8 dw 2048
52pw_4096: times 8 dw 4096
53
54pd_mask: dd 1, 2, 4, 8
55
56SECTION .text
57
58%macro ABSSUB 4 ; dst, a, b, tmp
59    psubusb       %1, %2, %3
60    psubusb       %4, %3, %2
61    por           %1, %4
62%endmacro
63
64%macro TRANSPOSE_16x4_AND_WRITE_4x16 5
65    ; transpose 16x4
66    punpcklbw    m%5, m%1, m%2
67    punpckhbw    m%1, m%2
68    punpcklbw    m%2, m%3, m%4
69    punpckhbw    m%3, m%4
70    punpcklwd    m%4, m%5, m%2
71    punpckhwd    m%5, m%2
72    punpcklwd    m%2, m%1, m%3
73    punpckhwd    m%1, m%3
74
75    ; write out
76%assign %%n 0
77%rep 4
78    movd [dstq+strideq *0-2], xm%4
79    movd [dstq+strideq *4-2], xm%5
80    movd [dstq+strideq *8-2], xm%2
81    movd [dstq+stride3q*4-2], xm%1
82    add         dstq, strideq
83%if %%n < 3
84    psrldq      xm%4, 4
85    psrldq      xm%5, 4
86    psrldq      xm%2, 4
87    psrldq      xm%1, 4
88%endif
89%assign %%n (%%n+1)
90%endrep
91    lea         dstq, [dstq+stride3q*4]
92%endmacro
93
94%macro TRANSPOSE_16X16B 2 ; output_transpose, mem
95%if %1 == 0
96    mova          %2, m15 ; m7 in 32-bit
97%endif
98
99    ; input in m0-7
100    punpcklbw    m15, m0, m1
101    punpckhbw     m0, m1
102    punpcklbw     m1, m2, m3
103    punpckhbw     m2, m3
104    punpcklbw     m3, m4, m5
105    punpckhbw     m4, m5
106%if ARCH_X86_64
107    SWAP           4, 5, 7
108%else
109 %if %1 == 0
110    mova          m5, %2
111 %else
112    mova          m5, [esp+1*16]
113 %endif
114    mova          %2, m4
115%endif
116    punpcklbw     m4, m6, m5
117    punpckhbw     m6, m5
118
119    ; interleaved in m15,0,1,2,3,7,4,6
120    punpcklwd     m5, m15, m1
121    punpckhwd    m15, m1
122    punpcklwd     m1, m0, m2
123    punpckhwd     m0, m2
124    punpcklwd     m2, m3, m4
125    punpckhwd     m3, m4
126%if ARCH_X86_64
127    SWAP           3, 4, 7
128%else
129    mova          m4, %2
130    mova          %2, m3
131%endif
132    punpcklwd     m3, m4, m6
133    punpckhwd     m4, m6
134
135    ; interleaved in m5,15,1,0,2,7,3,4
136    punpckldq     m6, m5, m2
137    punpckhdq     m5, m2
138%if ARCH_X86_64
139    SWAP           2, 7, 5
140%else
141    mova          m2, %2
142    mova  [esp+1*16], m5
143%endif
144    punpckldq     m5, m15, m2
145    punpckhdq    m15, m2
146    punpckldq     m2, m1, m3
147    punpckhdq     m1, m3
148    punpckldq     m3, m0, m4
149    punpckhdq     m0, m4
150
151%if ARCH_X86_32
152    mova  [esp+0*16], m6
153    mova  [esp+2*16], m5
154    mova  [esp+3*16], m15
155    mova  [esp+4*16], m2
156    mova  [esp+5*16], m1
157    mova  [esp+6*16], m3
158    mova  [esp+7*16], m0
159    mova          m8, [esp+ 8*16]
160    mova          m9, [esp+ 9*16]
161    mova         m10, [esp+10*16]
162 %if %1 == 0
163    mova         m11, [esp+11*16]
164    mova         m12, [esp+12*16]
165    mova         m13, [esp+13*16]
166    mova         m14, [esp+14*16]
167 %else
168    mova         m11, [esp+20*16]
169    mova         m12, [esp+15*16]
170    mova         m13, [esp+16*16]
171    mova         m14, [esp+17*16]
172 %endif
173%endif
174
175    ; input in m8-m15
176%if ARCH_X86_64
177    SWAP           7, 4
178%endif
179    punpcklbw     m7, m8, m9
180    punpckhbw     m8, m9
181    punpcklbw     m9, m10, m11
182    punpckhbw    m10, m11
183    punpcklbw    m11, m12, m13
184    punpckhbw    m12, m13
185%if ARCH_X86_64
186    mova         m13, %2
187%else
188 %if %1 == 0
189    mova         m13, [esp+15*16]
190 %else
191    mova         m13, [esp+18*16]
192 %endif
193%endif
194    mova          %2, m12
195    punpcklbw    m12, m14, m13
196    punpckhbw    m14, m14, m13
197
198    ; interleaved in m7,8,9,10,11,rsp%2,12,14
199    punpcklwd    m13, m7, m9
200    punpckhwd     m7, m9
201    punpcklwd     m9, m8, m10
202    punpckhwd     m8, m10
203    punpcklwd    m10, m11, m12
204    punpckhwd    m11, m12
205    mova         m12, %2
206    mova          %2, m11
207    punpcklwd    m11, m12, m14
208    punpckhwd    m12, m14
209
210    ; interleaved in m13,7,9,8,10,rsp%2,11,12
211    punpckldq    m14, m13, m10
212    punpckhdq    m13, m10
213    punpckldq    m10, m9, m11
214    punpckhdq     m9, m11
215    punpckldq    m11, m8, m12
216    punpckhdq     m8, m12
217    mova         m12, %2
218    mova          %2, m8
219    punpckldq     m8, m7, m12
220    punpckhdq     m7, m12
221
222%if ARCH_X86_32
223    mova [esp+ 8*16], m10
224    mova [esp+ 9*16], m9
225    mova [esp+10*16], m11
226    SWAP           6, 1
227    SWAP           4, 2
228    SWAP           5, 3
229    mova          m6, [esp+0*16]
230    mova          m4, [esp+1*16]
231    mova          m5, [esp+2*16]
232%endif
233
234    ; interleaved in m6,7,5,15,2,1,3,0,14,13,10,9,11,rsp%2,8,7
235    punpcklqdq   m12, m6, m14
236    punpckhqdq    m6, m14
237    punpcklqdq   m14, m4, m13
238    punpckhqdq    m4, m13
239    punpcklqdq   m13, m5, m8
240    punpckhqdq    m5, m8
241%if ARCH_X86_64
242    SWAP           8, 5
243%else
244    mova          m8, [esp+3*16]
245    mova [esp+27*16], m5
246 %define m15 m8
247%endif
248    punpcklqdq    m5, m15, m7
249    punpckhqdq   m15, m7
250
251%if ARCH_X86_32
252    mova [esp+11*16], m12
253    mova [esp+12*16], m6
254    mova [esp+13*16], m14
255    mova [esp+14*16], m4
256    mova [esp+26*16], m13
257    mova [esp+ 0*16], m5
258    mova [esp+ 1*16], m15
259    mova          m2, [esp+ 4*16]
260    mova         m10, [esp+ 8*16]
261    mova          m1, [esp+ 5*16]
262    mova          m9, [esp+ 9*16]
263    mova          m3, [esp+ 6*16]
264    mova         m11, [esp+10*16]
265    mova          m0, [esp+ 7*16]
266%endif
267
268    punpcklqdq    m7, m2, m10
269    punpckhqdq    m2, m10
270    punpcklqdq   m10, m1, m9
271    punpckhqdq    m1, m9
272    punpcklqdq    m9, m3, m11
273    punpckhqdq    m3, m11
274    mova         m11, %2
275%if ARCH_X86_32
276 %define m12 m3
277%endif
278    mova          %2, m12
279    punpcklqdq   m12, m0, m11
280    punpckhqdq    m0, m11
281%if %1 == 1
282    mova         m11, %2
283%endif
284
285%if ARCH_X86_64
286    ; interleaved m11,6,14,4,13,8,5,15,7,2,10,1,9,3,12,0
287    SWAP           0, 11, 1, 6, 5, 8, 7, 15
288    SWAP           2, 14, 12, 9
289    SWAP           3, 4, 13
290%else
291 %if %1 == 0
292    mova [esp+15*16], m9
293    mova [esp+17*16], m12
294    mova [esp+18*16], m0
295    mova [esp+28*16], m10
296    mova [esp+29*16], m1
297    mova          m3, [esp+0*16]
298    mova          m4, [esp+1*16]
299    SWAP          m5, m7
300    SWAP          m6, m2
301 %else
302    SWAP           0, 7
303    SWAP           3, 1, 2, 4, 6
304 %endif
305%endif
306%endmacro
307
308%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
309%if ARCH_X86_64
310 %define %%flat8mem [rsp+0*16]
311 %define %%q2mem    [rsp+1*16]
312 %define %%q3mem    [rsp+2*16]
313%else
314 %if %1 == 4 || %1 == 6
315  %define %%p2mem      [esp+ 8*16]
316  %define %%q2mem      [esp+ 9*16]
317  %define %%flat8mem   [esp+10*16]
318 %else
319  %ifidn %2, v
320   %define %%p2mem      [esp+16*16]
321   %define %%q2mem      [esp+ 1*16]
322   %define %%q3mem      [esp+18*16]
323   %define %%flat8mem   [esp+ 0*16]
324   %define %%flat16mem  [esp+20*16]
325  %else
326   %define %%p2mem     [esp+27*16]
327   %define %%q2mem     [esp+28*16]
328   %define %%q3mem     [esp+29*16]
329   %define %%flat8mem  [esp+21*16]
330   %define %%flat16mem [esp+30*16]
331  %endif
332 %endif
333 %xdefine m12reg m12
334%endif
335
336%if ARCH_X86_32
337    lea     stride3q, [strideq*3]
338%endif
339    ; load data
340%ifidn %2, v
341%if ARCH_X86_32
342    mov     mstrideq, strideq
343    neg     mstrideq
344%endif
345%if %1 == 4
346    lea         tmpq, [dstq+mstrideq*2]
347    mova          m3, [tmpq+strideq*0]          ; p1
348    mova          m4, [tmpq+strideq*1]          ; p0
349    mova          m5, [tmpq+strideq*2]          ; q0
350    mova          m6, [tmpq+stride3q]           ; q1
351%else
352    ; load 6-8 pixels, remainder (for wd=16) will be read inline
353    lea         tmpq, [dstq+mstrideq*4]
354    ; we load p3 later
355%define %%p3mem [dstq+mstrideq*4]
356 %if ARCH_X86_32
357  %define m13 m0
358  %define m14 m1
359  %define m15 m2
360 %endif
361    mova         m13, [tmpq+strideq*1]
362    mova          m3, [tmpq+strideq*2]
363    mova          m4, [tmpq+stride3q]
364    mova          m5, [dstq+strideq*0]
365    mova          m6, [dstq+strideq*1]
366    mova         m14, [dstq+strideq*2]
367%if %1 != 6
368    mova         m15, [dstq+stride3q]
369%endif
370 %if ARCH_X86_32
371    mova     %%p2mem, m13
372    mova     %%q2mem, m14
373  %define m13 %%p2mem
374  %define m14 %%q2mem
375  %if %1 != 6
376    mova     %%q3mem, m15
377   %define m15 %%q3mem
378  %endif
379 %endif
380%endif
381%else ; %2 == h
382    ; load lines
383%if %1 == 4
384    ; transpose 4x16
385    movd          m7, [dstq+strideq*0-2]
386    movd          m3, [dstq+strideq*1-2]
387    movd          m4, [dstq+strideq*2-2]
388    movd          m5, [dstq+stride3q -2]
389    lea         tmpq, [dstq+strideq*4]
390    punpcklbw     m7, m3
391    punpcklbw     m4, m5
392    movd          m3, [tmpq+strideq*0-2]
393    movd          m1, [tmpq+strideq*1-2]
394    movd          m5, [tmpq+strideq*2-2]
395    movd          m6, [tmpq+stride3q -2]
396    lea         tmpq, [tmpq+strideq*4]
397    punpcklbw     m3, m1
398    punpcklbw     m5, m6
399    movd          m0, [tmpq+strideq*0-2]
400    movd          m1, [tmpq+strideq*1-2]
401    punpcklbw     m0, m1
402    movd          m1, [tmpq+strideq*2-2]
403    movd          m2, [tmpq+stride3q -2]
404    punpcklbw     m1, m2
405    punpcklqdq    m7, m0
406    punpcklqdq    m4, m1
407    lea         tmpq, [tmpq+strideq*4]
408    movd          m0, [tmpq+strideq*0-2]
409    movd          m1, [tmpq+strideq*1-2]
410    punpcklbw     m0, m1
411    movd          m1, [tmpq+strideq*2-2]
412    movd          m2, [tmpq+stride3q -2]
413    punpcklbw     m1, m2
414    punpcklqdq    m3, m0
415    punpcklqdq    m5, m1
416    ; xm7: A0-1,B0-1,C0-1,D0-1,A8-9,B8-9,C8-9,D8-9
417    ; xm3: A4-5,B4-5,C4-5,D4-5,A12-13,B12-13,C12-13,D12-13
418    ; xm4: A2-3,B2-3,C2-3,D2-3,A10-11,B10-11,C10-11,D10-11
419    ; xm5: A6-7,B6-7,C6-7,D6-7,A14-15,B14-15,C14-15,D14-15
420    punpcklwd     m6, m7, m4
421    punpckhwd     m7, m4
422    punpcklwd     m4, m3, m5
423    punpckhwd     m3, m5
424    ; xm6: A0-3,B0-3,C0-3,D0-3
425    ; xm7: A8-11,B8-11,C8-11,D8-11
426    ; xm4: A4-7,B4-7,C4-7,D4-7
427    ; xm3: A12-15,B12-15,C12-15,D12-15
428    punpckldq     m5, m6, m4
429    punpckhdq     m6, m4
430    punpckldq     m4, m7, m3
431    punpckhdq     m7, m3
432    ; xm5: A0-7,B0-7
433    ; xm6: C0-7,D0-7
434    ; xm4: A8-15,B8-15
435    ; xm7: C8-15,D8-15
436    punpcklqdq    m3, m5, m4
437    punpckhqdq    m5, m5, m4
438    punpcklqdq    m4, m6, m7
439    punpckhqdq    m6, m7
440    ; xm3: A0-15
441    ; xm5: B0-15
442    ; xm4: C0-15
443    ; xm6: D0-15
444    SWAP           4, 5
445%elif %1 == 6 || %1 == 8
446    ; transpose 8x16
447    movq          m7, [dstq+strideq*0-%1/2]
448    movq          m3, [dstq+strideq*1-%1/2]
449    movq          m4, [dstq+strideq*2-%1/2]
450    movq          m5, [dstq+stride3q -%1/2]
451    lea         tmpq, [dstq+strideq*8]
452    punpcklbw     m7, m3
453    punpcklbw     m4, m5
454    movq          m3, [tmpq+strideq*0-%1/2]
455    movq          m1, [tmpq+strideq*1-%1/2]
456    movq          m5, [tmpq+strideq*2-%1/2]
457    movq          m6, [tmpq+stride3q -%1/2]
458    lea         tmpq, [dstq+strideq*4]
459    punpcklbw     m3, m1
460    punpcklbw     m5, m6
461    movq          m6, [tmpq+strideq*0-%1/2]
462    movq          m0, [tmpq+strideq*1-%1/2]
463    movq          m1, [tmpq+strideq*2-%1/2]
464    movq          m2, [tmpq+stride3q -%1/2]
465    lea         tmpq, [tmpq+strideq*8]
466    punpcklbw     m6, m0
467    punpcklbw     m1, m2
468    movq          m2, [tmpq+strideq*2-%1/2]
469    movq          m0, [tmpq+stride3q -%1/2]
470    punpcklbw     m2, m0
471%if ARCH_X86_64
472    SWAP         m15, m2
473%else
474 %define m15 [esp+3*16]
475    mova         m15, m2
476%endif
477    movq          m0, [tmpq+strideq*0-%1/2]
478    movq          m2, [tmpq+strideq*1-%1/2]
479    punpcklbw     m0, m2
480    ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1
481    ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9
482    ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3
483    ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11
484    ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5
485    ; xm0: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13
486    ; xm1: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7
487    ; xm2: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15
488    punpcklwd     m2, m7, m4
489    punpckhwd     m7, m4
490    punpcklwd     m4, m3, m5
491    punpckhwd     m3, m5
492    punpcklwd     m5, m6, m1
493    punpckhwd     m6, m1
494    punpcklwd     m1, m0, m15
495    punpckhwd     m0, m15
496%if ARCH_X86_64
497    SWAP         m15, m0
498%else
499    mova         m15, m0
500%endif
501    ; xm2: A0-3,B0-3,C0-3,D0-3
502    ; xm7: E0-3,F0-3,G0-3,H0-3
503    ; xm4: A8-11,B8-11,C8-11,D8-11
504    ; xm3: E8-11,F8-11,G8-11,H8-11
505    ; xm5: A4-7,B4-7,C4-7,D4-7
506    ; xm6: E4-7,F4-7,G4-7,H4-7
507    ; xm1: A12-15,B12-15,C12-15,D12-15
508    ; xm0: E12-15,F12-15,G12-15,H12-15
509    punpckldq     m0, m2, m5
510    punpckhdq     m2, m5
511    punpckldq     m5, m7, m6
512%if %1 != 6
513    punpckhdq     m7, m6
514%endif
515    punpckldq     m6, m4, m1
516    punpckhdq     m4, m1
517    punpckldq     m1, m3, m15
518%if %1 != 6
519    punpckhdq     m3, m15
520 %if ARCH_X86_64
521    SWAP         m15, m3
522 %else
523    mova         m15, m3
524 %endif
525%endif
526    ; xm0: A0-7,B0-7
527    ; xm2: C0-7,D0-7
528    ; xm5: E0-7,F0-7
529    ; xm7: G0-7,H0-7
530    ; xm6: A8-15,B8-15
531    ; xm4: C8-15,D8-15
532    ; xm1: E8-15,F8-15
533    ; xm3: G8-15,H8-15
534    punpcklqdq    m3, m0, m6
535    punpckhqdq    m0, m6
536    punpckhqdq    m6, m2, m4
537    punpcklqdq    m2, m4
538    punpcklqdq    m4, m5, m1
539    punpckhqdq    m5, m1
540%if %1 == 8
541    punpcklqdq    m1, m7, m15
542    punpckhqdq    m7, m15
543    ; xm3: A0-15
544    ; xm0: B0-15
545    ; xm2: C0-15
546    ; xm6: D0-15
547    ; xm4: E0-15
548    ; xm5: F0-15
549    ; xm1: G0-15
550    ; xm7: H0-15
551%if ARCH_X86_64
552    SWAP          11, 3, 2
553    SWAP          13, 0
554    SWAP           6, 5, 4
555    SWAP          14, 1
556    SWAP          15, 7
557    ; 3,0,2,6,4,5,1,7 -> 11,13,3,4,5,6,14,15
558    mova [rsp+21*16], m11
559 %define %%p3mem [rsp+21*16]
560%else
561 %define m11 [esp+26*16]
562 %define m13 [esp+27*16]
563 %define m14 [esp+28*16]
564 %define m15 [esp+29*16]
565    mova         m11, m3
566    mova         m13, m0
567    SWAP           3, 2
568    SWAP           6, 5, 4
569    mova         m14, m1
570    mova         m15, m7
571 %define %%p3mem [esp+26*16]
572%endif
573%else
574 %if ARCH_X86_64
575    SWAP          13, 3, 0
576    SWAP          14, 5, 6, 4, 2
577    ; 3,0,2,6,4,5 -> 13,3,4,5,6,14
578 %else
579  %define m13 %%p2mem
580  %define m14 %%q2mem
581    mova         m13, m3
582    mova         m14, m5
583    SWAP           3, 0
584    SWAP           5, 6, 4, 2
585    ; 0,2,6,4 -> 3,4,5,6
586 %endif
587%endif
588%else
589%if ARCH_X86_64
590    mova [rsp+20*16], m12
591%endif
592    ; load and 16x16 transpose. We only use 14 pixels but we'll need the
593    ; remainder at the end for the second transpose
594%if ARCH_X86_32
595 %xdefine m8  m0
596 %xdefine m9  m1
597 %xdefine m10 m2
598 %xdefine m11 m3
599 %xdefine m12 m4
600 %xdefine m13 m5
601 %xdefine m14 m6
602 %xdefine m15 m7
603    lea         tmpq, [dstq+strideq*8]
604    movu          m8, [tmpq+strideq*0-8]
605    movu          m9, [tmpq+strideq*1-8]
606    movu         m10, [tmpq+strideq*2-8]
607    movu         m11, [tmpq+stride3q -8]
608    lea         tmpq, [tmpq+strideq*4]
609    movu         m12, [tmpq+strideq*0-8]
610    movu         m13, [tmpq+strideq*1-8]
611    movu         m14, [tmpq+strideq*2-8]
612    movu         m15, [tmpq+stride3q -8]
613    mova [esp+ 8*16], m8
614    mova [esp+ 9*16], m9
615    mova [esp+10*16], m10
616    mova [esp+11*16], m11
617    mova [esp+12*16], m12
618    mova [esp+13*16], m13
619    mova [esp+14*16], m14
620    mova [esp+15*16], m15
621%endif
622    movu          m0, [dstq+strideq*0-8]
623    movu          m1, [dstq+strideq*1-8]
624    movu          m2, [dstq+strideq*2-8]
625    movu          m3, [dstq+stride3q -8]
626    lea         tmpq, [dstq+strideq*4]
627    movu          m4, [tmpq+strideq*0-8]
628    movu          m5, [tmpq+strideq*1-8]
629    movu          m6, [tmpq+strideq*2-8]
630    movu          m7, [tmpq+stride3q -8]
631    lea         tmpq, [tmpq+strideq*4]
632%if ARCH_X86_64
633    movu          m8, [tmpq+strideq*0-8]
634    movu          m9, [tmpq+strideq*1-8]
635    movu         m10, [tmpq+strideq*2-8]
636    movu         m11, [tmpq+stride3q -8]
637    lea         tmpq, [tmpq+strideq*4]
638    movu         m12, [tmpq+strideq*0-8]
639    movu         m13, [tmpq+strideq*1-8]
640    movu         m14, [tmpq+strideq*2-8]
641    movu         m15, [tmpq+stride3q -8]
642%endif
643
644%if ARCH_X86_64
645    TRANSPOSE_16X16B 0, [rsp+11*16]
646    mova [rsp+12*16], m1
647    mova [rsp+13*16], m2
648    mova [rsp+14*16], m3
649    mova [rsp+15*16], m12
650    mova [rsp+16*16], m13
651    mova [rsp+17*16], m14
652    mova [rsp+18*16], m15
653    ; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15
654    SWAP          12, 4, 7
655    SWAP          13, 5, 8
656    SWAP           3, 6, 9
657    SWAP          10, 14
658    SWAP          11, 15
659    mova [rsp+21*16], m12
660 %define %%p3mem [rsp+21*16]
661    mova         m12, [rsp+20*16]
662%else
663    TRANSPOSE_16X16B 0, [esp+16*16]
664 %define %%p3mem [esp+26*16]
665 %define m11 %%p3mem
666 %define m13 %%p2mem
667 %define m14 %%q2mem
668 %define m15 %%q3mem
669%endif
670%endif ; if 4 elif 6 or 8 else 16
671%endif ; if v else h
672
673    ; load L/E/I/H
674%if ARCH_X86_32
675    mov    l_strideq, l_stridem
676%endif
677%ifidn %2, v
678    movu          m1, [lq]
679    movu          m0, [lq+l_strideq]
680%else
681 %if ARCH_X86_32
682    lea   l_stride3q, [l_strideq*3]
683 %endif
684    movq         xm1, [lq]
685    movq         xm2, [lq+l_strideq*2]
686    movhps       xm1, [lq+l_strideq]
687    movhps       xm2, [lq+l_stride3q]
688    shufps        m0, m1, m2, q3131
689    shufps        m1, m2, q2020
690 %if ARCH_X86_32
691    lea     stride3q, [strideq*3]
692 %endif
693%endif
694
695%if ARCH_X86_32
696 %ifidn %2, v
697    mov         lutd, lutm
698 %endif
699%endif
700    pxor          m2, m2
701    pcmpeqb       m7, m2, m0
702    pand          m1, m7
703    por           m0, m1                        ; l[x][] ? l[x][] : l[x-stride][]
704    pshufb        m0, [PIC_sym(pb_4x0_4x4_4x8_4x12)] ; l[x][1]
705    pcmpeqb       m2, m0                        ; !L
706    psrlq         m7, m0, [lutq+128]
707    pand          m7, [PIC_sym(pb_63)]
708    pminub        m7, minlvl
709    pmaxub        m7, [PIC_sym(pb_1)]           ; I
710    pand          m1, m0, [PIC_sym(pb_240)]
711    psrlq         m1, 4                         ; H
712    paddb         m0, [PIC_sym(pb_2)]
713    paddb         m0, m0
714    paddb         m0, m7                        ; E
715    pxor          m1, [PIC_sym(pb_128)]
716    pxor          m7, [PIC_sym(pb_128)]
717    pxor          m0, [PIC_sym(pb_128)]
718    SWAP           2, 7
719
720%if ARCH_X86_64
721    SWAP           0, 8
722    SWAP           2, 10
723%else
724 %ifidn %2, v
725    mov     mstrideq, strideq
726    neg     mstrideq
727  %if %1 == 4
728    lea         tmpq, [dstq+mstrideq*2]
729  %elif %1 == 6 || %1 == 8
730    lea         tmpq, [dstq+mstrideq*4]
731  %endif
732 %endif
733    mova  [esp+3*16], m0
734    mova  [esp+4*16], m2
735%endif
736
737    ABSSUB        m0, m3, m4, m2                ; abs(p1-p0)
738    pmaxub        m0, m7
739    ABSSUB        m2, m5, m6, m7                ; abs(q1-q0)
740    pmaxub        m0, m2
741%if %1 == 4
742    pxor          m0, [PIC_sym(pb_128)]
743    pcmpgtb       m7, m0, m1                    ; hev
744 %if ARCH_X86_64
745    SWAP           7, 11
746 %else
747    mova  [esp+5*16], m7
748 %endif
749%else
750    pxor          m7, m0, [PIC_sym(pb_128)]
751    pcmpgtb       m7, m1                        ; hev
752%if ARCH_X86_64
753    SWAP           7, 11
754%else
755    mova  [esp+5*16], m7
756%endif
757
758%if %1 == 6
759    ABSSUB        m1, m13, m4, m7               ; abs(p2-p0)
760    pmaxub        m1, m0
761%else
762    mova          m2, %%p3mem
763    ABSSUB        m1, m2, m4, m7                ; abs(p3-p0)
764    pmaxub        m1, m0
765    ABSSUB        m7, m13, m4, m2               ; abs(p2-p0)
766    pmaxub        m1, m7
767%endif
768    ABSSUB        m7, m5, m14, m2               ; abs(p2-p0)
769    pmaxub        m1, m7
770%if %1 != 6
771    ABSSUB        m7, m5, m15, m2               ; abs(q3-q0)
772    pmaxub        m1, m7
773%endif
774    pxor          m1, [PIC_sym(pb_128)]
775    pcmpgtb       m1, [PIC_sym(pb_129)]         ; !flat8in
776%if ARCH_X86_64
777    SWAP           1, 9
778%else
779    mova  [esp+6*16], m1
780%endif
781
782%if %1 == 6
783    ABSSUB        m7, m13, m3, m1               ; abs(p2-p1)
784%else
785    mova          m2, %%p3mem
786    ABSSUB        m7, m2, m13, m1               ; abs(p3-p2)
787    ABSSUB        m2, m13, m3, m1               ; abs(p2-p1)
788    pmaxub        m7, m2
789    ABSSUB        m2, m14, m15, m1              ; abs(q3-q2)
790    pmaxub        m7, m2
791%endif
792    ABSSUB        m2, m14, m6,  m1              ; abs(q2-q1)
793    pmaxub        m7, m2
794%if ARCH_X86_32
795 %define m12 m1
796    mova         m12, maskmem
797%endif
798    pand          m2, m12, mask1
799    pcmpeqd       m2, m12
800    pand          m7, m2                        ; only apply fm-wide to wd>4 blocks
801    pmaxub        m0, m7
802
803    pxor          m0, [PIC_sym(pb_128)]
804%endif ; %if %1 == 4 else
805%if ARCH_X86_64
806    SWAP           2, 10
807    pcmpgtb       m0, m2
808%else
809    pcmpgtb       m0, [esp+4*16]
810%endif
811
812    ABSSUB        m1, m3, m6, m7                ; abs(p1-q1)
813    ABSSUB        m7, m4, m5, m2                ; abs(p0-q0)
814    paddusb       m7, m7
815    pand          m1, [PIC_sym(pb_254)]
816    psrlq         m1, 1
817    paddusb       m1, m7                        ; abs(p0-q0)*2+(abs(p1-q1)>>1)
818    pxor          m1, [PIC_sym(pb_128)]
819%if ARCH_X86_64
820    pcmpgtb       m1, m8                        ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E
821%else
822    pcmpgtb       m1, [esp+3*16]
823%endif
824    por           m0, m1
825
826%if %1 == 16
827%if ARCH_X86_64
828    SWAP           0, 8
829%else
830    mova  [esp+3*16], m0
831%endif
832%ifidn %2, v
833    lea         tmpq, [dstq+mstrideq*8]
834    mova          m0, [tmpq+strideq*1]
835%else
836    mova          m0, [rsp+12*16]
837%endif
838    ABSSUB        m1, m0, m4, m2
839%ifidn %2, v
840    mova          m0, [tmpq+strideq*2]
841%else
842    mova          m0, [rsp+13*16]
843%endif
844    ABSSUB        m2, m0, m4, m7
845    pmaxub        m1, m2
846%ifidn %2, v
847    mova          m0, [tmpq+stride3q]
848%else
849    mova          m0, [rsp+14*16]
850%endif
851    ABSSUB        m2, m0, m4, m7
852    pmaxub        m1, m2
853%ifidn %2, v
854    lea         tmpq, [dstq+strideq*4]
855    mova          m0, [tmpq+strideq*0]
856%else
857    mova          m0, [rsp+15*16]
858%endif
859    ABSSUB        m2, m0, m5, m7
860    pmaxub        m1, m2
861%ifidn %2, v
862    mova          m0, [tmpq+strideq*1]
863%else
864    mova          m0, [rsp+16*16]
865%endif
866    ABSSUB        m2, m0, m5, m7
867    pmaxub        m1, m2
868%ifidn %2, v
869    mova          m0, [tmpq+strideq*2]
870%else
871    mova          m0, [rsp+17*16]
872%endif
873    ABSSUB        m2, m0, m5, m7
874    pmaxub        m1, m2
875    pxor          m1, [PIC_sym(pb_128)]
876    pcmpgtb       m1, [PIC_sym(pb_129)]         ; !flat8out
877%if ARCH_X86_64
878    por           m1, m9                        ; !flat8in | !flat8out
879%else
880    por           m1, [esp+6*16]
881 %define m12 m7
882    mova         m12, maskmem
883%endif
884    pand          m2, m12, mask2
885    pcmpeqd       m2, m12
886    pandn         m1, m2                        ; flat16
887%if ARCH_X86_64
888    pandn         m2, m8, m1                    ; flat16 & fm
889%else
890    pandn         m2, [esp+3*16], m1            ; flat16 & fm
891    mova %%flat16mem, m2
892%endif
893    SWAP           1, 2
894
895    pand          m2, m12, mask1
896    pcmpeqd       m2, m12
897%if ARCH_X86_64
898    pandn         m9, m2                    ; flat8in
899    pandn         m2, m8, m9
900    SWAP           2, 9
901%else
902    pandn         m0, [esp+6*16], m2
903    pandn         m2, [esp+3*16], m0
904    mova  [esp+6*16], m2
905%endif
906    pand          m2, m12, mask0
907    pcmpeqd       m2, m12
908%if ARCH_X86_64
909    pandn         m8, m2
910    pandn         m2, m9, m8                    ; fm & !flat8 & !flat16
911    SWAP           2, 8
912    pandn         m2, m1, m9                    ; flat8 & !flat16
913    SWAP           2, 9
914    SWAP           0, 8
915    SWAP           1, 10
916%else
917    pandn         m0, [esp+3*16], m2
918    pandn         m2, [esp+6*16], m0
919    SWAP           2, 0
920    pandn         m2, m1, [esp+6*16]
921    mova  %%flat8mem, m2
922%endif
923%elif %1 != 4
924 %if ARCH_X86_64
925    SWAP           1, 9
926 %else
927  %define m12 m7
928    mova         m12, maskmem
929    mova          m1, [esp+6*16]
930 %endif
931    pand          m2, m12, mask1
932    pcmpeqd       m2, m12
933    pandn         m1, m2
934    pandn         m2, m0, m1                    ; flat8 & fm
935    pand          m1, m12, mask0
936    pcmpeqd       m1, m12
937    pandn         m0, m1
938    pandn         m1, m2, m0                    ; fm & !flat8
939    SWAP           1, 2, 0
940 %if ARCH_X86_64
941    SWAP           1, 9
942 %else
943    mova  %%flat8mem, m1
944 %endif
945%else
946%if ARCH_X86_32
947 %define m12 m1
948    mova         m12, maskmem
949%endif
950    pand          m2, m12, mask0
951    pcmpeqd       m2, m12
952    pandn         m0, m2                        ; fm
953%endif
954
955    ; short filter
956
957    mova          m1, [PIC_sym(pb_128)]
958%if ARCH_X86_64
959    SWAP           7, 11
960%else
961    mova          m7, [esp+5*16]
962%endif
963    pxor          m3, m1
964    pxor          m6, m1
965    pxor          m4, m1
966    pxor          m5, m1
967    psubsb        m1, m3, m6                    ; iclip_diff(p1-q1)
968    pand          m1, m7                        ; f=iclip_diff(p1-q1)&hev
969    psubsb        m2, m5, m4
970    paddsb        m1, m2
971    paddsb        m1, m2
972    paddsb        m1, m2                        ; f=iclip_diff(3*(q0-p0)+f)
973    mova          m2, [PIC_sym(pb_16)]
974    pand          m0, m1                        ; f&=fm
975    paddsb        m1, m0, [PIC_sym(pb_3)]
976    paddsb        m0, [PIC_sym(pb_4)]
977    pand          m1, [PIC_sym(pb_248)]
978    pand          m0, [PIC_sym(pb_248)]
979    psrlq         m1, 3
980    psrlq         m0, 3
981    pxor          m1, m2
982    pxor          m0, m2
983    psubb         m1, m2                        ; f2
984    psubb         m0, m2                        ; f1
985    mova          m2, [PIC_sym(pb_128)]
986    paddsb        m4, m1
987    psubsb        m5, m0
988    pxor          m4, m2
989    pxor          m5, m2
990
991    pxor          m0, m2
992    pxor          m1, m1
993    pavgb         m0, m1                        ; f=(f1+1)>>1
994    psubb         m0, [PIC_sym(pb_64)]
995    pandn         m7, m0                        ; f&=!hev
996    paddsb        m3, m7
997    psubsb        m6, m7
998    pxor          m3, m2
999    pxor          m6, m2
1000
1001%if %1 == 16
1002    ; flat16 filter
1003%ifidn %2, v
1004    lea         tmpq, [dstq+mstrideq*8]
1005    mova          m0, [tmpq+strideq*1]          ; p6
1006    mova          m2, [tmpq+strideq*2]          ; p5
1007    mova          m7, [tmpq+stride3q]           ; p4
1008%else
1009    mova          m0, [rsp+12*16]
1010    mova          m2, [rsp+13*16]
1011    mova          m7, [rsp+14*16]
1012%endif
1013
1014%if ARCH_X86_64
1015    SWAP           1, 10
1016    mova  %%flat8mem, m9
1017    mova     %%q2mem, m14
1018    mova     %%q3mem, m15
1019    SWAP           0, 8
1020    SWAP           1, 9
1021%else
1022 %ifidn %2, v
1023    mova [esp+17*16], m0
1024    mova [esp+19*16], m3
1025    mova [esp+21*16], m4
1026    mova [esp+22*16], m5
1027    mova [esp+23*16], m6
1028  %xdefine m11 m3
1029  %xdefine m14 m4
1030  %xdefine m15 m5
1031  %xdefine m10 m6
1032  %define m13 %%p2mem
1033  %define m8  [esp+17*16]
1034  %define m9  %%flat16mem
1035  %define m3  [esp+19*16]
1036  %define m4  [esp+21*16]
1037  %define m5  [esp+22*16]
1038  %define m6  [esp+23*16]
1039 %else
1040    mova [esp+31*16], m0
1041    mova [esp+32*16], m3
1042    mova [esp+33*16], m4
1043    mova [esp+34*16], m5
1044    mova [esp+35*16], m6
1045  %xdefine m11 m3
1046  %xdefine m14 m4
1047  %xdefine m15 m5
1048  %xdefine m10 m6
1049  %define m13 %%p2mem
1050  %define m8  [esp+31*16]
1051  %define m9  %%flat16mem
1052  %define m3  [esp+32*16]
1053  %define m4  [esp+33*16]
1054  %define m5  [esp+34*16]
1055  %define m6  [esp+35*16]
1056 %endif
1057%endif
1058
1059    ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A
1060    ; write -6
1061    mova         m11, %%p3mem
1062%if ARCH_X86_64
1063    punpcklbw    m14, m8, m11
1064    punpckhbw    m15, m8, m11
1065%else
1066    punpcklbw    m14, m0, m11
1067    punpckhbw    m15, m0, m11
1068%endif
1069%ifidn %2, v
1070    mova  [rsp+5*16], m11
1071%endif
1072    pmaddubsw    m10, m14, [PIC_sym(pb_7_1)]
1073    pmaddubsw    m11, m15, [PIC_sym(pb_7_1)]    ; p6*7+p3
1074    punpcklbw     m0, m2, m7
1075    punpckhbw     m1, m2, m7
1076    pmaddubsw     m0, [PIC_sym(pb_2)]
1077    pmaddubsw     m1, [PIC_sym(pb_2)]
1078    paddw        m10, m0
1079    paddw        m11, m1                        ; p6*7+p5*2+p4*2+p3
1080    punpcklbw     m0, m13, m3
1081    punpckhbw     m1, m13, m3
1082    pmaddubsw     m0, [PIC_sym(pb_1)]
1083    pmaddubsw     m1, [PIC_sym(pb_1)]
1084    paddw        m10, m0
1085    paddw        m11, m1                        ; p6*7+p5*2+p4*2+p3+p2+p1
1086    punpcklbw     m0, m4, m5
1087    punpckhbw     m1, m4, m5
1088    pmaddubsw     m0, [PIC_sym(pb_1)]
1089    pmaddubsw     m1, [PIC_sym(pb_1)]
1090    paddw        m10, m0
1091    paddw        m11, m1                        ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
1092    pmulhrsw      m0, m10, [PIC_sym(pw_2048)]
1093    pmulhrsw      m1, m11, [PIC_sym(pw_2048)]
1094    packuswb      m0, m1
1095    pand          m0, m9
1096    pandn         m1, m9, m2
1097    por           m0, m1
1098%ifidn %2, v
1099    mova [tmpq+strideq*2], m0                   ; p5
1100%else
1101    mova [rsp+13*16], m0
1102%endif
1103
1104    ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B
1105    ; write -5
1106    pmaddubsw    m14, [PIC_sym(pb_m1_1)]
1107    pmaddubsw    m15, [PIC_sym(pb_m1_1)]
1108    paddw        m10, m14
1109    paddw        m11, m15                       ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0
1110    punpcklbw     m0, m8, m6
1111    punpckhbw     m1, m8, m6
1112    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
1113    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
1114    mova  [rsp+3*16], m0
1115    mova  [rsp+4*16], m1
1116    paddw        m10, m0
1117    paddw        m11, m1                        ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1
1118    pmulhrsw      m0, m10, [PIC_sym(pw_2048)]
1119    pmulhrsw      m1, m11, [PIC_sym(pw_2048)]
1120    packuswb      m0, m1
1121    pand          m0, m9
1122    pandn         m1, m9, m7
1123    por           m0, m1
1124%ifidn %2, v
1125    mova [tmpq+stride3q], m0                    ; p4
1126%else
1127    mova [rsp+14*16], m0
1128%endif
1129
1130    ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C
1131    ; write -4
1132    mova         m14, %%q2mem
1133    punpcklbw     m0, m8, m13
1134    punpckhbw     m1, m8, m13
1135    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
1136    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
1137    paddw        m10, m0
1138    paddw        m11, m1                        ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1
1139    punpcklbw     m0, m2, m14
1140    punpckhbw     m2, m14
1141    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
1142    pmaddubsw     m2, [PIC_sym(pb_m1_1)]
1143    mova  [rsp+1*16], m0
1144    paddw        m10, m0
1145    paddw        m11, m2                        ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2
1146    pmulhrsw      m0, m10, [PIC_sym(pw_2048)]
1147    pmulhrsw      m1, m11, [PIC_sym(pw_2048)]
1148    packuswb      m0, m1
1149    pand          m0, m9
1150    pandn         m1, m9, %%p3mem
1151    por           m0, m1
1152%ifidn %2, v
1153    mova [tmpq+strideq*4], m0                   ; p3
1154%else
1155    mova [rsp+19*16], m0
1156%endif
1157
1158    ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D
1159    ; write -3
1160    mova         m15, %%q3mem
1161    punpcklbw     m0, m8, m3
1162    punpckhbw     m1, m8, m3
1163    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
1164    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
1165    paddw        m10, m0
1166    paddw        m11, m1                        ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2
1167    punpcklbw     m0, m7, m15
1168    punpckhbw     m7, m15
1169    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
1170    pmaddubsw     m7, [PIC_sym(pb_m1_1)]
1171    mova  [rsp+2*16], m0
1172%if ARCH_X86_32
1173 %ifidn %2, v
1174    mova [esp+24*16], m7
1175 %else
1176    mova [esp+36*16], m7
1177 %endif
1178%endif
1179    paddw        m10, m0
1180    paddw        m11, m7                        ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3
1181    pmulhrsw      m0, m10, [PIC_sym(pw_2048)]
1182    pmulhrsw      m1, m11, [PIC_sym(pw_2048)]
1183    packuswb      m0, m1
1184    pand          m0, m9
1185    pandn         m1, m9, m13
1186    por           m0, m1
1187    mova  [rsp+6*16], m0                        ; don't clobber p2/m13 since we need it in F
1188
1189    ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E
1190    ; write -2
1191    punpcklbw     m0, m8, m4
1192    punpckhbw     m1, m8, m4
1193    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
1194    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
1195    paddw        m10, m0
1196    paddw        m11, m1                        ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3
1197%if ARCH_X86_64
1198    SWAP           7, 8
1199%endif
1200%ifidn %2, v
1201    mova          m1, [dstq+strideq*4]          ; q4
1202    mova          m7, [rsp+5*16]                ; (pre-filter) p3
1203%else
1204    mova          m1, [rsp+15*16]
1205    mova          m7, %%p3mem                   ; (pre-filter) p3
1206%endif
1207    punpcklbw     m0, m1, m7
1208    punpckhbw     m1, m1, m7
1209    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
1210    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
1211    mova  [rsp+7*16], m0
1212    mova  [rsp+5*16], m1
1213    psubw        m10, m0
1214    psubw        m11, m1                        ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4
1215    pmulhrsw      m0, m10, [PIC_sym(pw_2048)]
1216    pmulhrsw      m1, m11, [PIC_sym(pw_2048)]
1217    packuswb      m0, m1
1218    pand          m0, m9
1219    pandn         m1, m9, m3
1220    por           m0, m1
1221    mova  [rsp+8*16], m0                        ; don't clobber p1/m3 since we need it in G
1222
1223    ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F
1224    ; write -1
1225%ifidn %2, v
1226    mova          m7, [tmpq+strideq*1]          ; p6
1227    lea         tmpq, [dstq+strideq*4]
1228    mova          m1, [tmpq+strideq*1]          ; q5
1229%else
1230    mova          m7, [rsp+12*16]               ; p6
1231    mova          m1, [rsp+16*16]
1232%endif
1233    punpcklbw     m0, m7, m5
1234    punpckhbw     m7, m5
1235    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
1236    pmaddubsw     m7, [PIC_sym(pb_m1_1)]
1237    paddw        m10, m0
1238    paddw        m11, m7                        ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4
1239    punpcklbw     m7, m13, m1
1240    pmaddubsw     m7, [PIC_sym(pb_m1_1)]
1241    mova  [rsp+9*16], m7
1242    paddw        m10, m7
1243%if ARCH_X86_64
1244    punpckhbw    m13, m1
1245    mova          m1, [rsp+6*16]
1246    SWAP           1, 13
1247%else
1248    punpckhbw     m7, m13, m1
1249    mova          m1, [esp+6*16]
1250    mova         m13, m1
1251    SWAP           1, 7
1252%endif
1253    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
1254    mova [rsp+10*16], m1
1255    paddw        m11, m1                        ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5
1256    pmulhrsw      m7, m10, [PIC_sym(pw_2048)]
1257    pmulhrsw      m0, m11, [PIC_sym(pw_2048)]
1258    packuswb      m7, m0
1259    pand          m7, m9
1260    pandn         m0, m9, m4
1261    por           m7, m0
1262    mova  [rsp+6*16], m7                        ; don't clobber p0/m4 since we need it in H
1263
1264    ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G
1265    ; write +0
1266%ifidn %2, v
1267    mova          m7, [tmpq+strideq*2]          ; q6
1268%else
1269    mova          m7, [rsp+17*16]
1270%endif
1271    paddw        m10, [rsp+3*16]
1272    paddw        m11, [rsp+4*16]                ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5
1273    punpcklbw     m0, m3, m7
1274    punpckhbw     m1, m3, m7
1275%if ARCH_X86_64
1276    mova          m3, [rsp+8*16]
1277%endif
1278    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
1279    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
1280    mova  [rsp+3*16], m0
1281    mova  [rsp+4*16], m1
1282    paddw        m10, m0
1283    paddw        m11, m1                        ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6
1284    pmulhrsw      m0, m10, [PIC_sym(pw_2048)]
1285    pmulhrsw      m1, m11, [PIC_sym(pw_2048)]
1286    packuswb      m0, m1
1287    pand          m0, m9
1288    pandn         m1, m9, m5
1289    por           m0, m1
1290%if ARCH_X86_32
1291    mova          m1, [esp+8*16]
1292    mova          m3, m1
1293%endif
1294    mova  [rsp+8*16], m0                        ; don't clobber q0/m5 since we need it in I
1295
1296    ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H
1297    ; write +1
1298    paddw        m10, [rsp+1*16]
1299    paddw        m11, m2                        ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6
1300    punpcklbw     m0, m4, m7
1301    punpckhbw     m2, m4, m7
1302    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
1303    pmaddubsw     m2, [PIC_sym(pb_m1_1)]
1304    paddw        m10, m0
1305    paddw        m11, m2                        ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2
1306%if ARCH_X86_64
1307    mova          m4, [rsp+6*16]
1308%else
1309 %define m4 [esp+6*16]
1310%endif
1311    pmulhrsw      m2, m10, [PIC_sym(pw_2048)]
1312    pmulhrsw      m1, m11, [PIC_sym(pw_2048)]
1313    packuswb      m2, m1
1314    pand          m2, m9
1315    pandn         m1, m9, m6
1316    por           m2, m1                        ; don't clobber q1/m6 since we need it in K
1317
1318    ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I
1319    ; write +2
1320    paddw        m10, [rsp+2*16]
1321%if ARCH_X86_64
1322    SWAP           7, 8
1323    paddw        m11, m7
1324%else
1325    mova          m8, m7
1326 %ifidn %2, v
1327    paddw        m11, [esp+24*16]               ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2
1328 %else
1329    paddw        m11, [esp+36*16]               ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2
1330 %endif
1331%endif
1332    punpcklbw     m0, m5, m8
1333    punpckhbw     m1, m5, m8
1334%if ARCH_X86_64
1335    mova          m5, [rsp+8*16]
1336%else
1337 %define m5 [esp+8*16]
1338%endif
1339    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
1340    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
1341    paddw        m10, m0
1342    paddw        m11, m1                        ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3
1343    pmulhrsw      m7, m10, [PIC_sym(pw_2048)]
1344    pmulhrsw      m1, m11, [PIC_sym(pw_2048)]
1345    packuswb      m7, m1
1346    pand          m7, m9
1347    pandn         m1, m9, m14
1348    por           m7, m1                        ; don't clobber q2/m14 since we need it in K
1349
1350    ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J
1351    ; write +3
1352    psubw        m10, [rsp+7*16]
1353    psubw        m11, [rsp+5*16]                ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3
1354    punpcklbw     m0, m6, m8
1355    punpckhbw     m1, m6, m8
1356    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
1357    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
1358    paddw        m10, m0
1359    paddw        m11, m1                        ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4
1360    pmulhrsw      m0, m10, [PIC_sym(pw_2048)]
1361    pmulhrsw      m1, m11, [PIC_sym(pw_2048)]
1362    packuswb      m0, m1
1363    pand          m0, m9
1364    pandn         m1, m9, m15
1365    por           m0, m1
1366%ifidn %2, v
1367    mova [tmpq+mstrideq], m0                    ; q3
1368%else
1369    mova [rsp+20*16], m0
1370%endif
1371
1372    ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K
1373    ; write +4
1374    paddw        m10, [rsp+ 9*16]
1375    paddw        m11, [rsp+10*16]               ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
1376    punpcklbw     m0, m14, m8
1377    punpckhbw     m1, m14, m8
1378    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
1379    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
1380    paddw        m10, m0
1381    paddw        m11, m1                        ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
1382    pmulhrsw      m0, m10, [PIC_sym(pw_2048)]
1383    pmulhrsw      m1, m11, [PIC_sym(pw_2048)]
1384    packuswb      m0, m1
1385    pand          m0, m9
1386%ifidn %2, v
1387    pandn         m1, m9, [tmpq+strideq*0]
1388%else
1389    pandn         m1, m9, [rsp+15*16]
1390%endif
1391    por           m0, m1
1392%ifidn %2, v
1393    mova [tmpq+strideq*0], m0                    ; q4
1394%else
1395    mova [rsp+15*16], m0
1396%endif
1397
1398    ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L
1399    ; write +5
1400    paddw        m10, [rsp+3*16]
1401    paddw        m11, [rsp+4*16]                ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
1402    punpcklbw     m0, m15, m8
1403    punpckhbw     m1, m15, m8
1404    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
1405    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
1406    paddw        m10, m0
1407    paddw        m11, m1                        ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
1408    pmulhrsw     m10, [PIC_sym(pw_2048)]
1409    pmulhrsw     m11, [PIC_sym(pw_2048)]
1410    packuswb     m10, m11
1411    pand         m10, m9
1412%ifidn %2, v
1413    pandn        m11, m9, [tmpq+strideq*1]
1414%else
1415    pandn        m11, m9, [rsp+16*16]
1416%endif
1417    por          m10, m11
1418%ifidn %2, v
1419    mova [tmpq+strideq*1], m10                  ; q5
1420%else
1421    mova [rsp+16*16], m10
1422%endif
1423
1424%if ARCH_X86_64
1425    SWAP           0, 8
1426    SWAP           1, 9
1427    SWAP          14, 7
1428%else
1429 %xdefine m3 m11
1430 %xdefine m4 m14
1431 %xdefine m5 m15
1432 %xdefine m6 m10
1433    mova     %%q2mem, m7
1434 %ifidn %2, v
1435    mova          m3, [esp+19*16]
1436 %else
1437    mova          m3, [esp+32*16]
1438 %endif
1439    mova          m4, [esp+ 6*16]
1440    mova          m5, [esp+ 8*16]
1441%endif
1442    SWAP          m6, m2
1443
1444%if ARCH_X86_64
1445    mova          m9, %%flat8mem
1446%endif
1447%ifidn %2, v
1448    lea         tmpq, [dstq+mstrideq*4]
1449%endif
1450%endif ; if %1 == 16
1451%if %1 >= 8
1452    ; flat8 filter
1453%if ARCH_X86_32
1454 %define m9  %%flat8mem
1455 %define m11 m1
1456 %define m13 %%p2mem
1457 %define m14 %%q2mem
1458 %define m15 %%q3mem
1459%endif
1460    mova         m11, %%p3mem
1461    punpcklbw     m0, m11, m3
1462    punpcklbw     m7, m13, m4
1463    pmaddubsw     m2, m0, [PIC_sym(pb_3_1)] ; 3 * p3 + p1
1464    pmaddubsw     m7, [PIC_sym(pb_2_1)]
1465    paddw         m2, m7                    ; 3 * p3 + 2 * p2 + p1 + p0
1466    punpcklbw     m7, m5, [PIC_sym(pb_4)]
1467    pmaddubsw     m7, [PIC_sym(pb_1)]
1468    paddw         m2, m7                    ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4
1469    punpckhbw     m1, m11, m3
1470    pmaddubsw     m7, m1, [PIC_sym(pb_3_1)] ; 3 * p3 + p1
1471    punpckhbw     m0, m13, m4
1472    pmaddubsw     m0, [PIC_sym(pb_2_1)]
1473    paddw         m7, m0                    ; 3 * p3 + 2 * p2 + p1 + p0
1474    punpckhbw     m0, m5, [PIC_sym(pb_4)]
1475    pmaddubsw     m0, [PIC_sym(pb_1)]
1476    paddw         m7, m0                    ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4
1477    psrlw         m0, m2, 3
1478    psrlw         m1, m7, 3
1479    packuswb      m0, m1
1480    pand          m0, m9
1481    pandn         m1, m9, m13
1482    por           m0, m1                    ; p2
1483%ifidn %2, v
1484    mova [tmpq+strideq*1], m0
1485%else
1486 %if ARCH_X86_64
1487    SWAP           0, 10
1488 %else
1489    mova  [esp+2*16], m0
1490 %endif
1491%endif
1492
1493%if ARCH_X86_32
1494    mova         m11, %%p3mem
1495%endif
1496    punpcklbw     m0, m11, m3
1497    punpckhbw     m1, m11, m3
1498    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
1499    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
1500    paddw         m2, m0
1501    paddw         m7, m1
1502    punpcklbw     m0, m13, m6
1503    punpckhbw     m1, m13, m6
1504    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
1505    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
1506    paddw         m2, m0
1507    paddw         m7, m1            ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4
1508    psrlw         m0, m2, 3
1509    psrlw         m1, m7, 3
1510    packuswb      m0, m1
1511    pand          m0, m9
1512    pandn         m1, m9, m3
1513    por           m0, m1            ; p1
1514%ifidn %2, v
1515    mova [tmpq+strideq*2], m0
1516%else
1517    mova  [rsp+0*16], m0
1518%endif
1519
1520%if ARCH_X86_32
1521    mova         m11, %%p3mem
1522%endif
1523    punpcklbw     m0, m11, m3
1524    punpckhbw     m1, m11, m3
1525    pmaddubsw     m0, [PIC_sym(pb_1)]
1526    pmaddubsw     m1, [PIC_sym(pb_1)]
1527    psubw         m2, m0
1528    psubw         m7, m1
1529    punpcklbw     m0, m4, m14
1530    punpckhbw     m1, m4, m14
1531    pmaddubsw     m0, [PIC_sym(pb_1)]
1532    pmaddubsw     m1, [PIC_sym(pb_1)]
1533    paddw         m2, m0
1534    paddw         m7, m1            ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4
1535    psrlw         m0, m2, 3
1536    psrlw         m1, m7, 3
1537    packuswb      m0, m1
1538    pand          m0, m9
1539    pandn         m1, m9, m4
1540    por           m0, m1            ; p0
1541%ifidn %2, v
1542    mova [tmpq+stride3q], m0
1543%else
1544    mova  [rsp+1*16], m0
1545%endif
1546
1547    punpcklbw     m0, m5, m15
1548    punpckhbw     m1, m5, m15
1549    pmaddubsw     m0, [PIC_sym(pb_1)]
1550    pmaddubsw     m1, [PIC_sym(pb_1)]
1551    paddw         m2, m0
1552    paddw         m7, m1
1553%if ARCH_X86_32
1554    mova         m11, %%p3mem
1555%endif
1556    punpcklbw     m0, m11, m4
1557    punpckhbw    m11, m11, m4
1558    pmaddubsw     m0, [PIC_sym(pb_1)]
1559    pmaddubsw    m11, [PIC_sym(pb_1)]
1560    psubw         m2, m0
1561    psubw         m7, m11           ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4
1562    psrlw         m0, m2, 3
1563    psrlw        m11, m7, 3
1564    packuswb      m0, m11
1565    pand          m0, m9
1566    pandn        m11, m9, m5
1567    por          m11, m0            ; q0
1568%ifidn %2, v
1569    mova [dstq+strideq*0], m11
1570%elif ARCH_X86_32
1571    mova  [esp+8*16], m11
1572%endif
1573
1574    punpcklbw     m0, m5, m15
1575    punpckhbw     m1, m5, m15
1576    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
1577    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
1578    paddw         m2, m0
1579    paddw         m7, m1
1580    punpcklbw     m0, m13, m6
1581    punpckhbw     m1, m13, m6
1582    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
1583    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
1584    paddw         m2, m0
1585    paddw         m7, m1            ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4
1586    psrlw         m0, m2, 3
1587    psrlw         m1, m7, 3
1588    packuswb      m0, m1
1589    pand          m0, m9
1590    pandn         m1, m9, m6
1591    por           m0, m1            ; q1
1592%ifidn %2, v
1593    mova [dstq+strideq*1], m0
1594%else
1595 %if ARCH_X86_64
1596    SWAP           0, 13
1597 %else
1598    mova  [esp+9*16], m0
1599 %endif
1600%endif
1601
1602    punpcklbw     m0, m3, m6
1603    punpckhbw     m1, m3, m6
1604    pmaddubsw     m0, [PIC_sym(pb_1)]
1605    pmaddubsw     m1, [PIC_sym(pb_1)]
1606    psubw         m2, m0
1607    psubw         m7, m1
1608    punpcklbw     m0, m14, m15
1609    punpckhbw     m1, m14, m15
1610    pmaddubsw     m0, [PIC_sym(pb_1)]
1611    pmaddubsw     m1, [PIC_sym(pb_1)]
1612    paddw         m2, m0
1613    paddw         m7, m1            ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4
1614    psrlw         m2, 3
1615    psrlw         m7, 3
1616    packuswb      m2, m7
1617    pand          m2, m9
1618    pandn         m7, m9, m14
1619    por           m2, m7            ; q2
1620%ifidn %2, v
1621    mova [dstq+strideq*2], m2
1622%else
1623    mova          m0, [rsp+0*16]
1624%if %1 == 8
1625    mova          m1, [rsp+1*16]
1626    mova          m4, %%p3mem
1627
1628%if ARCH_X86_32
1629 %define m10 [esp+2*16]
1630 %define m11 [esp+8*16]
1631 %define m13 [esp+9*16]
1632%endif
1633
1634    ; 16x8 transpose
1635    punpcklbw     m3, m4, m10
1636    punpckhbw     m4, m10
1637    punpcklbw     m5, m0, m1
1638    punpckhbw     m0, m1
1639    punpcklbw     m1, m11, m13
1640    punpckhbw     m6, m11, m13
1641    punpcklbw     m7, m2, m15
1642    punpckhbw     m2, m15
1643%if ARCH_X86_64
1644    SWAP           2, 15
1645%else
1646    mova         m15, m2
1647%endif
1648
1649    punpcklwd     m2, m3, m5
1650    punpckhwd     m3, m5
1651    punpcklwd     m5, m4, m0
1652    punpckhwd     m4, m0
1653    punpcklwd     m0, m1, m7
1654    punpckhwd     m1, m7
1655    punpcklwd     m7, m6, m15
1656    punpckhwd     m6, m15
1657%if ARCH_X86_64
1658    SWAP           6, 15
1659%else
1660    mova         m15, m6
1661%endif
1662
1663    punpckldq     m6, m2, m0
1664    punpckhdq     m2, m0
1665    punpckldq     m0, m3, m1
1666    punpckhdq     m3, m1
1667    punpckldq     m1, m5, m7
1668    punpckhdq     m5, m7
1669    punpckldq     m7, m4, m15
1670    punpckhdq     m4, m15
1671
1672    ; write 8x16
1673    movq   [dstq+strideq*0-4], xm6
1674    movhps [dstq+strideq*1-4], xm6
1675    movq   [dstq+strideq*2-4], xm2
1676    movhps [dstq+stride3q -4], xm2
1677    lea         dstq, [dstq+strideq*4]
1678    movq   [dstq+strideq*0-4], xm0
1679    movhps [dstq+strideq*1-4], xm0
1680    movq   [dstq+strideq*2-4], xm3
1681    movhps [dstq+stride3q -4], xm3
1682    lea         dstq, [dstq+strideq*4]
1683    movq   [dstq+strideq*0-4], xm1
1684    movhps [dstq+strideq*1-4], xm1
1685    movq   [dstq+strideq*2-4], xm5
1686    movhps [dstq+stride3q -4], xm5
1687    lea         dstq, [dstq+strideq*4]
1688    movq   [dstq+strideq*0-4], xm7
1689    movhps [dstq+strideq*1-4], xm7
1690    movq   [dstq+strideq*2-4], xm4
1691    movhps [dstq+stride3q -4], xm4
1692    lea         dstq, [dstq+strideq*4]
1693%else
1694    ; 16x16 transpose and store
1695    SWAP           6, 0
1696    SWAP           7, 1
1697 %if ARCH_X86_64
1698    SWAP           5, 10, 2
1699    SWAP           8, 11
1700    SWAP           9, 13
1701    mova [rsp+21*16], m12
1702 %else
1703    mova [esp+10*16], m2
1704  %xdefine m8  m0
1705  %xdefine m9  m1
1706  %xdefine m10 m2
1707  %xdefine m11 m3
1708  %xdefine m12 m4
1709  %xdefine m13 m5
1710  %xdefine m14 m6
1711  %xdefine m15 m7
1712 %endif
1713    mova          m0, [rsp+11*16]
1714    mova          m1, [rsp+12*16]
1715    mova          m2, [rsp+13*16]
1716    mova          m3, [rsp+14*16]
1717    mova          m4, [rsp+19*16]
1718%if ARCH_X86_64
1719    mova          m7, [rsp+ 1*16]
1720    mova         m11, [rsp+20*16]
1721    mova         m12, [rsp+15*16]
1722    mova         m13, [rsp+16*16]
1723    mova         m14, [rsp+17*16]
1724    TRANSPOSE_16X16B 1, [rsp+18*16]
1725%else
1726    mova          m5, [esp+ 2*16]
1727    TRANSPOSE_16X16B 1, [esp+32*16]
1728    mov         tmpq, dstq
1729    lea         dstq, [dstq+strideq*8]
1730%endif
1731    movu [dstq+strideq*0-8], xm0
1732    movu [dstq+strideq*1-8], xm1
1733    movu [dstq+strideq*2-8], xm2
1734    movu [dstq+stride3q -8], xm3
1735    lea         dstq, [dstq+strideq*4]
1736    movu [dstq+strideq*0-8], xm4
1737    movu [dstq+strideq*1-8], xm5
1738    movu [dstq+strideq*2-8], xm6
1739    movu [dstq+stride3q -8], xm7
1740%if ARCH_X86_64
1741    lea         dstq, [dstq+strideq*4]
1742%else
1743  %xdefine m8  m0
1744  %xdefine m9  m1
1745  %xdefine m10 m2
1746  %xdefine m11 m3
1747  %xdefine m12 m4
1748  %xdefine m13 m5
1749  %xdefine m14 m6
1750  %xdefine m15 m7
1751    mova          m8, [esp+11*16]
1752    mova          m9, [esp+12*16]
1753    mova         m10, [esp+13*16]
1754    mova         m11, [esp+14*16]
1755    mova         m12, [esp+26*16]
1756    mova         m13, [esp+27*16]
1757    mova         m14, [esp+ 0*16]
1758    mova         m15, [esp+ 1*16]
1759    mov         dstq, tmpq
1760%endif
1761    movu [dstq+strideq*0-8], xm8
1762    movu [dstq+strideq*1-8], xm9
1763    movu [dstq+strideq*2-8], xm10
1764    movu [dstq+stride3q -8], xm11
1765    lea         dstq, [dstq+strideq*4]
1766    movu [dstq+strideq*0-8], xm12
1767    movu [dstq+strideq*1-8], xm13
1768    movu [dstq+strideq*2-8], xm14
1769    movu [dstq+stride3q -8], xm15
1770    lea         dstq, [dstq+strideq*4]
1771%if ARCH_X86_32
1772    lea         dstq, [dstq+strideq*8]
1773%else
1774    mova         m12, [rsp+21*16]
1775%endif
1776
1777%endif ; if %1 == 8
1778%endif ; ifidn %2, v
1779%elif %1 == 6
1780    ; flat6 filter
1781%if ARCH_X86_32
1782    mova  [esp+3*16], m3
1783    mova  [esp+4*16], m4
1784    mova  [esp+5*16], m5
1785    mova  [esp+6*16], m6
1786 %xdefine m8  m3
1787 %xdefine m10 m4
1788 %xdefine m11 m5
1789 %xdefine m15 m6
1790 %define m3  [esp+3*16]
1791 %define m4  [esp+4*16]
1792 %define m5  [esp+5*16]
1793 %define m6  [esp+6*16]
1794 %define m9  %%flat8mem
1795 %define m13 %%p2mem
1796 %define m14 %%q2mem
1797%endif
1798
1799    punpcklbw     m8, m13, m5
1800    punpckhbw    m11, m13, m5
1801    pmaddubsw     m0, m8, [PIC_sym(pb_3_1)]
1802    pmaddubsw     m1, m11, [PIC_sym(pb_3_1)]
1803    punpcklbw     m7, m4, m3
1804    punpckhbw    m10, m4, m3
1805    pmaddubsw     m2, m7, [PIC_sym(pb_2)]
1806    pmaddubsw    m15, m10, [PIC_sym(pb_2)]
1807    paddw         m0, m2
1808    paddw         m1, m15
1809    pmulhrsw      m2, m0, [PIC_sym(pw_4096)]
1810    pmulhrsw     m15, m1, [PIC_sym(pw_4096)]
1811    packuswb      m2, m15
1812    pand          m2, m9
1813    pandn        m15, m9, m3
1814    por           m2, m15
1815%ifidn %2, v
1816    mova [tmpq+strideq*2], m2                   ; p1
1817%elif ARCH_X86_32
1818    mova [esp+11*16], m2
1819%endif
1820
1821    pmaddubsw     m8, [PIC_sym(pb_m1_1)]
1822    pmaddubsw    m11, [PIC_sym(pb_m1_1)]
1823    paddw         m0, m8
1824    paddw         m1, m11
1825    punpcklbw     m8, m13, m6
1826    punpckhbw    m11, m13, m6
1827%if ARCH_X86_64
1828    SWAP           2, 13
1829%endif
1830    pmaddubsw     m8, [PIC_sym(pb_m1_1)]
1831    pmaddubsw    m11, [PIC_sym(pb_m1_1)]
1832    paddw         m0, m8
1833    paddw         m1, m11
1834    pmulhrsw      m2, m0, [PIC_sym(pw_4096)]
1835    pmulhrsw     m15, m1, [PIC_sym(pw_4096)]
1836    packuswb      m2, m15
1837    pand          m2, m9
1838    pandn        m15, m9, m4
1839    por           m2, m15
1840%ifidn %2, v
1841    mova [tmpq+stride3q], m2                    ; p0
1842%elif ARCH_X86_32
1843    mova  [esp+8*16], m2
1844%endif
1845
1846    paddw         m0, m8
1847    paddw         m1, m11
1848    punpcklbw     m8, m3, m14
1849    punpckhbw    m11, m3, m14
1850%if ARCH_X86_64
1851    SWAP           2, 14
1852%endif
1853    pmaddubsw     m2, m8, [PIC_sym(pb_m1_1)]
1854    pmaddubsw    m15, m11, [PIC_sym(pb_m1_1)]
1855    paddw         m0, m2
1856    paddw         m1, m15
1857    pmulhrsw      m2, m0, [PIC_sym(pw_4096)]
1858    pmulhrsw     m15, m1, [PIC_sym(pw_4096)]
1859    packuswb      m2, m15
1860    pand          m2, m9
1861    pandn        m15, m9, m5
1862    por           m2, m15
1863%ifidn %2, v
1864    mova [dstq+strideq*0], m2                   ; q0
1865%endif
1866
1867    pmaddubsw     m8, [PIC_sym(pb_m1_2)]
1868    pmaddubsw    m11, [PIC_sym(pb_m1_2)]
1869    paddw         m0, m8
1870    paddw         m1, m11
1871    pmaddubsw     m7, [PIC_sym(pb_m1_0)]
1872    pmaddubsw    m10, [PIC_sym(pb_m1_0)]
1873    paddw         m0, m7
1874    paddw         m1, m10
1875    pmulhrsw      m0, [PIC_sym(pw_4096)]
1876    pmulhrsw      m1, [PIC_sym(pw_4096)]
1877    packuswb      m0, m1
1878    pand          m0, m9
1879    pandn         m1, m9, m6
1880    por           m0, m1
1881%if ARCH_X86_32
1882 %xdefine m3 m8
1883 %xdefine m4 m10
1884 %xdefine m5 m11
1885 %xdefine m6 m15
1886%endif
1887%ifidn %2, v
1888    mova [dstq+strideq*1], m0                   ; q1
1889%else
1890 %if ARCH_X86_64
1891    SWAP           3, 13
1892    SWAP           4, 14
1893 %else
1894    mova          m3, [esp+11*16]
1895    mova          m4, [esp+ 8*16]
1896 %endif
1897    SWAP           5, 2
1898    SWAP           6, 0
1899    TRANSPOSE_16x4_AND_WRITE_4x16 3, 4, 5, 6, 7
1900%endif
1901%else ; if %1 == 4
1902%ifidn %2, v
1903    mova [tmpq+strideq*0], m3                   ; p1
1904    mova [tmpq+strideq*1], m4                   ; p0
1905    mova [tmpq+strideq*2], m5                   ; q0
1906    mova [tmpq+stride3q ], m6                   ; q1
1907%else
1908    TRANSPOSE_16x4_AND_WRITE_4x16 3, 4, 5, 6, 7
1909%endif
1910%endif
1911%if ARCH_X86_32
1912 %define m12 m12reg
1913%endif
1914%endmacro
1915
1916;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1917;;          32-bit PIC helpers          ;;
1918;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1919
1920%if ARCH_X86_32
1921 %define PIC_base_offset $$
1922
1923 %macro SETUP_PIC 0 ; PIC_reg
1924  %define PIC_reg r2
1925  %assign PIC_reg_stk_offset stack_size-gprsize*(1+copy_args*4)
1926    LEA      PIC_reg, $$
1927 %endmacro
1928
1929 %macro XCHG_PIC_REG 1 ; 0=mask 1=PIC_base
1930  %if %1 == 0
1931    mov [esp+PIC_reg_stk_offset], PIC_reg
1932    mov      PIC_reg, maskm
1933  %else
1934    mov      PIC_reg, [esp+PIC_reg_stk_offset]
1935  %endif
1936 %endmacro
1937
1938 %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset)
1939
1940%else
1941 %macro XCHG_PIC_REG 1
1942 %endmacro
1943 %define PIC_sym(sym) (sym)
1944%endif
1945
1946;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1947
1948%if ARCH_X86_32
1949 %if STACK_ALIGNMENT < required_stack_alignment
1950  %assign copy_args 1
1951 %else
1952  %assign copy_args 0
1953 %endif
1954%endif
1955
1956%macro RELOC_ARGS 1
1957 %if copy_args
1958  %define maskm     [esp+stack_size-gprsize*1]
1959  %define l_stridem [esp+stack_size-gprsize*2]
1960  %define lutm      [esp+stack_size-gprsize*3]
1961  %define %1m       [esp+stack_size-gprsize*4]
1962    mov          r6d, r6m
1963    mov        maskm, maskd
1964    mov         lutm, lutd
1965    mov          %1m, r6d
1966 %else
1967  %define %1m       r6m
1968 %endif
1969%endmacro
1970
1971%if ARCH_X86_32
1972 %define tmpq       r4
1973 %define mstrideq   r5
1974 %define stride3q   r6
1975 %define l_stride3q r6
1976%endif
1977
1978INIT_XMM ssse3
1979%if ARCH_X86_64
1980cglobal lpf_v_sb_y, 7, 11, 16, 16 * 15, \
1981                    dst, stride, mask, l, l_stride, lut, \
1982                    w, stride3, mstride, tmp, mask_bits
1983%else
1984cglobal lpf_v_sb_y, 6, 7, 8, -16 * (26 + copy_args), \
1985                    dst, stride, mask, l, l_stride, lut, mask_bits
1986    RELOC_ARGS w
1987    SETUP_PIC
1988 %define m12 m5
1989%endif
1990    shl    l_strideq, 2
1991    sub           lq, l_strideq
1992%if ARCH_X86_64
1993    mov     mstrideq, strideq
1994    neg     mstrideq
1995    lea     stride3q, [strideq*3]
1996%else
1997    mov    l_stridem, l_strided
1998%endif
1999    mov   mask_bitsd, 0xf
2000    mova         m12, [PIC_sym(pd_mask)]
2001    XCHG_PIC_REG   0
2002    movu          m0, [maskq]
2003    pxor          m4, m4
2004    movd          m3, [lutq+136]
2005    pshufb        m3, m4
2006    pshufd        m2, m0, q2222
2007    pshufd        m1, m0, q1111
2008    pshufd        m0, m0, q0000
2009    por           m1, m2
2010    por           m0, m1
2011    mova [rsp+11*16], m0
2012    mova [rsp+12*16], m1
2013    mova [rsp+13*16], m2
2014    mova [rsp+14*16], m3
2015
2016%define maskmem [esp+15*16]
2017%define mask0   [rsp+11*16]
2018%define mask1   [rsp+12*16]
2019%define mask2   [rsp+13*16]
2020%define minlvl  [rsp+14*16]
2021
2022.loop:
2023    test   [maskq+8], mask_bitsd                ; vmask[2]
2024    je .no_flat16
2025
2026%if ARCH_X86_32
2027    XCHG_PIC_REG   1
2028    mov  [esp+25*16], mask_bitsd
2029    mova     maskmem, m12
2030%endif
2031    FILTER        16, v
2032    jmp .end
2033
2034.no_flat16:
2035    test   [maskq+4], mask_bitsd                ; vmask[1]
2036    je .no_flat
2037
2038%if ARCH_X86_32
2039    XCHG_PIC_REG   1
2040    mov  [esp+25*16], mask_bitsd
2041    mova     maskmem, m12
2042%endif
2043    FILTER         8, v
2044    jmp .end
2045
2046.no_flat:
2047    test   [maskq+0], mask_bitsd                ; vmask[0]
2048    XCHG_PIC_REG   1
2049    je .no_filter
2050
2051%if ARCH_X86_32
2052    mov  [esp+25*16], mask_bitsd
2053    mova     maskmem, m12
2054%endif
2055    FILTER         4, v
2056
2057.end:
2058%if ARCH_X86_32
2059    mova         m12, maskmem
2060    mov   mask_bitsd, [esp+25*16]
2061%endif
2062.no_filter:
2063    pslld        m12, 4
2064    shl   mask_bitsd, 4
2065    add           lq, 16
2066    add         dstq, 16
2067%if ARCH_X86_64
2068    sub           wd, 4
2069%else
2070    sub     dword wm, 4
2071%endif
2072    XCHG_PIC_REG   0
2073    jg .loop
2074    RET
2075
2076INIT_XMM ssse3
2077%if ARCH_X86_64
2078cglobal lpf_h_sb_y, 7, 11, 16, 16 * 26, \
2079                    dst, stride, mask, l, l_stride, lut, \
2080                    h, stride3, l_stride3, tmp, mask_bits
2081%else
2082cglobal lpf_h_sb_y, 6, 7, 8, -16 * (39 + copy_args), \
2083                    dst, stride, mask, l, l_stride, lut, mask_bits
2084    RELOC_ARGS h
2085    SETUP_PIC
2086 %define m12 m5
2087%endif
2088    sub           lq, 4
2089    shl    l_strideq, 2
2090%if ARCH_X86_64
2091    lea     stride3q, [strideq*3]
2092    lea   l_stride3q, [l_strideq*3]
2093%else
2094    mov    l_stridem, l_strided
2095%endif
2096    mov   mask_bitsd, 0xf
2097    mova         m12, [PIC_sym(pd_mask)]
2098    XCHG_PIC_REG   0
2099    movu          m0, [maskq]
2100    pxor          m4, m4
2101    movd          m3, [lutq+136]
2102    pshufb        m3, m4
2103    pshufd        m2, m0, q2222
2104    pshufd        m1, m0, q1111
2105    pshufd        m0, m0, q0000
2106    por           m1, m2
2107    por           m0, m1
2108    mova [rsp+22*16], m0
2109    mova [rsp+23*16], m1
2110    mova [rsp+24*16], m2
2111    mova [rsp+25*16], m3
2112
2113%define maskmem [esp+37*16]
2114%define mask0   [rsp+22*16]
2115%define mask1   [rsp+23*16]
2116%define mask2   [rsp+24*16]
2117%define minlvl  [rsp+25*16]
2118
2119.loop:
2120    test   [maskq+8], mask_bitsd                ; vmask[2]
2121    je .no_flat16
2122
2123%if ARCH_X86_32
2124    XCHG_PIC_REG   1
2125    mov  [esp+38*16], mask_bitsd
2126    mova     maskmem, m12
2127%endif
2128    FILTER        16, h
2129    jmp .end
2130
2131.no_flat16:
2132    test   [maskq+4], mask_bitsd                ; vmask[1]
2133    je .no_flat
2134
2135%if ARCH_X86_32
2136    XCHG_PIC_REG   1
2137    mov  [esp+38*16], mask_bitsd
2138    mova     maskmem, m12
2139%endif
2140    FILTER         8, h
2141    jmp .end
2142
2143.no_flat:
2144    test   [maskq+0], mask_bitsd                ; vmask[0]
2145    XCHG_PIC_REG   1
2146    je .no_filter
2147
2148%if ARCH_X86_32
2149    mov  [esp+38*16], mask_bitsd
2150    mova     maskmem, m12
2151%endif
2152    FILTER         4, h
2153    jmp .end
2154
2155.no_filter:
2156    lea         dstq, [dstq+strideq*8]
2157    lea         dstq, [dstq+strideq*8]
2158%if ARCH_X86_32
2159    jmp .end_noload
2160.end:
2161    mova         m12, maskmem
2162    mov    l_strideq, l_stridem
2163    mov   mask_bitsd, [esp+38*16]
2164.end_noload:
2165%else
2166.end:
2167%endif
2168    lea           lq, [lq+l_strideq*4]
2169    pslld        m12, 4
2170    shl   mask_bitsd, 4
2171%if ARCH_X86_64
2172    sub           hd, 4
2173%else
2174    sub     dword hm, 4
2175%endif
2176    XCHG_PIC_REG   0
2177    jg .loop
2178    RET
2179
2180INIT_XMM ssse3
2181%if ARCH_X86_64
2182cglobal lpf_v_sb_uv, 7, 11, 16, 3 * 16, \
2183                     dst, stride, mask, l, l_stride, lut, \
2184                     w, stride3, mstride, tmp, mask_bits
2185%else
2186cglobal lpf_v_sb_uv, 6, 7, 8, -16 * (12 + copy_args), \
2187                     dst, stride, mask, l, l_stride, lut, mask_bits
2188    RELOC_ARGS w
2189    SETUP_PIC
2190 %define m12 m4
2191%endif
2192    shl    l_strideq, 2
2193    sub           lq, l_strideq
2194%if ARCH_X86_64
2195    mov     mstrideq, strideq
2196    neg     mstrideq
2197    lea     stride3q, [strideq*3]
2198%else
2199    mov    l_stridem, l_strided
2200%endif
2201    mov   mask_bitsd, 0xf
2202    mova         m12, [PIC_sym(pd_mask)]
2203    XCHG_PIC_REG   0
2204    movq          m0, [maskq]
2205    pxor          m3, m3
2206    movd          m2, [lutq+136]
2207    pshufb        m2, m3
2208    pshufd        m1, m0, q1111
2209    pshufd        m0, m0, q0000
2210    por           m0, m1
2211    mova  [rsp+0*16], m0
2212    mova  [rsp+1*16], m1
2213    mova  [rsp+2*16], m2
2214
2215%define maskmem [esp+7*16]
2216%define mask0   [rsp+0*16]
2217%define mask1   [rsp+1*16]
2218%define minlvl  [rsp+2*16]
2219
2220.loop:
2221    test   [maskq+4], mask_bitsd                ; vmask[1]
2222    je .no_flat
2223
2224%if ARCH_X86_32
2225    XCHG_PIC_REG   1
2226    mov  [esp+11*16], mask_bitsd
2227    mova     maskmem, m12
2228%endif
2229    FILTER         6, v
2230    jmp .end
2231
2232.no_flat:
2233    test   [maskq+0], mask_bitsd                ; vmask[1]
2234    XCHG_PIC_REG   1
2235    je .no_filter
2236
2237%if ARCH_X86_32
2238    mov  [esp+11*16], mask_bitsd
2239    mova     maskmem, m12
2240%endif
2241    FILTER         4, v
2242
2243.end:
2244%if ARCH_X86_32
2245    mova         m12, maskmem
2246    mov   mask_bitsd, [esp+11*16]
2247%endif
2248.no_filter:
2249    pslld        m12, 4
2250    shl   mask_bitsd, 4
2251    add           lq, 16
2252    add         dstq, 16
2253%if ARCH_X86_64
2254    sub           wd, 4
2255%else
2256    sub     dword wm, 4
2257%endif
2258    XCHG_PIC_REG   0
2259    jg .loop
2260    RET
2261
2262INIT_XMM ssse3
2263%if ARCH_X86_64
2264cglobal lpf_h_sb_uv, 7, 11, 16, 16 * 3, \
2265                     dst, stride, mask, l, l_stride, lut, \
2266                     h, stride3, l_stride3, tmp, mask_bits
2267%else
2268cglobal lpf_h_sb_uv, 6, 7, 8, -16 * (13 + copy_args), \
2269                     dst, stride, mask, l, l_stride, lut, mask_bits
2270    RELOC_ARGS h
2271    SETUP_PIC
2272 %define m12 m4
2273%endif
2274    sub           lq, 4
2275    shl    l_strideq, 2
2276%if ARCH_X86_64
2277    lea     stride3q, [strideq*3]
2278    lea   l_stride3q, [l_strideq*3]
2279%else
2280    mov    l_stridem, l_strided
2281%endif
2282    mov   mask_bitsd, 0xf
2283    mova         m12, [PIC_sym(pd_mask)]
2284    XCHG_PIC_REG   0
2285    movq          m0, [maskq]
2286    pxor          m3, m3
2287    movd          m2, [lutq+136]
2288    pshufb        m2, m3
2289    pshufd        m1, m0, q1111
2290    pshufd        m0, m0, q0000
2291    por           m0, m1
2292    mova  [rsp+0*16], m0
2293    mova  [rsp+1*16], m1
2294    mova  [rsp+2*16], m2
2295
2296%define maskmem [esp+7*16]
2297%define mask0   [rsp+0*16]
2298%define mask1   [rsp+1*16]
2299%define minlvl  [rsp+2*16]
2300
2301.loop:
2302    test   [maskq+4], mask_bitsd                ; vmask[1]
2303    je .no_flat
2304
2305%if ARCH_X86_32
2306    XCHG_PIC_REG   1
2307    mov  [esp+12*16], mask_bitsd
2308    mova     maskmem, m12
2309%endif
2310    FILTER         6, h
2311    jmp .end
2312
2313.no_flat:
2314    test   [maskq+0], mask_bitsd                ; vmask[1]
2315    XCHG_PIC_REG   1
2316    je .no_filter
2317
2318%if ARCH_X86_32
2319    mov  [esp+12*16], mask_bitsd
2320    mova     maskmem, m12
2321%endif
2322    FILTER         4, h
2323    jmp .end
2324
2325.no_filter:
2326    lea         dstq, [dstq+strideq*8]
2327    lea         dstq, [dstq+strideq*8]
2328%if ARCH_X86_32
2329    jmp .end_noload
2330.end:
2331    mova         m12, maskmem
2332    mov    l_strided, l_stridem
2333    mov   mask_bitsd, [esp+12*16]
2334.end_noload:
2335%else
2336.end:
2337%endif
2338    lea           lq, [lq+l_strideq*4]
2339    pslld        m12, 4
2340    shl   mask_bitsd, 4
2341%if ARCH_X86_64
2342    sub           hd, 4
2343%else
2344    sub     dword hm, 4
2345%endif
2346    XCHG_PIC_REG   0
2347    jg .loop
2348    RET
2349