1; Copyright © 2018, VideoLAN and dav1d authors
2; Copyright © 2018, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29%if ARCH_X86_64
30
31SECTION_RODATA 32
32
33pb_4x1_4x5_4x9_4x13: times 2 db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12
34pb_7_1: times 16 db 7, 1
35pb_3_1: times 16 db 3, 1
36pb_2_1: times 16 db 2, 1
37pb_m1_0: times 16 db -1, 0
38pb_m1_1: times 16 db -1, 1
39pb_m1_2: times 16 db -1, 2
40pb_1: times 32 db 1
41pb_2: times 32 db 2
42pb_3: times 32 db 3
43pb_4: times 32 db 4
44pb_16: times 32 db 16
45pb_63: times 32 db 63
46pb_64: times 32 db 64
47pb_128: times 32 db 0x80
48pb_129: times 32 db 0x81
49pb_240: times 32 db 0xf0
50pb_248: times 32 db 0xf8
51pb_254: times 32 db 0xfe
52
53pw_2048: times 16 dw 2048
54pw_4096: times 16 dw 4096
55
56pb_mask: dd 1, 2, 4, 8, 16, 32, 64, 128
57
58SECTION .text
59
60%macro ABSSUB 4 ; dst, a, b, tmp
61    psubusb       %1, %2, %3
62    psubusb       %4, %3, %2
63    por           %1, %4
64%endmacro
65
66%macro TRANSPOSE_16x4_AND_WRITE_4x32 5
67    ; transpose 16x4
68    punpcklbw    m%5, m%1, m%2
69    punpckhbw    m%1, m%2
70    punpcklbw    m%2, m%3, m%4
71    punpckhbw    m%3, m%4
72    punpcklwd    m%4, m%5, m%2
73    punpckhwd    m%5, m%2
74    punpcklwd    m%2, m%1, m%3
75    punpckhwd    m%1, m%3
76
77    ; write out
78    movd [dstq+strideq*0-2], xm%4
79    pextrd [dstq+strideq*1-2], xm%4, 1
80    pextrd [dstq+strideq*2-2], xm%4, 2
81    pextrd [dstq+stride3q-2], xm%4, 3
82    lea         dstq, [dstq+strideq*4]
83    movd [dstq+strideq*0-2], xm%5
84    pextrd [dstq+strideq*1-2], xm%5, 1
85    pextrd [dstq+strideq*2-2], xm%5, 2
86    pextrd [dstq+stride3q-2], xm%5, 3
87    lea         dstq, [dstq+strideq*4]
88    movd [dstq+strideq*0-2], xm%2
89    pextrd [dstq+strideq*1-2], xm%2, 1
90    pextrd [dstq+strideq*2-2], xm%2, 2
91    pextrd [dstq+stride3q-2], xm%2, 3
92    lea         dstq, [dstq+strideq*4]
93    movd [dstq+strideq*0-2], xm%1
94    pextrd [dstq+strideq*1-2], xm%1, 1
95    pextrd [dstq+strideq*2-2], xm%1, 2
96    pextrd [dstq+stride3q-2], xm%1, 3
97    lea         dstq, [dstq+strideq*4]
98
99    vextracti128 xm%4, m%4, 1
100    vextracti128 xm%5, m%5, 1
101    vextracti128 xm%2, m%2, 1
102    vextracti128 xm%1, m%1, 1
103
104    movd [dstq+strideq*0-2], xm%4
105    pextrd [dstq+strideq*1-2], xm%4, 1
106    pextrd [dstq+strideq*2-2], xm%4, 2
107    pextrd [dstq+stride3q-2], xm%4, 3
108    lea         dstq, [dstq+strideq*4]
109    movd [dstq+strideq*0-2], xm%5
110    pextrd [dstq+strideq*1-2], xm%5, 1
111    pextrd [dstq+strideq*2-2], xm%5, 2
112    pextrd [dstq+stride3q-2], xm%5, 3
113    lea         dstq, [dstq+strideq*4]
114    movd [dstq+strideq*0-2], xm%2
115    pextrd [dstq+strideq*1-2], xm%2, 1
116    pextrd [dstq+strideq*2-2], xm%2, 2
117    pextrd [dstq+stride3q-2], xm%2, 3
118    lea         dstq, [dstq+strideq*4]
119    movd [dstq+strideq*0-2], xm%1
120    pextrd [dstq+strideq*1-2], xm%1, 1
121    pextrd [dstq+strideq*2-2], xm%1, 2
122    pextrd [dstq+stride3q-2], xm%1, 3
123    lea         dstq, [dstq+strideq*4]
124%endmacro
125
126%macro TRANSPOSE_16X16B 3 ; in_load_15_from_mem, out_store_0_in_mem, mem
127%if %1 == 0
128    mova          %3, m15
129%endif
130
131    ; input in m0-15
132    punpcklbw    m15, m0, m1
133    punpckhbw     m0, m1
134    punpcklbw     m1, m2, m3
135    punpckhbw     m2, m3
136    punpcklbw     m3, m4, m5
137    punpckhbw     m4, m5
138    punpcklbw     m5, m6, m7
139    punpckhbw     m6, m7
140    punpcklbw     m7, m8, m9
141    punpckhbw     m8, m9
142    punpcklbw     m9, m10, m11
143    punpckhbw    m10, m11
144    punpcklbw    m11, m12, m13
145    punpckhbw    m12, m13
146    mova         m13, %3
147    mova          %3, m12
148    punpcklbw    m12, m14, m13
149    punpckhbw    m13, m14, m13
150
151    ; interleaved in m15,0,1,2,3,4,5,6,7,8,9,10,11,rsp%3,12,13
152    punpcklwd    m14, m15, m1
153    punpckhwd    m15, m1
154    punpcklwd     m1, m0, m2
155    punpckhwd     m0, m2
156    punpcklwd     m2, m3, m5
157    punpckhwd     m3, m5
158    punpcklwd     m5, m4, m6
159    punpckhwd     m4, m6
160    punpcklwd     m6, m7, m9
161    punpckhwd     m7, m9
162    punpcklwd     m9, m8, m10
163    punpckhwd     m8, m10
164    punpcklwd    m10, m11, m12
165    punpckhwd    m11, m12
166    mova         m12, %3
167    mova          %3, m11
168    punpcklwd    m11, m12, m13
169    punpckhwd    m12, m13
170
171    ; interleaved in m14,15,1,0,2,3,5,4,6,7,9,8,10,rsp%3,11,12
172    punpckldq    m13, m14, m2
173    punpckhdq    m14, m2
174    punpckldq     m2, m15, m3
175    punpckhdq    m15, m3
176    punpckldq     m3, m1, m5
177    punpckhdq     m1, m5
178    punpckldq     m5, m0, m4
179    punpckhdq     m0, m4
180    punpckldq     m4, m6, m10
181    punpckhdq     m6, m10
182    punpckldq    m10, m9, m11
183    punpckhdq     m9, m11
184    punpckldq    m11, m8, m12
185    punpckhdq     m8, m12
186    mova         m12, %3
187    mova          %3, m8
188    punpckldq     m8, m7, m12
189    punpckhdq     m7, m12
190
191    ; interleaved in m13,14,2,15,3,1,5,0,4,6,8,7,10,9,11,rsp%3
192    punpcklqdq   m12, m13, m4
193    punpckhqdq   m13, m4
194    punpcklqdq    m4, m14, m6
195    punpckhqdq   m14, m6
196    punpcklqdq    m6, m2, m8
197    punpckhqdq    m2, m8
198    punpcklqdq    m8, m15, m7
199    punpckhqdq   m15, m7
200    punpcklqdq    m7, m3, m10
201    punpckhqdq    m3, m10
202    punpcklqdq   m10, m1, m9
203    punpckhqdq    m1, m9
204    punpcklqdq    m9, m5, m11
205    punpckhqdq    m5, m11
206    mova         m11, %3
207    mova          %3, m12
208    punpcklqdq   m12, m0, m11
209    punpckhqdq    m0, m11
210%if %2 == 0
211    mova         m11, %3
212%endif
213
214    ; interleaved m11,13,4,14,6,2,8,15,7,3,10,1,9,5,12,0
215    SWAP          0, 11, 1, 13, 5, 2, 4, 6, 8, 7, 15
216    SWAP          3, 14, 12, 9
217%endmacro
218
219%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
220    ; load data
221%ifidn %2, v
222%if %1 == 4
223    lea         tmpq, [dstq+mstrideq*2]
224    mova          m3, [tmpq+strideq*0]          ; p1
225    mova          m4, [tmpq+strideq*1]          ; p0
226    mova          m5, [tmpq+strideq*2]          ; q0
227    mova          m6, [tmpq+stride3q]           ; q1
228%else
229    ; load 6-8 pixels, remainder (for wd=16) will be read inline
230    lea         tmpq, [dstq+mstrideq*4]
231%if %1 != 6
232    mova         m12, [tmpq+strideq*0]
233%endif
234    mova         m13, [tmpq+strideq*1]
235    mova          m3, [tmpq+strideq*2]
236    mova          m4, [tmpq+stride3q]
237    mova          m5, [dstq+strideq*0]
238    mova          m6, [dstq+strideq*1]
239    mova         m14, [dstq+strideq*2]
240%if %1 != 6
241    mova         m15, [dstq+stride3q]
242%endif
243%endif
244%else
245    ; load lines
246%if %1 == 4
247    movd         xm3, [dstq+strideq*0-2]
248    movd         xm4, [dstq+strideq*1-2]
249    movd         xm5, [dstq+strideq*2-2]
250    movd         xm6, [dstq+stride3q -2]
251    lea         tmpq, [dstq+strideq*4]
252    pinsrd       xm3, [tmpq+strideq*0-2], 2
253    pinsrd       xm4, [tmpq+strideq*1-2], 2
254    pinsrd       xm5, [tmpq+strideq*2-2], 2
255    pinsrd       xm6, [tmpq+stride3q -2], 2
256    lea         tmpq, [tmpq+strideq*4]
257    pinsrd       xm3, [tmpq+strideq*0-2], 1
258    pinsrd       xm4, [tmpq+strideq*1-2], 1
259    pinsrd       xm5, [tmpq+strideq*2-2], 1
260    pinsrd       xm6, [tmpq+stride3q -2], 1
261    lea         tmpq, [tmpq+strideq*4]
262    pinsrd       xm3, [tmpq+strideq*0-2], 3
263    pinsrd       xm4, [tmpq+strideq*1-2], 3
264    pinsrd       xm5, [tmpq+strideq*2-2], 3
265    pinsrd       xm6, [tmpq+stride3q -2], 3
266    lea         tmpq, [tmpq+strideq*4]
267    movd        xm12, [tmpq+strideq*0-2]
268    movd        xm13, [tmpq+strideq*1-2]
269    movd        xm14, [tmpq+strideq*2-2]
270    movd        xm15, [tmpq+stride3q -2]
271    lea         tmpq, [tmpq+strideq*4]
272    pinsrd      xm12, [tmpq+strideq*0-2], 2
273    pinsrd      xm13, [tmpq+strideq*1-2], 2
274    pinsrd      xm14, [tmpq+strideq*2-2], 2
275    pinsrd      xm15, [tmpq+stride3q -2], 2
276    lea         tmpq, [tmpq+strideq*4]
277    pinsrd      xm12, [tmpq+strideq*0-2], 1
278    pinsrd      xm13, [tmpq+strideq*1-2], 1
279    pinsrd      xm14, [tmpq+strideq*2-2], 1
280    pinsrd      xm15, [tmpq+stride3q -2], 1
281    lea         tmpq, [tmpq+strideq*4]
282    pinsrd      xm12, [tmpq+strideq*0-2], 3
283    pinsrd      xm13, [tmpq+strideq*1-2], 3
284    pinsrd      xm14, [tmpq+strideq*2-2], 3
285    pinsrd      xm15, [tmpq+stride3q -2], 3
286    vinserti128   m3, xm12, 1
287    vinserti128   m4, xm13, 1
288    vinserti128   m5, xm14, 1
289    vinserti128   m6, xm15, 1
290
291    ; transpose 4x16
292    ; xm3: A-D0,A-D8,A-D4,A-D12
293    ; xm4: A-D1,A-D9,A-D5,A-D13
294    ; xm5: A-D2,A-D10,A-D6,A-D14
295    ; xm6: A-D3,A-D11,A-D7,A-D15
296    punpcklbw     m7, m3, m4
297    punpckhbw     m3, m4
298    punpcklbw     m4, m5, m6
299    punpckhbw     m5, m6
300    ; xm7: A0-1,B0-1,C0-1,D0-1,A8-9,B8-9,C8-9,D8-9
301    ; xm3: A4-5,B4-5,C4-5,D4-5,A12-13,B12-13,C12-13,D12-13
302    ; xm4: A2-3,B2-3,C2-3,D2-3,A10-11,B10-11,C10-11,D10-11
303    ; xm5: A6-7,B6-7,C6-7,D6-7,A14-15,B14-15,C14-15,D14-15
304    punpcklwd     m6, m7, m4
305    punpckhwd     m7, m4
306    punpcklwd     m4, m3, m5
307    punpckhwd     m3, m5
308    ; xm6: A0-3,B0-3,C0-3,D0-3
309    ; xm7: A8-11,B8-11,C8-11,D8-11
310    ; xm4: A4-7,B4-7,C4-7,D4-7
311    ; xm3: A12-15,B12-15,C12-15,D12-15
312    punpckldq     m5, m6, m4
313    punpckhdq     m6, m4
314    punpckldq     m4, m7, m3
315    punpckhdq     m7, m3
316    ; xm5: A0-7,B0-7
317    ; xm6: C0-7,D0-7
318    ; xm4: A8-15,B8-15
319    ; xm7: C8-15,D8-15
320    punpcklqdq    m3, m5, m4
321    punpckhqdq    m4, m5, m4
322    punpcklqdq    m5, m6, m7
323    punpckhqdq    m6, m7
324    ; xm3: A0-15
325    ; xm5: B0-15
326    ; xm4: C0-15
327    ; xm6: D0-15
328%elif %1 == 6 || %1 == 8
329    movq         xm3, [dstq+strideq*0-%1/2]
330    movq         xm4, [dstq+strideq*1-%1/2]
331    movq         xm5, [dstq+strideq*2-%1/2]
332    movq         xm6, [dstq+stride3q -%1/2]
333    lea         tmpq, [dstq+strideq*8]
334    movhps       xm3, [tmpq+strideq*0-%1/2]
335    movhps       xm4, [tmpq+strideq*1-%1/2]
336    movhps       xm5, [tmpq+strideq*2-%1/2]
337    movhps       xm6, [tmpq+stride3q -%1/2]
338    lea         tmpq, [tmpq+strideq*8]
339    movq         xm7, [tmpq+strideq*0-%1/2]
340    movq         xm8, [tmpq+strideq*1-%1/2]
341    movq         xm9, [tmpq+strideq*2-%1/2]
342    movq        xm11, [tmpq+stride3q -%1/2]
343    lea         tmpq, [tmpq+strideq*8]
344    movhps       xm7, [tmpq+strideq*0-%1/2]
345    movhps       xm8, [tmpq+strideq*1-%1/2]
346    movhps       xm9, [tmpq+strideq*2-%1/2]
347    movhps      xm11, [tmpq+stride3q -%1/2]
348    vinserti128   m3, xm7, 1
349    vinserti128   m4, xm8, 1
350    vinserti128   m5, xm9, 1
351    vinserti128   m6, xm11, 1
352    lea         tmpq, [dstq+strideq*4]
353    movq        xm12, [tmpq+strideq*0-%1/2]
354    movq        xm13, [tmpq+strideq*1-%1/2]
355    movq        xm14, [tmpq+strideq*2-%1/2]
356    movq        xm15, [tmpq+stride3q -%1/2]
357    lea         tmpq, [tmpq+strideq*8]
358    movhps      xm12, [tmpq+strideq*0-%1/2]
359    movhps      xm13, [tmpq+strideq*1-%1/2]
360    movhps      xm14, [tmpq+strideq*2-%1/2]
361    movhps      xm15, [tmpq+stride3q -%1/2]
362    lea         tmpq, [tmpq+strideq*8]
363    movq         xm7, [tmpq+strideq*0-%1/2]
364    movq         xm8, [tmpq+strideq*1-%1/2]
365    movq         xm9, [tmpq+strideq*2-%1/2]
366    movq        xm11, [tmpq+stride3q -%1/2]
367    lea         tmpq, [tmpq+strideq*8]
368    movhps       xm7, [tmpq+strideq*0-%1/2]
369    movhps       xm8, [tmpq+strideq*1-%1/2]
370    movhps       xm9, [tmpq+strideq*2-%1/2]
371    movhps      xm11, [tmpq+stride3q -%1/2]
372    vinserti128  m12, xm7, 1
373    vinserti128  m13, xm8, 1
374    vinserti128  m14, xm9, 1
375    vinserti128  m15, xm11, 1
376
377    ; transpose 8x16
378    ; xm3: A-H0,A-H8
379    ; xm4: A-H1,A-H9
380    ; xm5: A-H2,A-H10
381    ; xm6: A-H3,A-H11
382    ; xm12: A-H4,A-H12
383    ; xm13: A-H5,A-H13
384    ; xm14: A-H6,A-H14
385    ; xm15: A-H7,A-H15
386    punpcklbw    m7, m3, m4
387    punpckhbw    m3, m4
388    punpcklbw    m4, m5, m6
389    punpckhbw    m5, m6
390    punpcklbw    m6, m12, m13
391    punpckhbw   m12, m13
392    punpcklbw   m13, m14, m15
393    punpckhbw   m14, m15
394    ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1
395    ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9
396    ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3
397    ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11
398    ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5
399    ; xm12: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13
400    ; xm13: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7
401    ; xm14: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15
402    punpcklwd   m15, m7, m4
403    punpckhwd    m7, m4
404    punpcklwd    m4, m3, m5
405    punpckhwd    m3, m5
406    punpcklwd    m5, m6, m13
407    punpckhwd    m6, m13
408    punpcklwd   m13, m12, m14
409    punpckhwd   m12, m14
410    ; xm15: A0-3,B0-3,C0-3,D0-3
411    ; xm7: E0-3,F0-3,G0-3,H0-3
412    ; xm4: A8-11,B8-11,C8-11,D8-11
413    ; xm3: E8-11,F8-11,G8-11,H8-11
414    ; xm5: A4-7,B4-7,C4-7,D4-7
415    ; xm6: E4-7,F4-7,G4-7,H4-7
416    ; xm13: A12-15,B12-15,C12-15,D12-15
417    ; xm12: E12-15,F12-15,G12-15,H12-15
418    punpckldq   m14, m15, m5
419    punpckhdq   m15, m5
420    punpckldq    m5, m7, m6
421%if %1 != 6
422    punpckhdq    m7, m6
423%endif
424    punpckldq    m6, m4, m13
425    punpckhdq    m4, m13
426    punpckldq   m13, m3, m12
427%if %1 != 6
428    punpckhdq   m12, m3, m12
429%endif
430    ; xm14: A0-7,B0-7
431    ; xm15: C0-7,D0-7
432    ; xm5: E0-7,F0-7
433    ; xm7: G0-7,H0-7
434    ; xm6: A8-15,B8-15
435    ; xm4: C8-15,D8-15
436    ; xm13: E8-15,F8-15
437    ; xm12: G8-15,H8-15
438    punpcklqdq   m3, m14, m6
439    punpckhqdq  m14, m6
440    punpckhqdq   m6, m15, m4
441    punpcklqdq  m15, m4
442    punpcklqdq   m4, m5, m13
443    punpckhqdq  m13, m5, m13
444%if %1 == 8
445    punpcklqdq   m5, m7, m12
446    punpckhqdq  m12, m7, m12
447    ; xm3: A0-15
448    ; xm14: B0-15
449    ; xm15: C0-15
450    ; xm6: D0-15
451    ; xm4: E0-15
452    ; xm13: F0-15
453    ; xm5: G0-15
454    ; xm12: H0-15
455    SWAP         12, 3, 15
456    SWAP         13, 14, 5, 4, 6
457    ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,15
458%else
459    SWAP         13, 3, 14
460    SWAP          6, 4, 15, 5
461    ; 3,14,15,6,4,13 -> 13,3,4,5,6,14
462%endif
463%else
464    ; load and 16x16 transpose. We only use 14 pixels but we'll need the
465    ; remainder at the end for the second transpose
466    movu         xm0, [dstq+strideq*0-8]
467    movu         xm1, [dstq+strideq*1-8]
468    movu         xm2, [dstq+strideq*2-8]
469    movu         xm3, [dstq+stride3q -8]
470    lea         tmpq, [dstq+strideq*4]
471    movu         xm4, [tmpq+strideq*0-8]
472    movu         xm5, [tmpq+strideq*1-8]
473    movu         xm6, [tmpq+strideq*2-8]
474    movu         xm7, [tmpq+stride3q -8]
475    lea         tmpq, [tmpq+strideq*4]
476    movu         xm8, [tmpq+strideq*0-8]
477    movu         xm9, [tmpq+strideq*1-8]
478    movu        xm10, [tmpq+strideq*2-8]
479    movu        xm11, [tmpq+stride3q -8]
480    lea         tmpq, [tmpq+strideq*4]
481    movu        xm12, [tmpq+strideq*0-8]
482    movu        xm13, [tmpq+strideq*1-8]
483    movu        xm14, [tmpq+strideq*2-8]
484    movu        xm15, [tmpq+stride3q -8]
485    lea         tmpq, [tmpq+strideq*4]
486    vinserti128   m0, [tmpq+strideq*0-8], 1
487    vinserti128   m1, [tmpq+strideq*1-8], 1
488    vinserti128   m2, [tmpq+strideq*2-8], 1
489    vinserti128   m3, [tmpq+stride3q -8], 1
490    lea         tmpq, [tmpq+strideq*4]
491    vinserti128   m4, [tmpq+strideq*0-8], 1
492    vinserti128   m5, [tmpq+strideq*1-8], 1
493    vinserti128   m6, [tmpq+strideq*2-8], 1
494    vinserti128   m7, [tmpq+stride3q -8], 1
495    lea         tmpq, [tmpq+strideq*4]
496    vinserti128   m8, [tmpq+strideq*0-8], 1
497    vinserti128   m9, [tmpq+strideq*1-8], 1
498    vinserti128  m10, [tmpq+strideq*2-8], 1
499    vinserti128  m11, [tmpq+stride3q -8], 1
500    lea         tmpq, [tmpq+strideq*4]
501    vinserti128  m12, [tmpq+strideq*0-8], 1
502    vinserti128  m13, [tmpq+strideq*1-8], 1
503    vinserti128  m14, [tmpq+strideq*2-8], 1
504    vinserti128  m15, [tmpq+stride3q -8], 1
505
506    TRANSPOSE_16X16B 0, 1, [rsp+11*32]
507    mova  [rsp+12*32], m1
508    mova  [rsp+13*32], m2
509    mova  [rsp+14*32], m3
510    mova  [rsp+15*32], m12
511    mova  [rsp+16*32], m13
512    mova  [rsp+17*32], m14
513    mova  [rsp+18*32], m15
514    ; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15
515    SWAP           12, 4, 7
516    SWAP           13, 5, 8
517    SWAP            3, 6, 9
518    SWAP           10, 14
519    SWAP           11, 15
520%endif
521%endif
522
523    ; load L/E/I/H
524%ifidn %2, v
525    movu          m1, [lq]
526    movu          m0, [lq+l_strideq]
527%else
528    movq         xm1, [lq]
529    movq         xm2, [lq+l_strideq*2]
530    movhps       xm1, [lq+l_strideq]
531    movhps       xm2, [lq+l_stride3q]
532    lea           lq, [lq+l_strideq*4]
533    movq        xm10, [lq]
534    movq         xm0, [lq+l_strideq*2]
535    movhps      xm10, [lq+l_strideq]
536    movhps       xm0, [lq+l_stride3q]
537    lea           lq, [lq+l_strideq*4]
538    vinserti128   m1, xm10, 1
539    vinserti128   m2, xm0, 1
540    shufps        m0, m1, m2, q3131
541    shufps        m1, m2, q2020
542%endif
543    pxor          m2, m2
544    pcmpeqb      m10, m2, m0
545    pand          m1, m10
546    por           m0, m1                        ; l[x][] ? l[x][] : l[x-stride][]
547    pshufb        m0, [pb_4x1_4x5_4x9_4x13]     ; l[x][1]
548    pcmpeqb      m10, m2, m0                    ; !L
549    psrlq         m2, m0, [lutq+128]
550    pand          m2, [pb_63]
551    vpbroadcastb  m1, [lutq+136]
552    pminub        m2, m1
553    pmaxub        m2, [pb_1]                    ; I
554    pand          m1, m0, [pb_240]
555    psrlq         m1, 4                         ; H
556    paddb         m0, [pb_2]
557    paddb         m0, m0
558    paddb         m0, m2                        ; E
559    pxor          m1, [pb_128]
560    pxor          m2, [pb_128]
561    pxor          m0, [pb_128]
562
563    ABSSUB        m8, m3, m4, m9                ; abs(p1-p0)
564    pmaxub        m8, m10
565    ABSSUB        m9, m5, m6, m10               ; abs(q1-q0)
566    pmaxub        m8, m9
567%if %1 == 4
568    pxor          m8, [pb_128]
569    pcmpgtb       m7, m8, m1                    ; hev
570%else
571    pxor          m7, m8, [pb_128]
572    pcmpgtb       m7, m1                        ; hev
573
574%if %1 == 6
575    ABSSUB        m9, m13, m4, m10              ; abs(p2-p0)
576    pmaxub        m9, m8
577%else
578    ABSSUB        m9, m12, m4, m10              ; abs(p3-p0)
579    pmaxub        m9, m8
580    ABSSUB       m10, m13, m4, m11              ; abs(p2-p0)
581    pmaxub        m9, m10
582%endif
583    ABSSUB       m10, m5,  m14, m11             ; abs(q2-q0)
584    pmaxub        m9, m10
585%if %1 != 6
586    ABSSUB       m10, m5,  m15, m11             ; abs(q3-q0)
587    pmaxub        m9, m10
588%endif
589    pxor          m9, [pb_128]
590    pcmpgtb       m9, [pb_129]                  ; !flat8in
591
592%if %1 == 6
593    ABSSUB       m10, m13, m3,  m1              ; abs(p2-p1)
594%else
595    ABSSUB       m10, m12, m13, m11             ; abs(p3-p2)
596    ABSSUB       m11, m13, m3,  m1              ; abs(p2-p1)
597    pmaxub       m10, m11
598    ABSSUB       m11, m14, m15, m1              ; abs(q3-q2)
599    pmaxub       m10, m11
600%endif
601    ABSSUB       m11, m14, m6,  m1              ; abs(q2-q1)
602    pmaxub       m10, m11
603%if %1 == 16
604    vpbroadcastd m11, [maskq+8]
605    vpbroadcastd  m1, [maskq+4]
606    por          m11, m1
607    pand         m11, [pb_mask]
608    pcmpeqd      m11, [pb_mask]
609    pand         m10, m11
610%else
611    vpbroadcastd m11, [maskq+4]
612    pand         m11, [pb_mask]
613    pcmpeqd      m11, [pb_mask]
614    pand         m10, m11                       ; only apply fm-wide to wd>4 blocks
615%endif
616    pmaxub        m8, m10
617
618    pxor          m8, [pb_128]
619%endif
620    pcmpgtb       m8, m2
621
622    ABSSUB       m10, m3, m6, m11               ; abs(p1-q1)
623    ABSSUB       m11, m4, m5, m2                ; abs(p0-q0)
624    paddusb      m11, m11
625    pand         m10, [pb_254]
626    psrlq        m10, 1
627    paddusb      m10, m11                       ; abs(p0-q0)*2+(abs(p1-q1)>>1)
628    pxor         m10, [pb_128]
629    pcmpgtb      m10, m0                        ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E
630    por           m8, m10
631
632%if %1 == 16
633%ifidn %2, v
634    lea         tmpq, [dstq+mstrideq*8]
635    mova          m0, [tmpq+strideq*1]
636%else
637    mova          m0, [rsp+12*32]
638%endif
639    ABSSUB        m1, m0, m4, m2
640%ifidn %2, v
641    mova          m0, [tmpq+strideq*2]
642%else
643    mova          m0, [rsp+13*32]
644%endif
645    ABSSUB        m2, m0, m4, m10
646    pmaxub        m1, m2
647%ifidn %2, v
648    mova          m0, [tmpq+stride3q]
649%else
650    mova          m0, [rsp+14*32]
651%endif
652    ABSSUB        m2, m0, m4, m10
653    pmaxub        m1, m2
654%ifidn %2, v
655    lea         tmpq, [dstq+strideq*4]
656    mova          m0, [tmpq+strideq*0]
657%else
658    mova          m0, [rsp+15*32]
659%endif
660    ABSSUB        m2, m0, m5, m10
661    pmaxub        m1, m2
662%ifidn %2, v
663    mova          m0, [tmpq+strideq*1]
664%else
665    mova          m0, [rsp+16*32]
666%endif
667    ABSSUB        m2, m0, m5, m10
668    pmaxub        m1, m2
669%ifidn %2, v
670    mova          m0, [tmpq+strideq*2]
671%else
672    mova          m0, [rsp+17*32]
673%endif
674    ABSSUB        m2, m0, m5, m10
675    pmaxub        m1, m2
676    pxor          m1, [pb_128]
677    pcmpgtb       m1, [pb_129]                  ; !flat8out
678    por           m1, m9                        ; !flat8in | !flat8out
679    vpbroadcastd  m2, [maskq+8]
680    pand         m10, m2, [pb_mask]
681    pcmpeqd      m10, [pb_mask]
682    pandn         m1, m10                       ; flat16
683    pandn         m1, m8, m1                    ; flat16 & fm
684
685    vpbroadcastd m10, [maskq+4]
686    por          m10, m2
687    pand          m2, m10, [pb_mask]
688    pcmpeqd       m2, [pb_mask]
689    pandn         m9, m2                        ; flat8in
690    pandn         m9, m8, m9
691    vpbroadcastd  m2, [maskq+0]
692    por           m2, m10
693    pand          m2, [pb_mask]
694    pcmpeqd       m2, [pb_mask]
695    pandn         m8, m2
696    pandn         m8, m9, m8                    ; fm & !flat8 & !flat16
697    pandn         m9, m1, m9                    ; flat8 & !flat16
698%elif %1 != 4
699    vpbroadcastd  m0, [maskq+4]
700    pand          m2, m0, [pb_mask]
701    pcmpeqd       m2, [pb_mask]
702    pandn         m9, m2
703    pandn         m9, m8, m9                    ; flat8 & fm
704    vpbroadcastd  m2, [maskq+0]
705    por           m0, m2
706    pand          m0, [pb_mask]
707    pcmpeqd       m0, [pb_mask]
708    pandn         m8, m0
709    pandn         m8, m9, m8                    ; fm & !flat8
710%else
711    vpbroadcastd  m0, [maskq+0]
712    pand          m0, [pb_mask]
713    pcmpeqd       m0, [pb_mask]
714    pandn         m8, m0                        ; fm
715%endif
716
717    ; short filter
718
719    pxor          m3, [pb_128]
720    pxor          m6, [pb_128]
721    psubsb       m10, m3, m6                    ; iclip_diff(p1-q1)
722    pand         m10, m7                        ; f=iclip_diff(p1-q1)&hev
723    pxor          m4, [pb_128]
724    pxor          m5, [pb_128]
725    psubsb       m11, m5, m4
726    paddsb       m10, m11
727    paddsb       m10, m11
728    paddsb       m10, m11                       ; f=iclip_diff(3*(q0-p0)+f)
729    pand          m8, m10                       ; f&=fm
730    paddsb       m10, m8, [pb_3]
731    paddsb        m8, [pb_4]
732    pand         m10, [pb_248]
733    pand          m8, [pb_248]
734    psrlq        m10, 3
735    psrlq         m8, 3
736    pxor         m10, [pb_16]
737    pxor          m8, [pb_16]
738    psubb        m10, [pb_16]                   ; f2
739    psubb         m8, [pb_16]                   ; f1
740    paddsb        m4, m10
741    psubsb        m5, m8
742    pxor          m4, [pb_128]
743    pxor          m5, [pb_128]
744
745    pxor          m8, [pb_128]
746    pxor         m10, m10
747    pavgb         m8, m10                       ; f=(f1+1)>>1
748    psubb         m8, [pb_64]
749    pandn         m8, m7, m8                    ; f&=!hev
750    paddsb        m3, m8
751    psubsb        m6, m8
752    pxor          m3, [pb_128]
753    pxor          m6, [pb_128]
754
755%if %1 == 16
756    ; flat16 filter
757%ifidn %2, v
758    lea         tmpq, [dstq+mstrideq*8]
759    mova          m0, [tmpq+strideq*1]          ; p6
760    mova          m2, [tmpq+strideq*2]          ; p5
761    mova          m7, [tmpq+stride3q]           ; p4
762%else
763    mova          m0, [rsp+12*32]
764    mova          m2, [rsp+13*32]
765    mova          m7, [rsp+14*32]
766%endif
767
768    mova  [rsp+0*32], m9
769    mova  [rsp+1*32], m14
770    mova  [rsp+2*32], m15
771
772    ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A
773    ; write -6
774    punpcklbw    m14, m0, m12
775    punpckhbw    m15, m0, m12
776    pmaddubsw    m10, m14, [pb_7_1]
777    pmaddubsw    m11, m15, [pb_7_1]             ; p6*7+p3
778    punpcklbw     m8, m2, m7
779    punpckhbw     m9, m2, m7
780    pmaddubsw     m8, [pb_2]
781    pmaddubsw     m9, [pb_2]
782    paddw        m10, m8
783    paddw        m11, m9                        ; p6*7+p5*2+p4*2+p3
784    punpcklbw     m8, m13, m3
785    punpckhbw     m9, m13, m3
786    pmaddubsw     m8, [pb_1]
787    pmaddubsw     m9, [pb_1]
788    paddw        m10, m8
789    paddw        m11, m9                        ; p6*7+p5*2+p4*2+p3+p2+p1
790    punpcklbw     m8, m4, m5
791    punpckhbw     m9, m4, m5
792    pmaddubsw     m8, [pb_1]
793    pmaddubsw     m9, [pb_1]
794    paddw        m10, m8
795    paddw        m11, m9                        ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
796    pmulhrsw      m8, m10, [pw_2048]
797    pmulhrsw      m9, m11, [pw_2048]
798    packuswb      m8, m9
799    pand          m8, m1
800    pandn         m9, m1, m2
801    por           m8, m9
802%ifidn %2, v
803    mova [tmpq+strideq*2], m8                   ; p5
804%else
805    mova [rsp+13*32], m8
806%endif
807
808    ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B
809    ; write -5
810    pmaddubsw    m14, [pb_m1_1]
811    pmaddubsw    m15, [pb_m1_1]
812    paddw        m10, m14
813    paddw        m11, m15                       ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0
814    punpcklbw     m8, m0, m6
815    punpckhbw     m9, m0, m6
816    pmaddubsw     m8, [pb_m1_1]
817    pmaddubsw     m9, [pb_m1_1]
818    mova  [rsp+3*32], m8
819    mova  [rsp+4*32], m9
820    paddw        m10, m8
821    paddw        m11, m9                        ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1
822    pmulhrsw      m8, m10, [pw_2048]
823    pmulhrsw      m9, m11, [pw_2048]
824    packuswb      m8, m9
825    pand          m8, m1
826    pandn         m9, m1, m7
827    por           m8, m9
828%ifidn %2, v
829    mova [tmpq+stride3q], m8                    ; p4
830%else
831    mova [rsp+14*32], m8
832%endif
833
834    ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C
835    ; write -4
836    mova         m14, [rsp+1*32]
837    punpcklbw     m8, m0, m13
838    punpckhbw     m9, m0, m13
839    pmaddubsw     m8, [pb_m1_1]
840    pmaddubsw     m9, [pb_m1_1]
841    paddw        m10, m8
842    paddw        m11, m9                        ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1
843    punpcklbw     m8, m2, m14
844    punpckhbw     m2, m14
845    pmaddubsw     m8, [pb_m1_1]
846    pmaddubsw     m2, [pb_m1_1]
847    mova  [rsp+1*32], m8
848    paddw        m10, m8
849    paddw        m11, m2                        ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2
850    pmulhrsw      m8, m10, [pw_2048]
851    pmulhrsw      m9, m11, [pw_2048]
852    packuswb      m8, m9
853    pand          m8, m1
854    pandn         m9, m1, m12
855    por           m8, m9
856%ifidn %2, v
857    mova [tmpq+strideq*4], m8                   ; p3
858%else
859    mova [rsp+19*32], m8
860%endif
861
862    ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D
863    ; write -3
864    mova         m15, [rsp+2*32]
865    punpcklbw     m8, m0, m3
866    punpckhbw     m9, m0, m3
867    pmaddubsw     m8, [pb_m1_1]
868    pmaddubsw     m9, [pb_m1_1]
869    paddw        m10, m8
870    paddw        m11, m9                        ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2
871    punpcklbw     m8, m7, m15
872    punpckhbw     m7, m15
873    pmaddubsw     m8, [pb_m1_1]
874    pmaddubsw     m7, [pb_m1_1]
875    mova  [rsp+2*32], m8
876    paddw        m10, m8
877    paddw        m11, m7                        ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3
878    pmulhrsw      m8, m10, [pw_2048]
879    pmulhrsw      m9, m11, [pw_2048]
880    packuswb      m8, m9
881    pand          m8, m1
882    pandn         m9, m1, m13
883    por           m8, m9
884    mova  [rsp+6*32], m8                        ; don't clobber p2/m13 since we need it in F
885
886    ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E
887    ; write -2
888%ifidn %2, v
889    lea         tmpq, [dstq+strideq*4]
890%endif
891    punpcklbw     m8, m0, m4
892    punpckhbw     m9, m0, m4
893    pmaddubsw     m8, [pb_m1_1]
894    pmaddubsw     m9, [pb_m1_1]
895    paddw        m10, m8
896    paddw        m11, m9                        ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3
897%ifidn %2, v
898    mova          m9, [tmpq+strideq*0]          ; q4
899%else
900    mova          m9, [rsp+15*32]
901%endif
902    punpcklbw     m8, m12, m9
903    punpckhbw     m9, m12, m9
904    pmaddubsw     m8, [pb_m1_1]
905    pmaddubsw     m9, [pb_m1_1]
906    mova  [rsp+7*32], m8
907    mova  [rsp+5*32], m9
908    paddw        m10, m8
909    paddw        m11, m9                        ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4
910    pmulhrsw      m8, m10, [pw_2048]
911    pmulhrsw      m9, m11, [pw_2048]
912    packuswb      m8, m9
913    pand          m8, m1
914    pandn         m9, m1, m3
915    por           m8, m9
916    mova  [rsp+8*32], m8                        ; don't clobber p1/m3 since we need it in G
917
918    ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F
919    ; write -1
920%ifidn %2, v
921    mova          m9, [tmpq+strideq*1]          ; q5
922%else
923    mova          m9, [rsp+16*32]
924%endif
925    punpcklbw     m8, m0, m5
926    punpckhbw     m0, m5
927    pmaddubsw     m8, [pb_m1_1]
928    pmaddubsw     m0, [pb_m1_1]
929    paddw        m10, m8
930    paddw        m11, m0                        ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4
931    punpcklbw     m0, m13, m9
932    punpckhbw     m9, m13, m9
933    mova         m13, [rsp+6*32]
934    pmaddubsw     m0, [pb_m1_1]
935    pmaddubsw     m9, [pb_m1_1]
936    mova [rsp+ 9*32], m0
937    mova [rsp+10*32], m9
938    paddw        m10, m0
939    paddw        m11, m9                        ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5
940    pmulhrsw      m0, m10, [pw_2048]
941    pmulhrsw      m8, m11, [pw_2048]
942    packuswb      m0, m8
943    pand          m0, m1
944    pandn         m8, m1, m4
945    por           m0, m8
946    mova  [rsp+6*32], m0                        ; don't clobber p0/m4 since we need it in H
947
948    ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G
949    ; write +0
950%ifidn %2, v
951    mova          m0, [tmpq+strideq*2]          ; q6
952%else
953    mova          m0, [rsp+17*32]
954%endif
955    paddw        m10, [rsp+3*32]
956    paddw        m11, [rsp+4*32]                ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5
957    punpcklbw     m8, m3, m0
958    punpckhbw     m9, m3, m0
959    mova          m3, [rsp+8*32]
960    pmaddubsw     m8, [pb_m1_1]
961    pmaddubsw     m9, [pb_m1_1]
962    mova  [rsp+3*32], m8
963    mova  [rsp+4*32], m9
964    paddw        m10, m8
965    paddw        m11, m9                        ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6
966    pmulhrsw      m8, m10, [pw_2048]
967    pmulhrsw      m9, m11, [pw_2048]
968    packuswb      m8, m9
969    pand          m8, m1
970    pandn         m9, m1, m5
971    por           m8, m9
972    mova  [rsp+8*32], m8                        ; don't clobber q0/m5 since we need it in I
973
974    ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H
975    ; write +1
976    paddw        m10, [rsp+1*32]
977    paddw        m11, m2                        ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6
978    punpcklbw     m8, m4, m0
979    punpckhbw     m2, m4, m0
980    mova          m4, [rsp+6*32]
981    pmaddubsw     m8, [pb_m1_1]
982    pmaddubsw     m2, [pb_m1_1]
983    paddw        m10, m8
984    paddw        m11, m2                        ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2
985    pmulhrsw      m2, m10, [pw_2048]
986    pmulhrsw      m9, m11, [pw_2048]
987    packuswb      m2, m9
988    pand          m2, m1
989    pandn         m9, m1, m6
990    por           m2, m9                        ; don't clobber q1/m6 since we need it in K
991
992    ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I
993    ; write +2
994    paddw        m10, [rsp+2*32]
995    paddw        m11, m7                        ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2
996    punpcklbw     m8, m5, m0
997    punpckhbw     m9, m5, m0
998    mova          m5, [rsp+8*32]
999    pmaddubsw     m8, [pb_m1_1]
1000    pmaddubsw     m9, [pb_m1_1]
1001    paddw        m10, m8
1002    paddw        m11, m9                        ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3
1003    pmulhrsw      m7, m10, [pw_2048]
1004    pmulhrsw      m9, m11, [pw_2048]
1005    packuswb      m7, m9
1006    pand          m7, m1
1007    pandn         m9, m1, m14
1008    por           m7, m9                        ; don't clobber q2/m14 since we need it in K
1009
1010    ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J
1011    ; write +3
1012    paddw        m10, [rsp+7*32]
1013    paddw        m11, [rsp+5*32]                ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3
1014    punpcklbw     m8, m6, m0
1015    punpckhbw     m9, m6, m0
1016    SWAP           2, 6
1017    pmaddubsw     m8, [pb_m1_1]
1018    pmaddubsw     m9, [pb_m1_1]
1019    paddw        m10, m8
1020    paddw        m11, m9                        ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4
1021    pmulhrsw      m8, m10, [pw_2048]
1022    pmulhrsw      m9, m11, [pw_2048]
1023    packuswb      m8, m9
1024    pand          m8, m1
1025    pandn         m9, m1, m15
1026    por           m8, m9
1027%ifidn %2, v
1028    mova [tmpq+mstrideq], m8                    ; q3
1029%else
1030    mova [rsp+20*32], m8
1031%endif
1032
1033    ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K
1034    ; write +4
1035    paddw        m10, [rsp+ 9*32]
1036    paddw        m11, [rsp+10*32]               ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
1037    punpcklbw     m8, m14, m0
1038    punpckhbw     m9, m14, m0
1039    SWAP          14, 7
1040    pmaddubsw     m8, [pb_m1_1]
1041    pmaddubsw     m9, [pb_m1_1]
1042    paddw        m10, m8
1043    paddw        m11, m9                        ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
1044    pmulhrsw      m8, m10, [pw_2048]
1045    pmulhrsw      m9, m11, [pw_2048]
1046    packuswb      m8, m9
1047    pand          m8, m1
1048%ifidn %2, v
1049    pandn         m9, m1, [tmpq+strideq*0]
1050%else
1051    pandn         m9, m1, [rsp+15*32]
1052%endif
1053    por           m8, m9
1054%ifidn %2, v
1055    mova [tmpq+strideq*0], m8                    ; q4
1056%else
1057    mova [rsp+15*32], m8
1058%endif
1059
1060    ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L
1061    ; write +5
1062    paddw        m10, [rsp+3*32]
1063    paddw        m11, [rsp+4*32]                ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
1064    punpcklbw     m8, m15, m0
1065    punpckhbw     m9, m15, m0
1066    pmaddubsw     m8, [pb_m1_1]
1067    pmaddubsw     m9, [pb_m1_1]
1068    paddw        m10, m8
1069    paddw        m11, m9                        ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
1070    pmulhrsw     m10, [pw_2048]
1071    pmulhrsw     m11, [pw_2048]
1072    packuswb     m10, m11
1073    pand         m10, m1
1074%ifidn %2, v
1075    pandn        m11, m1, [tmpq+strideq*1]
1076%else
1077    pandn        m11, m1, [rsp+16*32]
1078%endif
1079    por          m10, m11
1080%ifidn %2, v
1081    mova [tmpq+strideq*1], m10                  ; q5
1082%else
1083    mova [rsp+16*32], m10
1084%endif
1085
1086    mova          m9, [rsp+0*32]
1087%ifidn %2, v
1088    lea         tmpq, [dstq+mstrideq*4]
1089%endif
1090%endif
1091%if %1 >= 8
1092    ; flat8 filter
1093    punpcklbw     m0, m12, m3
1094    punpckhbw     m1, m12, m3
1095    pmaddubsw     m2, m0, [pb_3_1]
1096    pmaddubsw     m7, m1, [pb_3_1]              ; 3 * p3 + p1
1097    punpcklbw     m8, m13, m4
1098    punpckhbw    m11, m13, m4
1099    pmaddubsw     m8, [pb_2_1]
1100    pmaddubsw    m11, [pb_2_1]
1101    paddw         m2, m8
1102    paddw         m7, m11                       ; 3 * p3 + 2 * p2 + p1 + p0
1103    punpcklbw     m8, m5, [pb_4]
1104    punpckhbw    m11, m5, [pb_4]
1105    pmaddubsw     m8, [pb_1]
1106    pmaddubsw    m11, [pb_1]
1107    paddw         m2, m8
1108    paddw         m7, m11                       ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4
1109    psrlw         m8, m2, 3
1110    psrlw        m11, m7, 3
1111    packuswb      m8, m11
1112    pand          m8, m9
1113    pandn        m11, m9, m13
1114    por          m10, m8, m11                  ; p2
1115%ifidn %2, v
1116    mova [tmpq+strideq*1], m10                 ; p2
1117%endif
1118
1119    pmaddubsw     m8, m0, [pb_m1_1]
1120    pmaddubsw    m11, m1, [pb_m1_1]
1121    paddw         m2, m8
1122    paddw         m7, m11
1123    punpcklbw     m8, m13, m6
1124    punpckhbw    m11, m13, m6
1125    pmaddubsw     m8, [pb_m1_1]
1126    pmaddubsw    m11, [pb_m1_1]
1127    paddw         m2, m8
1128    paddw         m7, m11                       ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4
1129    psrlw         m8, m2, 3
1130    psrlw        m11, m7, 3
1131    packuswb      m8, m11
1132    pand          m8, m9
1133    pandn        m11, m9, m3
1134    por           m8, m11                       ; p1
1135%ifidn %2, v
1136    mova [tmpq+strideq*2], m8                   ; p1
1137%else
1138    mova  [rsp+0*32], m8
1139%endif
1140
1141    pmaddubsw     m0, [pb_1]
1142    pmaddubsw     m1, [pb_1]
1143    psubw         m2, m0
1144    psubw         m7, m1
1145    punpcklbw     m8, m4, m14
1146    punpckhbw    m11, m4, m14
1147    pmaddubsw     m8, [pb_1]
1148    pmaddubsw    m11, [pb_1]
1149    paddw         m2, m8
1150    paddw         m7, m11                       ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4
1151    psrlw         m8, m2, 3
1152    psrlw        m11, m7, 3
1153    packuswb      m8, m11
1154    pand          m8, m9
1155    pandn        m11, m9, m4
1156    por           m8, m11                       ; p0
1157%ifidn %2, v
1158    mova [tmpq+stride3q ], m8                   ; p0
1159%else
1160    mova  [rsp+1*32], m8
1161%endif
1162
1163    punpcklbw     m0, m5, m15
1164    punpckhbw     m1, m5, m15
1165    pmaddubsw     m8, m0, [pb_1]
1166    pmaddubsw    m11, m1, [pb_1]
1167    paddw         m2, m8
1168    paddw         m7, m11
1169    punpcklbw     m8, m4, m12
1170    punpckhbw    m11, m4, m12
1171    pmaddubsw     m8, [pb_1]
1172    pmaddubsw    m11, [pb_1]
1173    psubw         m2, m8
1174    psubw         m7, m11                       ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4
1175    psrlw         m8, m2, 3
1176    psrlw        m11, m7, 3
1177    packuswb      m8, m11
1178    pand          m8, m9
1179    pandn        m11, m9, m5
1180    por          m11, m8, m11                   ; q0
1181%ifidn %2, v
1182    mova [dstq+strideq*0], m11                  ; q0
1183%endif
1184
1185    pmaddubsw     m0, [pb_m1_1]
1186    pmaddubsw     m1, [pb_m1_1]
1187    paddw         m2, m0
1188    paddw         m7, m1
1189    punpcklbw     m8, m13, m6
1190    punpckhbw    m13, m6
1191    pmaddubsw     m8, [pb_m1_1]
1192    pmaddubsw    m13, [pb_m1_1]
1193    paddw         m2, m8
1194    paddw         m7, m13                       ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4
1195    psrlw         m8, m2, 3
1196    psrlw        m13, m7, 3
1197    packuswb      m8, m13
1198    pand          m8, m9
1199    pandn        m13, m9, m6
1200    por          m13, m8, m13                   ; q1
1201%ifidn %2, v
1202    mova [dstq+strideq*1], m13                  ; q1
1203%endif
1204
1205    punpcklbw     m0, m3, m6
1206    punpckhbw     m1, m3, m6
1207    pmaddubsw     m0, [pb_1]
1208    pmaddubsw     m1, [pb_1]
1209    psubw         m2, m0
1210    psubw         m7, m1
1211    punpcklbw     m0, m14, m15
1212    punpckhbw     m1, m14, m15
1213    pmaddubsw     m0, [pb_1]
1214    pmaddubsw     m1, [pb_1]
1215    paddw         m2, m0
1216    paddw         m7, m1                        ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4
1217    psrlw         m2, 3
1218    psrlw         m7, 3
1219    packuswb      m2, m7
1220    pand          m2, m9
1221    pandn         m7, m9, m14
1222    por           m2, m7                        ; q2
1223%ifidn %2, v
1224    mova [dstq+strideq*2], m2                   ; q2
1225%else
1226    mova          m0, [rsp+0*32]
1227    mova          m1, [rsp+1*32]
1228%if %1 == 8
1229    ; 16x8 transpose
1230    punpcklbw     m3, m12, m10
1231    punpckhbw    m12, m10
1232    punpcklbw    m10, m0, m1
1233    punpckhbw     m0, m1
1234    punpcklbw     m1, m11, m13
1235    punpckhbw    m11, m13
1236    punpcklbw    m13, m2, m15
1237    punpckhbw     m2, m15
1238
1239    punpcklwd    m15, m3, m10
1240    punpckhwd     m3, m10
1241    punpcklwd    m10, m12, m0
1242    punpckhwd    m12, m0
1243    punpcklwd     m0, m1, m13
1244    punpckhwd     m1, m13
1245    punpcklwd    m13, m11, m2
1246    punpckhwd    m11, m2
1247
1248    punpckldq     m2, m15, m0
1249    punpckhdq    m15, m0
1250    punpckldq     m0, m3, m1
1251    punpckhdq     m3, m1
1252    punpckldq     m1, m10, m13
1253    punpckhdq    m10, m13
1254    punpckldq    m13, m12, m11
1255    punpckhdq    m12, m11
1256
1257    ; write 8x32
1258    movq   [dstq+strideq*0-4], xm2
1259    movhps [dstq+strideq*1-4], xm2
1260    movq   [dstq+strideq*2-4], xm15
1261    movhps [dstq+stride3q -4], xm15
1262    lea         dstq, [dstq+strideq*4]
1263    movq   [dstq+strideq*0-4], xm0
1264    movhps [dstq+strideq*1-4], xm0
1265    movq   [dstq+strideq*2-4], xm3
1266    movhps [dstq+stride3q -4], xm3
1267    lea         dstq, [dstq+strideq*4]
1268    movq   [dstq+strideq*0-4], xm1
1269    movhps [dstq+strideq*1-4], xm1
1270    movq   [dstq+strideq*2-4], xm10
1271    movhps [dstq+stride3q -4], xm10
1272    lea         dstq, [dstq+strideq*4]
1273    movq   [dstq+strideq*0-4], xm13
1274    movhps [dstq+strideq*1-4], xm13
1275    movq   [dstq+strideq*2-4], xm12
1276    movhps [dstq+stride3q -4], xm12
1277    lea         dstq, [dstq+strideq*4]
1278
1279    vextracti128  xm2,  m2, 1
1280    vextracti128 xm15, m15, 1
1281    vextracti128  xm0,  m0, 1
1282    vextracti128  xm3,  m3, 1
1283    vextracti128  xm1,  m1, 1
1284    vextracti128 xm10, m10, 1
1285    vextracti128 xm13, m13, 1
1286    vextracti128 xm12, m12, 1
1287
1288    movq   [dstq+strideq*0-4], xm2
1289    movhps [dstq+strideq*1-4], xm2
1290    movq   [dstq+strideq*2-4], xm15
1291    movhps [dstq+stride3q -4], xm15
1292    lea         dstq, [dstq+strideq*4]
1293    movq   [dstq+strideq*0-4], xm0
1294    movhps [dstq+strideq*1-4], xm0
1295    movq   [dstq+strideq*2-4], xm3
1296    movhps [dstq+stride3q -4], xm3
1297    lea         dstq, [dstq+strideq*4]
1298    movq   [dstq+strideq*0-4], xm1
1299    movhps [dstq+strideq*1-4], xm1
1300    movq   [dstq+strideq*2-4], xm10
1301    movhps [dstq+stride3q -4], xm10
1302    lea         dstq, [dstq+strideq*4]
1303    movq   [dstq+strideq*0-4], xm13
1304    movhps [dstq+strideq*1-4], xm13
1305    movq   [dstq+strideq*2-4], xm12
1306    movhps [dstq+stride3q -4], xm12
1307    lea         dstq, [dstq+strideq*4]
1308%else
1309    ; 16x16 transpose and store
1310    SWAP           5, 10, 2
1311    SWAP           6, 0
1312    SWAP           7, 1
1313    SWAP           8, 11
1314    SWAP           9, 13
1315    mova          m0, [rsp+11*32]
1316    mova          m1, [rsp+12*32]
1317    mova          m2, [rsp+13*32]
1318    mova          m3, [rsp+14*32]
1319    mova          m4, [rsp+19*32]
1320    mova         m11, [rsp+20*32]
1321    mova         m12, [rsp+15*32]
1322    mova         m13, [rsp+16*32]
1323    mova         m14, [rsp+17*32]
1324    TRANSPOSE_16X16B 1, 0, [rsp+18*32]
1325    movu [dstq+strideq*0-8], xm0
1326    movu [dstq+strideq*1-8], xm1
1327    movu [dstq+strideq*2-8], xm2
1328    movu [dstq+stride3q -8], xm3
1329    lea         dstq, [dstq+strideq*4]
1330    movu [dstq+strideq*0-8], xm4
1331    movu [dstq+strideq*1-8], xm5
1332    movu [dstq+strideq*2-8], xm6
1333    movu [dstq+stride3q -8], xm7
1334    lea         dstq, [dstq+strideq*4]
1335    movu [dstq+strideq*0-8], xm8
1336    movu [dstq+strideq*1-8], xm9
1337    movu [dstq+strideq*2-8], xm10
1338    movu [dstq+stride3q -8], xm11
1339    lea         dstq, [dstq+strideq*4]
1340    movu [dstq+strideq*0-8], xm12
1341    movu [dstq+strideq*1-8], xm13
1342    movu [dstq+strideq*2-8], xm14
1343    movu [dstq+stride3q -8], xm15
1344    lea         dstq, [dstq+strideq*4]
1345    vextracti128 [dstq+strideq*0-8], m0, 1
1346    vextracti128 [dstq+strideq*1-8], m1, 1
1347    vextracti128 [dstq+strideq*2-8], m2, 1
1348    vextracti128 [dstq+stride3q -8], m3, 1
1349    lea         dstq, [dstq+strideq*4]
1350    vextracti128 [dstq+strideq*0-8], m4, 1
1351    vextracti128 [dstq+strideq*1-8], m5, 1
1352    vextracti128 [dstq+strideq*2-8], m6, 1
1353    vextracti128 [dstq+stride3q -8], m7, 1
1354    lea         dstq, [dstq+strideq*4]
1355    vextracti128 [dstq+strideq*0-8], m8, 1
1356    vextracti128 [dstq+strideq*1-8], m9, 1
1357    vextracti128 [dstq+strideq*2-8], m10, 1
1358    vextracti128 [dstq+stride3q -8], m11, 1
1359    lea         dstq, [dstq+strideq*4]
1360    vextracti128 [dstq+strideq*0-8], m12, 1
1361    vextracti128 [dstq+strideq*1-8], m13, 1
1362    vextracti128 [dstq+strideq*2-8], m14, 1
1363    vextracti128 [dstq+stride3q -8], m15, 1
1364    lea         dstq, [dstq+strideq*4]
1365%endif
1366%endif
1367%elif %1 == 6
1368    ; flat6 filter
1369
1370    punpcklbw     m8, m13, m5
1371    punpckhbw    m11, m13, m5
1372    pmaddubsw     m0, m8, [pb_3_1]
1373    pmaddubsw     m1, m11, [pb_3_1]
1374    punpcklbw     m7, m4, m3
1375    punpckhbw    m10, m4, m3
1376    pmaddubsw     m2, m7, [pb_2]
1377    pmaddubsw    m12, m10, [pb_2]
1378    paddw         m0, m2
1379    paddw         m1, m12
1380    pmulhrsw      m2, m0, [pw_4096]
1381    pmulhrsw     m12, m1, [pw_4096]
1382    packuswb      m2, m12
1383    pand          m2, m9
1384    pandn        m12, m9, m3
1385    por           m2, m12
1386%ifidn %2, v
1387    mova [tmpq+strideq*2], m2                   ; p1
1388%endif
1389
1390    pmaddubsw     m8, [pb_m1_1]
1391    pmaddubsw    m11, [pb_m1_1]
1392    paddw         m0, m8
1393    paddw         m1, m11
1394    punpcklbw     m8, m13, m6
1395    punpckhbw    m11, m13, m6
1396    pmaddubsw     m8, [pb_m1_1]
1397    pmaddubsw    m11, [pb_m1_1]
1398    paddw         m0, m8
1399    paddw         m1, m11
1400    pmulhrsw     m12, m0, [pw_4096]
1401    pmulhrsw     m13, m1, [pw_4096]
1402    packuswb     m12, m13
1403    pand         m12, m9
1404    pandn        m13, m9, m4
1405    por          m12, m13
1406%ifidn %2, v
1407    mova [tmpq+stride3q], m12                   ; p0
1408%endif
1409
1410    paddw         m0, m8
1411    paddw         m1, m11
1412    punpcklbw     m8, m3, m14
1413    punpckhbw    m11, m3, m14
1414    pmaddubsw    m14, m8, [pb_m1_1]
1415    pmaddubsw    m13, m11, [pb_m1_1]
1416    paddw         m0, m14
1417    paddw         m1, m13
1418    pmulhrsw     m14, m0, [pw_4096]
1419    pmulhrsw     m13, m1, [pw_4096]
1420    packuswb     m14, m13
1421    pand         m14, m9
1422    pandn        m13, m9, m5
1423    por          m14, m13
1424%ifidn %2, v
1425    mova [dstq+strideq*0], m14                  ; q0
1426%endif
1427
1428    pmaddubsw     m8, [pb_m1_2]
1429    pmaddubsw    m11, [pb_m1_2]
1430    paddw         m0, m8
1431    paddw         m1, m11
1432    pmaddubsw     m7, [pb_m1_0]
1433    pmaddubsw    m10, [pb_m1_0]
1434    paddw         m0, m7
1435    paddw         m1, m10
1436    pmulhrsw      m0, [pw_4096]
1437    pmulhrsw      m1, [pw_4096]
1438    packuswb      m0, m1
1439    pand          m0, m9
1440    pandn         m9, m6
1441    por           m0, m9
1442%ifidn %2, v
1443    mova [dstq+strideq*1], m0                   ; q1
1444%else
1445    TRANSPOSE_16x4_AND_WRITE_4x32 2, 12, 14, 0, 1
1446%endif
1447%else
1448%ifidn %2, v
1449    mova [tmpq+strideq*0], m3                   ; p1
1450    mova [tmpq+strideq*1], m4                   ; p0
1451    mova [tmpq+strideq*2], m5                   ; q0
1452    mova [tmpq+stride3q ], m6                   ; q1
1453%else
1454    TRANSPOSE_16x4_AND_WRITE_4x32 3, 4, 5, 6, 7
1455%endif
1456%endif
1457%endmacro
1458
1459INIT_YMM avx2
1460cglobal lpf_v_sb_y, 7, 10, 16, 32 * 11, \
1461                    dst, stride, mask, l, l_stride, lut, \
1462                    w, stride3, mstride, tmp
1463    shl    l_strideq, 2
1464    sub           lq, l_strideq
1465    mov     mstrideq, strideq
1466    neg     mstrideq
1467    lea     stride3q, [strideq*3]
1468
1469.loop:
1470    cmp byte [maskq+8], 0                       ; vmask[2]
1471    je .no_flat16
1472
1473    FILTER        16, v
1474    jmp .end
1475
1476.no_flat16:
1477    cmp byte [maskq+4], 0                       ; vmask[1]
1478    je .no_flat
1479
1480    FILTER         8, v
1481    jmp .end
1482
1483.no_flat:
1484    cmp byte [maskq+0], 0                       ; vmask[0]
1485    je .end
1486
1487    FILTER         4, v
1488
1489.end:
1490    add           lq, 32
1491    add         dstq, 32
1492    add        maskq, 1
1493    sub           wd, 8
1494    jg .loop
1495    RET
1496
1497INIT_YMM avx2
1498cglobal lpf_h_sb_y, 7, 10, 16, 32 * 21, \
1499                    dst, stride, mask, l, l_stride, lut, \
1500                    h, stride3, l_stride3, tmp
1501    shl    l_strideq, 2
1502    sub           lq, 4
1503    lea     stride3q, [strideq*3]
1504    lea   l_stride3q, [l_strideq*3]
1505
1506.loop:
1507    cmp byte [maskq+8], 0                       ; vmask[2]
1508    je .no_flat16
1509
1510    FILTER        16, h
1511    jmp .end
1512
1513.no_flat16:
1514    cmp byte [maskq+4], 0                       ; vmask[1]
1515    je .no_flat
1516
1517    FILTER         8, h
1518    jmp .end
1519
1520.no_flat:
1521    cmp byte [maskq+0], 0                       ; vmask[0]
1522    je .no_filter
1523
1524    FILTER         4, h
1525    jmp .end
1526
1527.no_filter:
1528    lea         dstq, [dstq+stride3q*8]
1529    lea           lq, [lq+l_strideq*8]
1530    lea         dstq, [dstq+strideq*8]
1531.end:
1532    add        maskq, 1
1533    sub           hd, 8
1534    jg .loop
1535    RET
1536
1537INIT_YMM avx2
1538cglobal lpf_v_sb_uv, 7, 10, 16, \
1539                     dst, stride, mask, l, l_stride, lut, \
1540                     w, stride3, mstride, tmp
1541    shl    l_strideq, 2
1542    sub           lq, l_strideq
1543    mov     mstrideq, strideq
1544    neg     mstrideq
1545    lea     stride3q, [strideq*3]
1546
1547.loop:
1548    cmp byte [maskq+4], 0                       ; vmask[1]
1549    je .no_flat
1550
1551    FILTER         6, v
1552    jmp .end
1553
1554.no_flat:
1555    cmp byte [maskq+0], 0                       ; vmask[0]
1556    je .end
1557
1558    FILTER         4, v
1559
1560.end:
1561    add           lq, 32
1562    add         dstq, 32
1563    add        maskq, 1
1564    sub           wd, 8
1565    jg .loop
1566    RET
1567
1568INIT_YMM avx2
1569cglobal lpf_h_sb_uv, 7, 10, 16, \
1570                     dst, stride, mask, l, l_stride, lut, \
1571                     h, stride3, l_stride3, tmp
1572    shl    l_strideq, 2
1573    sub           lq, 4
1574    lea     stride3q, [strideq*3]
1575    lea   l_stride3q, [l_strideq*3]
1576
1577.loop:
1578    cmp byte [maskq+4], 0                       ; vmask[1]
1579    je .no_flat
1580
1581    FILTER         6, h
1582    jmp .end
1583
1584.no_flat:
1585    cmp byte [maskq+0], 0                       ; vmask[0]
1586    je .no_filter
1587
1588    FILTER         4, h
1589    jmp .end
1590
1591.no_filter:
1592    lea         dstq, [dstq+stride3q*8]
1593    lea           lq, [lq+l_strideq*8]
1594    lea         dstq, [dstq+strideq*8]
1595.end:
1596    add        maskq, 1
1597    sub           hd, 8
1598    jg .loop
1599    RET
1600
1601%endif ; ARCH_X86_64
1602