1;******************************************************************************
2;* VP9 loop filter SIMD optimizations
3;*
4;* Copyright (C) 2013-2014 Clément Bœsch <u pkh me>
5;* Copyright (C) 2014 Ronald S. Bultje <rsbultje@gmail.com>
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;******************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION_RODATA
27
28cextern pb_3
29cextern pb_80
30
31pb_4:   times 16 db 0x04
32pb_10:  times 16 db 0x10
33pb_40:  times 16 db 0x40
34pb_81:  times 16 db 0x81
35pb_f8:  times 16 db 0xf8
36pb_fe:  times 16 db 0xfe
37pb_ff:  times 16 db 0xff
38
39cextern pw_4
40cextern pw_8
41
42; with mix functions, two 8-bit thresholds are stored in a 16-bit storage,
43; the following mask is used to splat both in the same register
44mask_mix: times 8 db 0
45          times 8 db 1
46
47mask_mix84: times 8 db 0xff
48            times 8 db 0x00
49mask_mix48: times 8 db 0x00
50            times 8 db 0xff
51
52SECTION .text
53
54%macro SCRATCH 3
55%ifdef m8
56    SWAP                %1, %2
57%else
58    mova              [%3], m%1
59%endif
60%endmacro
61
62%macro UNSCRATCH 3
63%ifdef m8
64    SWAP                %1, %2
65%else
66    mova               m%1, [%3]
67%endif
68%endmacro
69
70; %1 = abs(%2-%3)
71%macro ABSSUB 4 ; dst, src1 (RO), src2 (RO), tmp
72%ifdef m8
73    psubusb             %1, %3, %2
74    psubusb             %4, %2, %3
75%else
76    mova                %1, %3
77    mova                %4, %2
78    psubusb             %1, %2
79    psubusb             %4, %3
80%endif
81    por                 %1, %4
82%endmacro
83
84; %1 = %1>%2
85%macro CMP_GT 2-3 ; src/dst, cmp, pb_80
86%if %0 == 3
87    pxor                %1, %3
88%endif
89    pcmpgtb             %1, %2
90%endmacro
91
92; %1 = abs(%2-%3) > %4
93%macro ABSSUB_GT 5-6 [pb_80]; dst, src1, src2, cmp, tmp, [pb_80]
94    ABSSUB              %1, %2, %3, %5      ; dst = abs(src1-src2)
95    CMP_GT              %1, %4, %6          ; dst > cmp
96%endmacro
97
98%macro MASK_APPLY 4 ; %1=new_data/dst %2=old_data %3=mask %4=tmp
99    pand                %1, %3              ; new &= mask
100    pandn               %4, %3, %2          ; tmp = ~mask & old
101    por                 %1, %4              ; new&mask | old&~mask
102%endmacro
103
104%macro UNPACK 4
105%ifdef m8
106    punpck%1bw          %2, %3, %4
107%else
108    mova                %2, %3
109    punpck%1bw          %2, %4
110%endif
111%endmacro
112
113%macro FILTER_SUBx2_ADDx2 11 ; %1=dst %2=h/l %3=cache %4=stack_off %5=sub1 %6=sub2 %7=add1
114                             ; %8=add2 %9=rshift, [unpack], [unpack_is_mem_on_x86_32]
115    psubw               %3, [rsp+%4+%5*mmsize*2]
116    psubw               %3, [rsp+%4+%6*mmsize*2]
117    paddw               %3, [rsp+%4+%7*mmsize*2]
118%ifnidn %10, ""
119%if %11 == 0
120    punpck%2bw          %1, %10, m0
121%else
122    UNPACK          %2, %1, %10, m0
123%endif
124    mova [rsp+%4+%8*mmsize*2], %1
125    paddw               %3, %1
126%else
127    paddw               %3, [rsp+%4+%8*mmsize*2]
128%endif
129    psraw               %1, %3, %9
130%endmacro
131
132; FIXME interleave l/h better (for instruction pairing)
133%macro FILTER_INIT 9 ; tmp1, tmp2, cacheL, cacheH, dstp, stack_off, filterid, mask, source
134    FILTER%7_INIT       %1, l, %3, %6 +      0
135    FILTER%7_INIT       %2, h, %4, %6 + mmsize
136    packuswb            %1, %2
137    MASK_APPLY          %1, %9, %8, %2
138    mova                %5, %1
139%endmacro
140
141
142%macro FILTER_UPDATE 12-16 "", "", "", 0 ; tmp1, tmp2, cacheL, cacheH, dstp, stack_off, -, -, +, +, rshift,
143                                         ; mask, [source], [unpack + src], [unpack_is_mem_on_x86_32]
144; FIXME interleave this properly with the subx2/addx2
145%ifnidn %15, ""
146%if %16 == 0 || ARCH_X86_64
147    mova               %14, %15
148%endif
149%endif
150    FILTER_SUBx2_ADDx2  %1, l, %3, %6 +      0, %7, %8, %9, %10, %11, %14, %16
151    FILTER_SUBx2_ADDx2  %2, h, %4, %6 + mmsize, %7, %8, %9, %10, %11, %14, %16
152    packuswb            %1, %2
153%ifnidn %13, ""
154    MASK_APPLY          %1, %13, %12, %2
155%else
156    MASK_APPLY          %1, %5, %12, %2
157%endif
158    mova                %5, %1
159%endmacro
160
161%macro SRSHIFT3B_2X 4 ; reg1, reg2, [pb_10], tmp
162    mova                %4, [pb_f8]
163    pand                %1, %4
164    pand                %2, %4
165    psrlq               %1, 3
166    psrlq               %2, 3
167    pxor                %1, %3
168    pxor                %2, %3
169    psubb               %1, %3
170    psubb               %2, %3
171%endmacro
172
173%macro EXTRACT_POS_NEG 3 ; i8, neg, pos
174    pxor                %3, %3
175    pxor                %2, %2
176    pcmpgtb             %3, %1                          ; i8 < 0 mask
177    psubb               %2, %1                          ; neg values (only the originally - will be kept)
178    pand                %2, %3                          ; negative values of i8 (but stored as +)
179    pandn               %3, %1                          ; positive values of i8
180%endmacro
181
182; clip_u8(u8 + i8)
183%macro SIGN_ADD 4 ; dst, u8, i8, tmp1
184    EXTRACT_POS_NEG     %3, %4, %1
185    paddusb             %1, %2                          ; add the positives
186    psubusb             %1, %4                          ; sub the negatives
187%endmacro
188
189; clip_u8(u8 - i8)
190%macro SIGN_SUB 4 ; dst, u8, i8, tmp1
191    EXTRACT_POS_NEG     %3, %1, %4
192    paddusb             %1, %2                          ; add the negatives
193    psubusb             %1, %4                          ; sub the positives
194%endmacro
195
196%macro FILTER6_INIT 4 ; %1=dst %2=h/l %3=cache, %4=stack_off
197    UNPACK          %2, %1, rp3, m0                     ; p3: B->W
198    mova [rsp+%4+0*mmsize*2], %1
199    paddw               %3, %1, %1                      ; p3*2
200    paddw               %3, %1                          ; p3*3
201    punpck%2bw          %1, m1,  m0                     ; p2: B->W
202    mova [rsp+%4+1*mmsize*2], %1
203    paddw               %3, %1                          ; p3*3 + p2
204    paddw               %3, %1                          ; p3*3 + p2*2
205    UNPACK          %2, %1, rp1, m0                     ; p1: B->W
206    mova [rsp+%4+2*mmsize*2], %1
207    paddw               %3, %1                          ; p3*3 + p2*2 + p1
208    UNPACK          %2, %1, rp0, m0                     ; p0: B->W
209    mova [rsp+%4+3*mmsize*2], %1
210    paddw               %3, %1                          ; p3*3 + p2*2 + p1 + p0
211    UNPACK          %2, %1, rq0, m0                     ; q0: B->W
212    mova [rsp+%4+4*mmsize*2], %1
213    paddw               %3, %1                          ; p3*3 + p2*2 + p1 + p0 + q0
214    paddw               %3, [pw_4]                      ; p3*3 + p2*2 + p1 + p0 + q0 + 4
215    psraw               %1, %3, 3                       ; (p3*3 + p2*2 + p1 + p0 + q0 + 4) >> 3
216%endmacro
217
218%macro FILTER14_INIT 4 ; %1=dst %2=h/l %3=cache, %4=stack_off
219    punpck%2bw          %1, m2, m0                      ; p7: B->W
220    mova [rsp+%4+ 8*mmsize*2], %1
221    psllw               %3, %1, 3                       ; p7*8
222    psubw               %3, %1                          ; p7*7
223    punpck%2bw          %1, m3, m0                      ; p6: B->W
224    mova [rsp+%4+ 9*mmsize*2], %1
225    paddw               %3, %1                          ; p7*7 + p6
226    paddw               %3, %1                          ; p7*7 + p6*2
227    UNPACK          %2, %1, rp5, m0                     ; p5: B->W
228    mova [rsp+%4+10*mmsize*2], %1
229    paddw               %3, %1                          ; p7*7 + p6*2 + p5
230    UNPACK          %2, %1, rp4, m0                     ; p4: B->W
231    mova [rsp+%4+11*mmsize*2], %1
232    paddw               %3, %1                          ; p7*7 + p6*2 + p5 + p4
233    paddw               %3, [rsp+%4+ 0*mmsize*2]        ; p7*7 + p6*2 + p5 + p4 + p3
234    paddw               %3, [rsp+%4+ 1*mmsize*2]        ; p7*7 + p6*2 + p5 + .. + p2
235    paddw               %3, [rsp+%4+ 2*mmsize*2]        ; p7*7 + p6*2 + p5 + .. + p1
236    paddw               %3, [rsp+%4+ 3*mmsize*2]        ; p7*7 + p6*2 + p5 + .. + p0
237    paddw               %3, [rsp+%4+ 4*mmsize*2]        ; p7*7 + p6*2 + p5 + .. + p0 + q0
238    paddw               %3, [pw_8]                      ; p7*7 + p6*2 + p5 + .. + p0 + q0 + 8
239    psraw               %1, %3, 4                       ; (p7*7 + p6*2 + p5 + .. + p0 + q0 + 8) >> 4
240%endmacro
241
242%macro TRANSPOSE16x16B 17
243    mova %17, m%16
244    SBUTTERFLY bw,  %1,  %2,  %16
245    SBUTTERFLY bw,  %3,  %4,  %16
246    SBUTTERFLY bw,  %5,  %6,  %16
247    SBUTTERFLY bw,  %7,  %8,  %16
248    SBUTTERFLY bw,  %9,  %10, %16
249    SBUTTERFLY bw,  %11, %12, %16
250    SBUTTERFLY bw,  %13, %14, %16
251    mova m%16,  %17
252    mova  %17, m%14
253    SBUTTERFLY bw,  %15, %16, %14
254    SBUTTERFLY wd,  %1,  %3,  %14
255    SBUTTERFLY wd,  %2,  %4,  %14
256    SBUTTERFLY wd,  %5,  %7,  %14
257    SBUTTERFLY wd,  %6,  %8,  %14
258    SBUTTERFLY wd,  %9,  %11, %14
259    SBUTTERFLY wd,  %10, %12, %14
260    SBUTTERFLY wd,  %13, %15, %14
261    mova m%14,  %17
262    mova  %17, m%12
263    SBUTTERFLY wd,  %14, %16, %12
264    SBUTTERFLY dq,  %1,  %5,  %12
265    SBUTTERFLY dq,  %2,  %6,  %12
266    SBUTTERFLY dq,  %3,  %7,  %12
267    SBUTTERFLY dq,  %4,  %8,  %12
268    SBUTTERFLY dq,  %9,  %13, %12
269    SBUTTERFLY dq,  %10, %14, %12
270    SBUTTERFLY dq,  %11, %15, %12
271    mova m%12, %17
272    mova  %17, m%8
273    SBUTTERFLY dq,  %12, %16, %8
274    SBUTTERFLY qdq, %1,  %9,  %8
275    SBUTTERFLY qdq, %2,  %10, %8
276    SBUTTERFLY qdq, %3,  %11, %8
277    SBUTTERFLY qdq, %4,  %12, %8
278    SBUTTERFLY qdq, %5,  %13, %8
279    SBUTTERFLY qdq, %6,  %14, %8
280    SBUTTERFLY qdq, %7,  %15, %8
281    mova m%8, %17
282    mova %17, m%1
283    SBUTTERFLY qdq, %8,  %16, %1
284    mova m%1, %17
285    SWAP %2,  %9
286    SWAP %3,  %5
287    SWAP %4,  %13
288    SWAP %6,  %11
289    SWAP %8,  %15
290    SWAP %12, %14
291%endmacro
292
293%macro TRANSPOSE8x8B 13
294    SBUTTERFLY bw,  %1, %2, %7
295    movdq%10 m%7, %9
296    movdqa %11, m%2
297    SBUTTERFLY bw,  %3, %4, %2
298    SBUTTERFLY bw,  %5, %6, %2
299    SBUTTERFLY bw,  %7, %8, %2
300    SBUTTERFLY wd,  %1, %3, %2
301    movdqa m%2, %11
302    movdqa %11, m%3
303    SBUTTERFLY wd,  %2, %4, %3
304    SBUTTERFLY wd,  %5, %7, %3
305    SBUTTERFLY wd,  %6, %8, %3
306    SBUTTERFLY dq, %1, %5, %3
307    SBUTTERFLY dq, %2, %6, %3
308    movdqa m%3, %11
309    movh   %12, m%2
310    movhps %13, m%2
311    SBUTTERFLY dq, %3, %7, %2
312    SBUTTERFLY dq, %4, %8, %2
313    SWAP %2, %5
314    SWAP %4, %7
315%endmacro
316
317%macro DEFINE_REAL_P7_TO_Q7 0-1 0
318%define P7 dstq  + 4*mstrideq  + %1
319%define P6 dstq  +   mstride3q + %1
320%define P5 dstq  + 2*mstrideq  + %1
321%define P4 dstq  +   mstrideq  + %1
322%define P3 dstq                + %1
323%define P2 dstq  +    strideq  + %1
324%define P1 dstq  + 2* strideq  + %1
325%define P0 dstq  +    stride3q + %1
326%define Q0 dstq  + 4* strideq  + %1
327%define Q1 dst2q +   mstride3q + %1
328%define Q2 dst2q + 2*mstrideq  + %1
329%define Q3 dst2q +   mstrideq  + %1
330%define Q4 dst2q               + %1
331%define Q5 dst2q +    strideq  + %1
332%define Q6 dst2q + 2* strideq  + %1
333%define Q7 dst2q +    stride3q + %1
334%endmacro
335
336%macro DEFINE_TRANSPOSED_P7_TO_Q7 0-1 0
337%define P3 rsp +  0*mmsize + %1
338%define P2 rsp +  1*mmsize + %1
339%define P1 rsp +  2*mmsize + %1
340%define P0 rsp +  3*mmsize + %1
341%define Q0 rsp +  4*mmsize + %1
342%define Q1 rsp +  5*mmsize + %1
343%define Q2 rsp +  6*mmsize + %1
344%define Q3 rsp +  7*mmsize + %1
345%if mmsize == 16
346%define P7 rsp +  8*mmsize + %1
347%define P6 rsp +  9*mmsize + %1
348%define P5 rsp + 10*mmsize + %1
349%define P4 rsp + 11*mmsize + %1
350%define Q4 rsp + 12*mmsize + %1
351%define Q5 rsp + 13*mmsize + %1
352%define Q6 rsp + 14*mmsize + %1
353%define Q7 rsp + 15*mmsize + %1
354%endif
355%endmacro
356
357; ..............AB -> AAAAAAAABBBBBBBB
358%macro SPLATB_MIX 1-2 [mask_mix]
359%if cpuflag(ssse3)
360    pshufb     %1, %2
361%else
362    punpcklbw  %1, %1
363    punpcklwd  %1, %1
364    punpckldq  %1, %1
365%endif
366%endmacro
367
368%macro LOOPFILTER 5 ; %1=v/h %2=size1 %3+%4=stack, %5=mmx/32bit stack only
369%assign %%ext 0
370%if ARCH_X86_32 || mmsize == 8
371%assign %%ext %5
372%endif
373
374%if UNIX64
375cglobal vp9_loop_filter_%1_%2_ %+ mmsize, 5, 9, 16, %3 + %4 + %%ext, dst, stride, E, I, H, mstride, dst2, stride3, mstride3
376%else
377%if WIN64
378cglobal vp9_loop_filter_%1_%2_ %+ mmsize, 4, 8, 16, %3 + %4 + %%ext, dst, stride, E, I, mstride, dst2, stride3, mstride3
379%else
380cglobal vp9_loop_filter_%1_%2_ %+ mmsize, 2, 6, 16, %3 + %4 + %%ext, dst, stride, mstride, dst2, stride3, mstride3
381%define Ed dword r2m
382%define Id dword r3m
383%endif
384%define Hd dword r4m
385%endif
386
387    mov               mstrideq, strideq
388    neg               mstrideq
389
390    lea               stride3q, [strideq*3]
391    lea              mstride3q, [mstrideq*3]
392
393%ifidn %1, h
394%if %2 != 16
395%if mmsize == 16
396%define movx movh
397%else
398%define movx mova
399%endif
400    lea                   dstq, [dstq + 4*strideq - 4]
401%else
402%define movx movu
403    lea                   dstq, [dstq + 4*strideq - 8] ; go from top center (h pos) to center left (v pos)
404%endif
405%else
406    lea                   dstq, [dstq + 4*mstrideq]
407%endif
408    ; FIXME we shouldn't need two dts registers if mmsize == 8
409    lea                  dst2q, [dstq + 8*strideq]
410
411    DEFINE_REAL_P7_TO_Q7
412
413%ifidn %1, h
414    movx                    m0, [P7]
415    movx                    m1, [P6]
416    movx                    m2, [P5]
417    movx                    m3, [P4]
418    movx                    m4, [P3]
419    movx                    m5, [P2]
420%if (ARCH_X86_64 && mmsize == 16) || %2 > 16
421    movx                    m6, [P1]
422%endif
423    movx                    m7, [P0]
424%ifdef m8
425    movx                    m8, [Q0]
426    movx                    m9, [Q1]
427    movx                   m10, [Q2]
428    movx                   m11, [Q3]
429    movx                   m12, [Q4]
430    movx                   m13, [Q5]
431    movx                   m14, [Q6]
432    movx                   m15, [Q7]
433    DEFINE_TRANSPOSED_P7_TO_Q7
434%if %2 == 16
435    TRANSPOSE16x16B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, [rsp]
436    mova           [P7],  m0
437    mova           [P6],  m1
438    mova           [P5],  m2
439    mova           [P4],  m3
440%else ; %2 == 44/48/84/88
441    ; 8x16 transpose
442    punpcklbw        m0,  m1
443    punpcklbw        m2,  m3
444    punpcklbw        m4,  m5
445    punpcklbw        m6,  m7
446    punpcklbw        m8,  m9
447    punpcklbw       m10, m11
448    punpcklbw       m12, m13
449    punpcklbw       m14, m15
450    TRANSPOSE8x8W     0, 2, 4, 6, 8, 10, 12, 14, 15
451    SWAP              0,  4
452    SWAP              2,  5
453    SWAP              0,  6
454    SWAP              0,  7
455    SWAP             10,  9
456    SWAP             12, 10
457    SWAP             14, 11
458%endif ; %2
459    mova           [P3],  m4
460    mova           [P2],  m5
461    mova           [P1],  m6
462    mova           [P0],  m7
463    mova           [Q0],  m8
464    mova           [Q1],  m9
465    mova           [Q2], m10
466    mova           [Q3], m11
467%if %2 == 16
468    mova           [Q4], m12
469    mova           [Q5], m13
470    mova           [Q6], m14
471    mova           [Q7], m15
472%endif ; %2
473%else ; x86-32
474%if %2 == 16
475    TRANSPOSE8x8B    0, 1, 2, 3, 4, 5, 6, 7, [P1], u, [rsp+%3+%4], [rsp+64], [rsp+80]
476    DEFINE_TRANSPOSED_P7_TO_Q7
477    movh          [P7], m0
478    movh          [P5], m1
479    movh          [P3], m2
480    movh          [P1], m3
481    movh          [Q2], m5
482    movh          [Q4], m6
483    movh          [Q6], m7
484    movhps        [P6], m0
485    movhps        [P4], m1
486    movhps        [P2], m2
487    movhps        [P0], m3
488    movhps        [Q3], m5
489    movhps        [Q5], m6
490    movhps        [Q7], m7
491    DEFINE_REAL_P7_TO_Q7
492    movx                    m0, [Q0]
493    movx                    m1, [Q1]
494    movx                    m2, [Q2]
495    movx                    m3, [Q3]
496    movx                    m4, [Q4]
497    movx                    m5, [Q5]
498    movx                    m7, [Q7]
499    TRANSPOSE8x8B 0, 1, 2, 3, 4, 5, 6, 7, [Q6], u, [rsp+%3+%4], [rsp+72], [rsp+88]
500    DEFINE_TRANSPOSED_P7_TO_Q7 8
501    movh          [P7], m0
502    movh          [P5], m1
503    movh          [P3], m2
504    movh          [P1], m3
505    movh          [Q2], m5
506    movh          [Q4], m6
507    movh          [Q6], m7
508    movhps        [P6], m0
509    movhps        [P4], m1
510    movhps        [P2], m2
511    movhps        [P0], m3
512    movhps        [Q3], m5
513    movhps        [Q5], m6
514    movhps        [Q7], m7
515    DEFINE_TRANSPOSED_P7_TO_Q7
516%elif %2 > 16 ; %2 == 44/48/84/88
517    punpcklbw        m0, m1
518    punpcklbw        m2, m3
519    punpcklbw        m4, m5
520    punpcklbw        m6, m7
521    movx             m1, [Q0]
522    movx             m3, [Q1]
523    movx             m5, [Q2]
524    movx             m7, [Q3]
525    punpcklbw        m1, m3
526    punpcklbw        m5, m7
527    movx             m3, [Q4]
528    movx             m7, [Q5]
529    punpcklbw        m3, m7
530    mova          [rsp], m3
531    movx             m3, [Q6]
532    movx             m7, [Q7]
533    punpcklbw        m3, m7
534    DEFINE_TRANSPOSED_P7_TO_Q7
535    TRANSPOSE8x8W     0, 2, 4, 6, 1, 5, 7, 3, [rsp], [Q0], 1
536    mova           [P3],  m0
537    mova           [P2],  m2
538    mova           [P1],  m4
539    mova           [P0],  m6
540    mova           [Q1],  m5
541    mova           [Q2],  m7
542    mova           [Q3],  m3
543%else ; %2 == 4 || %2 == 8
544    SBUTTERFLY       bw, 0, 1, 6
545    SBUTTERFLY       bw, 2, 3, 6
546    SBUTTERFLY       bw, 4, 5, 6
547    mova [rsp+4*mmsize], m5
548    mova             m6, [P1]
549    SBUTTERFLY       bw, 6, 7, 5
550    DEFINE_TRANSPOSED_P7_TO_Q7
551    TRANSPOSE4x4W     0, 2, 4, 6, 5
552    mova           [P3], m0
553    mova           [P2], m2
554    mova           [P1], m4
555    mova           [P0], m6
556    mova             m5, [rsp+4*mmsize]
557    TRANSPOSE4x4W     1, 3, 5, 7, 0
558    mova           [Q0], m1
559    mova           [Q1], m3
560    mova           [Q2], m5
561    mova           [Q3], m7
562%endif ; %2
563%endif ; x86-32/64
564%endif ; %1 == h
565
566    ; calc fm mask
567%if %2 == 16 || mmsize == 8
568%if cpuflag(ssse3)
569    pxor                m0, m0
570%endif
571    SPLATB_REG          m2, I, m0                       ; I I I I ...
572    SPLATB_REG          m3, E, m0                       ; E E E E ...
573%else
574%if cpuflag(ssse3)
575    mova                m0, [mask_mix]
576%endif
577    movd                m2, Id
578    movd                m3, Ed
579    SPLATB_MIX          m2, m0
580    SPLATB_MIX          m3, m0
581%endif
582    mova                m0, [pb_80]
583    pxor                m2, m0
584    pxor                m3, m0
585%ifdef m8
586%ifidn %1, v
587    mova                m8, [P3]
588    mova                m9, [P2]
589    mova               m10, [P1]
590    mova               m11, [P0]
591    mova               m12, [Q0]
592    mova               m13, [Q1]
593    mova               m14, [Q2]
594    mova               m15, [Q3]
595%else
596    ; In case of horizontal, P3..Q3 are already present in some registers due
597    ; to the previous transpose, so we just swap registers.
598    SWAP                 8,  4, 12
599    SWAP                 9,  5, 13
600    SWAP                10,  6, 14
601    SWAP                11,  7, 15
602%endif
603%define rp3 m8
604%define rp2 m9
605%define rp1 m10
606%define rp0 m11
607%define rq0 m12
608%define rq1 m13
609%define rq2 m14
610%define rq3 m15
611%else
612%define rp3 [P3]
613%define rp2 [P2]
614%define rp1 [P1]
615%define rp0 [P0]
616%define rq0 [Q0]
617%define rq1 [Q1]
618%define rq2 [Q2]
619%define rq3 [Q3]
620%endif
621    ABSSUB_GT           m5, rp3, rp2, m2, m7, m0        ; m5 = abs(p3-p2) <= I
622    ABSSUB_GT           m1, rp2, rp1, m2, m7, m0        ; m1 = abs(p2-p1) <= I
623    por                 m5, m1
624    ABSSUB_GT           m1, rp1, rp0, m2, m7, m0        ; m1 = abs(p1-p0) <= I
625    por                 m5, m1
626    ABSSUB_GT           m1, rq0, rq1, m2, m7, m0        ; m1 = abs(q1-q0) <= I
627    por                 m5, m1
628    ABSSUB_GT           m1, rq1, rq2, m2, m7, m0        ; m1 = abs(q2-q1) <= I
629    por                 m5, m1
630    ABSSUB_GT           m1, rq2, rq3, m2, m7, m0        ; m1 = abs(q3-q2) <= I
631    por                 m5, m1
632    ABSSUB              m1, rp0, rq0, m7                ; abs(p0-q0)
633    paddusb             m1, m1                          ; abs(p0-q0) * 2
634    ABSSUB              m2, rp1, rq1, m7                ; abs(p1-q1)
635    pand                m2, [pb_fe]                     ; drop lsb so shift can work
636    psrlq               m2, 1                           ; abs(p1-q1)/2
637    paddusb             m1, m2                          ; abs(p0-q0)*2 + abs(p1-q1)/2
638    pxor                m1, m0
639    pcmpgtb             m1, m3
640    por                 m1, m5                          ; fm final value
641    SWAP                 1, 3
642    pxor                m3, [pb_ff]
643
644    ; (m3: fm, m8..15: p3 p2 p1 p0 q0 q1 q2 q3)
645    ; calc flat8in (if not 44_16) and hev masks
646%if %2 != 44 && %2 != 4
647    mova                m6, [pb_81]                     ; [1 1 1 1 ...] ^ 0x80
648    ABSSUB_GT           m2, rp3, rp0, m6, m5            ; abs(p3 - p0) <= 1
649%ifdef m8
650    mova                m8, [pb_80]
651%define rb80 m8
652%else
653%define rb80 [pb_80]
654%endif
655    ABSSUB_GT           m1, rp2, rp0, m6, m5, rb80      ; abs(p2 - p0) <= 1
656    por                 m2, m1
657    ABSSUB              m4, rp1, rp0, m5                ; abs(p1 - p0)
658%if %2 <= 16
659%if cpuflag(ssse3)
660    pxor                m0, m0
661%endif
662    SPLATB_REG          m7, H, m0                       ; H H H H ...
663%else
664    movd                m7, Hd
665    SPLATB_MIX          m7
666%endif
667    pxor                m7, rb80
668    pxor                m4, rb80
669    pcmpgtb             m0, m4, m7                      ; abs(p1 - p0) > H (1/2 hev condition)
670    CMP_GT              m4, m6                          ; abs(p1 - p0) <= 1
671    por                 m2, m4                          ; (flat8in)
672    ABSSUB              m4, rq1, rq0, m1                ; abs(q1 - q0)
673    pxor                m4, rb80
674    pcmpgtb             m5, m4, m7                      ; abs(q1 - q0) > H (2/2 hev condition)
675    por                 m0, m5                          ; hev final value
676    CMP_GT              m4, m6                          ; abs(q1 - q0) <= 1
677    por                 m2, m4                          ; (flat8in)
678    ABSSUB_GT           m1, rq2, rq0, m6, m5, rb80      ; abs(q2 - q0) <= 1
679    por                 m2, m1
680    ABSSUB_GT           m1, rq3, rq0, m6, m5, rb80      ; abs(q3 - q0) <= 1
681    por                 m2, m1                          ; flat8in final value
682    pxor                m2, [pb_ff]
683%if %2 == 84 || %2 == 48
684    pand                m2, [mask_mix%2]
685%endif
686%else
687    mova                m6, [pb_80]
688%if %2 == 44
689    movd                m7, Hd
690    SPLATB_MIX          m7
691%else
692%if cpuflag(ssse3)
693    pxor                m0, m0
694%endif
695    SPLATB_REG          m7, H, m0                       ; H H H H ...
696%endif
697    pxor                m7, m6
698    ABSSUB              m4, rp1, rp0, m1                ; abs(p1 - p0)
699    pxor                m4, m6
700    pcmpgtb             m0, m4, m7                      ; abs(p1 - p0) > H (1/2 hev condition)
701    ABSSUB              m4, rq1, rq0, m1                ; abs(q1 - q0)
702    pxor                m4, m6
703    pcmpgtb             m5, m4, m7                      ; abs(q1 - q0) > H (2/2 hev condition)
704    por                 m0, m5                          ; hev final value
705%endif
706
707%if %2 == 16
708    ; (m0: hev, m2: flat8in, m3: fm, m6: pb_81, m9..15: p2 p1 p0 q0 q1 q2 q3)
709    ; calc flat8out mask
710%ifdef m8
711    mova                m8, [P7]
712    mova                m9, [P6]
713%define rp7 m8
714%define rp6 m9
715%else
716%define rp7 [P7]
717%define rp6 [P6]
718%endif
719    ABSSUB_GT           m1, rp7, rp0, m6, m5            ; abs(p7 - p0) <= 1
720    ABSSUB_GT           m7, rp6, rp0, m6, m5            ; abs(p6 - p0) <= 1
721    por                 m1, m7
722%ifdef m8
723    mova                m8, [P5]
724    mova                m9, [P4]
725%define rp5 m8
726%define rp4 m9
727%else
728%define rp5 [P5]
729%define rp4 [P4]
730%endif
731    ABSSUB_GT           m7, rp5, rp0, m6, m5            ; abs(p5 - p0) <= 1
732    por                 m1, m7
733    ABSSUB_GT           m7, rp4, rp0, m6, m5            ; abs(p4 - p0) <= 1
734    por                 m1, m7
735%ifdef m8
736    mova                m14, [Q4]
737    mova                m15, [Q5]
738%define rq4 m14
739%define rq5 m15
740%else
741%define rq4 [Q4]
742%define rq5 [Q5]
743%endif
744    ABSSUB_GT           m7, rq4, rq0, m6, m5            ; abs(q4 - q0) <= 1
745    por                 m1, m7
746    ABSSUB_GT           m7, rq5, rq0, m6, m5            ; abs(q5 - q0) <= 1
747    por                 m1, m7
748%ifdef m8
749    mova                m14, [Q6]
750    mova                m15, [Q7]
751%define rq6 m14
752%define rq7 m15
753%else
754%define rq6 [Q6]
755%define rq7 [Q7]
756%endif
757    ABSSUB_GT           m7, rq6, rq0, m6, m5            ; abs(q4 - q0) <= 1
758    por                 m1, m7
759    ABSSUB_GT           m7, rq7, rq0, m6, m5            ; abs(q5 - q0) <= 1
760    por                 m1, m7                          ; flat8out final value
761    pxor                m1, [pb_ff]
762%endif
763
764    ; if (fm) {
765    ;     if (out && in) filter_14()
766    ;     else if (in)   filter_6()
767    ;     else if (hev)  filter_2()
768    ;     else           filter_4()
769    ; }
770    ;
771    ; f14:                                                                            fm &  out &  in
772    ; f6:  fm & ~f14 & in        => fm & ~(out & in) & in                          => fm & ~out &  in
773    ; f2:  fm & ~f14 & ~f6 & hev => fm & ~(out & in) & ~(~out & in) & hev          => fm &  ~in &  hev
774    ; f4:  fm & ~f14 & ~f6 & ~f2 => fm & ~(out & in) & ~(~out & in) & ~(~in & hev) => fm &  ~in & ~hev
775
776    ; (m0: hev, [m1: flat8out], [m2: flat8in], m3: fm, m8..15: p5 p4 p1 p0 q0 q1 q6 q7)
777    ; filter2()
778%if %2 != 44 && %2 != 4
779    mova                m6, [pb_80]                     ; already in m6 if 44_16
780    SCRATCH              2, 15, rsp+%3+%4
781%if %2 == 16
782    SCRATCH              1,  8, rsp+%3+%4+16
783%endif
784%endif
785    pxor                m2, m6, rq0                     ; q0 ^ 0x80
786    pxor                m4, m6, rp0                     ; p0 ^ 0x80
787    psubsb              m2, m4                          ; (signed) q0 - p0
788    pxor                m4, m6, rp1                     ; p1 ^ 0x80
789    pxor                m5, m6, rq1                     ; q1 ^ 0x80
790    psubsb              m4, m5                          ; (signed) p1 - q1
791    paddsb              m4, m2                          ;   (q0 - p0) + (p1 - q1)
792    paddsb              m4, m2                          ; 2*(q0 - p0) + (p1 - q1)
793    paddsb              m4, m2                          ; 3*(q0 - p0) + (p1 - q1)
794    paddsb              m6, m4, [pb_4]                  ; m6: f1 = clip(f + 4, 127)
795    paddsb              m4, [pb_3]                      ; m4: f2 = clip(f + 3, 127)
796%ifdef m8
797    mova                m14, [pb_10]                    ; will be reused in filter4()
798%define rb10 m14
799%else
800%define rb10 [pb_10]
801%endif
802    SRSHIFT3B_2X        m6, m4, rb10, m7                ; f1 and f2 sign byte shift by 3
803    SIGN_SUB            m7, rq0, m6, m5                 ; m7 = q0 - f1
804    SIGN_ADD            m1, rp0, m4, m5                 ; m1 = p0 + f2
805%if %2 != 44 && %2 != 4
806%ifdef m8
807    pandn               m6, m15, m3                     ;  ~mask(in) & mask(fm)
808%else
809    mova                m6, [rsp+%3+%4]
810    pandn               m6, m3
811%endif
812    pand                m6, m0                          ; (~mask(in) & mask(fm)) & mask(hev)
813%else
814    pand                m6, m3, m0
815%endif
816    MASK_APPLY          m7, rq0, m6, m5                 ; m7 = filter2(q0) & mask / we write it in filter4()
817    MASK_APPLY          m1, rp0, m6, m5                 ; m1 = filter2(p0) & mask / we write it in filter4()
818
819    ; (m0: hev, m1: p0', m2: q0-p0, m3: fm, m7: q0', [m8: flat8out], m10..13: p1 p0 q0 q1, m14: pb_10, [m15: flat8in], )
820    ; filter4()
821    mova                m4, m2
822    paddsb              m2, m4                          ; 2 * (q0 - p0)
823    paddsb              m2, m4                          ; 3 * (q0 - p0)
824    paddsb              m6, m2, [pb_4]                  ; m6:  f1 = clip(f + 4, 127)
825    paddsb              m2, [pb_3]                      ; m2: f2 = clip(f + 3, 127)
826    SRSHIFT3B_2X        m6, m2, rb10, m4                ; f1 and f2 sign byte shift by 3
827%if %2 != 44 && %2 != 4
828%ifdef m8
829    pandn               m5, m15, m3                     ;               ~mask(in) & mask(fm)
830%else
831    mova                m5, [rsp+%3+%4]
832    pandn               m5, m3
833%endif
834    pandn               m0, m5                          ; ~mask(hev) & (~mask(in) & mask(fm))
835%else
836    pandn               m0, m3
837%endif
838    SIGN_SUB            m5, rq0, m6, m4                 ; q0 - f1
839    MASK_APPLY          m5, m7, m0, m4                  ; filter4(q0) & mask
840    mova                [Q0], m5
841    SIGN_ADD            m7, rp0, m2, m4                 ; p0 + f2
842    MASK_APPLY          m7, m1, m0, m4                  ; filter4(p0) & mask
843    mova                [P0], m7
844    paddb               m6, [pb_80]                     ;
845    pxor                m1, m1                          ;   f=(f1+1)>>1
846    pavgb               m6, m1                          ;
847    psubb               m6, [pb_40]                     ;
848    SIGN_ADD            m1, rp1, m6, m2                 ; p1 + f
849    SIGN_SUB            m4, rq1, m6, m2                 ; q1 - f
850    MASK_APPLY          m1, rp1, m0, m2                 ; m1 = filter4(p1)
851    MASK_APPLY          m4, rq1, m0, m2                 ; m4 = filter4(q1)
852    mova                [P1], m1
853    mova                [Q1], m4
854
855%if %2 != 44 && %2 != 4
856    UNSCRATCH            2, 15, rsp+%3+%4
857%endif
858
859    ; ([m1: flat8out], m2: flat8in, m3: fm, m10..13: p1 p0 q0 q1)
860    ; filter6()
861%if %2 != 44 && %2 != 4
862    pxor                m0, m0
863%if %2 != 16
864    pand                m3, m2
865%else
866    pand                m2, m3                          ;               mask(fm) & mask(in)
867%ifdef m8
868    pandn               m3, m8, m2                      ; ~mask(out) & (mask(fm) & mask(in))
869%else
870    mova                m3, [rsp+%3+%4+16]
871    pandn               m3, m2
872%endif
873%endif
874%ifdef m8
875    mova               m14, [P3]
876    mova                m9, [Q3]
877%define rp3 m14
878%define rq3 m9
879%else
880%define rp3 [P3]
881%define rq3 [Q3]
882%endif
883    mova                m1, [P2]
884    FILTER_INIT         m4, m5, m6, m7, [P2], %4, 6,             m3,  m1             ; [p2]
885    mova                m1, [Q2]
886    FILTER_UPDATE       m4, m5, m6, m7, [P1], %4, 0, 1, 2, 5, 3, m3,  "", rq1, "", 1 ; [p1] -p3 -p2 +p1 +q1
887    FILTER_UPDATE       m4, m5, m6, m7, [P0], %4, 0, 2, 3, 6, 3, m3,  "", m1         ; [p0] -p3 -p1 +p0 +q2
888    FILTER_UPDATE       m4, m5, m6, m7, [Q0], %4, 0, 3, 4, 7, 3, m3,  "", rq3, "", 1 ; [q0] -p3 -p0 +q0 +q3
889    FILTER_UPDATE       m4, m5, m6, m7, [Q1], %4, 1, 4, 5, 7, 3, m3,  ""             ; [q1] -p2 -q0 +q1 +q3
890    FILTER_UPDATE       m4, m5, m6, m7, [Q2], %4, 2, 5, 6, 7, 3, m3,  m1             ; [q2] -p1 -q1 +q2 +q3
891%endif
892
893%if %2 == 16
894    UNSCRATCH            1,  8, rsp+%3+%4+16
895%endif
896
897    ; (m0: 0, [m1: flat8out], m2: fm & flat8in, m8..15: q2 q3 p1 p0 q0 q1 p3 p2)
898    ; filter14()
899    ;
900    ;                            m2  m3  m8  m9 m14 m15 m10 m11 m12 m13
901    ;
902    ;                                    q2  q3  p3  p2  p1  p0  q0  q1
903    ; p6  -7                     p7  p6  p5  p4   .   .   .   .   .
904    ; p5  -6  -p7 -p6 +p5 +q1     .   .   .                           .
905    ; p4  -5  -p7 -p5 +p4 +q2     .       .   .                      q2
906    ; p3  -4  -p7 -p4 +p3 +q3     .           .   .                  q3
907    ; p2  -3  -p7 -p3 +p2 +q4     .               .   .              q4
908    ; p1  -2  -p7 -p2 +p1 +q5     .                   .   .          q5
909    ; p0  -1  -p7 -p1 +p0 +q6     .                       .   .      q6
910    ; q0  +0  -p7 -p0 +q0 +q7     .                           .   .  q7
911    ; q1  +1  -p6 -q0 +q1 +q7    q1   .                           .   .
912    ; q2  +2  -p5 -q1 +q2 +q7     .  q2   .                           .
913    ; q3  +3  -p4 -q2 +q3 +q7         .  q3   .                       .
914    ; q4  +4  -p3 -q3 +q4 +q7             .  q4   .                   .
915    ; q5  +5  -p2 -q4 +q5 +q7                 .  q5   .               .
916    ; q6  +6  -p1 -q5 +q6 +q7                     .  q6   .           .
917
918%if %2 == 16
919    pand            m1, m2                                                              ; mask(out) & (mask(fm) & mask(in))
920    mova            m2, [P7]
921    mova            m3, [P6]
922%ifdef m8
923    mova            m8, [P5]
924    mova            m9, [P4]
925%define rp5 m8
926%define rp4 m9
927%define rp5s m8
928%define rp4s m9
929%define rp3s m14
930%define rq4 m8
931%define rq5 m9
932%define rq6 m14
933%define rq7 m15
934%define rq4s m8
935%define rq5s m9
936%define rq6s m14
937%else
938%define rp5 [P5]
939%define rp4 [P4]
940%define rp5s ""
941%define rp4s ""
942%define rp3s ""
943%define rq4 [Q4]
944%define rq5 [Q5]
945%define rq6 [Q6]
946%define rq7 [Q7]
947%define rq4s ""
948%define rq5s ""
949%define rq6s ""
950%endif
951    FILTER_INIT     m4, m5, m6, m7, [P6], %4, 14,                m1,  m3            ; [p6]
952    FILTER_UPDATE   m4, m5, m6, m7, [P5], %4,  8,  9, 10,  5, 4, m1, rp5s           ; [p5] -p7 -p6 +p5 +q1
953    FILTER_UPDATE   m4, m5, m6, m7, [P4], %4,  8, 10, 11,  6, 4, m1, rp4s           ; [p4] -p7 -p5 +p4 +q2
954    FILTER_UPDATE   m4, m5, m6, m7, [P3], %4,  8, 11,  0,  7, 4, m1, rp3s           ; [p3] -p7 -p4 +p3 +q3
955    FILTER_UPDATE   m4, m5, m6, m7, [P2], %4,  8,  0,  1, 12, 4, m1,  "", rq4, [Q4], 1 ; [p2] -p7 -p3 +p2 +q4
956    FILTER_UPDATE   m4, m5, m6, m7, [P1], %4,  8,  1,  2, 13, 4, m1,  "", rq5, [Q5], 1 ; [p1] -p7 -p2 +p1 +q5
957    FILTER_UPDATE   m4, m5, m6, m7, [P0], %4,  8,  2,  3, 14, 4, m1,  "", rq6, [Q6], 1 ; [p0] -p7 -p1 +p0 +q6
958    FILTER_UPDATE   m4, m5, m6, m7, [Q0], %4,  8,  3,  4, 15, 4, m1,  "", rq7, [Q7], 1 ; [q0] -p7 -p0 +q0 +q7
959    FILTER_UPDATE   m4, m5, m6, m7, [Q1], %4,  9,  4,  5, 15, 4, m1,  ""            ; [q1] -p6 -q0 +q1 +q7
960    FILTER_UPDATE   m4, m5, m6, m7, [Q2], %4, 10,  5,  6, 15, 4, m1,  ""            ; [q2] -p5 -q1 +q2 +q7
961    FILTER_UPDATE   m4, m5, m6, m7, [Q3], %4, 11,  6,  7, 15, 4, m1,  ""            ; [q3] -p4 -q2 +q3 +q7
962    FILTER_UPDATE   m4, m5, m6, m7, [Q4], %4,  0,  7, 12, 15, 4, m1, rq4s           ; [q4] -p3 -q3 +q4 +q7
963    FILTER_UPDATE   m4, m5, m6, m7, [Q5], %4,  1, 12, 13, 15, 4, m1, rq5s           ; [q5] -p2 -q4 +q5 +q7
964    FILTER_UPDATE   m4, m5, m6, m7, [Q6], %4,  2, 13, 14, 15, 4, m1, rq6s           ; [q6] -p1 -q5 +q6 +q7
965%endif
966
967%ifidn %1, h
968%if %2 == 16
969    mova                    m0, [P7]
970    mova                    m1, [P6]
971    mova                    m2, [P5]
972    mova                    m3, [P4]
973    mova                    m4, [P3]
974    mova                    m5, [P2]
975%if ARCH_X86_64
976    mova                    m6, [P1]
977%endif
978    mova                    m7, [P0]
979%if ARCH_X86_64
980    mova                    m8, [Q0]
981    mova                    m9, [Q1]
982    mova                   m10, [Q2]
983    mova                   m11, [Q3]
984    mova                   m12, [Q4]
985    mova                   m13, [Q5]
986    mova                   m14, [Q6]
987    mova                   m15, [Q7]
988    TRANSPOSE16x16B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, [rsp]
989    DEFINE_REAL_P7_TO_Q7
990    movu  [P7],  m0
991    movu  [P6],  m1
992    movu  [P5],  m2
993    movu  [P4],  m3
994    movu  [P3],  m4
995    movu  [P2],  m5
996    movu  [P1],  m6
997    movu  [P0],  m7
998    movu  [Q0],  m8
999    movu  [Q1],  m9
1000    movu  [Q2], m10
1001    movu  [Q3], m11
1002    movu  [Q4], m12
1003    movu  [Q5], m13
1004    movu  [Q6], m14
1005    movu  [Q7], m15
1006%else
1007    DEFINE_REAL_P7_TO_Q7
1008    TRANSPOSE8x8B 0, 1, 2, 3, 4, 5, 6, 7, [rsp+32], a, [rsp+%3+%4], [Q0], [Q1]
1009    movh   [P7],  m0
1010    movh   [P5],  m1
1011    movh   [P3],  m2
1012    movh   [P1],  m3
1013    movh   [Q2],  m5
1014    movh   [Q4],  m6
1015    movh   [Q6],  m7
1016    movhps [P6],  m0
1017    movhps [P4],  m1
1018    movhps [P2],  m2
1019    movhps [P0],  m3
1020    movhps [Q3],  m5
1021    movhps [Q5],  m6
1022    movhps [Q7],  m7
1023    DEFINE_TRANSPOSED_P7_TO_Q7
1024    mova                    m0, [Q0]
1025    mova                    m1, [Q1]
1026    mova                    m2, [Q2]
1027    mova                    m3, [Q3]
1028    mova                    m4, [Q4]
1029    mova                    m5, [Q5]
1030    mova                    m7, [Q7]
1031    DEFINE_REAL_P7_TO_Q7 8
1032    TRANSPOSE8x8B 0, 1, 2, 3, 4, 5, 6, 7, [rsp+224], a, [rsp+%3+%4], [Q0], [Q1]
1033    movh   [P7],  m0
1034    movh   [P5],  m1
1035    movh   [P3],  m2
1036    movh   [P1],  m3
1037    movh   [Q2],  m5
1038    movh   [Q4],  m6
1039    movh   [Q6],  m7
1040    movhps [P6],  m0
1041    movhps [P4],  m1
1042    movhps [P2],  m2
1043    movhps [P0],  m3
1044    movhps [Q3],  m5
1045    movhps [Q5],  m6
1046    movhps [Q7],  m7
1047%endif
1048%elif %2 == 44 || %2 == 4
1049    SWAP 0, 1   ; m0 = p1
1050    SWAP 1, 7   ; m1 = p0
1051    SWAP 2, 5   ; m2 = q0
1052    SWAP 3, 4   ; m3 = q1
1053    DEFINE_REAL_P7_TO_Q7 2
1054    SBUTTERFLY  bw, 0, 1, 4
1055    SBUTTERFLY  bw, 2, 3, 4
1056    SBUTTERFLY  wd, 0, 2, 4
1057    SBUTTERFLY  wd, 1, 3, 4
1058%if mmsize == 16
1059    movd  [P7], m0
1060    movd  [P3], m2
1061    movd  [Q0], m1
1062    movd  [Q4], m3
1063    psrldq  m0, 4
1064    psrldq  m1, 4
1065    psrldq  m2, 4
1066    psrldq  m3, 4
1067    movd  [P6], m0
1068    movd  [P2], m2
1069    movd  [Q1], m1
1070    movd  [Q5], m3
1071    psrldq  m0, 4
1072    psrldq  m1, 4
1073    psrldq  m2, 4
1074    psrldq  m3, 4
1075    movd  [P5], m0
1076    movd  [P1], m2
1077    movd  [Q2], m1
1078    movd  [Q6], m3
1079    psrldq  m0, 4
1080    psrldq  m1, 4
1081    psrldq  m2, 4
1082    psrldq  m3, 4
1083    movd  [P4], m0
1084    movd  [P0], m2
1085    movd  [Q3], m1
1086    movd  [Q7], m3
1087%else
1088    movd  [P7], m0
1089    movd  [P5], m2
1090    movd  [P3], m1
1091    movd  [P1], m3
1092    psrlq   m0, 32
1093    psrlq   m2, 32
1094    psrlq   m1, 32
1095    psrlq   m3, 32
1096    movd  [P6], m0
1097    movd  [P4], m2
1098    movd  [P2], m1
1099    movd  [P0], m3
1100%endif
1101%else
1102    ; the following code do a transpose of 8 full lines to 16 half
1103    ; lines (high part). It is inlined to avoid the need of a staging area
1104    mova                    m0, [P3]
1105    mova                    m1, [P2]
1106    mova                    m2, [P1]
1107    mova                    m3, [P0]
1108    mova                    m4, [Q0]
1109    mova                    m5, [Q1]
1110%ifdef m8
1111    mova                    m6, [Q2]
1112%endif
1113    mova                    m7, [Q3]
1114    DEFINE_REAL_P7_TO_Q7
1115%ifdef m8
1116    SBUTTERFLY  bw,  0,  1, 8
1117    SBUTTERFLY  bw,  2,  3, 8
1118    SBUTTERFLY  bw,  4,  5, 8
1119    SBUTTERFLY  bw,  6,  7, 8
1120    SBUTTERFLY  wd,  0,  2, 8
1121    SBUTTERFLY  wd,  1,  3, 8
1122    SBUTTERFLY  wd,  4,  6, 8
1123    SBUTTERFLY  wd,  5,  7, 8
1124    SBUTTERFLY  dq,  0,  4, 8
1125    SBUTTERFLY  dq,  1,  5, 8
1126    SBUTTERFLY  dq,  2,  6, 8
1127    SBUTTERFLY  dq,  3,  7, 8
1128%else
1129    SBUTTERFLY  bw,  0,  1, 6
1130    mova [rsp+mmsize*4], m1
1131    mova        m6, [rsp+mmsize*6]
1132    SBUTTERFLY  bw,  2,  3, 1
1133    SBUTTERFLY  bw,  4,  5, 1
1134    SBUTTERFLY  bw,  6,  7, 1
1135    SBUTTERFLY  wd,  0,  2, 1
1136    mova [rsp+mmsize*6], m2
1137    mova        m1, [rsp+mmsize*4]
1138    SBUTTERFLY  wd,  1,  3, 2
1139    SBUTTERFLY  wd,  4,  6, 2
1140    SBUTTERFLY  wd,  5,  7, 2
1141    SBUTTERFLY  dq,  0,  4, 2
1142    SBUTTERFLY  dq,  1,  5, 2
1143%if mmsize == 16
1144    movh      [Q0], m1
1145    movhps    [Q1], m1
1146%else
1147    mova      [P3], m1
1148%endif
1149    mova        m2, [rsp+mmsize*6]
1150    SBUTTERFLY  dq,  2,  6, 1
1151    SBUTTERFLY  dq,  3,  7, 1
1152%endif
1153    SWAP         3, 6
1154    SWAP         1, 4
1155%if mmsize == 16
1156    movh      [P7], m0
1157    movhps    [P6], m0
1158    movh      [P5], m1
1159    movhps    [P4], m1
1160    movh      [P3], m2
1161    movhps    [P2], m2
1162    movh      [P1], m3
1163    movhps    [P0], m3
1164%ifdef m8
1165    movh      [Q0], m4
1166    movhps    [Q1], m4
1167%endif
1168    movh      [Q2], m5
1169    movhps    [Q3], m5
1170    movh      [Q4], m6
1171    movhps    [Q5], m6
1172    movh      [Q6], m7
1173    movhps    [Q7], m7
1174%else
1175    mova      [P7], m0
1176    mova      [P6], m1
1177    mova      [P5], m2
1178    mova      [P4], m3
1179    mova      [P2], m5
1180    mova      [P1], m6
1181    mova      [P0], m7
1182%endif
1183%endif
1184%endif
1185
1186    RET
1187%endmacro
1188
1189%macro LPF_16_VH 5
1190INIT_XMM %5
1191LOOPFILTER v, %1, %2,  0, %4
1192LOOPFILTER h, %1, %2, %3, %4
1193%endmacro
1194
1195%macro LPF_16_VH_ALL_OPTS 4
1196LPF_16_VH %1, %2, %3, %4, sse2
1197LPF_16_VH %1, %2, %3, %4, ssse3
1198LPF_16_VH %1, %2, %3, %4, avx
1199%endmacro
1200
1201LPF_16_VH_ALL_OPTS 16, 512, 256, 32
1202LPF_16_VH_ALL_OPTS 44,   0, 128,  0
1203LPF_16_VH_ALL_OPTS 48, 256, 128, 16
1204LPF_16_VH_ALL_OPTS 84, 256, 128, 16
1205LPF_16_VH_ALL_OPTS 88, 256, 128, 16
1206
1207INIT_MMX mmxext
1208LOOPFILTER v, 4,   0,  0, 0
1209LOOPFILTER h, 4,   0, 64, 0
1210LOOPFILTER v, 8, 128,  0, 8
1211LOOPFILTER h, 8, 128, 64, 8
1212