1;******************************************************************************
2;* VP9 loop filter SIMD optimizations
3;*
4;* Copyright (C) 2015 Ronald S. Bultje <rsbultje@gmail.com>
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "libavutil/x86/x86util.asm"
24
25SECTION_RODATA
26
27pw_511: times 16 dw 511
28pw_2047: times 16 dw 2047
29pw_16384: times 16 dw 16384
30pw_m512: times 16 dw -512
31pw_m2048: times 16 dw -2048
32
33cextern pw_1
34cextern pw_3
35cextern pw_4
36cextern pw_8
37cextern pw_16
38cextern pw_256
39cextern pw_1023
40cextern pw_4095
41cextern pw_m1
42
43SECTION .text
44
45%macro SCRATCH 3-4
46%if ARCH_X86_64
47    SWAP                %1, %2
48%if %0 == 4
49%define reg_%4 m%2
50%endif
51%else
52    mova              [%3], m%1
53%if %0 == 4
54%define reg_%4 [%3]
55%endif
56%endif
57%endmacro
58
59%macro UNSCRATCH 3-4
60%if ARCH_X86_64
61    SWAP                %1, %2
62%else
63    mova               m%1, [%3]
64%endif
65%if %0 == 4
66%undef reg_%4
67%endif
68%endmacro
69
70%macro PRELOAD 2-3
71%if ARCH_X86_64
72    mova               m%1, [%2]
73%if %0 == 3
74%define reg_%3 m%1
75%endif
76%elif %0 == 3
77%define reg_%3 [%2]
78%endif
79%endmacro
80
81; calculate p or q portion of flat8out
82%macro FLAT8OUT_HALF 0
83    psubw               m4, m0                      ; q4-q0
84    psubw               m5, m0                      ; q5-q0
85    psubw               m6, m0                      ; q6-q0
86    psubw               m7, m0                      ; q7-q0
87    ABS2                m4, m5, m2, m3              ; abs(q4-q0) | abs(q5-q0)
88    ABS2                m6, m7, m2, m3              ; abs(q6-q0) | abs(q7-q0)
89    pcmpgtw             m4, reg_F                   ; abs(q4-q0) > F
90    pcmpgtw             m5, reg_F                   ; abs(q5-q0) > F
91    pcmpgtw             m6, reg_F                   ; abs(q6-q0) > F
92    pcmpgtw             m7, reg_F                   ; abs(q7-q0) > F
93    por                 m5, m4
94    por                 m7, m6
95    por                 m7, m5                      ; !flat8out, q portion
96%endmacro
97
98; calculate p or q portion of flat8in/hev/fm (excluding mb_edge condition)
99%macro FLAT8IN_HALF 1
100%if %1 > 4
101    psubw               m4, m3, m0                  ; q3-q0
102    psubw               m5, m2, m0                  ; q2-q0
103    ABS2                m4, m5, m6, m7              ; abs(q3-q0) | abs(q2-q0)
104    pcmpgtw             m4, reg_F                   ; abs(q3-q0) > F
105    pcmpgtw             m5, reg_F                   ; abs(q2-q0) > F
106%endif
107    psubw               m3, m2                      ; q3-q2
108    psubw               m2, m1                      ; q2-q1
109    ABS2                m3, m2, m6, m7              ; abs(q3-q2) | abs(q2-q1)
110    pcmpgtw             m3, reg_I                   ; abs(q3-q2) > I
111    pcmpgtw             m2, reg_I                   ; abs(q2-q1) > I
112%if %1 > 4
113    por                 m4, m5
114%endif
115    por                 m2, m3
116    psubw               m3, m1, m0                  ; q1-q0
117    ABS1                m3, m5                      ; abs(q1-q0)
118%if %1 > 4
119    pcmpgtw             m6, m3, reg_F               ; abs(q1-q0) > F
120%endif
121    pcmpgtw             m7, m3, reg_H               ; abs(q1-q0) > H
122    pcmpgtw             m3, reg_I                   ; abs(q1-q0) > I
123%if %1 > 4
124    por                 m4, m6
125%endif
126    por                 m2, m3
127%endmacro
128
129; one step in filter_14/filter_6
130;
131; take sum $reg, downshift, apply mask and write into dst
132;
133; if sub2/add1-2 are present, add/sub as appropriate to prepare for the next
134; step's sum $reg. This is omitted for the last row in each filter.
135;
136; if dont_store is set, don't write the result into memory, instead keep the
137; values in register so we can write it out later
138%macro FILTER_STEP 6-10 "", "", "", 0 ; tmp, reg, mask, shift, dst, \
139                                      ; src/sub1, sub2, add1, add2, dont_store
140    psrlw               %1, %2, %4
141    psubw               %1, %6                      ; abs->delta
142%ifnidn %7, ""
143    psubw               %2, %6
144    psubw               %2, %7
145    paddw               %2, %8
146    paddw               %2, %9
147%endif
148    pand                %1, reg_%3                  ; apply mask
149%if %10 == 1
150    paddw               %6, %1                      ; delta->abs
151%else
152    paddw               %1, %6                      ; delta->abs
153    mova              [%5], %1
154%endif
155%endmacro
156
157; FIXME avx2 versions for 16_16 and mix2_{4,8}{4,8}
158
159%macro LOOP_FILTER 3 ; dir[h/v], wd[4/8/16], bpp[10/12]
160
161%if ARCH_X86_64
162%if %2 == 16
163%assign %%num_xmm_regs 16
164%elif %2 == 8
165%assign %%num_xmm_regs 15
166%else ; %2 == 4
167%assign %%num_xmm_regs 14
168%endif ; %2
169%assign %%bak_mem 0
170%else ; ARCH_X86_32
171%assign %%num_xmm_regs 8
172%if %2 == 16
173%assign %%bak_mem 7
174%elif %2 == 8
175%assign %%bak_mem 6
176%else ; %2 == 4
177%assign %%bak_mem 5
178%endif ; %2
179%endif ; ARCH_X86_64/32
180
181%if %2 == 16
182%ifidn %1, v
183%assign %%num_gpr_regs 6
184%else ; %1 == h
185%assign %%num_gpr_regs 5
186%endif ; %1
187%assign %%wd_mem 6
188%else ; %2 == 8/4
189%assign %%num_gpr_regs 5
190%if ARCH_X86_32 && %2 == 8
191%assign %%wd_mem 2
192%else ; ARCH_X86_64 || %2 == 4
193%assign %%wd_mem 0
194%endif ; ARCH_X86_64/32 etc.
195%endif ; %2
196
197%ifidn %1, v
198%assign %%tsp_mem 0
199%elif %2 == 16 ; && %1 == h
200%assign %%tsp_mem 16
201%else ; %1 == h && %1 == 8/4
202%assign %%tsp_mem 8
203%endif ; %1/%2
204
205%assign %%off %%wd_mem
206%assign %%tspoff %%bak_mem+%%wd_mem
207%assign %%stack_mem ((%%bak_mem+%%wd_mem+%%tsp_mem)*mmsize)
208
209%if %3 == 10
210%define %%maxsgn 511
211%define %%minsgn m512
212%define %%maxusgn 1023
213%define %%maxf 4
214%else ; %3 == 12
215%define %%maxsgn 2047
216%define %%minsgn m2048
217%define %%maxusgn 4095
218%define %%maxf 16
219%endif ; %3
220
221cglobal vp9_loop_filter_%1_%2_%3, 5, %%num_gpr_regs, %%num_xmm_regs, %%stack_mem, dst, stride, E, I, H
222    ; prepare E, I and H masks
223    shl                 Ed, %3-8
224    shl                 Id, %3-8
225    shl                 Hd, %3-8
226%if cpuflag(ssse3)
227    mova                m0, [pw_256]
228%endif
229    movd                m1, Ed
230    movd                m2, Id
231    movd                m3, Hd
232%if cpuflag(ssse3)
233    pshufb              m1, m0                      ; E << (bit_depth - 8)
234    pshufb              m2, m0                      ; I << (bit_depth - 8)
235    pshufb              m3, m0                      ; H << (bit_depth - 8)
236%else
237    punpcklwd           m1, m1
238    punpcklwd           m2, m2
239    punpcklwd           m3, m3
240    pshufd              m1, m1, q0000
241    pshufd              m2, m2, q0000
242    pshufd              m3, m3, q0000
243%endif
244    SCRATCH              1,  8, rsp+(%%off+0)*mmsize,  E
245    SCRATCH              2,  9, rsp+(%%off+1)*mmsize,  I
246    SCRATCH              3, 10, rsp+(%%off+2)*mmsize,  H
247%if %2 > 4
248    PRELOAD                 11, pw_ %+ %%maxf, F
249%endif
250
251    ; set up variables to load data
252%ifidn %1, v
253    DEFINE_ARGS dst8, stride, stride3, dst0, dst4, dst12
254    lea           stride3q, [strideq*3]
255    neg            strideq
256%if %2 == 16
257    lea              dst0q, [dst8q+strideq*8]
258%else
259    lea              dst4q, [dst8q+strideq*4]
260%endif
261    neg            strideq
262%if %2 == 16
263    lea             dst12q, [dst8q+strideq*4]
264    lea              dst4q, [dst0q+strideq*4]
265%endif
266
267%if %2 == 16
268%define %%p7 dst0q
269%define %%p6 dst0q+strideq
270%define %%p5 dst0q+strideq*2
271%define %%p4 dst0q+stride3q
272%endif
273%define %%p3 dst4q
274%define %%p2 dst4q+strideq
275%define %%p1 dst4q+strideq*2
276%define %%p0 dst4q+stride3q
277%define %%q0 dst8q
278%define %%q1 dst8q+strideq
279%define %%q2 dst8q+strideq*2
280%define %%q3 dst8q+stride3q
281%if %2 == 16
282%define %%q4 dst12q
283%define %%q5 dst12q+strideq
284%define %%q6 dst12q+strideq*2
285%define %%q7 dst12q+stride3q
286%endif
287%else ; %1 == h
288    DEFINE_ARGS dst0, stride, stride3, dst4
289    lea           stride3q, [strideq*3]
290    lea              dst4q, [dst0q+strideq*4]
291
292%define %%p3 rsp+(%%tspoff+0)*mmsize
293%define %%p2 rsp+(%%tspoff+1)*mmsize
294%define %%p1 rsp+(%%tspoff+2)*mmsize
295%define %%p0 rsp+(%%tspoff+3)*mmsize
296%define %%q0 rsp+(%%tspoff+4)*mmsize
297%define %%q1 rsp+(%%tspoff+5)*mmsize
298%define %%q2 rsp+(%%tspoff+6)*mmsize
299%define %%q3 rsp+(%%tspoff+7)*mmsize
300
301%if %2 < 16
302    movu                m0, [dst0q+strideq*0-8]
303    movu                m1, [dst0q+strideq*1-8]
304    movu                m2, [dst0q+strideq*2-8]
305    movu                m3, [dst0q+stride3q -8]
306    movu                m4, [dst4q+strideq*0-8]
307    movu                m5, [dst4q+strideq*1-8]
308    movu                m6, [dst4q+strideq*2-8]
309    movu                m7, [dst4q+stride3q -8]
310
311%if ARCH_X86_64
312    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, 12
313%else
314    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, [%%p0], [%%q0]
315%endif
316
317    mova            [%%p3], m0
318    mova            [%%p2], m1
319    mova            [%%p1], m2
320    mova            [%%p0], m3
321%if ARCH_X86_64
322    mova            [%%q0], m4
323%endif
324    mova            [%%q1], m5
325    mova            [%%q2], m6
326    mova            [%%q3], m7
327
328    ; FIXME investigate if we can _not_ load q0-3 below if h, and adjust register
329    ; order here accordingly
330%else ; %2 == 16
331
332%define %%p7 rsp+(%%tspoff+ 8)*mmsize
333%define %%p6 rsp+(%%tspoff+ 9)*mmsize
334%define %%p5 rsp+(%%tspoff+10)*mmsize
335%define %%p4 rsp+(%%tspoff+11)*mmsize
336%define %%q4 rsp+(%%tspoff+12)*mmsize
337%define %%q5 rsp+(%%tspoff+13)*mmsize
338%define %%q6 rsp+(%%tspoff+14)*mmsize
339%define %%q7 rsp+(%%tspoff+15)*mmsize
340
341    mova                m0, [dst0q+strideq*0-16]
342    mova                m1, [dst0q+strideq*1-16]
343    mova                m2, [dst0q+strideq*2-16]
344    mova                m3, [dst0q+stride3q -16]
345    mova                m4, [dst4q+strideq*0-16]
346    mova                m5, [dst4q+strideq*1-16]
347%if ARCH_X86_64
348    mova                m6, [dst4q+strideq*2-16]
349%endif
350    mova                m7, [dst4q+stride3q -16]
351
352%if ARCH_X86_64
353    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, 12
354%else
355    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2-16], [%%p3], 1
356%endif
357
358    mova            [%%p7], m0
359    mova            [%%p6], m1
360    mova            [%%p5], m2
361    mova            [%%p4], m3
362%if ARCH_X86_64
363    mova            [%%p3], m4
364%endif
365    mova            [%%p2], m5
366    mova            [%%p1], m6
367    mova            [%%p0], m7
368
369    mova                m0, [dst0q+strideq*0]
370    mova                m1, [dst0q+strideq*1]
371    mova                m2, [dst0q+strideq*2]
372    mova                m3, [dst0q+stride3q ]
373    mova                m4, [dst4q+strideq*0]
374    mova                m5, [dst4q+strideq*1]
375%if ARCH_X86_64
376    mova                m6, [dst4q+strideq*2]
377%endif
378    mova                m7, [dst4q+stride3q ]
379
380%if ARCH_X86_64
381    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, 12
382%else
383    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2], [%%q4], 1
384%endif
385
386    mova            [%%q0], m0
387    mova            [%%q1], m1
388    mova            [%%q2], m2
389    mova            [%%q3], m3
390%if ARCH_X86_64
391    mova            [%%q4], m4
392%endif
393    mova            [%%q5], m5
394    mova            [%%q6], m6
395    mova            [%%q7], m7
396
397    ; FIXME investigate if we can _not_ load q0|q4-7 below if h, and adjust register
398    ; order here accordingly
399%endif ; %2
400%endif ; %1
401
402    ; load q0|q4-7 data
403    mova                m0, [%%q0]
404%if %2 == 16
405    mova                m4, [%%q4]
406    mova                m5, [%%q5]
407    mova                m6, [%%q6]
408    mova                m7, [%%q7]
409
410    ; flat8out q portion
411    FLAT8OUT_HALF
412    SCRATCH              7, 15, rsp+(%%off+6)*mmsize, F8O
413%endif
414
415    ; load q1-3 data
416    mova                m1, [%%q1]
417    mova                m2, [%%q2]
418    mova                m3, [%%q3]
419
420    ; r6-8|pw_4[m8-11]=reg_E/I/H/F
421    ; r9[m15]=!flatout[q]
422    ; m12-14=free
423    ; m0-3=q0-q3
424    ; m4-7=free
425
426    ; flat8in|fm|hev q portion
427    FLAT8IN_HALF        %2
428    SCRATCH              7, 13, rsp+(%%off+4)*mmsize, HEV
429%if %2 > 4
430    SCRATCH              4, 14, rsp+(%%off+5)*mmsize, F8I
431%endif
432
433    ; r6-8|pw_4[m8-11]=reg_E/I/H/F
434    ; r9[m15]=!flat8out[q]
435    ; r10[m13]=hev[q]
436    ; r11[m14]=!flat8in[q]
437    ; m2=!fm[q]
438    ; m0,1=q0-q1
439    ; m2-7=free
440    ; m12=free
441
442    ; load p0-1
443    mova                m3, [%%p0]
444    mova                m4, [%%p1]
445
446    ; fm mb_edge portion
447    psubw               m5, m3, m0                  ; q0-p0
448    psubw               m6, m4, m1                  ; q1-p1
449%if ARCH_X86_64
450    ABS2                m5, m6, m7, m12             ; abs(q0-p0) | abs(q1-p1)
451%else
452    ABS1                m5, m7                      ; abs(q0-p0)
453    ABS1                m6, m7                      ; abs(q1-p1)
454%endif
455    paddw               m5, m5
456    psraw               m6, 1
457    paddw               m6, m5                      ; abs(q0-p0)*2+(abs(q1-p1)>>1)
458    pcmpgtw             m6, reg_E
459    por                 m2, m6
460    SCRATCH              2, 12, rsp+(%%off+3)*mmsize, FM
461
462    ; r6-8|pw_4[m8-11]=reg_E/I/H/F
463    ; r9[m15]=!flat8out[q]
464    ; r10[m13]=hev[q]
465    ; r11[m14]=!flat8in[q]
466    ; r12[m12]=!fm[q]
467    ; m3-4=q0-1
468    ; m0-2/5-7=free
469
470    ; load p4-7 data
471    SWAP                 3, 0                       ; p0
472    SWAP                 4, 1                       ; p1
473%if %2 == 16
474    mova                m7, [%%p7]
475    mova                m6, [%%p6]
476    mova                m5, [%%p5]
477    mova                m4, [%%p4]
478
479    ; flat8out p portion
480    FLAT8OUT_HALF
481    por                 m7, reg_F8O
482    SCRATCH              7, 15, rsp+(%%off+6)*mmsize, F8O
483%endif
484
485    ; r6-8|pw_4[m8-11]=reg_E/I/H/F
486    ; r9[m15]=!flat8out
487    ; r10[m13]=hev[q]
488    ; r11[m14]=!flat8in[q]
489    ; r12[m12]=!fm[q]
490    ; m0=p0
491    ; m1-7=free
492
493    ; load p2-3 data
494    mova                m2, [%%p2]
495    mova                m3, [%%p3]
496
497    ; flat8in|fm|hev p portion
498    FLAT8IN_HALF        %2
499    por                 m7, reg_HEV
500%if %2 > 4
501    por                 m4, reg_F8I
502%endif
503    por                 m2, reg_FM
504%if %2 > 4
505    por                 m4, m2                      ; !flat8|!fm
506%if %2 == 16
507    por                 m5, m4, reg_F8O             ; !flat16|!fm
508    pandn               m2, m4                      ; filter4_mask
509    pandn               m4, m5                      ; filter8_mask
510    pxor                m5, [pw_m1]                 ; filter16_mask
511    SCRATCH              5, 15, rsp+(%%off+6)*mmsize, F16M
512%else
513    pandn               m2, m4                      ; filter4_mask
514    pxor                m4, [pw_m1]                 ; filter8_mask
515%endif
516    SCRATCH              4, 14, rsp+(%%off+5)*mmsize, F8M
517%else
518    pxor                m2, [pw_m1]                 ; filter4_mask
519%endif
520    SCRATCH              7, 13, rsp+(%%off+4)*mmsize, HEV
521    SCRATCH              2, 12, rsp+(%%off+3)*mmsize, F4M
522
523    ; r9[m15]=filter16_mask
524    ; r10[m13]=hev
525    ; r11[m14]=filter8_mask
526    ; r12[m12]=filter4_mask
527    ; m0,1=p0-p1
528    ; m2-7=free
529    ; m8-11=free
530
531%if %2 > 4
532%if %2 == 16
533    ; filter_14
534    mova                m2, [%%p7]
535    mova                m3, [%%p6]
536    mova                m6, [%%p5]
537    mova                m7, [%%p4]
538    PRELOAD              8, %%p3, P3
539    PRELOAD              9, %%p2, P2
540%endif
541    PRELOAD             10, %%q0, Q0
542    PRELOAD             11, %%q1, Q1
543%if %2 == 16
544    psllw               m4, m2, 3
545    paddw               m5, m3, m3
546    paddw               m4, m6
547    paddw               m5, m7
548    paddw               m4, reg_P3
549    paddw               m5, reg_P2
550    paddw               m4, m1
551    paddw               m5, m0
552    paddw               m4, reg_Q0                  ; q0+p1+p3+p5+p7*8
553    psubw               m5, m2                      ; p0+p2+p4+p6*2-p7
554    paddw               m4, [pw_8]
555    paddw               m5, m4                      ; q0+p0+p1+p2+p3+p4+p5+p6*2+p7*7+8
556
557    ; below, we use r0-5 for storing pre-filter pixels for subsequent subtraction
558    ; at the end of the filter
559
560    mova    [rsp+0*mmsize], m3
561    FILTER_STEP         m4, m5, F16M, 4, %%p6, m3,     m2,             m6,     reg_Q1
562%endif
563    mova                m3, [%%q2]
564%if %2 == 16
565    mova    [rsp+1*mmsize], m6
566    FILTER_STEP         m4, m5, F16M, 4, %%p5, m6,     m2,             m7,     m3
567%endif
568    mova                m6, [%%q3]
569%if %2 == 16
570    mova    [rsp+2*mmsize], m7
571    FILTER_STEP         m4, m5, F16M, 4, %%p4, m7,     m2,             reg_P3, m6
572    mova                m7, [%%q4]
573%if ARCH_X86_64
574    mova    [rsp+3*mmsize], reg_P3
575%else
576    mova                m4, reg_P3
577    mova    [rsp+3*mmsize], m4
578%endif
579    FILTER_STEP         m4, m5, F16M, 4, %%p3, reg_P3, m2,             reg_P2, m7
580    PRELOAD              8, %%q5, Q5
581%if ARCH_X86_64
582    mova    [rsp+4*mmsize], reg_P2
583%else
584    mova                m4, reg_P2
585    mova    [rsp+4*mmsize], m4
586%endif
587    FILTER_STEP         m4, m5, F16M, 4, %%p2, reg_P2, m2,             m1,     reg_Q5
588    PRELOAD              9, %%q6, Q6
589    mova    [rsp+5*mmsize], m1
590    FILTER_STEP         m4, m5, F16M, 4, %%p1, m1,     m2,             m0,     reg_Q6
591    mova                m1, [%%q7]
592    FILTER_STEP         m4, m5, F16M, 4, %%p0, m0,     m2,             reg_Q0, m1,     1
593    FILTER_STEP         m4, m5, F16M, 4, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m1,     ARCH_X86_64
594    FILTER_STEP         m4, m5, F16M, 4, %%q1, reg_Q1, [rsp+1*mmsize], m3,     m1,     ARCH_X86_64
595    FILTER_STEP         m4, m5, F16M, 4, %%q2, m3,     [rsp+2*mmsize], m6,     m1,     1
596    FILTER_STEP         m4, m5, F16M, 4, %%q3, m6,     [rsp+3*mmsize], m7,     m1
597    FILTER_STEP         m4, m5, F16M, 4, %%q4, m7,     [rsp+4*mmsize], reg_Q5, m1
598    FILTER_STEP         m4, m5, F16M, 4, %%q5, reg_Q5, [rsp+5*mmsize], reg_Q6, m1
599    FILTER_STEP         m4, m5, F16M, 4, %%q6, reg_Q6
600
601    mova                m7, [%%p1]
602%else
603    SWAP                 1, 7
604%endif
605
606    mova                m2, [%%p3]
607    mova                m1, [%%p2]
608
609    ; reg_Q0-1 (m10-m11)
610    ; m0=p0
611    ; m1=p2
612    ; m2=p3
613    ; m3=q2
614    ; m4-5=free
615    ; m6=q3
616    ; m7=p1
617    ; m8-9 unused
618
619    ; filter_6
620    psllw               m4, m2, 2
621    paddw               m5, m1, m1
622    paddw               m4, m7
623    psubw               m5, m2
624    paddw               m4, m0
625    paddw               m5, reg_Q0
626    paddw               m4, [pw_4]
627    paddw               m5, m4
628
629%if ARCH_X86_64
630    mova                m8, m1
631    mova                m9, m7
632%else
633    mova    [rsp+0*mmsize], m1
634    mova    [rsp+1*mmsize], m7
635%endif
636%ifidn %1, v
637    FILTER_STEP         m4, m5, F8M, 3, %%p2, m1,     m2,             m7,     reg_Q1
638%else
639    FILTER_STEP         m4, m5, F8M, 3, %%p2, m1,     m2,             m7,     reg_Q1, 1
640%endif
641    FILTER_STEP         m4, m5, F8M, 3, %%p1, m7,     m2,             m0,     m3, 1
642    FILTER_STEP         m4, m5, F8M, 3, %%p0, m0,     m2,             reg_Q0, m6, 1
643%if ARCH_X86_64
644    FILTER_STEP         m4, m5, F8M, 3, %%q0, reg_Q0, m8,             reg_Q1, m6, ARCH_X86_64
645    FILTER_STEP         m4, m5, F8M, 3, %%q1, reg_Q1, m9,             m3,     m6, ARCH_X86_64
646%else
647    FILTER_STEP         m4, m5, F8M, 3, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m6, ARCH_X86_64
648    FILTER_STEP         m4, m5, F8M, 3, %%q1, reg_Q1, [rsp+1*mmsize], m3,     m6, ARCH_X86_64
649%endif
650    FILTER_STEP         m4, m5, F8M, 3, %%q2, m3
651
652    UNSCRATCH            2, 10, %%q0
653    UNSCRATCH            6, 11, %%q1
654%else
655    SWAP                 1, 7
656    mova                m2, [%%q0]
657    mova                m6, [%%q1]
658%endif
659    UNSCRATCH            3, 13, rsp+(%%off+4)*mmsize, HEV
660
661    ; m0=p0
662    ; m1=p2
663    ; m2=q0
664    ; m3=hev_mask
665    ; m4-5=free
666    ; m6=q1
667    ; m7=p1
668
669    ; filter_4
670    psubw               m4, m7, m6              ; p1-q1
671    psubw               m5, m2, m0              ; q0-p0
672    pand                m4, m3
673    pminsw              m4, [pw_ %+ %%maxsgn]
674    pmaxsw              m4, [pw_ %+ %%minsgn]   ; clip_intp2(p1-q1, 9) -> f
675    paddw               m4, m5
676    paddw               m5, m5
677    paddw               m4, m5                  ; 3*(q0-p0)+f
678    pminsw              m4, [pw_ %+ %%maxsgn]
679    pmaxsw              m4, [pw_ %+ %%minsgn]   ; clip_intp2(3*(q0-p0)+f, 9) -> f
680    pand                m4, reg_F4M
681    paddw               m5, m4, [pw_4]
682    paddw               m4, [pw_3]
683    pminsw              m5, [pw_ %+ %%maxsgn]
684    pminsw              m4, [pw_ %+ %%maxsgn]
685    psraw               m5, 3                   ; min_intp2(f+4, 9)>>3 -> f1
686    psraw               m4, 3                   ; min_intp2(f+3, 9)>>3 -> f2
687    psubw               m2, m5                  ; q0-f1
688    paddw               m0, m4                  ; p0+f2
689    pandn               m3, m5                  ; f1 & !hev (for p1/q1 adj)
690    pxor                m4, m4
691    mova                m5, [pw_ %+ %%maxusgn]
692    pmaxsw              m2, m4
693    pmaxsw              m0, m4
694    pminsw              m2, m5
695    pminsw              m0, m5
696%if cpuflag(ssse3)
697    pmulhrsw            m3, [pw_16384]          ; (f1+1)>>1
698%else
699    paddw               m3, [pw_1]
700    psraw               m3, 1
701%endif
702    paddw               m7, m3                  ; p1+f
703    psubw               m6, m3                  ; q1-f
704    pmaxsw              m7, m4
705    pmaxsw              m6, m4
706    pminsw              m7, m5
707    pminsw              m6, m5
708
709    ; store
710%ifidn %1, v
711    mova            [%%p1], m7
712    mova            [%%p0], m0
713    mova            [%%q0], m2
714    mova            [%%q1], m6
715%else ; %1 == h
716%if %2 == 4
717    TRANSPOSE4x4W        7, 0, 2, 6, 1
718    movh   [dst0q+strideq*0-4], m7
719    movhps [dst0q+strideq*1-4], m7
720    movh   [dst0q+strideq*2-4], m0
721    movhps [dst0q+stride3q -4], m0
722    movh   [dst4q+strideq*0-4], m2
723    movhps [dst4q+strideq*1-4], m2
724    movh   [dst4q+strideq*2-4], m6
725    movhps [dst4q+stride3q -4], m6
726%elif %2 == 8
727    mova                m3, [%%p3]
728    mova                m4, [%%q2]
729    mova                m5, [%%q3]
730
731%if ARCH_X86_64
732    TRANSPOSE8x8W        3, 1, 7, 0, 2, 6, 4, 5, 8
733%else
734    TRANSPOSE8x8W        3, 1, 7, 0, 2, 6, 4, 5, [%%q2], [%%q0], 1
735    mova                m2, [%%q0]
736%endif
737
738    movu [dst0q+strideq*0-8], m3
739    movu [dst0q+strideq*1-8], m1
740    movu [dst0q+strideq*2-8], m7
741    movu [dst0q+stride3q -8], m0
742    movu [dst4q+strideq*0-8], m2
743    movu [dst4q+strideq*1-8], m6
744    movu [dst4q+strideq*2-8], m4
745    movu [dst4q+stride3q -8], m5
746%else ; %2 == 16
747    SCRATCH              2, 8, %%q0
748    SCRATCH              6, 9, %%q1
749    mova                m2, [%%p7]
750    mova                m3, [%%p6]
751    mova                m4, [%%p5]
752    mova                m5, [%%p4]
753    mova                m6, [%%p3]
754
755%if ARCH_X86_64
756    TRANSPOSE8x8W        2, 3, 4, 5, 6, 1, 7, 0, 10
757%else
758    mova            [%%p1], m7
759    TRANSPOSE8x8W        2, 3, 4, 5, 6, 1, 7, 0, [%%p1], [dst4q+strideq*0-16], 1
760%endif
761
762    mova [dst0q+strideq*0-16], m2
763    mova [dst0q+strideq*1-16], m3
764    mova [dst0q+strideq*2-16], m4
765    mova [dst0q+stride3q -16], m5
766%if ARCH_X86_64
767    mova [dst4q+strideq*0-16], m6
768%endif
769    mova [dst4q+strideq*1-16], m1
770    mova [dst4q+strideq*2-16], m7
771    mova [dst4q+stride3q -16], m0
772
773    UNSCRATCH            2, 8, %%q0
774    UNSCRATCH            6, 9, %%q1
775    mova                m0, [%%q2]
776    mova                m1, [%%q3]
777    mova                m3, [%%q4]
778    mova                m4, [%%q5]
779%if ARCH_X86_64
780    mova                m5, [%%q6]
781%endif
782    mova                m7, [%%q7]
783
784%if ARCH_X86_64
785    TRANSPOSE8x8W        2, 6, 0, 1, 3, 4, 5, 7, 8
786%else
787    TRANSPOSE8x8W        2, 6, 0, 1, 3, 4, 5, 7, [%%q6], [dst4q+strideq*0], 1
788%endif
789
790    mova [dst0q+strideq*0], m2
791    mova [dst0q+strideq*1], m6
792    mova [dst0q+strideq*2], m0
793    mova [dst0q+stride3q ], m1
794%if ARCH_X86_64
795    mova [dst4q+strideq*0], m3
796%endif
797    mova [dst4q+strideq*1], m4
798    mova [dst4q+strideq*2], m5
799    mova [dst4q+stride3q ], m7
800%endif ; %2
801%endif ; %1
802    RET
803%endmacro
804
805%macro LOOP_FILTER_CPUSETS 3
806INIT_XMM sse2
807LOOP_FILTER %1, %2, %3
808INIT_XMM ssse3
809LOOP_FILTER %1, %2, %3
810INIT_XMM avx
811LOOP_FILTER %1, %2, %3
812%endmacro
813
814%macro LOOP_FILTER_WDSETS 2
815LOOP_FILTER_CPUSETS %1,  4, %2
816LOOP_FILTER_CPUSETS %1,  8, %2
817LOOP_FILTER_CPUSETS %1, 16, %2
818%endmacro
819
820LOOP_FILTER_WDSETS h, 10
821LOOP_FILTER_WDSETS v, 10
822LOOP_FILTER_WDSETS h, 12
823LOOP_FILTER_WDSETS v, 12
824