1;*****************************************************************************
2;* x86-optimized functions for yadif filter
3;*
4;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
5;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com>
6;* Copyright (c) 2011-2013 James Darnley <james.darnley@gmail.com>
7;*
8;* This file is part of FFmpeg.
9;*
10;* FFmpeg is free software; you can redistribute it and/or
11;* modify it under the terms of the GNU Lesser General Public
12;* License as published by the Free Software Foundation; either
13;* version 2.1 of the License, or (at your option) any later version.
14;*
15;* FFmpeg is distributed in the hope that it will be useful,
16;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18;* Lesser General Public License for more details.
19;*
20;* You should have received a copy of the GNU Lesser General Public
21;* License along with FFmpeg; if not, write to the Free Software
22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23;******************************************************************************
24
25%include "libavutil/x86/x86util.asm"
26
27SECTION_RODATA
28
29pw_1: times 8 dw 1
30
31SECTION .text
32
33%macro PMAXUW 2
34%if cpuflag(sse4)
35    pmaxuw %1, %2
36%else
37    psubusw %1, %2
38    paddusw %1, %2
39%endif
40%endmacro
41
42%macro CHECK 2
43    movu      m2, [curq+t1+%1*2]
44    movu      m3, [curq+t0+%2*2]
45    mova      m4, m2
46    mova      m5, m2
47    pxor      m4, m3
48    pavgw     m5, m3
49    pand      m4, [pw_1]
50    psubusw   m5, m4
51    RSHIFT    m5, 2
52    mova      m4, m2
53    psubusw   m2, m3
54    psubusw   m3, m4
55    PMAXUW    m2, m3
56    mova      m3, m2
57    mova      m4, m2
58    RSHIFT    m3, 2
59    RSHIFT    m4, 4
60    paddw     m2, m3
61    paddw     m2, m4
62%endmacro
63
64%macro CHECK1 0
65    mova    m3, m0
66    pcmpgtw m3, m2
67    pminsw  m0, m2
68    mova    m6, m3
69    pand    m5, m3
70    pandn   m3, m1
71    por     m3, m5
72    mova    m1, m3
73%endmacro
74
75; %macro CHECK2 0
76;     paddw   m6, [pw_1]
77;     psllw   m6, 14
78;     paddsw  m2, m6
79;     mova    m3, m0
80;     pcmpgtw m3, m2
81;     pminsw  m0, m2
82;     pand    m5, m3
83;     pandn   m3, m1
84;     por     m3, m5
85;     mova    m1, m3
86; %endmacro
87
88; This version of CHECK2 is required for 14-bit samples.  The left-shift trick
89; in the old code is not large enough to correctly select pixels or scores.
90
91%macro CHECK2 0
92    mova    m3, m0
93    pcmpgtw m0, m2
94    pand    m0, m6
95    mova    m6, m0
96    pand    m5, m6
97    pand    m2, m0
98    pandn   m6, m1
99    pandn   m0, m3
100    por     m6, m5
101    por     m0, m2
102    mova    m1, m6
103%endmacro
104
105%macro LOAD 2
106    movu      %1, %2
107%endmacro
108
109%macro FILTER 3
110.loop%1:
111    pxor         m7, m7
112    LOAD         m0, [curq+t1]
113    LOAD         m1, [curq+t0]
114    LOAD         m2, [%2]
115    LOAD         m3, [%3]
116    mova         m4, m3
117    paddw        m3, m2
118    psraw        m3, 1
119    mova   [rsp+ 0], m0
120    mova   [rsp+16], m3
121    mova   [rsp+32], m1
122    psubw        m2, m4
123    ABS1         m2, m4
124    LOAD         m3, [prevq+t1]
125    LOAD         m4, [prevq+t0]
126    psubw        m3, m0
127    psubw        m4, m1
128    ABS2         m3, m4, m5, m6
129    paddw        m3, m4
130    psrlw        m2, 1
131    psrlw        m3, 1
132    pmaxsw       m2, m3
133    LOAD         m3, [nextq+t1]
134    LOAD         m4, [nextq+t0]
135    psubw        m3, m0
136    psubw        m4, m1
137    ABS2         m3, m4, m5, m6
138    paddw        m3, m4
139    psrlw        m3, 1
140    pmaxsw       m2, m3
141    mova   [rsp+48], m2
142
143    paddw        m1, m0
144    paddw        m0, m0
145    psubw        m0, m1
146    psrlw        m1, 1
147    ABS1         m0, m2
148
149    movu         m2, [curq+t1-1*2]
150    movu         m3, [curq+t0-1*2]
151    mova         m4, m2
152    psubusw      m2, m3
153    psubusw      m3, m4
154    PMAXUW       m2, m3
155    mova         m3, m2
156    RSHIFT       m3, 4
157    paddw        m0, m2
158    paddw        m0, m3
159    psubw        m0, [pw_1]
160
161    CHECK -2, 0
162    CHECK1
163    CHECK -3, 1
164    CHECK2
165    CHECK 0, -2
166    CHECK1
167    CHECK 1, -3
168    CHECK2
169
170    mova         m6, [rsp+48]
171    cmp   DWORD r8m, 2
172    jge .end%1
173    LOAD         m2, [%2+t1*2]
174    LOAD         m4, [%3+t1*2]
175    LOAD         m3, [%2+t0*2]
176    LOAD         m5, [%3+t0*2]
177    paddw        m2, m4
178    paddw        m3, m5
179    psrlw        m2, 1
180    psrlw        m3, 1
181    mova         m4, [rsp+ 0]
182    mova         m5, [rsp+16]
183    mova         m7, [rsp+32]
184    psubw        m2, m4
185    psubw        m3, m7
186    mova         m0, m5
187    psubw        m5, m4
188    psubw        m0, m7
189    mova         m4, m2
190    pminsw       m2, m3
191    pmaxsw       m3, m4
192    pmaxsw       m2, m5
193    pminsw       m3, m5
194    pmaxsw       m2, m0
195    pminsw       m3, m0
196    pxor         m4, m4
197    pmaxsw       m6, m3
198    psubw        m4, m2
199    pmaxsw       m6, m4
200
201.end%1:
202    mova         m2, [rsp+16]
203    mova         m3, m2
204    psubw        m2, m6
205    paddw        m3, m6
206    pmaxsw       m1, m2
207    pminsw       m1, m3
208
209    movu     [dstq], m1
210    add        dstq, mmsize-4
211    add       prevq, mmsize-4
212    add        curq, mmsize-4
213    add       nextq, mmsize-4
214    sub   DWORD r4m, mmsize/2-2
215    jg .loop%1
216%endmacro
217
218%macro YADIF 0
219%if ARCH_X86_32
220cglobal yadif_filter_line_10bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
221                                              prefs, mrefs, parity, mode
222%else
223cglobal yadif_filter_line_10bit, 4, 7, 8, 80, dst, prev, cur, next, w, \
224                                              prefs, mrefs, parity, mode
225%endif
226%if ARCH_X86_32
227    mov            r4, r5mp
228    mov            r5, r6mp
229    DECLARE_REG_TMP 4,5
230%else
231    movsxd         r5, DWORD r5m
232    movsxd         r6, DWORD r6m
233    DECLARE_REG_TMP 5,6
234%endif
235
236    cmp DWORD paritym, 0
237    je .parity0
238    FILTER 1, prevq, curq
239    jmp .ret
240
241.parity0:
242    FILTER 0, curq, nextq
243
244.ret:
245    RET
246%endmacro
247
248INIT_XMM ssse3
249YADIF
250INIT_XMM sse2
251YADIF
252%if ARCH_X86_32
253INIT_MMX mmxext
254YADIF
255%endif
256