1 /*
2 * Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
3 *
4 * This file is part of Libav.
5 *
6 * Libav is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * Libav is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with Libav; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 */
20
21 #ifdef COMPILE_TEMPLATE_SSE2
22 #define MM "%%xmm"
23 #define MOV "movq"
24 #define MOVQ "movdqa"
25 #define MOVQU "movdqu"
26 #define STEP 8
27 #define LOAD(mem,dst) \
28 MOV" "mem", "dst" \n\t"\
29 "punpcklbw "MM"7, "dst" \n\t"
30 #define PSRL1(reg) "psrldq $1, "reg" \n\t"
31 #define PSRL2(reg) "psrldq $2, "reg" \n\t"
32 #define PSHUF(src,dst) "movdqa "dst", "src" \n\t"\
33 "psrldq $2, "src" \n\t"
34 #else
35 #define MM "%%mm"
36 #define MOV "movd"
37 #define MOVQ "movq"
38 #define MOVQU "movq"
39 #define STEP 4
40 #define LOAD(mem,dst) \
41 MOV" "mem", "dst" \n\t"\
42 "punpcklbw "MM"7, "dst" \n\t"
43 #define PSRL1(reg) "psrlq $8, "reg" \n\t"
44 #define PSRL2(reg) "psrlq $16, "reg" \n\t"
45 #define PSHUF(src,dst) "pshufw $9, "dst", "src" \n\t"
46 #endif
47
48 #ifdef COMPILE_TEMPLATE_SSSE3
49 #define PABS(tmp,dst) \
50 "pabsw "dst", "dst" \n\t"
51 #else
52 #define PABS(tmp,dst) \
53 "pxor "tmp", "tmp" \n\t"\
54 "psubw "dst", "tmp" \n\t"\
55 "pmaxsw "tmp", "dst" \n\t"
56 #endif
57
58 #define CHECK(pj,mj) \
59 MOVQU" "#pj"(%[cur],%[mrefs]), "MM"2 \n\t" /* cur[x-refs-1+j] */\
60 MOVQU" "#mj"(%[cur],%[prefs]), "MM"3 \n\t" /* cur[x+refs-1-j] */\
61 MOVQ" "MM"2, "MM"4 \n\t"\
62 MOVQ" "MM"2, "MM"5 \n\t"\
63 "pxor "MM"3, "MM"4 \n\t"\
64 "pavgb "MM"3, "MM"5 \n\t"\
65 "pand "MANGLE(pb_1)", "MM"4 \n\t"\
66 "psubusb "MM"4, "MM"5 \n\t"\
67 PSRL1(MM"5") \
68 "punpcklbw "MM"7, "MM"5 \n\t" /* (cur[x-refs+j] + cur[x+refs-j])>>1 */\
69 MOVQ" "MM"2, "MM"4 \n\t"\
70 "psubusb "MM"3, "MM"2 \n\t"\
71 "psubusb "MM"4, "MM"3 \n\t"\
72 "pmaxub "MM"3, "MM"2 \n\t"\
73 MOVQ" "MM"2, "MM"3 \n\t"\
74 MOVQ" "MM"2, "MM"4 \n\t" /* ABS(cur[x-refs-1+j] - cur[x+refs-1-j]) */\
75 PSRL1(MM"3") /* ABS(cur[x-refs +j] - cur[x+refs -j]) */\
76 PSRL2(MM"4") /* ABS(cur[x-refs+1+j] - cur[x+refs+1-j]) */\
77 "punpcklbw "MM"7, "MM"2 \n\t"\
78 "punpcklbw "MM"7, "MM"3 \n\t"\
79 "punpcklbw "MM"7, "MM"4 \n\t"\
80 "paddw "MM"3, "MM"2 \n\t"\
81 "paddw "MM"4, "MM"2 \n\t" /* score */
82
83 #define CHECK1 \
84 MOVQ" "MM"0, "MM"3 \n\t"\
85 "pcmpgtw "MM"2, "MM"3 \n\t" /* if(score < spatial_score) */\
86 "pminsw "MM"2, "MM"0 \n\t" /* spatial_score= score; */\
87 MOVQ" "MM"3, "MM"6 \n\t"\
88 "pand "MM"3, "MM"5 \n\t"\
89 "pandn "MM"1, "MM"3 \n\t"\
90 "por "MM"5, "MM"3 \n\t"\
91 MOVQ" "MM"3, "MM"1 \n\t" /* spatial_pred= (cur[x-refs+j] + cur[x+refs-j])>>1; */
92
93 #define CHECK2 /* pretend not to have checked dir=2 if dir=1 was bad.\
94 hurts both quality and speed, but matches the C version. */\
95 "paddw "MANGLE(pw_1)", "MM"6 \n\t"\
96 "psllw $14, "MM"6 \n\t"\
97 "paddsw "MM"6, "MM"2 \n\t"\
98 MOVQ" "MM"0, "MM"3 \n\t"\
99 "pcmpgtw "MM"2, "MM"3 \n\t"\
100 "pminsw "MM"2, "MM"0 \n\t"\
101 "pand "MM"3, "MM"5 \n\t"\
102 "pandn "MM"1, "MM"3 \n\t"\
103 "por "MM"5, "MM"3 \n\t"\
104 MOVQ" "MM"3, "MM"1 \n\t"
105
RENAME(yadif_filter_line)106 static void RENAME (yadif_filter_line) (guint8 * dst, guint8 * prev,
107 guint8 * cur, guint8 * next, int w, int prefs, int mrefs, int parity,
108 int mode)
109 {
110 DECLARE_ALIGNED (16, guint8, tmp)[16 * 4];
111 int x;
112
113 #define FILTER\
114 for(x=0; x<w; x+=STEP){\
115 __asm__ volatile(\
116 "pxor "MM"7, "MM"7 \n\t"\
117 LOAD("(%[cur],%[mrefs])", MM"0") /* c = cur[x-refs] */\
118 LOAD("(%[cur],%[prefs])", MM"1") /* e = cur[x+refs] */\
119 LOAD("(%["prev2"])", MM"2") /* prev2[x] */\
120 LOAD("(%["next2"])", MM"3") /* next2[x] */\
121 MOVQ" "MM"3, "MM"4 \n\t"\
122 "paddw "MM"2, "MM"3 \n\t"\
123 "psraw $1, "MM"3 \n\t" /* d = (prev2[x] + next2[x])>>1 */\
124 MOVQ" "MM"0, (%[tmp]) \n\t" /* c */\
125 MOVQ" "MM"3, 16(%[tmp]) \n\t" /* d */\
126 MOVQ" "MM"1, 32(%[tmp]) \n\t" /* e */\
127 "psubw "MM"4, "MM"2 \n\t"\
128 PABS( MM"4", MM"2") /* temporal_diff0 */\
129 LOAD("(%[prev],%[mrefs])", MM"3") /* prev[x-refs] */\
130 LOAD("(%[prev],%[prefs])", MM"4") /* prev[x+refs] */\
131 "psubw "MM"0, "MM"3 \n\t"\
132 "psubw "MM"1, "MM"4 \n\t"\
133 PABS( MM"5", MM"3")\
134 PABS( MM"5", MM"4")\
135 "paddw "MM"4, "MM"3 \n\t" /* temporal_diff1 */\
136 "psrlw $1, "MM"2 \n\t"\
137 "psrlw $1, "MM"3 \n\t"\
138 "pmaxsw "MM"3, "MM"2 \n\t"\
139 LOAD("(%[next],%[mrefs])", MM"3") /* next[x-refs] */\
140 LOAD("(%[next],%[prefs])", MM"4") /* next[x+refs] */\
141 "psubw "MM"0, "MM"3 \n\t"\
142 "psubw "MM"1, "MM"4 \n\t"\
143 PABS( MM"5", MM"3")\
144 PABS( MM"5", MM"4")\
145 "paddw "MM"4, "MM"3 \n\t" /* temporal_diff2 */\
146 "psrlw $1, "MM"3 \n\t"\
147 "pmaxsw "MM"3, "MM"2 \n\t"\
148 MOVQ" "MM"2, 48(%[tmp]) \n\t" /* diff */\
149 \
150 "paddw "MM"0, "MM"1 \n\t"\
151 "paddw "MM"0, "MM"0 \n\t"\
152 "psubw "MM"1, "MM"0 \n\t"\
153 "psrlw $1, "MM"1 \n\t" /* spatial_pred */\
154 PABS( MM"2", MM"0") /* ABS(c-e) */\
155 \
156 MOVQU" -1(%[cur],%[mrefs]), "MM"2 \n\t" /* cur[x-refs-1] */\
157 MOVQU" -1(%[cur],%[prefs]), "MM"3 \n\t" /* cur[x+refs-1] */\
158 MOVQ" "MM"2, "MM"4 \n\t"\
159 "psubusb "MM"3, "MM"2 \n\t"\
160 "psubusb "MM"4, "MM"3 \n\t"\
161 "pmaxub "MM"3, "MM"2 \n\t"\
162 PSHUF(MM"3", MM"2") \
163 "punpcklbw "MM"7, "MM"2 \n\t" /* ABS(cur[x-refs-1] - cur[x+refs-1]) */\
164 "punpcklbw "MM"7, "MM"3 \n\t" /* ABS(cur[x-refs+1] - cur[x+refs+1]) */\
165 "paddw "MM"2, "MM"0 \n\t"\
166 "paddw "MM"3, "MM"0 \n\t"\
167 "psubw "MANGLE(pw_1)", "MM"0 \n\t" /* spatial_score */\
168 \
169 CHECK(-2,0)\
170 CHECK1\
171 CHECK(-3,1)\
172 CHECK2\
173 CHECK(0,-2)\
174 CHECK1\
175 CHECK(1,-3)\
176 CHECK2\
177 \
178 /* if(p->mode<2) ... */\
179 MOVQ" 48(%[tmp]), "MM"6 \n\t" /* diff */\
180 "cmpl $2, %[mode] \n\t"\
181 "jge 1f \n\t"\
182 LOAD("(%["prev2"],%[mrefs],2)", MM"2") /* prev2[x-2*refs] */\
183 LOAD("(%["next2"],%[mrefs],2)", MM"4") /* next2[x-2*refs] */\
184 LOAD("(%["prev2"],%[prefs],2)", MM"3") /* prev2[x+2*refs] */\
185 LOAD("(%["next2"],%[prefs],2)", MM"5") /* next2[x+2*refs] */\
186 "paddw "MM"4, "MM"2 \n\t"\
187 "paddw "MM"5, "MM"3 \n\t"\
188 "psrlw $1, "MM"2 \n\t" /* b */\
189 "psrlw $1, "MM"3 \n\t" /* f */\
190 MOVQ" (%[tmp]), "MM"4 \n\t" /* c */\
191 MOVQ" 16(%[tmp]), "MM"5 \n\t" /* d */\
192 MOVQ" 32(%[tmp]), "MM"7 \n\t" /* e */\
193 "psubw "MM"4, "MM"2 \n\t" /* b-c */\
194 "psubw "MM"7, "MM"3 \n\t" /* f-e */\
195 MOVQ" "MM"5, "MM"0 \n\t"\
196 "psubw "MM"4, "MM"5 \n\t" /* d-c */\
197 "psubw "MM"7, "MM"0 \n\t" /* d-e */\
198 MOVQ" "MM"2, "MM"4 \n\t"\
199 "pminsw "MM"3, "MM"2 \n\t"\
200 "pmaxsw "MM"4, "MM"3 \n\t"\
201 "pmaxsw "MM"5, "MM"2 \n\t"\
202 "pminsw "MM"5, "MM"3 \n\t"\
203 "pmaxsw "MM"0, "MM"2 \n\t" /* max */\
204 "pminsw "MM"0, "MM"3 \n\t" /* min */\
205 "pxor "MM"4, "MM"4 \n\t"\
206 "pmaxsw "MM"3, "MM"6 \n\t"\
207 "psubw "MM"2, "MM"4 \n\t" /* -max */\
208 "pmaxsw "MM"4, "MM"6 \n\t" /* diff= MAX3(diff, min, -max); */\
209 "1: \n\t"\
210 \
211 MOVQ" 16(%[tmp]), "MM"2 \n\t" /* d */\
212 MOVQ" "MM"2, "MM"3 \n\t"\
213 "psubw "MM"6, "MM"2 \n\t" /* d-diff */\
214 "paddw "MM"6, "MM"3 \n\t" /* d+diff */\
215 "pmaxsw "MM"2, "MM"1 \n\t"\
216 "pminsw "MM"3, "MM"1 \n\t" /* d = clip(spatial_pred, d-diff, d+diff); */\
217 "packuswb "MM"1, "MM"1 \n\t"\
218 \
219 ::[prev] "r"(prev),\
220 [cur] "r"(cur),\
221 [next] "r"(next),\
222 [prefs]"r"((x86_reg)prefs),\
223 [mrefs]"r"((x86_reg)mrefs),\
224 [mode] "g"(mode),\
225 [tmp] "r"(tmp)\
226 );\
227 __asm__ volatile(MOV" "MM"1, %0" :"=m"(*dst));\
228 dst += STEP;\
229 prev+= STEP;\
230 cur += STEP;\
231 next+= STEP;\
232 }
233
234 if (parity) {
235 #define prev2 "prev"
236 #define next2 "cur"
237 FILTER
238 #undef prev2
239 #undef next2
240 } else {
241 #define prev2 "cur"
242 #define next2 "next"
243 FILTER
244 #undef prev2
245 #undef next2
246 }
247 }
248
249 #undef STEP
250 #undef MM
251 #undef MOV
252 #undef MOVQ
253 #undef MOVQU
254 #undef PSHUF
255 #undef PSRL1
256 #undef PSRL2
257 #undef LOAD
258 #undef PABS
259 #undef CHECK
260 #undef CHECK1
261 #undef CHECK2
262 #undef FILTER
263