1 /*
2  * Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
3  *
4  * This file is part of Libav.
5  *
6  * Libav is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * Libav is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License along
17  * with Libav; if not, write to the Free Software Foundation, Inc.,
18  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19  */
20 
21 #ifdef COMPILE_TEMPLATE_SSE2
22 #define MM "%%xmm"
23 #define MOV  "movq"
24 #define MOVQ "movdqa"
25 #define MOVQU "movdqu"
26 #define STEP 8
27 #define LOAD(mem,dst) \
28             MOV"       "mem", "dst" \n\t"\
29             "punpcklbw "MM"7, "dst" \n\t"
30 #define PSRL1(reg) "psrldq $1, "reg" \n\t"
31 #define PSRL2(reg) "psrldq $2, "reg" \n\t"
32 #define PSHUF(src,dst) "movdqa "dst", "src" \n\t"\
33                        "psrldq $2, "src"     \n\t"
34 #else
35 #define MM "%%mm"
36 #define MOV  "movd"
37 #define MOVQ "movq"
38 #define MOVQU "movq"
39 #define STEP 4
40 #define LOAD(mem,dst) \
41             MOV"       "mem", "dst" \n\t"\
42             "punpcklbw "MM"7, "dst" \n\t"
43 #define PSRL1(reg) "psrlq $8, "reg" \n\t"
44 #define PSRL2(reg) "psrlq $16, "reg" \n\t"
45 #define PSHUF(src,dst) "pshufw $9, "dst", "src" \n\t"
46 #endif
47 
48 #ifdef COMPILE_TEMPLATE_SSSE3
49 #define PABS(tmp,dst) \
50             "pabsw     "dst", "dst" \n\t"
51 #else
52 #define PABS(tmp,dst) \
53             "pxor     "tmp", "tmp" \n\t"\
54             "psubw    "dst", "tmp" \n\t"\
55             "pmaxsw   "tmp", "dst" \n\t"
56 #endif
57 
58 #define CHECK(pj,mj) \
59             MOVQU" "#pj"(%[cur],%[mrefs]), "MM"2 \n\t" /* cur[x-refs-1+j] */\
60             MOVQU" "#mj"(%[cur],%[prefs]), "MM"3 \n\t" /* cur[x+refs-1-j] */\
61             MOVQ"      "MM"2, "MM"4 \n\t"\
62             MOVQ"      "MM"2, "MM"5 \n\t"\
63             "pxor      "MM"3, "MM"4 \n\t"\
64             "pavgb     "MM"3, "MM"5 \n\t"\
65             "pand     "MANGLE(pb_1)", "MM"4 \n\t"\
66             "psubusb   "MM"4, "MM"5 \n\t"\
67             PSRL1(MM"5")                 \
68             "punpcklbw "MM"7, "MM"5 \n\t" /* (cur[x-refs+j] + cur[x+refs-j])>>1 */\
69             MOVQ"      "MM"2, "MM"4 \n\t"\
70             "psubusb   "MM"3, "MM"2 \n\t"\
71             "psubusb   "MM"4, "MM"3 \n\t"\
72             "pmaxub    "MM"3, "MM"2 \n\t"\
73             MOVQ"      "MM"2, "MM"3 \n\t"\
74             MOVQ"      "MM"2, "MM"4 \n\t" /* ABS(cur[x-refs-1+j] - cur[x+refs-1-j]) */\
75             PSRL1(MM"3")                  /* ABS(cur[x-refs  +j] - cur[x+refs  -j]) */\
76             PSRL2(MM"4")                  /* ABS(cur[x-refs+1+j] - cur[x+refs+1-j]) */\
77             "punpcklbw "MM"7, "MM"2 \n\t"\
78             "punpcklbw "MM"7, "MM"3 \n\t"\
79             "punpcklbw "MM"7, "MM"4 \n\t"\
80             "paddw     "MM"3, "MM"2 \n\t"\
81             "paddw     "MM"4, "MM"2 \n\t"       /* score */
82 
83 #define CHECK1 \
84             MOVQ"      "MM"0, "MM"3 \n\t"\
85             "pcmpgtw   "MM"2, "MM"3 \n\t" /* if(score < spatial_score) */\
86             "pminsw    "MM"2, "MM"0 \n\t" /* spatial_score= score; */\
87             MOVQ"      "MM"3, "MM"6 \n\t"\
88             "pand      "MM"3, "MM"5 \n\t"\
89             "pandn     "MM"1, "MM"3 \n\t"\
90             "por       "MM"5, "MM"3 \n\t"\
91             MOVQ"      "MM"3, "MM"1 \n\t"       /* spatial_pred= (cur[x-refs+j] + cur[x+refs-j])>>1; */
92 
93 #define CHECK2 /* pretend not to have checked dir=2 if dir=1 was bad.\
94                   hurts both quality and speed, but matches the C version. */\
95             "paddw    "MANGLE(pw_1)", "MM"6 \n\t"\
96             "psllw     $14,   "MM"6 \n\t"\
97             "paddsw    "MM"6, "MM"2 \n\t"\
98             MOVQ"      "MM"0, "MM"3 \n\t"\
99             "pcmpgtw   "MM"2, "MM"3 \n\t"\
100             "pminsw    "MM"2, "MM"0 \n\t"\
101             "pand      "MM"3, "MM"5 \n\t"\
102             "pandn     "MM"1, "MM"3 \n\t"\
103             "por       "MM"5, "MM"3 \n\t"\
104             MOVQ"      "MM"3, "MM"1 \n\t"
105 
RENAME(yadif_filter_line)106 static void RENAME (yadif_filter_line) (guint8 * dst, guint8 * prev,
107     guint8 * cur, guint8 * next, int w, int prefs, int mrefs, int parity,
108     int mode)
109 {
110   DECLARE_ALIGNED (16, guint8, tmp)[16 * 4];
111   int x;
112 
113 #define FILTER\
114     for(x=0; x<w; x+=STEP){\
115         __asm__ volatile(\
116             "pxor      "MM"7, "MM"7 \n\t"\
117             LOAD("(%[cur],%[mrefs])", MM"0") /* c = cur[x-refs] */\
118             LOAD("(%[cur],%[prefs])", MM"1") /* e = cur[x+refs] */\
119             LOAD("(%["prev2"])", MM"2") /* prev2[x] */\
120             LOAD("(%["next2"])", MM"3") /* next2[x] */\
121             MOVQ"      "MM"3, "MM"4 \n\t"\
122             "paddw     "MM"2, "MM"3 \n\t"\
123             "psraw     $1,    "MM"3 \n\t" /* d = (prev2[x] + next2[x])>>1 */\
124             MOVQ"      "MM"0,   (%[tmp]) \n\t" /* c */\
125             MOVQ"      "MM"3, 16(%[tmp]) \n\t" /* d */\
126             MOVQ"      "MM"1, 32(%[tmp]) \n\t" /* e */\
127             "psubw     "MM"4, "MM"2 \n\t"\
128             PABS(      MM"4", MM"2") /* temporal_diff0 */\
129             LOAD("(%[prev],%[mrefs])", MM"3") /* prev[x-refs] */\
130             LOAD("(%[prev],%[prefs])", MM"4") /* prev[x+refs] */\
131             "psubw     "MM"0, "MM"3 \n\t"\
132             "psubw     "MM"1, "MM"4 \n\t"\
133             PABS(      MM"5", MM"3")\
134             PABS(      MM"5", MM"4")\
135             "paddw     "MM"4, "MM"3 \n\t" /* temporal_diff1 */\
136             "psrlw     $1,    "MM"2 \n\t"\
137             "psrlw     $1,    "MM"3 \n\t"\
138             "pmaxsw    "MM"3, "MM"2 \n\t"\
139             LOAD("(%[next],%[mrefs])", MM"3") /* next[x-refs] */\
140             LOAD("(%[next],%[prefs])", MM"4") /* next[x+refs] */\
141             "psubw     "MM"0, "MM"3 \n\t"\
142             "psubw     "MM"1, "MM"4 \n\t"\
143             PABS(      MM"5", MM"3")\
144             PABS(      MM"5", MM"4")\
145             "paddw     "MM"4, "MM"3 \n\t" /* temporal_diff2 */\
146             "psrlw     $1,    "MM"3 \n\t"\
147             "pmaxsw    "MM"3, "MM"2 \n\t"\
148             MOVQ"      "MM"2, 48(%[tmp]) \n\t" /* diff */\
149 \
150             "paddw     "MM"0, "MM"1 \n\t"\
151             "paddw     "MM"0, "MM"0 \n\t"\
152             "psubw     "MM"1, "MM"0 \n\t"\
153             "psrlw     $1,    "MM"1 \n\t" /* spatial_pred */\
154             PABS(      MM"2", MM"0")      /* ABS(c-e) */\
155 \
156             MOVQU" -1(%[cur],%[mrefs]), "MM"2 \n\t" /* cur[x-refs-1] */\
157             MOVQU" -1(%[cur],%[prefs]), "MM"3 \n\t" /* cur[x+refs-1] */\
158             MOVQ"      "MM"2, "MM"4 \n\t"\
159             "psubusb   "MM"3, "MM"2 \n\t"\
160             "psubusb   "MM"4, "MM"3 \n\t"\
161             "pmaxub    "MM"3, "MM"2 \n\t"\
162             PSHUF(MM"3", MM"2") \
163             "punpcklbw "MM"7, "MM"2 \n\t" /* ABS(cur[x-refs-1] - cur[x+refs-1]) */\
164             "punpcklbw "MM"7, "MM"3 \n\t" /* ABS(cur[x-refs+1] - cur[x+refs+1]) */\
165             "paddw     "MM"2, "MM"0 \n\t"\
166             "paddw     "MM"3, "MM"0 \n\t"\
167             "psubw    "MANGLE(pw_1)", "MM"0 \n\t" /* spatial_score */\
168 \
169             CHECK(-2,0)\
170             CHECK1\
171             CHECK(-3,1)\
172             CHECK2\
173             CHECK(0,-2)\
174             CHECK1\
175             CHECK(1,-3)\
176             CHECK2\
177 \
178             /* if(p->mode<2) ... */\
179             MOVQ" 48(%[tmp]), "MM"6 \n\t" /* diff */\
180             "cmpl      $2, %[mode] \n\t"\
181             "jge       1f \n\t"\
182             LOAD("(%["prev2"],%[mrefs],2)", MM"2") /* prev2[x-2*refs] */\
183             LOAD("(%["next2"],%[mrefs],2)", MM"4") /* next2[x-2*refs] */\
184             LOAD("(%["prev2"],%[prefs],2)", MM"3") /* prev2[x+2*refs] */\
185             LOAD("(%["next2"],%[prefs],2)", MM"5") /* next2[x+2*refs] */\
186             "paddw     "MM"4, "MM"2 \n\t"\
187             "paddw     "MM"5, "MM"3 \n\t"\
188             "psrlw     $1,    "MM"2 \n\t" /* b */\
189             "psrlw     $1,    "MM"3 \n\t" /* f */\
190             MOVQ"   (%[tmp]), "MM"4 \n\t" /* c */\
191             MOVQ" 16(%[tmp]), "MM"5 \n\t" /* d */\
192             MOVQ" 32(%[tmp]), "MM"7 \n\t" /* e */\
193             "psubw     "MM"4, "MM"2 \n\t" /* b-c */\
194             "psubw     "MM"7, "MM"3 \n\t" /* f-e */\
195             MOVQ"      "MM"5, "MM"0 \n\t"\
196             "psubw     "MM"4, "MM"5 \n\t" /* d-c */\
197             "psubw     "MM"7, "MM"0 \n\t" /* d-e */\
198             MOVQ"      "MM"2, "MM"4 \n\t"\
199             "pminsw    "MM"3, "MM"2 \n\t"\
200             "pmaxsw    "MM"4, "MM"3 \n\t"\
201             "pmaxsw    "MM"5, "MM"2 \n\t"\
202             "pminsw    "MM"5, "MM"3 \n\t"\
203             "pmaxsw    "MM"0, "MM"2 \n\t" /* max */\
204             "pminsw    "MM"0, "MM"3 \n\t" /* min */\
205             "pxor      "MM"4, "MM"4 \n\t"\
206             "pmaxsw    "MM"3, "MM"6 \n\t"\
207             "psubw     "MM"2, "MM"4 \n\t" /* -max */\
208             "pmaxsw    "MM"4, "MM"6 \n\t" /* diff= MAX3(diff, min, -max); */\
209             "1: \n\t"\
210 \
211             MOVQ" 16(%[tmp]), "MM"2 \n\t" /* d */\
212             MOVQ"      "MM"2, "MM"3 \n\t"\
213             "psubw     "MM"6, "MM"2 \n\t" /* d-diff */\
214             "paddw     "MM"6, "MM"3 \n\t" /* d+diff */\
215             "pmaxsw    "MM"2, "MM"1 \n\t"\
216             "pminsw    "MM"3, "MM"1 \n\t" /* d = clip(spatial_pred, d-diff, d+diff); */\
217             "packuswb  "MM"1, "MM"1 \n\t"\
218 \
219             ::[prev] "r"(prev),\
220              [cur]  "r"(cur),\
221              [next] "r"(next),\
222              [prefs]"r"((x86_reg)prefs),\
223              [mrefs]"r"((x86_reg)mrefs),\
224              [mode] "g"(mode),\
225              [tmp]  "r"(tmp)\
226         );\
227         __asm__ volatile(MOV" "MM"1, %0" :"=m"(*dst));\
228         dst += STEP;\
229         prev+= STEP;\
230         cur += STEP;\
231         next+= STEP;\
232     }
233 
234   if (parity) {
235 #define prev2 "prev"
236 #define next2 "cur"
237     FILTER
238 #undef prev2
239 #undef next2
240   } else {
241 #define prev2 "cur"
242 #define next2 "next"
243     FILTER
244 #undef prev2
245 #undef next2
246   }
247 }
248 
249 #undef STEP
250 #undef MM
251 #undef MOV
252 #undef MOVQ
253 #undef MOVQU
254 #undef PSHUF
255 #undef PSRL1
256 #undef PSRL2
257 #undef LOAD
258 #undef PABS
259 #undef CHECK
260 #undef CHECK1
261 #undef CHECK2
262 #undef FILTER
263