1 /*****************************************************************************
2 ** Copyright (c) 2000 John Adcock, Tom Barry, Steve Grimm All rights reserved.
3 ** port copyright (c) 2003 Miguel Freitas
4 ******************************************************************************
5 **
6 ** This file is subject to the terms of the GNU General Public License as
7 ** published by the Free Software Foundation. A copy of this license is
8 ** included with this software distribution in the file COPYING. If you
9 ** do not have a copy, you may obtain a copy by writing to the Free
10 ** Software Foundation, 51 Franklin St, Fifth Floor, Boston, MA
11 ** 02110-1301, USA.
12 **
13 ** This software is distributed in the hope that it will be useful,
14 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ** GNU General Public License for more details
17 ******************************************************************************
18 ** CVS Log
19 **
20 ** Revision 1.10 2006/12/21 09:54:45 dgp85
21 ** Apply the textrel patch from Gentoo, thanks to PaX team for providing it. The patch was applied and tested for a while in Gentoo and Pardus, and solves also Debian's problems with non-PIC code. If problems will arise, they'll be debugged.
22 **
23 ** Revision 1.9 2006/02/04 14:06:29 miguelfreitas
24 ** Enable AMD64 mmx/sse support in some plugins (tvtime, libmpeg2, goom...)
25 ** patch by dani3l
26 **
27 ** Revision 1.8 2005/06/05 16:00:06 miguelfreitas
28 ** quite some hacks for gcc 2.95 compatibility
29 **
30 ** Revision 1.7 2004/04/09 02:57:06 miguelfreitas
31 ** tvtime deinterlacing algorithms assumed top_field_first=1
32 ** top_field_first=0 (aka bottom_field_first) should now work as expected
33 **
34 ** Revision 1.6 2004/02/12 20:53:31 mroi
35 ** my gcc (partly 3.4 already) optimizes these away, because they are only used
36 ** inside inline assembler (which the compiler does not recognize); so actually
37 ** the code is wrong (the asm parts should list these as inputs), but telling
38 ** the compiler to keep them is the easier fix
39 **
40 ** Revision 1.5 2004/01/05 12:15:55 siggi
41 ** wonder why Mike isn't complaining about C++ style comments, any more...
42 **
43 ** Revision 1.4 2004/01/05 01:47:26 tmmm
44 ** DOS/Win CRs are forbidden, verboten, interdit
45 **
46 ** Revision 1.3 2004/01/02 20:53:43 miguelfreitas
47 ** better MANGLE from ffmpeg
48 **
49 ** Revision 1.2 2004/01/02 20:47:03 miguelfreitas
50 ** my small contribution to the cygwin port ;-)
51 **
52 ** Revision 1.1 2003/06/22 17:30:03 miguelfreitas
53 ** use our own port of greedy2frame (tvtime port is currently broken)
54 **
55 ** Revision 1.8 2001/11/23 17:18:54 adcockj
56 ** Fixed silly and/or confusion
57 **
58 ** Revision 1.7 2001/11/22 22:27:00 adcockj
59 ** Bug Fixes
60 **
61 ** Revision 1.6 2001/11/21 15:21:40 adcockj
62 ** Renamed DEINTERLACE_INFO to TDeinterlaceInfo in line with standards
63 ** Changed TDeinterlaceInfo structure to have history of pictures.
64 **
65 ** Revision 1.5 2001/07/31 06:48:33 adcockj
66 ** Fixed index bug spotted by Peter Gubanov
67 **
68 ** Revision 1.4 2001/07/13 16:13:33 adcockj
69 ** Added CVS tags and removed tabs
70 **
71 *****************************************************************************/
72
73 /*
74 * This is the implementation of the Greedy 2-frame deinterlace algorithm
75 * described in DI_Greedy2Frame.c. It's in a separate file so we can compile
76 * variants for different CPU types; most of the code is the same in the
77 * different variants.
78 */
79
80
81 /****************************************************************************
82 ** Field 1 | Field 2 | Field 3 | Field 4 |
83 ** T0 | | T1 | |
84 ** | M0 | | M1 |
85 ** B0 | | B1 | |
86 */
87
88 #if defined(ARCH_X86)
89 static const sse_t Mask128 = { .uq = { 0x7f7f7f7f7f7f7f7fll, 0x7f7f7f7f7f7f7f7fll} };
90 #define TP GREEDYTWOFRAMETHRESHOLD, GREEDYTWOFRAMETHRESHOLD2
91 static const sse_t GreedyTwoFrameThreshold128 = { .ub = {TP, TP, TP, TP, TP, TP, TP, TP} };
92 #undef TP
93 #endif
94
DeinterlaceGreedy2Frame_SSE2(uint8_t * output,int outstride,deinterlace_frame_data_t * data,int bottom_field,int second_field,int width,int height)95 static void DeinterlaceGreedy2Frame_SSE2(uint8_t *output, int outstride,
96 deinterlace_frame_data_t *data,
97 int bottom_field, int second_field,
98 int width, int height )
99 {
100 #if defined(ARCH_X86)
101 int Line;
102 int stride = width * 2;
103 register uint8_t* M1;
104 register uint8_t* M0;
105 register uint8_t* T1;
106 register uint8_t* T0;
107 uint8_t* Dest = output;
108 register uint8_t* Dest2;
109 register uint8_t* Destc;
110 register int count;
111 uint32_t Pitch = stride * 2;
112 uint32_t LineLength = stride;
113 uint32_t PitchRest = Pitch - (LineLength >> 4)*16;
114
115 if( second_field ) {
116 M1 = data->f0;
117 T1 = data->f0;
118 M0 = data->f1;
119 T0 = data->f1;
120 } else {
121 M1 = data->f0;
122 T1 = data->f1;
123 M0 = data->f1;
124 T0 = data->f2;
125 }
126
127 if( bottom_field ) {
128 M1 += stride;
129 T1 += 0;
130 M0 += stride;
131 T0 += 0;
132 } else {
133 M1 += Pitch;
134 T1 += stride;
135 M0 += Pitch;
136 T0 += stride;
137
138 xine_fast_memcpy(Dest, M1, LineLength);
139 Dest += outstride;
140 }
141
142 for (Line = 0; Line < (height / 2) - 1; ++Line)
143 {
144 /* Always use the most recent data verbatim. By definition it's correct
145 * (it'd be shown on an interlaced display) and our job is to fill in
146 * the spaces between the new lines.
147 */
148 /* xine_fast_memcpy would be pretty pointless here as we load the same
149 * data anyway it's just one additional mov per loop...
150 * XXX I believe some cpus with sse2 (early A64?) only have one write
151 * buffer. Using movntdq with 2 different streams may have quite
152 * bad performance consequences on such cpus.
153 */
154
155 Destc = Dest;
156 Dest += outstride;
157 Dest2 = Dest;
158
159 /* just rely on gcc not using xmm regs... */
160 do {
161 __asm__ __volatile__(
162 "movdqa %0, %%xmm6 \n\t" // xmm6 = Mask
163 "pxor %%xmm7, %%xmm7 \n\t" // xmm7 = zero
164 : /* no output */
165 : "m" (Mask128) );
166 } while (0);
167
168 count = LineLength >> 4;
169 do {
170 __asm__ __volatile__(
171 /* Figure out what to do with the scanline above the one we copy.
172 * See above for a description of the algorithm.
173 * weave if (weave(M) AND (weave(T) OR weave(B)))
174 */
175 "movdqa (%2), %%xmm1 \n\t" /* xmm1 = T1 */
176 "movdqa (%3), %%xmm0 \n\t" /* xmm0 = T0 */
177 "movdqa (%4,%2), %%xmm3 \n\t" /* xmm3 = B1 */
178 "movdqa (%4,%3), %%xmm2 \n\t" /* xmm2 = B0 */
179
180 /* calculate |T1-T0| keep T1 put result in xmm5 */
181 "movdqa %%xmm1, %%xmm5 \n\t"
182 "psubusb %%xmm0, %%xmm5 \n\t"
183 "psubusb %%xmm1, %%xmm0 \n\t"
184 "por %%xmm0, %%xmm5 \n\t"
185
186 /* T1 is data for line to copy */
187 "movntdq %%xmm1, %1 \n\t"
188
189 /* if |T1-T0| > Threshold we want 0 else dword minus one */
190 "psrlw $1, %%xmm5 \n\t"
191 "pand %%xmm6, %%xmm5 \n\t"
192 "pcmpgtb %0, %%xmm5 \n\t"
193 "pcmpeqd %%xmm7, %%xmm5 \n\t"
194
195 "prefetcht0 64(%4,%2) \n\t"
196 "prefetcht0 64(%4,%3) \n\t"
197 :
198 : "m" (GreedyTwoFrameThreshold128),
199 "m" (*Destc), "r" (T1), "r" (T0), "r" ((void*)(intptr_t)Pitch) );
200
201 __asm__ __volatile__ (
202 /* calculate |B1-B0| keep B1 put result in xmm4 */
203 "movdqa %%xmm3, %%xmm4 \n\t"
204 "psubusb %%xmm2, %%xmm4 \n\t"
205 "psubusb %%xmm3, %%xmm2 \n\t"
206 "por %%xmm2, %%xmm4 \n\t"
207
208 "movdqa (%0), %%xmm0 \n\t" /* xmm0 = M1 */
209 "movdqa (%1), %%xmm2 \n\t" /* xmm2 = M0 */
210
211 /* if |B1-B0| > Threshold we want 0 else dword minus one */
212 "psrlw $1, %%xmm4 \n\t"
213 "pand %%xmm6, %%xmm4 \n\t"
214 "pcmpgtb %2, %%xmm4 \n\t"
215 "pcmpeqd %%xmm7, %%xmm4 \n\t"
216
217 "por %%xmm4, %%xmm5 \n\t"
218
219 /* Average T1 and B1 so we can do interpolated bobbing if we bob
220 * onto T1 */
221 "pavgb %%xmm3, %%xmm1 \n\t" /* xmm1 = avg(T1,B1) */
222
223 "prefetcht0 64(%0) \n\t"
224 "prefetcht0 64(%1) \n\t"
225
226 /* make mm0 the average of M1 and M0 which should make weave
227 * look better when there is small amounts of movement */
228 "movdqa %%xmm2, %%xmm3 \n\t"
229 "pavgb %%xmm0, %%xmm3 \n\t" /* xmm3 = avg(M1,M0) */
230
231 /* calculate |M1-M0| put result in xmm4 */
232 "movdqa %%xmm0, %%xmm4 \n\t"
233 "psubusb %%xmm2, %%xmm4 \n\t"
234 "psubusb %%xmm0, %%xmm2 \n\t"
235 "por %%xmm2, %%xmm4 \n\t"
236
237 /* if |M1-M0| > Threshold we want 0 else dword minus one */
238 "psrlw $1, %%xmm4 \n\t"
239 "pand %%xmm6, %%xmm4 \n\t"
240 "pcmpgtb %2, %%xmm4 \n\t"
241 "pcmpeqd %%xmm7, %%xmm4 \n\t" /* do we want to bob */
242
243 "pand %%xmm5, %%xmm4 \n\t"
244
245 /* debugging feature
246 * output the value of xmm4 at this point which is pink where we will weave
247 * and green where we are going to bob
248 */
249 #ifdef CHECK_BOBWEAVE
250 "movntdq %%xmm4, %3 \n\t"
251 #else
252 /* xmm4 now is 1 where we want to weave and 0 where we want to bob */
253 "pand %%xmm4, %%xmm3 \n\t"
254 "pandn %%xmm1, %%xmm4 \n\t"
255 "por %%xmm3, %%xmm4 \n\t"
256 "movntdq %%xmm4, %3 \n\t"
257 #endif
258 :
259 : "r" (M1), "r" (M0), "m" (GreedyTwoFrameThreshold128),
260 "m" (*Dest2));
261
262 /* Advance to the next set of pixels. */
263 T1 += 16;
264 M1 += 16;
265 M0 += 16;
266 T0 += 16;
267 Dest2 += 16;
268 Destc += 16;
269
270 } while( --count );
271
272 Dest += outstride;
273
274 M1 += PitchRest;
275 T1 += PitchRest;
276 M0 += PitchRest;
277 T0 += PitchRest;
278 }
279
280 __asm__ __volatile__("sfence\n\t");
281
282 if( bottom_field )
283 {
284 xine_fast_memcpy(Dest, T1, stride);
285 Dest += outstride;
286 xine_fast_memcpy(Dest, M1, stride);
287 }
288 else
289 {
290 xine_fast_memcpy(Dest, T1, stride);
291 }
292 #endif
293 }
294
295