1 /*****************************************************************************
2 ** Copyright (c) 2000 John Adcock, Tom Barry, Steve Grimm  All rights reserved.
3 ** port copyright (c) 2003 Miguel Freitas
4 ******************************************************************************
5 **
6 **  This file is subject to the terms of the GNU General Public License as
7 **  published by the Free Software Foundation.  A copy of this license is
8 **  included with this software distribution in the file COPYING.  If you
9 **  do not have a copy, you may obtain a copy by writing to the Free
10 **  Software Foundation, 51 Franklin St, Fifth Floor, Boston, MA
11 **  02110-1301, USA.
12 **
13 **  This software is distributed in the hope that it will be useful,
14 **  but WITHOUT ANY WARRANTY; without even the implied warranty of
15 **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 **  GNU General Public License for more details
17 ******************************************************************************
18 ** CVS Log
19 **
20 ** Revision 1.10  2006/12/21 09:54:45  dgp85
21 ** Apply the textrel patch from Gentoo, thanks to PaX team for providing it. The patch was applied and tested for a while in Gentoo and Pardus, and solves also Debian's problems with non-PIC code. If problems will arise, they'll be debugged.
22 **
23 ** Revision 1.9  2006/02/04 14:06:29  miguelfreitas
24 ** Enable AMD64 mmx/sse support in some plugins (tvtime, libmpeg2, goom...)
25 ** patch by dani3l
26 **
27 ** Revision 1.8  2005/06/05 16:00:06  miguelfreitas
28 ** quite some hacks for gcc 2.95 compatibility
29 **
30 ** Revision 1.7  2004/04/09 02:57:06  miguelfreitas
31 ** tvtime deinterlacing algorithms assumed top_field_first=1
32 ** top_field_first=0 (aka bottom_field_first) should now work as expected
33 **
34 ** Revision 1.6  2004/02/12 20:53:31  mroi
35 ** my gcc (partly 3.4 already) optimizes these away, because they are only used
36 ** inside inline assembler (which the compiler does not recognize); so actually
37 ** the code is wrong (the asm parts should list these as inputs), but telling
38 ** the compiler to keep them is the easier fix
39 **
40 ** Revision 1.5  2004/01/05 12:15:55  siggi
41 ** wonder why Mike isn't complaining about C++ style comments, any more...
42 **
43 ** Revision 1.4  2004/01/05 01:47:26  tmmm
44 ** DOS/Win CRs are forbidden, verboten, interdit
45 **
46 ** Revision 1.3  2004/01/02 20:53:43  miguelfreitas
47 ** better MANGLE from ffmpeg
48 **
49 ** Revision 1.2  2004/01/02 20:47:03  miguelfreitas
50 ** my small contribution to the cygwin port ;-)
51 **
52 ** Revision 1.1  2003/06/22 17:30:03  miguelfreitas
53 ** use our own port of greedy2frame (tvtime port is currently broken)
54 **
55 ** Revision 1.8  2001/11/23 17:18:54  adcockj
56 ** Fixed silly and/or confusion
57 **
58 ** Revision 1.7  2001/11/22 22:27:00  adcockj
59 ** Bug Fixes
60 **
61 ** Revision 1.6  2001/11/21 15:21:40  adcockj
62 ** Renamed DEINTERLACE_INFO to TDeinterlaceInfo in line with standards
63 ** Changed TDeinterlaceInfo structure to have history of pictures.
64 **
65 ** Revision 1.5  2001/07/31 06:48:33  adcockj
66 ** Fixed index bug spotted by Peter Gubanov
67 **
68 ** Revision 1.4  2001/07/13 16:13:33  adcockj
69 ** Added CVS tags and removed tabs
70 **
71 *****************************************************************************/
72 
73 /*
74  * This is the implementation of the Greedy 2-frame deinterlace algorithm
75  * described in DI_Greedy2Frame.c.  It's in a separate file so we can compile
76  * variants for different CPU types; most of the code is the same in the
77  * different variants.
78  */
79 
80 
81 /****************************************************************************
82 ** Field 1 | Field 2 | Field 3 | Field 4 |
83 **   T0    |         |    T1   |         |
84 **         |   M0    |         |    M1   |
85 **   B0    |         |    B1   |         |
86 */
87 
88 #if defined(ARCH_X86)
89 static const sse_t Mask128 = { .uq = { 0x7f7f7f7f7f7f7f7fll, 0x7f7f7f7f7f7f7f7fll} };
90 #define TP GREEDYTWOFRAMETHRESHOLD, GREEDYTWOFRAMETHRESHOLD2
91 static const sse_t GreedyTwoFrameThreshold128 = { .ub = {TP, TP, TP, TP, TP, TP, TP, TP} };
92 #undef TP
93 #endif
94 
DeinterlaceGreedy2Frame_SSE2(uint8_t * output,int outstride,deinterlace_frame_data_t * data,int bottom_field,int second_field,int width,int height)95 static void DeinterlaceGreedy2Frame_SSE2(uint8_t *output, int outstride,
96                                          deinterlace_frame_data_t *data,
97                                          int bottom_field, int second_field,
98                                          int width, int height )
99 {
100 #if defined(ARCH_X86)
101     int Line;
102     int stride = width * 2;
103     register uint8_t* M1;
104     register uint8_t* M0;
105     register uint8_t* T1;
106     register uint8_t* T0;
107     uint8_t* Dest = output;
108     register uint8_t* Dest2;
109     register uint8_t* Destc;
110     register int count;
111     uint32_t Pitch = stride * 2;
112     uint32_t LineLength = stride;
113     uint32_t PitchRest = Pitch - (LineLength >> 4)*16;
114 
115     if( second_field ) {
116         M1 = data->f0;
117         T1 = data->f0;
118         M0 = data->f1;
119         T0 = data->f1;
120     } else {
121         M1 = data->f0;
122         T1 = data->f1;
123         M0 = data->f1;
124         T0 = data->f2;
125     }
126 
127     if( bottom_field ) {
128         M1 += stride;
129         T1 += 0;
130         M0 += stride;
131         T0 += 0;
132     } else {
133         M1 += Pitch;
134         T1 += stride;
135         M0 += Pitch;
136         T0 += stride;
137 
138         xine_fast_memcpy(Dest, M1, LineLength);
139         Dest += outstride;
140     }
141 
142     for (Line = 0; Line < (height / 2) - 1; ++Line)
143     {
144       /* Always use the most recent data verbatim.  By definition it's correct
145        * (it'd be shown on an interlaced display) and our job is to fill in
146        * the spaces between the new lines.
147        */
148       /* xine_fast_memcpy would be pretty pointless here as we load the same
149        * data anyway it's just one additional mov per loop...
150        * XXX I believe some cpus with sse2 (early A64?) only have one write
151        * buffer. Using movntdq with 2 different streams may have quite
152        * bad performance consequences on such cpus.
153        */
154 
155         Destc = Dest;
156         Dest += outstride;
157         Dest2 = Dest;
158 
159         /* just rely on gcc not using xmm regs... */
160         do {
161           __asm__ __volatile__(
162             "movdqa  %0, %%xmm6			\n\t"     // xmm6 = Mask
163             "pxor    %%xmm7, %%xmm7		\n\t"     // xmm7 = zero
164             : /* no output */
165             : "m" (Mask128) );
166         } while (0);
167 
168         count = LineLength >> 4;
169         do {
170           __asm__ __volatile__(
171        /* Figure out what to do with the scanline above the one we copy.
172         * See above for a description of the algorithm.
173         * weave if (weave(M) AND (weave(T) OR weave(B)))
174         */
175             "movdqa  (%2), %%xmm1		\n\t" /* xmm1 = T1 */
176             "movdqa  (%3), %%xmm0		\n\t" /* xmm0 = T0 */
177             "movdqa  (%4,%2), %%xmm3		\n\t" /* xmm3 = B1 */
178             "movdqa  (%4,%3), %%xmm2		\n\t" /* xmm2 = B0 */
179 
180             /* calculate |T1-T0| keep T1 put result in xmm5 */
181             "movdqa  %%xmm1, %%xmm5		\n\t"
182             "psubusb %%xmm0, %%xmm5		\n\t"
183             "psubusb %%xmm1, %%xmm0		\n\t"
184             "por     %%xmm0, %%xmm5		\n\t"
185 
186             /* T1 is data for line to copy */
187             "movntdq  %%xmm1, %1		\n\t"
188 
189             /* if |T1-T0| > Threshold we want 0 else dword minus one */
190             "psrlw   $1, %%xmm5			\n\t"
191             "pand    %%xmm6, %%xmm5		\n\t"
192             "pcmpgtb %0, %%xmm5			\n\t"
193             "pcmpeqd %%xmm7, %%xmm5		\n\t"
194 
195             "prefetcht0  64(%4,%2)		\n\t"
196             "prefetcht0  64(%4,%3)		\n\t"
197           :
198           : "m" (GreedyTwoFrameThreshold128),
199             "m" (*Destc), "r" (T1), "r" (T0), "r" ((void*)(intptr_t)Pitch) );
200 
201           __asm__ __volatile__ (
202             /* calculate |B1-B0| keep B1 put result in xmm4 */
203             "movdqa  %%xmm3, %%xmm4		\n\t"
204             "psubusb %%xmm2, %%xmm4		\n\t"
205             "psubusb %%xmm3, %%xmm2		\n\t"
206             "por     %%xmm2, %%xmm4		\n\t"
207 
208             "movdqa  (%0), %%xmm0		\n\t" /* xmm0 = M1 */
209             "movdqa  (%1), %%xmm2		\n\t" /* xmm2 = M0 */
210 
211             /* if |B1-B0| > Threshold we want 0 else dword minus one */
212             "psrlw   $1, %%xmm4			\n\t"
213             "pand    %%xmm6, %%xmm4		\n\t"
214             "pcmpgtb %2, %%xmm4			\n\t"
215             "pcmpeqd %%xmm7, %%xmm4		\n\t"
216 
217             "por     %%xmm4, %%xmm5		\n\t"
218 
219             /* Average T1 and B1 so we can do interpolated bobbing if we bob
220              * onto T1 */
221             "pavgb   %%xmm3, %%xmm1		\n\t" /* xmm1 = avg(T1,B1) */
222 
223             "prefetcht0  64(%0)			\n\t"
224             "prefetcht0  64(%1)			\n\t"
225 
226             /* make mm0 the average of M1 and M0 which should make weave
227              * look better when there is small amounts of movement */
228             "movdqa  %%xmm2, %%xmm3		\n\t"
229             "pavgb   %%xmm0, %%xmm3		\n\t" /* xmm3 = avg(M1,M0) */
230 
231             /* calculate |M1-M0| put result in xmm4 */
232             "movdqa  %%xmm0, %%xmm4		\n\t"
233             "psubusb %%xmm2, %%xmm4		\n\t"
234             "psubusb %%xmm0, %%xmm2		\n\t"
235             "por     %%xmm2, %%xmm4		\n\t"
236 
237             /* if |M1-M0| > Threshold we want 0 else dword minus one */
238             "psrlw   $1, %%xmm4			\n\t"
239             "pand    %%xmm6, %%xmm4		\n\t"
240             "pcmpgtb %2, %%xmm4			\n\t"
241             "pcmpeqd %%xmm7, %%xmm4		\n\t" /* do we want to bob */
242 
243             "pand   %%xmm5, %%xmm4		\n\t"
244 
245 /* debugging feature
246  * output the value of xmm4 at this point which is pink where we will weave
247  * and green where we are going to bob
248  */
249 #ifdef CHECK_BOBWEAVE
250             "movntdq  %%xmm4, %3		\n\t"
251 #else
252             /* xmm4 now is 1 where we want to weave and 0 where we want to bob */
253             "pand    %%xmm4, %%xmm3		\n\t"
254             "pandn   %%xmm1, %%xmm4		\n\t"
255             "por     %%xmm3, %%xmm4		\n\t"
256             "movntdq  %%xmm4, %3		\n\t"
257 #endif
258           :
259           : "r" (M1), "r" (M0), "m" (GreedyTwoFrameThreshold128),
260             "m" (*Dest2));
261 
262           /* Advance to the next set of pixels. */
263           T1 += 16;
264           M1 += 16;
265           M0 += 16;
266           T0 += 16;
267           Dest2 += 16;
268           Destc += 16;
269 
270         } while( --count );
271 
272         Dest += outstride;
273 
274         M1 += PitchRest;
275         T1 += PitchRest;
276         M0 += PitchRest;
277         T0 += PitchRest;
278     }
279 
280     __asm__ __volatile__("sfence\n\t");
281 
282     if( bottom_field )
283     {
284         xine_fast_memcpy(Dest, T1, stride);
285         Dest += outstride;
286         xine_fast_memcpy(Dest, M1, stride);
287     }
288     else
289     {
290         xine_fast_memcpy(Dest, T1, stride);
291     }
292 #endif
293 }
294 
295