1// -*- c++ -*-
2/////////////////////////////////////////////////////////////////////////////
3// Copyright (c) 2001 Tom Barry.  All rights reserved.
4/////////////////////////////////////////////////////////////////////////////
5//
6//	This file is subject to the terms of the GNU General Public License as
7//	published by the Free Software Foundation.  A copy of this license is
8//	included with this software distribution in the file COPYING.  If you
9//	do not have a copy, you may obtain a copy by writing to the Free
10//	Software Foundation, 51 Franklin St, Fifth Floor, Boston, MA
11//  02110-1301, USA.
12//
13//	This software is distributed in the hope that it will be useful,
14//	but WITHOUT ANY WARRANTY; without even the implied warranty of
15//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16//	GNU General Public License for more details
17//
18/////////////////////////////////////////////////////////////////////////////
19
20#include "x86-64_macros.inc"
21
22#if !defined(MASKS_DEFINED)
23#define MASKS_DEFINED
24static const int64_t __attribute__((__used__)) YMask        = 0x00ff00ff00ff00ffull; // to keep only luma
25static const int64_t __attribute__((__used__)) UVMask       = 0xff00ff00ff00ff00ull; // to keep only chroma
26static const int64_t __attribute__((__used__)) ShiftMask    = 0xfefffefffefffeffull; // to avoid shifting chroma to luma
27static const int64_t __attribute__((__used__)) QW256        = 0x0100010001000100ull; // 4 256's
28
29static int64_t MaxComb;
30static int64_t MotionThreshold;
31static int64_t MotionSense;
32static int64_t QW256B;
33
34#endif
35
36static void FUNCT_NAME(uint8_t *output, int outstride,
37                  deinterlace_frame_data_t *data,
38                  int bottom_field, int second_field, int width, int height )
39{
40    int64_t i;
41    int stride = (width*2);
42    int InfoIsOdd = bottom_field;
43
44    int Line;
45    long LoopCtr;
46    unsigned int Pitch = stride*2;
47    int FieldHeight = height / 2;
48
49    unsigned char* L1;					// ptr to Line1, of 3
50    unsigned char* L2;					// ptr to Line2, the weave line
51    unsigned char* L3;					// ptr to Line3
52
53    unsigned char* L2P;					// ptr to prev Line2
54    unsigned char* temp;
55    unsigned char* Dest = output;
56
57    int64_t LastAvg=0;			//interp value from left qword
58
59    // Set up our two parms that are actually evaluated for each pixel
60    i=GreedyMaxComb;
61    MaxComb = i << 56 | i << 48 | i << 40 | i << 32 | i << 24 | i << 16 | i << 8 | i;
62
63    i = GreedyMotionThreshold;		// scale to range of 0-257
64    MotionThreshold = i << 48 | i << 32 | i << 16 | i | UVMask;
65
66    i = GreedyMotionSense;		// scale to range of 0-257
67    MotionSense = i << 48 | i << 32 | i << 16 | i;
68
69    i = 0xffffffff - 256;
70    QW256B =  i << 48 |  i << 32 | i << 16 | i;  // save a couple instr on PMINSW instruct.
71
72    // copy first even line no matter what, and the first odd line if we're
73    // processing an EVEN field. (note diff from other deint rtns.)
74    if( second_field ) {
75        L1 = data->f0;
76        L2 = data->f0;
77        L2P = data->f1;
78    } else {
79        L1 = data->f1;
80        L2 = data->f0;
81        L2P = data->f1;
82    }
83
84    if( InfoIsOdd ) {
85        L1 += 0;
86        L2 += stride;
87        L3 = L1 + Pitch;
88        L2P += stride;
89
90        // copy first even line
91        xine_fast_memcpy(Dest, L1, stride);
92        Dest += outstride;
93    } else {
94        // copy first even line
95        xine_fast_memcpy(Dest, L2, stride);
96        Dest += outstride;
97
98        L1 += stride;
99        L2 += Pitch;
100        L3 = L1 + Pitch;
101        L2P += Pitch;
102
103        // then first odd line
104        xine_fast_memcpy(Dest, L1, stride);
105        Dest += outstride;
106    }
107
108    for (Line = 0; Line < (FieldHeight - 1); ++Line) {
109        LoopCtr = stride / 8 - 1; // there are LineLength / 8 qwords per line but do 1 less, adj at end of loop
110
111/* Hans-Dieter Kosch writes:
112 *
113 * >  The older compilers do not understand the syntax
114 * >  __asm__ ( "command %[name0]" : : [name0] "x"(arg0) )
115 * >  They only understand
116 * >  __asm__ ( "command %0" : : "x"(arg0) )
117 *
118 * now we define the arguments to make the asm code less ugly.
119 */
120#ifndef asmLastAvg
121#define asmLastAvg      "%0"
122#define asmL1           "%1"
123#define asmL3           "%2"
124#define asmtemp         "%3"
125#define asmL2           "%4"
126#define asmDest         "%5"
127#define asmLoopCtr      "%6"
128#endif
129
130        // For ease of reading, the comments below assume that we're operating on an odd
131        // field (i.e., that InfoIsOdd is true).  Assume the obvious for even lines..
132        temp = L2P;
133        __asm__ __volatile__
134            (
135             MEMREG ("mov", asmL1, "ax")
136             BUMPPTR ("8", "ax", "dx")      // next qword needed by DJR
137             MEMREG ("mov", asmL3, "cx")
138             REG2 ("sub", "ax", "cx")       // carry L3 addr as an offset
139             MEMREG ("mov", asmL2, "si")
140             MEMREG ("mov", asmDest, "di")  // DL1 if Odd or DL2 if Even
141
142             ".align 8\n\t"
143             "1:\n\t"
144
145             "movq  "MEMREF1("si")",  %%mm0\n\t"     // L2 - the newest weave pixel value
146             "movq  "MEMREF1("ax")",  %%mm1\n\t"     // L1 - the top pixel
147             REG1 ("push", "dx")
148             MEMREG ("mov", asmtemp, "dx")
149             "movq  "MEMREF1("dx")",  %%mm2\n\t"     // L2P - the prev weave pixel
150             REG1 ("pop", "dx")
151             "movq  "MEMREF2("ax","cx")", %%mm3\n\t" // L3, next odd row
152             "movq  %%mm1,            %%mm6\n\t"     // L1 - get simple single pixel interp
153             // pavgb   mm6, mm3                     // use macro below
154             V_PAVGB ("%%mm6", "%%mm3", "%%mm4", "%8")
155
156             // DJR - Diagonal Jaggie Reduction
157             // In the event that we are going to use an average (Bob) pixel we do not want a jagged
158             // stair step effect.  To combat this we avg in the 2 horizontally adjacen pixels into the
159             // interpolated Bob mix. This will do horizontal smoothing for only the Bob'd pixels.
160
161             "movq  "asmLastAvg",   %%mm4\n\t"      // the bob value from prev qword in row
162             "movq  %%mm6,          "asmLastAvg"\n\t" // save for next pass
163             "psrlq $48,            %%mm4\n\t"      // right justify 1 pixel
164             "movq  %%mm6,          %%mm7\n\t"      // copy of simple bob pixel
165             "psllq $16,            %%mm7\n\t"      // left justify 3 pixels
166             "por   %%mm7,          %%mm4\n\t"      // and combine
167
168             "movq  "MEMREF1("dx")",  %%mm5\n\t"    // next horiz qword from L1
169             //			pavgb   mm5, qword ptr[ebx+ecx] // next horiz qword from L3, use macro below
170             V_PAVGB ("%%mm5", MEMREF2("dx","cx"), "%%mm7", "%8")
171             "psllq $48,            %%mm5\n\t"      // left just 1 pixel
172             "movq  %%mm6,          %%mm7\n\t"      // another copy of simple bob pixel
173             "psrlq $16,            %%mm7\n\t"      // right just 3 pixels
174             "por   %%mm7,          %%mm5\n\t"      // combine
175             //			pavgb	mm4, mm5			// avg of forward and prev by 1 pixel, use macro
176             V_PAVGB ("%%mm4", "%%mm5", "%%mm5", "%8")   // mm5 gets modified if MMX
177             //			pavgb	mm6, mm4			// avg of center and surround interp vals, use macro
178             V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%8")
179
180             // Don't do any more averaging than needed for mmx. It hurts performance and causes rounding errors.
181#ifndef IS_MMX
182             //          pavgb	mm4, mm6			// 1/4 center, 3/4 adjacent
183             V_PAVGB ("%%mm4", "%%mm6", "%%mm7", "%8")
184             //    		pavgb	mm6, mm4			// 3/8 center, 5/8 adjacent
185             V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%8")
186#endif
187
188             // get abs value of possible L2 comb
189             "movq    %%mm6,        %%mm4\n\t"      // work copy of interp val
190             "movq    %%mm2,        %%mm7\n\t"      // L2
191             "psubusb %%mm4,        %%mm7\n\t"      // L2 - avg
192             "movq    %%mm4,        %%mm5\n\t"      // avg
193             "psubusb %%mm2,        %%mm5\n\t"      // avg - L2
194             "por     %%mm7,        %%mm5\n\t"      // abs(avg-L2)
195
196             // get abs value of possible L2P comb
197             "movq    %%mm0,        %%mm7\n\t"      // L2P
198             "psubusb %%mm4,        %%mm7\n\t"      // L2P - avg
199             "psubusb %%mm0,        %%mm4\n\t"      // avg - L2P
200             "por     %%mm7,        %%mm4\n\t"      // abs(avg-L2P)
201
202             // use L2 or L2P depending upon which makes smaller comb
203             "psubusb %%mm5,        %%mm4\n\t"      // see if it goes to zero
204             "psubusb %%mm5,        %%mm5\n\t"      // 0
205             "pcmpeqb %%mm5,        %%mm4\n\t"      // if (mm4=0) then FF else 0
206             "pcmpeqb %%mm4,        %%mm5\n\t"      // opposite of mm4
207
208             // if Comb(L2P) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55
209             "pand    %%mm2,        %%mm5\n\t"      // use L2 if mm5 == ff, else 0
210             "pand    %%mm0,        %%mm4\n\t"      // use L2P if mm4 = ff, else 0
211             "por     %%mm5,        %%mm4\n\t"      // may the best win
212
213             // Inventory: at this point we have the following values:
214             // mm0 = L2P (or L2)
215             // mm1 = L1
216             // mm2 = L2 (or L2P)
217             // mm3 = L3
218             // mm4 = the best of L2,L2P weave pixel, base upon comb
219             // mm6 = the avg interpolated value, if we need to use it
220
221             // Let's measure movement, as how much the weave pixel has changed
222             "movq    %%mm2,        %%mm7\n\t"
223             "psubusb %%mm0,        %%mm2\n\t"
224             "psubusb %%mm7,        %%mm0\n\t"
225             "por     %%mm2,        %%mm0\n\t"      // abs value of change, used later
226
227             // Now lets clip our chosen value to be not outside of the range
228             // of the high/low range L1-L3 by more than MaxComb.
229             // This allows some comb but limits the damages and also allows more
230             // detail than a boring oversmoothed clip.
231             "movq    %%mm1,        %%mm2\n\t"      // copy L1
232             //	pmaxub mm2, mm3                     // use macro
233             V_PMAXUB ("%%mm2", "%%mm3")            // now = Max(L1,L3)
234             "movq    %%mm1,        %%mm5\n\t"      // copy L1
235             // pminub	mm5, mm3                    // now = Min(L1,L3), use macro
236             V_PMINUB ("%%mm5", "%%mm3", "%%mm7")
237             // allow the value to be above the high or below the low by amt of MaxComb
238             "psubusb %9,           %%mm5\n\t"      // lower min by diff
239             "paddusb %9,           %%mm2\n\t"      // increase max by diff
240             // pmaxub	mm4, mm5                    // now = Max(best,Min(L1,L3) use macro
241             V_PMAXUB ("%%mm4", "%%mm5")
242             // pminub	mm4, mm2                    // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped
243             V_PMINUB ("%%mm4", "%%mm2", "%%mm7")
244
245             // Blend weave pixel with bob pixel, depending on motion val in mm0
246             "psubusb %10,          %%mm0\n\t"// test Threshold, clear chroma change >>>??
247             "pmullw  %11,          %%mm0\n\t"    // mul by user factor, keep low 16 bits
248             "movq    %12,          %%mm7\n\t"
249#ifdef IS_SSE
250             "pminsw  %%mm7,        %%mm0\n\t"      // max = 256
251#else
252             "paddusw %13,          %%mm0\n\t"      // add, may sat at fff..
253             "psubusw %13,          %%mm0\n\t"      // now = Min(L1,256)
254#endif
255             "psubusw %%mm0,        %%mm7\n\t"      // so the 2 sum to 256, weighted avg
256             "movq    %%mm4,        %%mm2\n\t"      // save weave chroma info before trashing
257             "pand    %14,          %%mm4\n\t"      // keep only luma from calc'd value
258             "pmullw  %%mm7,        %%mm4\n\t"      // use more weave for less motion
259             "pand    %14,          %%mm6\n\t"      // keep only luma from calc'd value
260             "pmullw  %%mm0,        %%mm6\n\t"      // use more bob for large motion
261             "paddusw %%mm6,        %%mm4\n\t"      // combine
262             "psrlw   $8,           %%mm4\n\t"      // div by 256 to get weighted avg
263
264             // chroma comes from weave pixel
265             "pand    %15,          %%mm2\n\t"      // keep chroma
266             "por     %%mm4,        %%mm2\n\t"      // and combine
267
268             V_MOVNTQ (MEMREF1("di"), "%%mm2")     // move in our clipped best, use macro
269
270             // bump ptrs and loop
271             BUMPPTR ("8", "ax", "ax")
272             BUMPPTR ("8", "dx", "dx")
273             CONSTMEM ("add", "8", asmtemp)
274             BUMPPTR ("8", "di", "di")
275             BUMPPTR ("8", "si", "si")
276             CONSTMEM ("sub", "1", asmLoopCtr)
277             "jg      1b\n\t"                       // loop if not to last line
278                                                    // note P-III default assumes backward branches taken
279             "jl      1f\n\t"                       // done
280
281             REG2 ("mov", "ax", "dx")  // sharpness lookahead 1 byte only, be wrong on 1
282             "jmp     1b\n\t"
283
284             "1:\n\t"
285
286             : /* no outputs */
287
288             : "m"(LastAvg),
289               "m"(L1),
290               "m"(L3),
291               "m"(temp),
292               "m"(L2),
293               "m"(Dest),
294               "m"(LoopCtr),
295               "m"(temp),
296               "m"(ShiftMask),
297               "m"(MaxComb),
298               "m"(MotionThreshold),
299               "m"(MotionSense),
300               "m"(QW256),
301               "m"(QW256B),
302               "m"(YMask),
303               "m"(UVMask)
304
305             :
306#if defined(ARCH_X86_X32) || defined(ARCH_X86_64)
307               "rax", "rcx", "rdx", "rsi", "rdi",
308/* the following clobber list causes trouble for gcc 2.95. it shouldn't be
309 * an issue as, afaik, mmx registers map to the existing fp registers.
310 */
311               "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7",
312#elif defined(ARCH_X86)
313               "eax", "ecx", "edx", "esi", "edi",
314               "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)",
315#endif
316               "memory", "cc"
317            );
318
319        Dest += outstride;
320        xine_fast_memcpy(Dest, L3, stride);
321        Dest += outstride;
322
323        L1  += Pitch;
324        L2  += Pitch;
325        L3  += Pitch;
326        L2P += Pitch;
327    }
328
329    if (InfoIsOdd) {
330        xine_fast_memcpy(Dest, L2, stride);
331    }
332
333    // clear out the MMX registers ready for doing floating point again
334#if defined(ARCH_X86)
335    __asm__ __volatile__ ("emms\n\t");
336#endif
337}
338