1/*
2 *
3 * GStreamer
4 * Copyright (c) 2001 Tom Barry.  All rights reserved.
5 * Copyright (C) 2008,2010 Sebastian Dröge <slomo@collabora.co.uk>
6 *
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Library General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Library General Public License for more details.
16 *
17 * You should have received a copy of the GNU Library General Public
18 * License aglong with this library; if not, write to the
19 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
20 * Boston, MA 02110-1301, USA.
21 */
22
23
24/*
25 * Relicensed for GStreamer from GPL to LGPL with permit from Tom Barry.
26 * See: http://bugzilla.gnome.org/show_bug.cgi?id=163578
27 */
28
29
30#include "x86-64_macros.inc"
31
32static void
33FUNCT_NAME_YUY2 (GstDeinterlaceMethodGreedyH *self, const guint8 * L1, const guint8 * L2, const guint8 * L3, const guint8 * L2P, guint8 * Dest, gint width)
34{
35
36  // in tight loop some vars are accessed faster in local storage
37  gint64 YMask = 0x00ff00ff00ff00ffull;        // to keep only luma
38  gint64 UVMask = 0xff00ff00ff00ff00ull;       // to keep only chroma
39  gint64 ShiftMask = 0xfefefefefefefefeull;    // to avoid shifting chroma to luma
40  gint64 QW256 = 0x0100010001000100ull;        // 4 256's
41  gint64 MaxComb;
42  gint64 MotionThreshold;
43  gint64 MotionSense;
44  gint64 i;
45  glong LoopCtr;
46  glong oldbx = 0;
47
48  gint64 QW256B;
49  gint64 LastAvg = 0;          //interp value from left qword
50
51  // FIXME: Use C implementation if the width is not a multiple of 4
52  // Do something more optimal later
53  if (width % 4 != 0)
54    C_FUNCT_YUY2 (self, L1, L2, L3, L2P, Dest, width);
55
56  // Set up our two parms that are actually evaluated for each pixel
57  i = self->max_comb;
58  MaxComb =
59      i << 56 | i << 48 | i << 40 | i << 32 | i << 24 | i << 16 | i << 8 | i;
60
61  i = self->motion_threshold;    // scale to range of 0-257
62  MotionThreshold = i << 48 | i << 32 | i << 16 | i | UVMask;
63
64  i = self->motion_sense;        // scale to range of 0-257
65  MotionSense = i << 48 | i << 32 | i << 16 | i;
66
67  i = 0xffffffff - 256;
68  QW256B = i << 48 | i << 32 | i << 16 | i;     // save a couple instr on PMINSW instruct.
69
70  LoopCtr = width / 8 - 1;       // there are LineLength / 4 qwords per line but do 1 less, adj at end of loop
71
72  // For ease of reading, the comments below assume that we're operating on an odd
73  // field (i.e., that InfoIsOdd is true).  Assume the obvious for even lines..
74  __asm__ __volatile__ (
75      // save ebx (-fPIC)
76      MOVX " %%" XBX ", %[oldbx]\n\t"
77      MOVX "  %[L1],          %%" XAX "\n\t"
78      LEAX "  8(%%" XAX "),     %%" XBX "\n\t"   // next qword needed by DJR
79      MOVX "  %[L3],          %%" XCX "\n\t"
80      SUBX "  %%" XAX ",        %%" XCX "\n\t"   // carry L3 addr as an offset
81      MOVX "  %[L2P],         %%" XDX "\n\t"
82      MOVX "  %[L2],          %%" XSI "\n\t"
83      MOVX "  %[Dest],        %%" XDI "\n\t"      // DL1 if Odd or DL2 if Even
84
85      ".align 8\n\t"
86      "1:\n\t"
87      "movq  (%%" XSI "),      %%mm0\n\t"       // L2 - the newest weave pixel value
88      "movq  (%%" XAX "),      %%mm1\n\t"       // L1 - the top pixel
89      "movq  (%%" XDX "),      %%mm2\n\t"       // L2P - the prev weave pixel
90      "movq  (%%" XAX ", %%" XCX "), %%mm3\n\t" // L3, next odd row
91      "movq  %%mm1,          %%mm6\n\t"         // L1 - get simple single pixel interp
92
93      //        pavgb   mm6, mm3                    // use macro below
94      V_PAVGB ("%%mm6", "%%mm3", "%%mm4", "%[ShiftMask]")
95
96      // DJR - Diagonal Jaggie Reduction
97      // In the event that we are going to use an average (Bob) pixel we do not want a jagged
98      // stair step effect.  To combat this we avg in the 2 horizontally adjacen pixels into the
99      // interpolated Bob mix. This will do horizontal smoothing for only the Bob'd pixels.
100
101      "movq  %[LastAvg],     %%mm4\n\t" // the bob value from prev qword in row
102      "movq  %%mm6,          %[LastAvg]\n\t"    // save for next pass
103      "psrlq $48,            %%mm4\n\t" // right justify 1 pixel
104      "movq  %%mm6,          %%mm7\n\t" // copy of simple bob pixel
105      "psllq $16,            %%mm7\n\t" // left justify 3 pixels
106      "por   %%mm7,          %%mm4\n\t" // and combine
107      "movq  (%%" XBX "),      %%mm5\n\t"       // next horiz qword from L1
108      // pavgb   mm5, qword ptr[ebx+ecx] // next horiz qword from L3, use macro below
109
110      V_PAVGB ("%%mm5", "(%%" XBX ",%%" XCX ")", "%%mm7", "%[ShiftMask]")
111      "psllq $48,            %%mm5\n\t" // left just 1 pixel
112      "movq  %%mm6,          %%mm7\n\t" // another copy of simple bob pixel
113      "psrlq $16,            %%mm7\n\t" // right just 3 pixels
114      "por   %%mm7,          %%mm5\n\t" // combine
115      // pavgb        mm4, mm5                        // avg of forward and prev by 1 pixel, use macro
116      V_PAVGB ("%%mm4", "%%mm5", "%%mm5", "%[ShiftMask]")       // mm5 gets modified if MMX
117      //                        pavgb        mm6, mm4                        // avg of center and surround interp vals, use macro
118      V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]")
119
120      // Don't do any more averaging than needed for mmx. It hurts performance and causes rounding errors.
121#ifndef IS_MMX
122      //          pavgb        mm4, mm6                        // 1/4 center, 3/4 adjacent
123      V_PAVGB ("%%mm4", "%%mm6", "%%mm7", "%[ShiftMask]")
124      //                    pavgb        mm6, mm4                        // 3/8 center, 5/8 adjacent
125      V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]")
126#endif
127
128      // get abs value of possible L2 comb
129      "movq    %%mm6,        %%mm4\n\t" // work copy of interp val
130      "movq    %%mm2,        %%mm7\n\t" // L2
131      "psubusb %%mm4,        %%mm7\n\t" // L2 - avg
132      "movq    %%mm4,        %%mm5\n\t" // avg
133      "psubusb %%mm2,        %%mm5\n\t" // avg - L2
134      "por     %%mm7,        %%mm5\n\t" // abs(avg-L2)
135
136      // get abs value of possible L2P comb
137      "movq    %%mm0,        %%mm7\n\t" // L2P
138      "psubusb %%mm4,        %%mm7\n\t" // L2P - avg
139      "psubusb %%mm0,        %%mm4\n\t" // avg - L2P
140      "por     %%mm7,        %%mm4\n\t" // abs(avg-L2P)
141
142      // use L2 or L2P depending upon which makes smaller comb
143      "psubusb %%mm5,        %%mm4\n\t" // see if it goes to zero
144      "psubusb %%mm5,        %%mm5\n\t" // 0
145      "pcmpeqb %%mm5,        %%mm4\n\t" // if (mm4=0) then FF else 0
146      "pcmpeqb %%mm4,        %%mm5\n\t" // opposite of mm4
147
148      // if Comb(L2P) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55
149      "pand    %%mm2,        %%mm5\n\t" // use L2 if mm5 == ff, else 0
150      "pand    %%mm0,        %%mm4\n\t" // use L2P if mm4 = ff, else 0
151      "por     %%mm5,        %%mm4\n\t" // may the best win
152
153      // Inventory: at this point we have the following values:
154      // mm0 = L2P (or L2)
155      // mm1 = L1
156      // mm2 = L2 (or L2P)
157      // mm3 = L3
158      // mm4 = the best of L2,L2P weave pixel, base upon comb
159      // mm6 = the avg interpolated value, if we need to use it
160      // Let's measure movement, as how much the weave pixel has changed
161
162      "movq    %%mm2,        %%mm7\n\t"
163      "psubusb %%mm0,        %%mm2\n\t"
164      "psubusb %%mm7,        %%mm0\n\t"
165      "por     %%mm2,        %%mm0\n\t"   // abs value of change, used later
166
167      // Now lets clip our chosen value to be not outside of the range
168      // of the high/low range L1-L3 by more than MaxComb.
169      // This allows some comb but limits the damages and also allows more
170      // detail than a boring oversmoothed clip.
171
172      "movq    %%mm1,        %%mm2\n\t" // copy L1
173      // pmaxub mm2, mm3                     // use macro
174      V_PMAXUB ("%%mm2", "%%mm3")       // now = Max(L1,L3)
175      "movq    %%mm1,        %%mm5\n\t" // copy L1
176      // pminub        mm5, mm3                    // now = Min(L1,L3), use macro
177      V_PMINUB ("%%mm5", "%%mm3", "%%mm7")
178
179      // allow the value to be above the high or below the low by amt of MaxComb
180      "psubusb %[MaxComb],   %%mm5\n\t" // lower min by diff
181      "paddusb %[MaxComb],   %%mm2\n\t" // increase max by diff
182      // pmaxub        mm4, mm5         // now = Max(best,Min(L1,L3) use macro
183      V_PMAXUB ("%%mm4", "%%mm5")
184      // pminub        mm4, mm2         // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped
185      V_PMINUB ("%%mm4", "%%mm2", "%%mm7")
186
187      // Blend weave pixel with bob pixel, depending on motion val in mm0
188      "psubusb %[MotionThreshold], %%mm0\n\t"   // test Threshold, clear chroma change
189      "pmullw  %[MotionSense], %%mm0\n\t"       // mul by user factor, keep low 16 bits
190      "movq    %[QW256], %%mm7\n\t"
191#ifdef IS_MMXEXT
192      "pminsw  %%mm7,        %%mm0\n\t" // max = 256
193#else
194      "paddusw %[QW256B],    %%mm0\n\t" // add, may sat at fff..
195      "psubusw %[QW256B],    %%mm0\n\t" // now = Min(L1,256)
196#endif
197      "psubusw %%mm0,        %%mm7\n\t" // so the 2 sum to 256, weighted avg
198      "movq    %%mm4,        %%mm2\n\t" // save weave chroma info before trashing
199      "pand    %[YMask],     %%mm4\n\t" // keep only luma from calc'd value
200      "pmullw  %%mm7,        %%mm4\n\t" // use more weave for less motion
201      "pand    %[YMask],     %%mm6\n\t" // keep only luma from calc'd value
202      "pmullw  %%mm0,        %%mm6\n\t" // use more bob for large motion
203      "paddusw %%mm6,        %%mm4\n\t" // combine
204      "psrlw   $8,           %%mm4\n\t" // div by 256 to get weighted avg
205      // chroma comes from weave pixel
206      "pand    %[UVMask],    %%mm2\n\t" // keep chroma
207      "por     %%mm4,        %%mm2\n\t" // and combine
208      V_MOVNTQ ("(%%" XDI ")", "%%mm2") // move in our clipped best, use macro
209      // bump ptrs and loop
210      LEAX "    8(%%" XAX "),   %%" XAX "\n\t"
211      LEAX "    8(%%" XBX "),   %%" XBX "\n\t"
212      LEAX "    8(%%" XDX "),   %%" XDX "\n\t"
213      LEAX "    8(%%" XDI "),   %%" XDI "\n\t"
214      LEAX "    8(%%" XSI "),   %%" XSI "\n\t"
215      DECX "    %[LoopCtr]\n\t"
216
217      "jg      1b\n\t"   // loop if not to last line
218      // note P-III default assumes backward branches taken
219      "jl      1f\n\t"          // done
220      MOVX "    %%" XAX ",      %%" XBX "\n\t"  // sharpness lookahead 1 byte only, be wrong on 1
221      "jmp     1b\n\t"
222
223      "1:\n\t"
224      MOVX " %[oldbx], %%" XBX "\n\t"
225      "emms\n\t":     /* no outputs */
226
227      :[LastAvg] "m" (LastAvg),
228       [L1] "m" (L1),
229       [L3] "m" (L3),
230       [L2P] "m" (L2P),
231       [L2] "m" (L2),
232       [Dest] "m" (Dest),
233       [ShiftMask] "m" (ShiftMask),
234       [MaxComb] "m" (MaxComb),
235       [MotionThreshold] "m" (MotionThreshold),
236       [MotionSense] "m" (MotionSense),
237       [QW256B] "m" (QW256B),
238       [YMask] "m" (YMask),
239       [UVMask] "m" (UVMask),
240       [LoopCtr] "m" (LoopCtr),
241       [QW256] "m" (QW256),
242       [oldbx] "m" (oldbx)
243      : XAX, XCX, XDX, XSI, XDI,
244      "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)",
245#ifdef __MMX__
246      "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7",
247#endif
248      "memory", "cc");
249}
250
251static void
252FUNCT_NAME_UYVY (GstDeinterlaceMethodGreedyH *self, const guint8 * L1, const guint8 * L2, const guint8 * L3, const guint8 * L2P, guint8 * Dest, gint width)
253{
254
255  // in tight loop some vars are accessed faster in local storage
256  gint64 YMask = 0xff00ff00ff00ff00ull;       // to keep only luma
257  gint64 UVMask = 0x00ff00ff00ff00ffull;        // to keep only chroma
258  gint64 ShiftMask = 0xfefefefefefefefeull;    // to avoid shifting chroma to luma
259  gint64 QW256 = 0x0100010001000100ull;        // 4 256's
260  gint64 MaxComb;
261  gint64 MotionThreshold;
262  gint64 MotionSense;
263  gint64 i;
264  glong LoopCtr;
265  glong oldbx = 0;
266
267  gint64 QW256B;
268  gint64 LastAvg = 0;          //interp value from left qword
269
270  // FIXME: Use C implementation if the width is not a multiple of 4
271  // Do something more optimal later
272  if (width % 4 != 0)
273    C_FUNCT_UYVY (self, L1, L2, L3, L2P, Dest, width);
274
275  // Set up our two parms that are actually evaluated for each pixel
276  i = self->max_comb;
277  MaxComb =
278      i << 56 | i << 48 | i << 40 | i << 32 | i << 24 | i << 16 | i << 8 | i;
279
280  i = self->motion_threshold;    // scale to range of 0-257
281  MotionThreshold = i << 48 | i << 32 | i << 16 | i | UVMask;
282
283  i = self->motion_sense;        // scale to range of 0-257
284  MotionSense = i << 48 | i << 32 | i << 16 | i;
285
286  i = 0xffffffff - 256;
287  QW256B = i << 48 | i << 32 | i << 16 | i;     // save a couple instr on PMINSW instruct.
288
289  LoopCtr = width / 8 - 1;       // there are LineLength / 4 qwords per line but do 1 less, adj at end of loop
290
291  // For ease of reading, the comments below assume that we're operating on an odd
292  // field (i.e., that InfoIsOdd is true).  Assume the obvious for even lines..
293  __asm__ __volatile__ (
294      // save ebx (-fPIC)
295      MOVX " %%" XBX ", %[oldbx]\n\t"
296      MOVX "  %[L1],          %%" XAX "\n\t"
297      LEAX "  8(%%" XAX "),     %%" XBX "\n\t"   // next qword needed by DJR
298      MOVX "  %[L3],          %%" XCX "\n\t"
299      SUBX "  %%" XAX ",        %%" XCX "\n\t"   // carry L3 addr as an offset
300      MOVX "  %[L2P],         %%" XDX "\n\t"
301      MOVX "  %[L2],          %%" XSI "\n\t"
302      MOVX "  %[Dest],        %%" XDI "\n\t"      // DL1 if Odd or DL2 if Even
303
304      ".align 8\n\t"
305      "1:\n\t"
306      "movq  (%%" XSI "),      %%mm0\n\t"       // L2 - the newest weave pixel value
307      "movq  (%%" XAX "),      %%mm1\n\t"       // L1 - the top pixel
308      "movq  (%%" XDX "),      %%mm2\n\t"       // L2P - the prev weave pixel
309      "movq  (%%" XAX ", %%" XCX "), %%mm3\n\t" // L3, next odd row
310      "movq  %%mm1,          %%mm6\n\t"         // L1 - get simple single pixel interp
311
312      //        pavgb   mm6, mm3                    // use macro below
313      V_PAVGB ("%%mm6", "%%mm3", "%%mm4", "%[ShiftMask]")
314
315      // DJR - Diagonal Jaggie Reduction
316      // In the event that we are going to use an average (Bob) pixel we do not want a jagged
317      // stair step effect.  To combat this we avg in the 2 horizontally adjacen pixels into the
318      // interpolated Bob mix. This will do horizontal smoothing for only the Bob'd pixels.
319
320      "movq  %[LastAvg],     %%mm4\n\t" // the bob value from prev qword in row
321      "movq  %%mm6,          %[LastAvg]\n\t"    // save for next pass
322      "psrlq $48,            %%mm4\n\t" // right justify 1 pixel
323      "movq  %%mm6,          %%mm7\n\t" // copy of simple bob pixel
324      "psllq $16,            %%mm7\n\t" // left justify 3 pixels
325      "por   %%mm7,          %%mm4\n\t" // and combine
326      "movq  (%%" XBX "),      %%mm5\n\t"       // next horiz qword from L1
327      // pavgb   mm5, qword ptr[ebx+ecx] // next horiz qword from L3, use macro below
328
329      V_PAVGB ("%%mm5", "(%%" XBX ",%%" XCX ")", "%%mm7", "%[ShiftMask]")
330      "psllq $48,            %%mm5\n\t" // left just 1 pixel
331      "movq  %%mm6,          %%mm7\n\t" // another copy of simple bob pixel
332      "psrlq $16,            %%mm7\n\t" // right just 3 pixels
333      "por   %%mm7,          %%mm5\n\t" // combine
334      // pavgb        mm4, mm5                        // avg of forward and prev by 1 pixel, use macro
335      V_PAVGB ("%%mm4", "%%mm5", "%%mm5", "%[ShiftMask]")       // mm5 gets modified if MMX
336      //                        pavgb        mm6, mm4                        // avg of center and surround interp vals, use macro
337      V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]")
338
339      // Don't do any more averaging than needed for mmx. It hurts performance and causes rounding errors.
340#ifndef IS_MMX
341      //          pavgb        mm4, mm6                        // 1/4 center, 3/4 adjacent
342      V_PAVGB ("%%mm4", "%%mm6", "%%mm7", "%[ShiftMask]")
343      //                    pavgb        mm6, mm4                        // 3/8 center, 5/8 adjacent
344      V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]")
345#endif
346
347      // get abs value of possible L2 comb
348      "movq    %%mm6,        %%mm4\n\t" // work copy of interp val
349      "movq    %%mm2,        %%mm7\n\t" // L2
350      "psubusb %%mm4,        %%mm7\n\t" // L2 - avg
351      "movq    %%mm4,        %%mm5\n\t" // avg
352      "psubusb %%mm2,        %%mm5\n\t" // avg - L2
353      "por     %%mm7,        %%mm5\n\t" // abs(avg-L2)
354
355      // get abs value of possible L2P comb
356      "movq    %%mm0,        %%mm7\n\t" // L2P
357      "psubusb %%mm4,        %%mm7\n\t" // L2P - avg
358      "psubusb %%mm0,        %%mm4\n\t" // avg - L2P
359      "por     %%mm7,        %%mm4\n\t" // abs(avg-L2P)
360
361      // use L2 or L2P depending upon which makes smaller comb
362      "psubusb %%mm5,        %%mm4\n\t" // see if it goes to zero
363      "psubusb %%mm5,        %%mm5\n\t" // 0
364      "pcmpeqb %%mm5,        %%mm4\n\t" // if (mm4=0) then FF else 0
365      "pcmpeqb %%mm4,        %%mm5\n\t" // opposite of mm4
366
367      // if Comb(L2P) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55
368      "pand    %%mm2,        %%mm5\n\t" // use L2 if mm5 == ff, else 0
369      "pand    %%mm0,        %%mm4\n\t" // use L2P if mm4 = ff, else 0
370      "por     %%mm5,        %%mm4\n\t" // may the best win
371
372      // Inventory: at this point we have the following values:
373      // mm0 = L2P (or L2)
374      // mm1 = L1
375      // mm2 = L2 (or L2P)
376      // mm3 = L3
377      // mm4 = the best of L2,L2P weave pixel, base upon comb
378      // mm6 = the avg interpolated value, if we need to use it
379      // Let's measure movement, as how much the weave pixel has changed
380
381      "movq    %%mm2,        %%mm7\n\t"
382      "psubusb %%mm0,        %%mm2\n\t"
383      "psubusb %%mm7,        %%mm0\n\t"
384      "por     %%mm2,        %%mm0\n\t"   // abs value of change, used later
385
386      // Now lets clip our chosen value to be not outside of the range
387      // of the high/low range L1-L3 by more than MaxComb.
388      // This allows some comb but limits the damages and also allows more
389      // detail than a boring oversmoothed clip.
390
391      "movq    %%mm1,        %%mm2\n\t" // copy L1
392      // pmaxub mm2, mm3                     // use macro
393      V_PMAXUB ("%%mm2", "%%mm3")       // now = Max(L1,L3)
394      "movq    %%mm1,        %%mm5\n\t" // copy L1
395      // pminub        mm5, mm3                    // now = Min(L1,L3), use macro
396      V_PMINUB ("%%mm5", "%%mm3", "%%mm7")
397
398      // allow the value to be above the high or below the low by amt of MaxComb
399      "psubusb %[MaxComb],   %%mm5\n\t" // lower min by diff
400      "paddusb %[MaxComb],   %%mm2\n\t" // increase max by diff
401      // pmaxub        mm4, mm5         // now = Max(best,Min(L1,L3) use macro
402      V_PMAXUB ("%%mm4", "%%mm5")
403      // pminub        mm4, mm2         // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped
404      V_PMINUB ("%%mm4", "%%mm2", "%%mm7")
405
406      // Blend weave pixel with bob pixel, depending on motion val in mm0
407      "psubusb %[MotionThreshold], %%mm0\n\t"   // test Threshold, clear chroma change
408      "psrlw   $8,           %%mm0\n\t" // div by 256 to get weighted avg
409      "pmullw  %[MotionSense], %%mm0\n\t"       // mul by user factor, keep low 16 bits
410      "movq    %[QW256], %%mm7\n\t"
411#ifdef IS_MMXEXT
412      "pminsw  %%mm7,        %%mm0\n\t" // max = 256
413#else
414      "paddusw %[QW256B],    %%mm0\n\t" // add, may sat at fff..
415      "psubusw %[QW256B],    %%mm0\n\t" // now = Min(L1,256)
416#endif
417      "psubusw %%mm0,        %%mm7\n\t" // so the 2 sum to 256, weighted avg
418      "movq    %%mm4,        %%mm2\n\t" // save weave chroma info before trashing
419      "pand    %[YMask],     %%mm4\n\t" // keep only luma from calc'd value
420      "psrlw   $8,           %%mm4\n\t" // div by 256 to get weighted avg
421      "pmullw  %%mm7,        %%mm4\n\t" // use more weave for less motion
422      "pand    %[YMask],     %%mm6\n\t" // keep only luma from calc'd value
423      "psrlw   $8,           %%mm6\n\t" // div by 256 to get weighted avg
424      "pmullw  %%mm0,        %%mm6\n\t" // use more bob for large motion
425      "paddusw %%mm6,        %%mm4\n\t" // combine
426      "pand    %[YMask],     %%mm4\n\t" // keep only luma from calc'd value
427      // chroma comes from weave pixel
428      "pand    %[UVMask],    %%mm2\n\t" // keep chroma
429      "por     %%mm4,        %%mm2\n\t" // and combine
430      V_MOVNTQ ("(%%" XDI ")", "%%mm2") // move in our clipped best, use macro
431      // bump ptrs and loop
432      LEAX "    8(%%" XAX "),   %%" XAX "\n\t"
433      LEAX "    8(%%" XBX "),   %%" XBX "\n\t"
434      LEAX "    8(%%" XDX "),   %%" XDX "\n\t"
435      LEAX "    8(%%" XDI "),   %%" XDI "\n\t"
436      LEAX "    8(%%" XSI "),   %%" XSI "\n\t"
437      DECX "    %[LoopCtr]\n\t"
438
439      "jg      1b\n\t"   // loop if not to last line
440      // note P-III default assumes backward branches taken
441      "jl      1f\n\t"          // done
442      MOVX "    %%" XAX ",      %%" XBX "\n\t"  // sharpness lookahead 1 byte only, be wrong on 1
443      "jmp     1b\n\t"
444
445      "1:\n\t"
446      MOVX " %[oldbx], %%" XBX "\n\t"
447      "emms\n\t":     /* no outputs */
448
449      :[LastAvg] "m" (LastAvg),
450       [L1] "m" (L1),
451       [L3] "m" (L3),
452       [L2P] "m" (L2P),
453       [L2] "m" (L2),
454       [Dest] "m" (Dest),
455       [ShiftMask] "m" (ShiftMask),
456       [MaxComb] "m" (MaxComb),
457       [MotionThreshold] "m" (MotionThreshold),
458       [MotionSense] "m" (MotionSense),
459       [QW256B] "m" (QW256B),
460       [YMask] "m" (YMask),
461       [UVMask] "m" (UVMask),
462       [LoopCtr] "m" (LoopCtr),
463       [QW256] "m" (QW256),
464       [oldbx] "m" (oldbx)
465      : XAX, XCX, XDX, XSI, XDI,
466      "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)",
467#ifdef __MMX__
468      "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7",
469#endif
470      "memory", "cc");
471}
472
473