1 /*****************************************************************************
2  *
3  *  XVID MPEG-4 VIDEO CODEC
4  *  - GMC interpolation module -
5  *
6  *  Copyright(C) 2002-2003 Pascal Massimino <skal@planet-d.net>
7  *
8  *  This program is free software ; you can redistribute it and/or modify
9  *  it under the terms of the GNU General Public License as published by
10  *  the Free Software Foundation ; either version 2 of the License, or
11  *  (at your option) any later version.
12  *
13  *  This program is distributed in the hope that it will be useful,
14  *  but WITHOUT ANY WARRANTY ; without even the implied warranty of
15  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  *  GNU General Public License for more details.
17  *
18  *  You should have received a copy of the GNU General Public License
19  *  along with this program ; if not, write to the Free Software
20  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
21  *
22  * $Id: gmc.c 2180 2019-11-12 14:48:35Z Isibaar $
23  *
24  ****************************************************************************/
25 
26 #include "../portab.h"
27 #include "../global.h"
28 #include "../encoder.h"
29 #include "gmc.h"
30 #include "../utils/emms.h"
31 
32 #include <stdio.h>
33 
34   /* initialized by init_GMC(), for 3points */
35 static
36 void (*Predict_16x16_func)(const NEW_GMC_DATA * const This,
37                            uint8_t *dst, const uint8_t *src,
38                            int dststride, int srcstride, int x, int y, int rounding) = 0;
39 static
40 void (*Predict_8x8_func)(const NEW_GMC_DATA * const This,
41                          uint8_t *uDst, const uint8_t *uSrc,
42                          uint8_t *vDst, const uint8_t *vSrc,
43                          int dststride, int srcstride, int x, int y, int rounding) = 0;
44 
45 /****************************************************************************/
46 /* this is borrowed from   bitstream.c  until we find a common solution */
47 static uint32_t __inline
log2bin(uint32_t value)48 log2bin(uint32_t value)
49 {
50 /* Changed by Chenm001 */
51 #if !defined(_MSC_VER) || defined(ARCH_IS_X86_64)
52   int n = 0;
53 
54   while (value) {
55 	value >>= 1;
56 	n++;
57   }
58   return n;
59 #else
60   __asm {
61 	bsr eax, value
62 	inc eax
63   }
64 #endif
65 }
66 
67 /* 16*sizeof(int) -> 1 or 2 cachelines */
68 /* table lookup might be faster!  (still to be benchmarked) */
69 
70 /*
71 static int log2bin_table[16] =
72 	{ 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4};
73 */
74 /*	1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 */
75 
76 #define RDIV(a,b) (((a)>0 ? (a) + ((b)>>1) : (a) - ((b)>>1))/(b))
77 #define RSHIFT(a,b) ( (a)>0 ? ((a) + (1<<((b)-1)))>>(b) : ((a) + (1<<((b)-1))-1)>>(b))
78 
79 #define MLT(i)  (((16-(i))<<16) + (i))
80 static const uint32_t MTab[16] = {
81   MLT( 0), MLT( 1), MLT( 2), MLT( 3), MLT( 4), MLT( 5), MLT( 6), MLT( 7),
82   MLT( 8), MLT( 9), MLT(10), MLT(11), MLT(12), MLT(13), MLT(14), MLT(15)
83 };
84 #undef MLT
85 
86 /* ************************************************************
87  * Pts = 2 or 3
88  *
89  * Warning! *src is the global frame pointer (that is: adress
90  * of pixel 0,0), not the macroblock one.
91  * Conversely, *dst is the macroblock top-left adress.
92  */
93 
94 static
Predict_16x16_C(const NEW_GMC_DATA * const This,uint8_t * dst,const uint8_t * src,int dststride,int srcstride,int x,int y,int rounding)95 void Predict_16x16_C(const NEW_GMC_DATA * const This,
96                      uint8_t *dst, const uint8_t *src,
97                      int dststride, int srcstride, int x, int y, int rounding)
98 {
99 	const int W = This->sW;
100 	const int H	= This->sH;
101 	const int rho = 3 - This->accuracy;
102 	const int Rounder = ( (1<<7) - (rounding<<(2*rho)) ) << 16;
103 
104 	const int dUx = This->dU[0];
105 	const int dVx = This->dV[0];
106 	const int dUy = This->dU[1];
107 	const int dVy = This->dV[1];
108 
109 	int Uo = This->Uo + 16*(dUy*y + dUx*x);
110 	int Vo = This->Vo + 16*(dVy*y + dVx*x);
111 
112 	int i, j;
113 
114 	dst += 16;
115 	for (j=16; j>0; --j) {
116 		int U = Uo, V = Vo;
117 		Uo += dUy; Vo += dVy;
118 		for (i=-16; i<0; ++i) {
119 			unsigned int f0, f1, ri = 16, rj = 16;
120 			int Offset;
121 			int u = ( U >> 16 ) << rho;
122 			int v = ( V >> 16 ) << rho;
123 
124 			U += dUx; V += dVx;
125 
126 			if (u > 0 && u <= W) { ri = MTab[u&15]; Offset = u>>4;	}
127 			else {
128 				if (u > W) Offset = W>>4;
129 				else Offset = 0;
130 				ri = MTab[0];
131 			}
132 
133 			if (v > 0 && v <= H) { rj = MTab[v&15]; Offset += (v>>4)*srcstride; }
134 			else {
135 				if (v > H) Offset += (H>>4)*srcstride;
136 				rj = MTab[0];
137 			}
138 
139 			f0	= src[Offset + 0];
140 			f0 |= src[Offset + 1] << 16;
141 			f1	= src[Offset + srcstride + 0];
142 			f1 |= src[Offset + srcstride + 1] << 16;
143 			f0 = (ri*f0)>>16;
144 			f1 = (ri*f1) & 0x0fff0000;
145 			f0 |= f1;
146 			f0 = (rj*f0 + Rounder) >> 24;
147 
148 			dst[i] = (uint8_t)f0;
149 		}
150 		dst += dststride;
151 	}
152 }
153 
154 static
Predict_8x8_C(const NEW_GMC_DATA * const This,uint8_t * uDst,const uint8_t * uSrc,uint8_t * vDst,const uint8_t * vSrc,int dststride,int srcstride,int x,int y,int rounding)155 void Predict_8x8_C(const NEW_GMC_DATA * const This,
156                    uint8_t *uDst, const uint8_t *uSrc,
157                    uint8_t *vDst, const uint8_t *vSrc,
158                    int dststride, int srcstride, int x, int y, int rounding)
159 {
160 	const int W	 = This->sW >> 1;
161 	const int H	 = This->sH >> 1;
162 	const int rho = 3-This->accuracy;
163 	const int32_t Rounder = ( 128 - (rounding<<(2*rho)) ) << 16;
164 
165 	const int32_t dUx = This->dU[0];
166 	const int32_t dVx = This->dV[0];
167 	const int32_t dUy = This->dU[1];
168 	const int32_t dVy = This->dV[1];
169 
170 	int32_t Uo = This->Uco + 8*(dUy*y + dUx*x);
171 	int32_t Vo = This->Vco + 8*(dVy*y + dVx*x);
172 
173 	int i, j;
174 
175 	uDst += 8;
176 	vDst += 8;
177 	for (j=8; j>0; --j) {
178 		int32_t U = Uo, V = Vo;
179 		Uo += dUy; Vo += dVy;
180 
181 		for (i=-8; i<0; ++i) {
182 			int Offset;
183 			uint32_t f0, f1, ri, rj;
184 			int32_t u, v;
185 
186 			u = ( U >> 16 ) << rho;
187 			v = ( V >> 16 ) << rho;
188 			U += dUx; V += dVx;
189 
190 			if (u > 0 && u <= W) {
191 				ri = MTab[u&15];
192 				Offset = u>>4;
193 			} else {
194 				if (u>W) Offset = W>>4;
195 				else Offset = 0;
196 				ri = MTab[0];
197 			}
198 
199 			if (v > 0 && v <= H) {
200 				rj = MTab[v&15];
201 				Offset += (v>>4)*srcstride;
202 			} else {
203 				if (v>H) Offset += (H>>4)*srcstride;
204 				rj = MTab[0];
205 			}
206 
207 			f0	= uSrc[Offset + 0];
208 			f0 |= uSrc[Offset + 1] << 16;
209 			f1	= uSrc[Offset + srcstride + 0];
210 			f1 |= uSrc[Offset + srcstride + 1] << 16;
211 			f0 = (ri*f0)>>16;
212 			f1 = (ri*f1) & 0x0fff0000;
213 			f0 |= f1;
214 			f0 = (rj*f0 + Rounder) >> 24;
215 
216 			uDst[i] = (uint8_t)f0;
217 
218 			f0	= vSrc[Offset + 0];
219 			f0 |= vSrc[Offset + 1] << 16;
220 			f1	= vSrc[Offset + srcstride + 0];
221 			f1 |= vSrc[Offset + srcstride + 1] << 16;
222 			f0 = (ri*f0)>>16;
223 			f1 = (ri*f1) & 0x0fff0000;
224 			f0 |= f1;
225 			f0 = (rj*f0 + Rounder) >> 24;
226 
227 			vDst[i] = (uint8_t)f0;
228 		}
229 		uDst += dststride;
230 		vDst += dststride;
231 	}
232 }
233 
234 static
get_average_mv_C(const NEW_GMC_DATA * const Dsp,VECTOR * const mv,int x,int y,int qpel)235 void get_average_mv_C(const NEW_GMC_DATA * const Dsp, VECTOR * const mv,
236                       int x, int y, int qpel)
237 {
238 	int i, j;
239 	int vx = 0, vy = 0;
240 	int32_t uo = Dsp->Uo + 16*(Dsp->dU[1]*y + Dsp->dU[0]*x);
241 	int32_t vo = Dsp->Vo + 16*(Dsp->dV[1]*y + Dsp->dV[0]*x);
242 	for (j=16; j>0; --j)
243 	{
244 	int32_t U, V;
245 	U = uo; uo += Dsp->dU[1];
246 	V = vo; vo += Dsp->dV[1];
247 	for (i=16; i>0; --i)
248 	{
249 		int32_t u,v;
250 		u = U >> 16; U += Dsp->dU[0]; vx += u;
251 		v = V >> 16; V += Dsp->dV[0]; vy += v;
252 	}
253 	}
254 	vx -= (256*x+120) << (5+Dsp->accuracy);	/* 120 = 15*16/2 */
255 	vy -= (256*y+120) << (5+Dsp->accuracy);
256 
257 	mv->x = RSHIFT( vx, 8+Dsp->accuracy - qpel );
258 	mv->y = RSHIFT( vy, 8+Dsp->accuracy - qpel );
259 }
260 
261 /* ************************************************************
262  * simplified version for 1 warp point
263  */
264 
265 static
Predict_1pt_16x16_C(const NEW_GMC_DATA * const This,uint8_t * Dst,const uint8_t * Src,int dststride,int srcstride,int x,int y,int rounding)266 void Predict_1pt_16x16_C(const NEW_GMC_DATA * const This,
267                          uint8_t *Dst, const uint8_t *Src,
268                          int dststride, int srcstride, int x, int y, int rounding)
269 {
270 	const int W	 = This->sW;
271 	const int H	 = This->sH;
272 	const int rho = 3-MIN(This->accuracy, 3);
273 	const int32_t Rounder = ( 128 - (rounding<<(2*rho)) ) << 16;
274 
275 
276 	int32_t uo = This->Uo + (x<<8);	 /* ((16*x)<<4) */
277 	int32_t vo = This->Vo + (y<<8);
278 	uint32_t ri = MTab[uo & 15];
279 	uint32_t rj = MTab[vo & 15];
280 	int i, j;
281 
282 	int32_t Offset;
283 	if (vo>=(-16<<4) && vo<=H) Offset = (vo>>4)*srcstride;
284 	else {
285 		if (vo>H) Offset = ( H>>4)*srcstride;
286 		else Offset =-16*srcstride;
287 		rj = MTab[0];
288 	}
289 	if (uo>=(-16<<4) && uo<=W) Offset += (uo>>4);
290 	else {
291 		if (uo>W) Offset += (W>>4);
292 		else Offset -= 16;
293 		ri = MTab[0];
294 	}
295 
296 	Dst += 16;
297 
298 	for(j=16; j>0; --j, Offset+=srcstride-16)
299 	{
300 	for(i=-16; i<0; ++i, ++Offset)
301 	{
302 		uint32_t f0, f1;
303 		f0	= Src[ Offset		+0 ];
304 		f0 |= Src[ Offset		+1 ] << 16;
305 		f1	= Src[ Offset+srcstride +0 ];
306 		f1 |= Src[ Offset+srcstride +1 ] << 16;
307 		f0 = (ri*f0)>>16;
308 		f1 = (ri*f1) & 0x0fff0000;
309 		f0 |= f1;
310 		f0 = ( rj*f0 + Rounder ) >> 24;
311 		Dst[i] = (uint8_t)f0;
312 	}
313 	Dst += dststride;
314 	}
315 }
316 
317 static
Predict_1pt_8x8_C(const NEW_GMC_DATA * const This,uint8_t * uDst,const uint8_t * uSrc,uint8_t * vDst,const uint8_t * vSrc,int dststride,int srcstride,int x,int y,int rounding)318 void Predict_1pt_8x8_C(const NEW_GMC_DATA * const This,
319                        uint8_t *uDst, const uint8_t *uSrc,
320                        uint8_t *vDst, const uint8_t *vSrc,
321                        int dststride, int srcstride, int x, int y, int rounding)
322 {
323 	const int W	 = This->sW >> 1;
324 	const int H	 = This->sH >> 1;
325 	const int rho = 3-This->accuracy;
326 	const int32_t Rounder = ( 128 - (rounding<<(2*rho)) ) << 16;
327 
328 	int32_t uo = This->Uco + (x<<7);
329 	int32_t vo = This->Vco + (y<<7);
330 	uint32_t rri = MTab[uo & 15];
331 	uint32_t rrj = MTab[vo & 15];
332 	int i, j;
333 
334 	int32_t Offset;
335 	if (vo>=(-8<<4) && vo<=H) Offset = (vo>>4)*srcstride;
336 	else {
337 		if (vo>H) Offset = ( H>>4)*srcstride;
338 		else Offset =-8*srcstride;
339 		rrj = MTab[0];
340 	}
341 	if (uo>=(-8<<4) && uo<=W) Offset += (uo>>4);
342 	else {
343 		if (uo>W) Offset += ( W>>4);
344 		else Offset -= 8;
345 		rri = MTab[0];
346 	}
347 
348 	uDst += 8;
349 	vDst += 8;
350 	for(j=8; j>0; --j, Offset+=srcstride-8)
351 	{
352 	for(i=-8; i<0; ++i, Offset++)
353 	{
354 		uint32_t f0, f1;
355 		f0	= uSrc[ Offset + 0 ];
356 		f0 |= uSrc[ Offset + 1 ] << 16;
357 		f1	= uSrc[ Offset + srcstride + 0 ];
358 		f1 |= uSrc[ Offset + srcstride + 1 ] << 16;
359 		f0 = (rri*f0)>>16;
360 		f1 = (rri*f1) & 0x0fff0000;
361 		f0 |= f1;
362 		f0 = ( rrj*f0 + Rounder ) >> 24;
363 		uDst[i] = (uint8_t)f0;
364 
365 		f0	= vSrc[ Offset + 0 ];
366 		f0 |= vSrc[ Offset + 1 ] << 16;
367 		f1	= vSrc[ Offset + srcstride + 0 ];
368 		f1 |= vSrc[ Offset + srcstride + 1 ] << 16;
369 		f0 = (rri*f0)>>16;
370 		f1 = (rri*f1) & 0x0fff0000;
371 		f0 |= f1;
372 		f0 = ( rrj*f0 + Rounder ) >> 24;
373 		vDst[i] = (uint8_t)f0;
374 	}
375 	uDst += dststride;
376 	vDst += dststride;
377 	}
378 }
379 
380 static
get_average_mv_1pt_C(const NEW_GMC_DATA * const Dsp,VECTOR * const mv,int x,int y,int qpel)381 void get_average_mv_1pt_C(const NEW_GMC_DATA * const Dsp, VECTOR * const mv,
382 							int x, int y, int qpel)
383 {
384 	mv->x = RSHIFT(Dsp->Uo<<qpel, 3);
385 	mv->y = RSHIFT(Dsp->Vo<<qpel, 3);
386 }
387 
388 #if defined(ARCH_IS_IA32) || defined(ARCH_IS_X86_64)
389 /* *************************************************************
390  * MMX core function
391  */
392 
393 static
394 void (*GMC_Core_Lin_8)(uint8_t *Dst, const uint16_t * Offsets,
395                        const uint8_t * const Src0, const int BpS, const int Rounder) = 0;
396 
397 extern void xvid_GMC_Core_Lin_8_mmx(uint8_t *Dst, const uint16_t * Offsets,
398                                     const uint8_t * const Src0, const int BpS, const int Rounder);
399 
400 extern void xvid_GMC_Core_Lin_8_sse2(uint8_t *Dst, const uint16_t * Offsets,
401                                      const uint8_t * const Src0, const int BpS, const int Rounder);
402 
403 extern void xvid_GMC_Core_Lin_8_sse41(uint8_t *Dst, const uint16_t * Offsets,
404                                       const uint8_t * const Src0, const int BpS, const int Rounder);
405 
406 /* *************************************************************/
407 
GMC_Core_Non_Lin_8(uint8_t * Dst,const uint16_t * Offsets,const uint8_t * const Src0,const int srcstride,const int Rounder)408 static void GMC_Core_Non_Lin_8(uint8_t *Dst,
409                                const uint16_t * Offsets,
410                                const uint8_t * const Src0, const int srcstride,
411                                const int Rounder)
412 {
413   int i;
414   for(i=0; i<8; ++i)
415   {
416     uint32_t u = Offsets[i   ];
417     uint32_t v = Offsets[i+16];
418     const uint32_t ri = MTab[u&0x0f];
419     const uint32_t rj = MTab[v&0x0f];
420     uint32_t f0, f1;
421     const uint8_t * const Src = Src0 + (u>>4) + (v>>4)*srcstride;
422     f0  = Src[0];
423     f0 |= Src[1] << 16;
424     f1  = Src[srcstride +0];
425     f1 |= Src[srcstride +1] << 16;
426     f0 = (ri*f0)>>16;
427     f1 = (ri*f1) & 0x0fff0000;
428     f0 |= f1;
429     f0 = ( rj*f0 + Rounder ) >> 24;
430     Dst[i] = (uint8_t)f0;
431   }
432 }
433 
434 //////////////////////////////////////////////////////////
435 
436 static
Predict_16x16_mmx(const NEW_GMC_DATA * const This,uint8_t * dst,const uint8_t * src,int dststride,int srcstride,int x,int y,int rounding)437 void Predict_16x16_mmx(const NEW_GMC_DATA * const This,
438                        uint8_t *dst, const uint8_t *src,
439                        int dststride, int srcstride, int x, int y, int rounding)
440 {
441   const int W = This->sW;
442   const int H = This->sH;
443   const int rho = 3 - This->accuracy;
444   const int Rounder = ( 128 - (rounding<<(2*rho)) ) << 16;
445   const uint32_t W2 = W<<(16-rho);
446   const uint32_t H2 = H<<(16-rho);
447 
448   const int dUx = This->dU[0];
449   const int dVx = This->dV[0];
450   const int dUy = This->dU[1];
451   const int dVy = This->dV[1];
452 
453   int Uo = This->Uo + 16*(dUy*y + dUx*x);
454   int Vo = This->Vo + 16*(dVy*y + dVx*x);
455 
456   int i, j;
457 
458   DECLARE_ALIGNED_MATRIX(Offsets, 2,16, uint16_t, CACHE_LINE);
459   for(j=16; j>0; --j)
460   {
461     int32_t U = Uo, V = Vo;
462     Uo += dUy; Vo += dVy;
463     if ( W2>(uint32_t)U && W2>(uint32_t)(U+15*dUx) &&
464          H2>(uint32_t)V && H2>(uint32_t)(V+15*dVx) )
465     {
466       uint32_t UV1, UV2;
467       for(i=0; i<16; ++i)
468       {
469         uint32_t u = ( U >> 16 ) << rho;
470         uint32_t v = ( V >> 16 ) << rho;
471         U += dUx;  V += dVx;
472         Offsets[   i] = u;
473         Offsets[16+i] = v;
474       }
475           // batch 8 input pixels when linearity says it's ok
476 
477       UV1 = (Offsets[0] | (Offsets[16]<<16)) & 0xfff0fff0U;
478       UV2 = (Offsets[7] | (Offsets[23]<<16)) & 0xfff0fff0U;
479       if (UV1+7*16==UV2)
480         GMC_Core_Lin_8(dst,    Offsets,    src + (Offsets[0]>>4) + (Offsets[16]>>4)*srcstride, srcstride, Rounder);
481       else
482         GMC_Core_Non_Lin_8(dst,   Offsets,   src, srcstride, Rounder);
483       UV1 = (Offsets[ 8] | (Offsets[24]<<16)) & 0xfff0fff0U;
484       UV2 = (Offsets[15] | (Offsets[31]<<16)) & 0xfff0fff0U;
485       if (UV1+7*16==UV2)
486         GMC_Core_Lin_8(dst+8,  Offsets+8,  src + (Offsets[8]>>4) + (Offsets[24]>>4)*srcstride, srcstride, Rounder);
487       else
488         GMC_Core_Non_Lin_8(dst+8, Offsets+8, src, srcstride, Rounder);
489     }
490     else
491     {
492       for(i=0; i<16; ++i)
493       {
494         int u = ( U >> 16 ) << rho;
495         int v = ( V >> 16 ) << rho;
496         U += dUx; V += dVx;
497 
498         Offsets[   i] = (u<0) ? 0 : (u>=W) ? W : u;
499         Offsets[16+i] = (v<0) ? 0 : (v>=H) ? H : v;
500       }
501         // due to boundary clipping, we cannot infer the 8-pixels batchability
502         // simply by using the linearity. Oh well, not a big deal...
503       GMC_Core_Non_Lin_8(dst,   Offsets,   src, srcstride, Rounder);
504       GMC_Core_Non_Lin_8(dst+8, Offsets+8, src, srcstride, Rounder);
505     }
506     dst += dststride;
507   }
508 }
509 
510 static
Predict_8x8_mmx(const NEW_GMC_DATA * const This,uint8_t * uDst,const uint8_t * uSrc,uint8_t * vDst,const uint8_t * vSrc,int dststride,int srcstride,int x,int y,int rounding)511 void Predict_8x8_mmx(const NEW_GMC_DATA * const This,
512                      uint8_t *uDst, const uint8_t *uSrc,
513                      uint8_t *vDst, const uint8_t *vSrc,
514                      int dststride, int srcstride, int x, int y, int rounding)
515 {
516   const int W   = This->sW >> 1;
517   const int H   = This->sH >> 1;
518   const int rho = 3-This->accuracy;
519   const int32_t Rounder = ( 128 - (rounding<<(2*rho)) ) << 16;
520   const uint32_t W2 = W<<(16-rho);
521   const uint32_t H2 = H<<(16-rho);
522 
523   const int dUx = This->dU[0];
524   const int dVx = This->dV[0];
525   const int dUy = This->dU[1];
526   const int dVy = This->dV[1];
527 
528   int Uo = This->Uco + 8*(dUy*y + dUx*x);
529   int Vo = This->Vco + 8*(dVy*y + dVx*x);
530 
531   DECLARE_ALIGNED_MATRIX(Offsets, 2,16, uint16_t, CACHE_LINE);
532   int i, j;
533   for(j=8; j>0; --j)
534   {
535     int32_t U = Uo, V = Vo;
536     Uo += dUy; Vo += dVy;
537     if ( W2>(uint32_t)U && W2>(uint32_t)(U+15*dUx) &&
538          H2>(uint32_t)V && H2>(uint32_t)(V+15*dVx) )
539     {
540       uint32_t UV1, UV2;
541       for(i=0; i<8; ++i)
542       {
543         int32_t u = ( U >> 16 ) << rho;
544         int32_t v = ( V >> 16 ) << rho;
545         U += dUx; V += dVx;
546         Offsets[   i] = u;
547         Offsets[16+i] = v;
548       }
549 
550           // batch 8 input pixels when linearity says it's ok
551 			UV1 = (Offsets[ 0] | (Offsets[16]<<16)) & 0xfff0fff0U;
552 			UV2 = (Offsets[ 7] | (Offsets[23]<<16)) & 0xfff0fff0U;
553 			if (UV1+7*16==UV2)
554       {
555 				const uint32_t Off = (Offsets[0]>>4) + (Offsets[16]>>4)*srcstride;
556 				GMC_Core_Lin_8(uDst, Offsets, uSrc+Off, srcstride, Rounder);
557 				GMC_Core_Lin_8(vDst, Offsets, vSrc+Off, srcstride, Rounder);
558       }
559       else {
560         GMC_Core_Non_Lin_8(uDst, Offsets, uSrc, srcstride, Rounder);
561         GMC_Core_Non_Lin_8(vDst, Offsets, vSrc, srcstride, Rounder);
562       }
563     }
564     else
565     {
566       for(i=0; i<8; ++i)
567       {
568         int u = ( U >> 16 ) << rho;
569         int v = ( V >> 16 ) << rho;
570         U += dUx; V += dVx;
571         Offsets[   i] = (u<0) ? 0 : (u>=W) ? W : u;
572         Offsets[16+i] = (v<0) ? 0 : (v>=H) ? H : v;
573       }
574       GMC_Core_Non_Lin_8(uDst, Offsets, uSrc, srcstride, Rounder);
575       GMC_Core_Non_Lin_8(vDst, Offsets, vSrc, srcstride, Rounder);
576     }
577     uDst += dststride;
578     vDst += dststride;
579   }
580 }
581 
582 #endif /* ARCH_IS_IA32 */
583 
584 /* *************************************************************
585  * will initialize internal pointers
586  */
587 
init_GMC(const unsigned int cpu_flags)588 void init_GMC(const unsigned int cpu_flags)
589 {
590       Predict_16x16_func = Predict_16x16_C;
591       Predict_8x8_func   = Predict_8x8_C;
592 
593 #if defined(ARCH_IS_IA32) || defined(ARCH_IS_X86_64)
594       if ((cpu_flags & XVID_CPU_MMX)   || (cpu_flags & XVID_CPU_MMXEXT)   ||
595           (cpu_flags & XVID_CPU_3DNOW) || (cpu_flags & XVID_CPU_3DNOWEXT) ||
596           (cpu_flags & XVID_CPU_SSE)   || (cpu_flags & XVID_CPU_SSE2) ||
597           (cpu_flags & XVID_CPU_SSE3)  || (cpu_flags & XVID_CPU_SSE41))
598 	{
599 	   Predict_16x16_func = Predict_16x16_mmx;
600 	   Predict_8x8_func   = Predict_8x8_mmx;
601 
602            if (cpu_flags & XVID_CPU_SSE41)
603 	     GMC_Core_Lin_8 = xvid_GMC_Core_Lin_8_sse41;
604 	   else if (cpu_flags & XVID_CPU_SSE2)
605 	     GMC_Core_Lin_8 = xvid_GMC_Core_Lin_8_sse2;
606 	   else
607              GMC_Core_Lin_8 = xvid_GMC_Core_Lin_8_mmx;
608 	}
609 #endif
610 }
611 
612 /* *************************************************************
613  * Warning! It's Accuracy being passed, not 'resolution'!
614  */
615 
generate_GMCparameters(int nb_pts,const int accuracy,const WARPPOINTS * const pts,const int width,const int height,NEW_GMC_DATA * const gmc)616 void generate_GMCparameters( int nb_pts, const int accuracy,
617 								 const WARPPOINTS *const pts,
618 								 const int width, const int height,
619 								 NEW_GMC_DATA *const gmc)
620 {
621 	gmc->sW = width	<< 4;
622 	gmc->sH = height << 4;
623 	gmc->accuracy = accuracy;
624 	gmc->num_wp = nb_pts;
625 
626 	/* reduce the number of points, if possible */
627 	if (nb_pts<2 || (pts->duv[2].x==0 && pts->duv[2].y==0 && pts->duv[1].x==0 && pts->duv[1].y==0 )) {
628   	if (nb_pts<2 || (pts->duv[1].x==0 && pts->duv[1].y==0)) {
629 	  	if (nb_pts<1 || (pts->duv[0].x==0 && pts->duv[0].y==0)) {
630 		    nb_pts = 0;
631   		}
632 	  	else nb_pts = 1;
633   	}
634 	  else nb_pts = 2;
635   }
636 
637 	/* now, nb_pts stores the actual number of points required for interpolation */
638 
639 	if (nb_pts<=1)
640 	{
641 	if (nb_pts==1) {
642 		/* store as 4b fixed point */
643 		gmc->Uo = pts->duv[0].x << accuracy;
644 		gmc->Vo = pts->duv[0].y << accuracy;
645 		gmc->Uco = ((pts->duv[0].x>>1) | (pts->duv[0].x&1)) << accuracy;	 /* DIV2RND() */
646 		gmc->Vco = ((pts->duv[0].y>>1) | (pts->duv[0].y&1)) << accuracy;	 /* DIV2RND() */
647 	}
648 	else {	/* zero points?! */
649 		gmc->Uo	= gmc->Vo	= 0;
650 		gmc->Uco = gmc->Vco = 0;
651 	}
652 
653 	gmc->predict_16x16	= Predict_1pt_16x16_C;
654 	gmc->predict_8x8	= Predict_1pt_8x8_C;
655 	gmc->get_average_mv = get_average_mv_1pt_C;
656 	}
657 	else {		/* 2 or 3 points */
658 	const int rho	 = 3 - accuracy;	/* = {3,2,1,0} for Acc={0,1,2,3} */
659 	int Alpha = log2bin(width-1);
660 	int Ws = 1 << Alpha;
661 
662 	gmc->dU[0] = 16*Ws + RDIV( 8*Ws*pts->duv[1].x, width );	 /* dU/dx */
663 	gmc->dV[0] =		 RDIV( 8*Ws*pts->duv[1].y, width );	 /* dV/dx */
664 
665 	if (nb_pts==2) {
666 		gmc->dU[1] = -gmc->dV[0];	/* -Sin */
667 		gmc->dV[1] =	gmc->dU[0] ;	/* Cos */
668 	}
669 	else
670 	{
671 		const int Beta = log2bin(height-1);
672 		const int Hs = 1<<Beta;
673 		gmc->dU[1] =		 RDIV( 8*Hs*pts->duv[2].x, height );	 /* dU/dy */
674 		gmc->dV[1] = 16*Hs + RDIV( 8*Hs*pts->duv[2].y, height );	 /* dV/dy */
675 		if (Beta>Alpha) {
676 		gmc->dU[0] <<= (Beta-Alpha);
677 		gmc->dV[0] <<= (Beta-Alpha);
678 		Alpha = Beta;
679 		Ws = Hs;
680 		}
681 		else {
682 		gmc->dU[1] <<= Alpha - Beta;
683 		gmc->dV[1] <<= Alpha - Beta;
684 		}
685 	}
686 	/* upscale to 16b fixed-point */
687 	gmc->dU[0] <<= (16-Alpha - rho);
688 	gmc->dU[1] <<= (16-Alpha - rho);
689 	gmc->dV[0] <<= (16-Alpha - rho);
690 	gmc->dV[1] <<= (16-Alpha - rho);
691 
692 	gmc->Uo	= ( pts->duv[0].x	 <<(16+ accuracy)) + (1<<15);
693 	gmc->Vo	= ( pts->duv[0].y	 <<(16+ accuracy)) + (1<<15);
694 	gmc->Uco = ((pts->duv[0].x-1)<<(17+ accuracy)) + (1<<17);
695 	gmc->Vco = ((pts->duv[0].y-1)<<(17+ accuracy)) + (1<<17);
696 	gmc->Uco = (gmc->Uco + gmc->dU[0] + gmc->dU[1])>>2;
697 	gmc->Vco = (gmc->Vco + gmc->dV[0] + gmc->dV[1])>>2;
698 
699 	gmc->predict_16x16	= Predict_16x16_func;
700 	gmc->predict_8x8	= Predict_8x8_func;
701 	gmc->get_average_mv = get_average_mv_C;
702 	}
703 }
704 
705 /* *******************************************************************
706  * quick and dirty routine to generate the full warped image
707  * (pGMC != NULL) or just all average Motion Vectors (pGMC == NULL) */
708 
709 void
generate_GMCimage(const NEW_GMC_DATA * const gmc_data,const IMAGE * const pRef,const int mb_width,const int mb_height,const int stride,const int stride2,const int fcode,const int32_t quarterpel,const int reduced_resolution,const int32_t rounding,MACROBLOCK * const pMBs,IMAGE * const pGMC)710 generate_GMCimage(	const NEW_GMC_DATA *const gmc_data, /* [input] precalculated data */
711 					const IMAGE *const pRef,		/* [input] */
712 					const int mb_width,
713 					const int mb_height,
714 					const int stride,
715 					const int stride2,
716 					const int fcode, 				/* [input] some parameters... */
717 						const int32_t quarterpel,		/* [input] for rounding avgMV */
718 					const int reduced_resolution,	/* [input] ignored */
719 					const int32_t rounding,			/* [input] for rounding image data */
720 					MACROBLOCK *const pMBs, 		/* [output] average motion vectors */
721 					IMAGE *const pGMC)				/* [output] full warped image */
722 {
723 
724 	unsigned int mj,mi;
725 	VECTOR avgMV;
726 
727 	for (mj = 0; mj < (unsigned int)mb_height; mj++)
728 		for (mi = 0; mi < (unsigned int)mb_width; mi++) {
729 			const int mbnum = mj*mb_width+mi;
730 			if (pGMC)
731 			{
732 				gmc_data->predict_16x16(gmc_data,
733 							pGMC->y + mj*16*stride + mi*16, pRef->y,
734 							stride, stride, mi, mj, rounding);
735 
736 				gmc_data->predict_8x8(gmc_data,
737 					pGMC->u + mj*8*stride2 + mi*8, pRef->u,
738 					pGMC->v + mj*8*stride2 + mi*8, pRef->v,
739 					stride2, stride2, mi, mj, rounding);
740 			}
741 			gmc_data->get_average_mv(gmc_data, &avgMV, mi, mj, quarterpel);
742 
743 			pMBs[mbnum].amv.x = gmc_sanitize(avgMV.x, quarterpel, fcode);
744 			pMBs[mbnum].amv.y = gmc_sanitize(avgMV.y, quarterpel, fcode);
745 
746 			pMBs[mbnum].mcsel = 0; /* until mode decision */
747 	}
748   emms();
749 }
750