1 /*****************************************************************************
2 *
3 * XVID MPEG-4 VIDEO CODEC
4 * - GMC interpolation module -
5 *
6 * Copyright(C) 2002-2003 Pascal Massimino <skal@planet-d.net>
7 *
8 * This program is free software ; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation ; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY ; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program ; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 *
22 * $Id: gmc.c 2180 2019-11-12 14:48:35Z Isibaar $
23 *
24 ****************************************************************************/
25
26 #include "../portab.h"
27 #include "../global.h"
28 #include "../encoder.h"
29 #include "gmc.h"
30 #include "../utils/emms.h"
31
32 #include <stdio.h>
33
34 /* initialized by init_GMC(), for 3points */
35 static
36 void (*Predict_16x16_func)(const NEW_GMC_DATA * const This,
37 uint8_t *dst, const uint8_t *src,
38 int dststride, int srcstride, int x, int y, int rounding) = 0;
39 static
40 void (*Predict_8x8_func)(const NEW_GMC_DATA * const This,
41 uint8_t *uDst, const uint8_t *uSrc,
42 uint8_t *vDst, const uint8_t *vSrc,
43 int dststride, int srcstride, int x, int y, int rounding) = 0;
44
45 /****************************************************************************/
46 /* this is borrowed from bitstream.c until we find a common solution */
47 static uint32_t __inline
log2bin(uint32_t value)48 log2bin(uint32_t value)
49 {
50 /* Changed by Chenm001 */
51 #if !defined(_MSC_VER) || defined(ARCH_IS_X86_64)
52 int n = 0;
53
54 while (value) {
55 value >>= 1;
56 n++;
57 }
58 return n;
59 #else
60 __asm {
61 bsr eax, value
62 inc eax
63 }
64 #endif
65 }
66
67 /* 16*sizeof(int) -> 1 or 2 cachelines */
68 /* table lookup might be faster! (still to be benchmarked) */
69
70 /*
71 static int log2bin_table[16] =
72 { 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4};
73 */
74 /* 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 */
75
76 #define RDIV(a,b) (((a)>0 ? (a) + ((b)>>1) : (a) - ((b)>>1))/(b))
77 #define RSHIFT(a,b) ( (a)>0 ? ((a) + (1<<((b)-1)))>>(b) : ((a) + (1<<((b)-1))-1)>>(b))
78
79 #define MLT(i) (((16-(i))<<16) + (i))
80 static const uint32_t MTab[16] = {
81 MLT( 0), MLT( 1), MLT( 2), MLT( 3), MLT( 4), MLT( 5), MLT( 6), MLT( 7),
82 MLT( 8), MLT( 9), MLT(10), MLT(11), MLT(12), MLT(13), MLT(14), MLT(15)
83 };
84 #undef MLT
85
86 /* ************************************************************
87 * Pts = 2 or 3
88 *
89 * Warning! *src is the global frame pointer (that is: adress
90 * of pixel 0,0), not the macroblock one.
91 * Conversely, *dst is the macroblock top-left adress.
92 */
93
94 static
Predict_16x16_C(const NEW_GMC_DATA * const This,uint8_t * dst,const uint8_t * src,int dststride,int srcstride,int x,int y,int rounding)95 void Predict_16x16_C(const NEW_GMC_DATA * const This,
96 uint8_t *dst, const uint8_t *src,
97 int dststride, int srcstride, int x, int y, int rounding)
98 {
99 const int W = This->sW;
100 const int H = This->sH;
101 const int rho = 3 - This->accuracy;
102 const int Rounder = ( (1<<7) - (rounding<<(2*rho)) ) << 16;
103
104 const int dUx = This->dU[0];
105 const int dVx = This->dV[0];
106 const int dUy = This->dU[1];
107 const int dVy = This->dV[1];
108
109 int Uo = This->Uo + 16*(dUy*y + dUx*x);
110 int Vo = This->Vo + 16*(dVy*y + dVx*x);
111
112 int i, j;
113
114 dst += 16;
115 for (j=16; j>0; --j) {
116 int U = Uo, V = Vo;
117 Uo += dUy; Vo += dVy;
118 for (i=-16; i<0; ++i) {
119 unsigned int f0, f1, ri = 16, rj = 16;
120 int Offset;
121 int u = ( U >> 16 ) << rho;
122 int v = ( V >> 16 ) << rho;
123
124 U += dUx; V += dVx;
125
126 if (u > 0 && u <= W) { ri = MTab[u&15]; Offset = u>>4; }
127 else {
128 if (u > W) Offset = W>>4;
129 else Offset = 0;
130 ri = MTab[0];
131 }
132
133 if (v > 0 && v <= H) { rj = MTab[v&15]; Offset += (v>>4)*srcstride; }
134 else {
135 if (v > H) Offset += (H>>4)*srcstride;
136 rj = MTab[0];
137 }
138
139 f0 = src[Offset + 0];
140 f0 |= src[Offset + 1] << 16;
141 f1 = src[Offset + srcstride + 0];
142 f1 |= src[Offset + srcstride + 1] << 16;
143 f0 = (ri*f0)>>16;
144 f1 = (ri*f1) & 0x0fff0000;
145 f0 |= f1;
146 f0 = (rj*f0 + Rounder) >> 24;
147
148 dst[i] = (uint8_t)f0;
149 }
150 dst += dststride;
151 }
152 }
153
154 static
Predict_8x8_C(const NEW_GMC_DATA * const This,uint8_t * uDst,const uint8_t * uSrc,uint8_t * vDst,const uint8_t * vSrc,int dststride,int srcstride,int x,int y,int rounding)155 void Predict_8x8_C(const NEW_GMC_DATA * const This,
156 uint8_t *uDst, const uint8_t *uSrc,
157 uint8_t *vDst, const uint8_t *vSrc,
158 int dststride, int srcstride, int x, int y, int rounding)
159 {
160 const int W = This->sW >> 1;
161 const int H = This->sH >> 1;
162 const int rho = 3-This->accuracy;
163 const int32_t Rounder = ( 128 - (rounding<<(2*rho)) ) << 16;
164
165 const int32_t dUx = This->dU[0];
166 const int32_t dVx = This->dV[0];
167 const int32_t dUy = This->dU[1];
168 const int32_t dVy = This->dV[1];
169
170 int32_t Uo = This->Uco + 8*(dUy*y + dUx*x);
171 int32_t Vo = This->Vco + 8*(dVy*y + dVx*x);
172
173 int i, j;
174
175 uDst += 8;
176 vDst += 8;
177 for (j=8; j>0; --j) {
178 int32_t U = Uo, V = Vo;
179 Uo += dUy; Vo += dVy;
180
181 for (i=-8; i<0; ++i) {
182 int Offset;
183 uint32_t f0, f1, ri, rj;
184 int32_t u, v;
185
186 u = ( U >> 16 ) << rho;
187 v = ( V >> 16 ) << rho;
188 U += dUx; V += dVx;
189
190 if (u > 0 && u <= W) {
191 ri = MTab[u&15];
192 Offset = u>>4;
193 } else {
194 if (u>W) Offset = W>>4;
195 else Offset = 0;
196 ri = MTab[0];
197 }
198
199 if (v > 0 && v <= H) {
200 rj = MTab[v&15];
201 Offset += (v>>4)*srcstride;
202 } else {
203 if (v>H) Offset += (H>>4)*srcstride;
204 rj = MTab[0];
205 }
206
207 f0 = uSrc[Offset + 0];
208 f0 |= uSrc[Offset + 1] << 16;
209 f1 = uSrc[Offset + srcstride + 0];
210 f1 |= uSrc[Offset + srcstride + 1] << 16;
211 f0 = (ri*f0)>>16;
212 f1 = (ri*f1) & 0x0fff0000;
213 f0 |= f1;
214 f0 = (rj*f0 + Rounder) >> 24;
215
216 uDst[i] = (uint8_t)f0;
217
218 f0 = vSrc[Offset + 0];
219 f0 |= vSrc[Offset + 1] << 16;
220 f1 = vSrc[Offset + srcstride + 0];
221 f1 |= vSrc[Offset + srcstride + 1] << 16;
222 f0 = (ri*f0)>>16;
223 f1 = (ri*f1) & 0x0fff0000;
224 f0 |= f1;
225 f0 = (rj*f0 + Rounder) >> 24;
226
227 vDst[i] = (uint8_t)f0;
228 }
229 uDst += dststride;
230 vDst += dststride;
231 }
232 }
233
234 static
get_average_mv_C(const NEW_GMC_DATA * const Dsp,VECTOR * const mv,int x,int y,int qpel)235 void get_average_mv_C(const NEW_GMC_DATA * const Dsp, VECTOR * const mv,
236 int x, int y, int qpel)
237 {
238 int i, j;
239 int vx = 0, vy = 0;
240 int32_t uo = Dsp->Uo + 16*(Dsp->dU[1]*y + Dsp->dU[0]*x);
241 int32_t vo = Dsp->Vo + 16*(Dsp->dV[1]*y + Dsp->dV[0]*x);
242 for (j=16; j>0; --j)
243 {
244 int32_t U, V;
245 U = uo; uo += Dsp->dU[1];
246 V = vo; vo += Dsp->dV[1];
247 for (i=16; i>0; --i)
248 {
249 int32_t u,v;
250 u = U >> 16; U += Dsp->dU[0]; vx += u;
251 v = V >> 16; V += Dsp->dV[0]; vy += v;
252 }
253 }
254 vx -= (256*x+120) << (5+Dsp->accuracy); /* 120 = 15*16/2 */
255 vy -= (256*y+120) << (5+Dsp->accuracy);
256
257 mv->x = RSHIFT( vx, 8+Dsp->accuracy - qpel );
258 mv->y = RSHIFT( vy, 8+Dsp->accuracy - qpel );
259 }
260
261 /* ************************************************************
262 * simplified version for 1 warp point
263 */
264
265 static
Predict_1pt_16x16_C(const NEW_GMC_DATA * const This,uint8_t * Dst,const uint8_t * Src,int dststride,int srcstride,int x,int y,int rounding)266 void Predict_1pt_16x16_C(const NEW_GMC_DATA * const This,
267 uint8_t *Dst, const uint8_t *Src,
268 int dststride, int srcstride, int x, int y, int rounding)
269 {
270 const int W = This->sW;
271 const int H = This->sH;
272 const int rho = 3-MIN(This->accuracy, 3);
273 const int32_t Rounder = ( 128 - (rounding<<(2*rho)) ) << 16;
274
275
276 int32_t uo = This->Uo + (x<<8); /* ((16*x)<<4) */
277 int32_t vo = This->Vo + (y<<8);
278 uint32_t ri = MTab[uo & 15];
279 uint32_t rj = MTab[vo & 15];
280 int i, j;
281
282 int32_t Offset;
283 if (vo>=(-16<<4) && vo<=H) Offset = (vo>>4)*srcstride;
284 else {
285 if (vo>H) Offset = ( H>>4)*srcstride;
286 else Offset =-16*srcstride;
287 rj = MTab[0];
288 }
289 if (uo>=(-16<<4) && uo<=W) Offset += (uo>>4);
290 else {
291 if (uo>W) Offset += (W>>4);
292 else Offset -= 16;
293 ri = MTab[0];
294 }
295
296 Dst += 16;
297
298 for(j=16; j>0; --j, Offset+=srcstride-16)
299 {
300 for(i=-16; i<0; ++i, ++Offset)
301 {
302 uint32_t f0, f1;
303 f0 = Src[ Offset +0 ];
304 f0 |= Src[ Offset +1 ] << 16;
305 f1 = Src[ Offset+srcstride +0 ];
306 f1 |= Src[ Offset+srcstride +1 ] << 16;
307 f0 = (ri*f0)>>16;
308 f1 = (ri*f1) & 0x0fff0000;
309 f0 |= f1;
310 f0 = ( rj*f0 + Rounder ) >> 24;
311 Dst[i] = (uint8_t)f0;
312 }
313 Dst += dststride;
314 }
315 }
316
317 static
Predict_1pt_8x8_C(const NEW_GMC_DATA * const This,uint8_t * uDst,const uint8_t * uSrc,uint8_t * vDst,const uint8_t * vSrc,int dststride,int srcstride,int x,int y,int rounding)318 void Predict_1pt_8x8_C(const NEW_GMC_DATA * const This,
319 uint8_t *uDst, const uint8_t *uSrc,
320 uint8_t *vDst, const uint8_t *vSrc,
321 int dststride, int srcstride, int x, int y, int rounding)
322 {
323 const int W = This->sW >> 1;
324 const int H = This->sH >> 1;
325 const int rho = 3-This->accuracy;
326 const int32_t Rounder = ( 128 - (rounding<<(2*rho)) ) << 16;
327
328 int32_t uo = This->Uco + (x<<7);
329 int32_t vo = This->Vco + (y<<7);
330 uint32_t rri = MTab[uo & 15];
331 uint32_t rrj = MTab[vo & 15];
332 int i, j;
333
334 int32_t Offset;
335 if (vo>=(-8<<4) && vo<=H) Offset = (vo>>4)*srcstride;
336 else {
337 if (vo>H) Offset = ( H>>4)*srcstride;
338 else Offset =-8*srcstride;
339 rrj = MTab[0];
340 }
341 if (uo>=(-8<<4) && uo<=W) Offset += (uo>>4);
342 else {
343 if (uo>W) Offset += ( W>>4);
344 else Offset -= 8;
345 rri = MTab[0];
346 }
347
348 uDst += 8;
349 vDst += 8;
350 for(j=8; j>0; --j, Offset+=srcstride-8)
351 {
352 for(i=-8; i<0; ++i, Offset++)
353 {
354 uint32_t f0, f1;
355 f0 = uSrc[ Offset + 0 ];
356 f0 |= uSrc[ Offset + 1 ] << 16;
357 f1 = uSrc[ Offset + srcstride + 0 ];
358 f1 |= uSrc[ Offset + srcstride + 1 ] << 16;
359 f0 = (rri*f0)>>16;
360 f1 = (rri*f1) & 0x0fff0000;
361 f0 |= f1;
362 f0 = ( rrj*f0 + Rounder ) >> 24;
363 uDst[i] = (uint8_t)f0;
364
365 f0 = vSrc[ Offset + 0 ];
366 f0 |= vSrc[ Offset + 1 ] << 16;
367 f1 = vSrc[ Offset + srcstride + 0 ];
368 f1 |= vSrc[ Offset + srcstride + 1 ] << 16;
369 f0 = (rri*f0)>>16;
370 f1 = (rri*f1) & 0x0fff0000;
371 f0 |= f1;
372 f0 = ( rrj*f0 + Rounder ) >> 24;
373 vDst[i] = (uint8_t)f0;
374 }
375 uDst += dststride;
376 vDst += dststride;
377 }
378 }
379
380 static
get_average_mv_1pt_C(const NEW_GMC_DATA * const Dsp,VECTOR * const mv,int x,int y,int qpel)381 void get_average_mv_1pt_C(const NEW_GMC_DATA * const Dsp, VECTOR * const mv,
382 int x, int y, int qpel)
383 {
384 mv->x = RSHIFT(Dsp->Uo<<qpel, 3);
385 mv->y = RSHIFT(Dsp->Vo<<qpel, 3);
386 }
387
388 #if defined(ARCH_IS_IA32) || defined(ARCH_IS_X86_64)
389 /* *************************************************************
390 * MMX core function
391 */
392
393 static
394 void (*GMC_Core_Lin_8)(uint8_t *Dst, const uint16_t * Offsets,
395 const uint8_t * const Src0, const int BpS, const int Rounder) = 0;
396
397 extern void xvid_GMC_Core_Lin_8_mmx(uint8_t *Dst, const uint16_t * Offsets,
398 const uint8_t * const Src0, const int BpS, const int Rounder);
399
400 extern void xvid_GMC_Core_Lin_8_sse2(uint8_t *Dst, const uint16_t * Offsets,
401 const uint8_t * const Src0, const int BpS, const int Rounder);
402
403 extern void xvid_GMC_Core_Lin_8_sse41(uint8_t *Dst, const uint16_t * Offsets,
404 const uint8_t * const Src0, const int BpS, const int Rounder);
405
406 /* *************************************************************/
407
GMC_Core_Non_Lin_8(uint8_t * Dst,const uint16_t * Offsets,const uint8_t * const Src0,const int srcstride,const int Rounder)408 static void GMC_Core_Non_Lin_8(uint8_t *Dst,
409 const uint16_t * Offsets,
410 const uint8_t * const Src0, const int srcstride,
411 const int Rounder)
412 {
413 int i;
414 for(i=0; i<8; ++i)
415 {
416 uint32_t u = Offsets[i ];
417 uint32_t v = Offsets[i+16];
418 const uint32_t ri = MTab[u&0x0f];
419 const uint32_t rj = MTab[v&0x0f];
420 uint32_t f0, f1;
421 const uint8_t * const Src = Src0 + (u>>4) + (v>>4)*srcstride;
422 f0 = Src[0];
423 f0 |= Src[1] << 16;
424 f1 = Src[srcstride +0];
425 f1 |= Src[srcstride +1] << 16;
426 f0 = (ri*f0)>>16;
427 f1 = (ri*f1) & 0x0fff0000;
428 f0 |= f1;
429 f0 = ( rj*f0 + Rounder ) >> 24;
430 Dst[i] = (uint8_t)f0;
431 }
432 }
433
434 //////////////////////////////////////////////////////////
435
436 static
Predict_16x16_mmx(const NEW_GMC_DATA * const This,uint8_t * dst,const uint8_t * src,int dststride,int srcstride,int x,int y,int rounding)437 void Predict_16x16_mmx(const NEW_GMC_DATA * const This,
438 uint8_t *dst, const uint8_t *src,
439 int dststride, int srcstride, int x, int y, int rounding)
440 {
441 const int W = This->sW;
442 const int H = This->sH;
443 const int rho = 3 - This->accuracy;
444 const int Rounder = ( 128 - (rounding<<(2*rho)) ) << 16;
445 const uint32_t W2 = W<<(16-rho);
446 const uint32_t H2 = H<<(16-rho);
447
448 const int dUx = This->dU[0];
449 const int dVx = This->dV[0];
450 const int dUy = This->dU[1];
451 const int dVy = This->dV[1];
452
453 int Uo = This->Uo + 16*(dUy*y + dUx*x);
454 int Vo = This->Vo + 16*(dVy*y + dVx*x);
455
456 int i, j;
457
458 DECLARE_ALIGNED_MATRIX(Offsets, 2,16, uint16_t, CACHE_LINE);
459 for(j=16; j>0; --j)
460 {
461 int32_t U = Uo, V = Vo;
462 Uo += dUy; Vo += dVy;
463 if ( W2>(uint32_t)U && W2>(uint32_t)(U+15*dUx) &&
464 H2>(uint32_t)V && H2>(uint32_t)(V+15*dVx) )
465 {
466 uint32_t UV1, UV2;
467 for(i=0; i<16; ++i)
468 {
469 uint32_t u = ( U >> 16 ) << rho;
470 uint32_t v = ( V >> 16 ) << rho;
471 U += dUx; V += dVx;
472 Offsets[ i] = u;
473 Offsets[16+i] = v;
474 }
475 // batch 8 input pixels when linearity says it's ok
476
477 UV1 = (Offsets[0] | (Offsets[16]<<16)) & 0xfff0fff0U;
478 UV2 = (Offsets[7] | (Offsets[23]<<16)) & 0xfff0fff0U;
479 if (UV1+7*16==UV2)
480 GMC_Core_Lin_8(dst, Offsets, src + (Offsets[0]>>4) + (Offsets[16]>>4)*srcstride, srcstride, Rounder);
481 else
482 GMC_Core_Non_Lin_8(dst, Offsets, src, srcstride, Rounder);
483 UV1 = (Offsets[ 8] | (Offsets[24]<<16)) & 0xfff0fff0U;
484 UV2 = (Offsets[15] | (Offsets[31]<<16)) & 0xfff0fff0U;
485 if (UV1+7*16==UV2)
486 GMC_Core_Lin_8(dst+8, Offsets+8, src + (Offsets[8]>>4) + (Offsets[24]>>4)*srcstride, srcstride, Rounder);
487 else
488 GMC_Core_Non_Lin_8(dst+8, Offsets+8, src, srcstride, Rounder);
489 }
490 else
491 {
492 for(i=0; i<16; ++i)
493 {
494 int u = ( U >> 16 ) << rho;
495 int v = ( V >> 16 ) << rho;
496 U += dUx; V += dVx;
497
498 Offsets[ i] = (u<0) ? 0 : (u>=W) ? W : u;
499 Offsets[16+i] = (v<0) ? 0 : (v>=H) ? H : v;
500 }
501 // due to boundary clipping, we cannot infer the 8-pixels batchability
502 // simply by using the linearity. Oh well, not a big deal...
503 GMC_Core_Non_Lin_8(dst, Offsets, src, srcstride, Rounder);
504 GMC_Core_Non_Lin_8(dst+8, Offsets+8, src, srcstride, Rounder);
505 }
506 dst += dststride;
507 }
508 }
509
510 static
Predict_8x8_mmx(const NEW_GMC_DATA * const This,uint8_t * uDst,const uint8_t * uSrc,uint8_t * vDst,const uint8_t * vSrc,int dststride,int srcstride,int x,int y,int rounding)511 void Predict_8x8_mmx(const NEW_GMC_DATA * const This,
512 uint8_t *uDst, const uint8_t *uSrc,
513 uint8_t *vDst, const uint8_t *vSrc,
514 int dststride, int srcstride, int x, int y, int rounding)
515 {
516 const int W = This->sW >> 1;
517 const int H = This->sH >> 1;
518 const int rho = 3-This->accuracy;
519 const int32_t Rounder = ( 128 - (rounding<<(2*rho)) ) << 16;
520 const uint32_t W2 = W<<(16-rho);
521 const uint32_t H2 = H<<(16-rho);
522
523 const int dUx = This->dU[0];
524 const int dVx = This->dV[0];
525 const int dUy = This->dU[1];
526 const int dVy = This->dV[1];
527
528 int Uo = This->Uco + 8*(dUy*y + dUx*x);
529 int Vo = This->Vco + 8*(dVy*y + dVx*x);
530
531 DECLARE_ALIGNED_MATRIX(Offsets, 2,16, uint16_t, CACHE_LINE);
532 int i, j;
533 for(j=8; j>0; --j)
534 {
535 int32_t U = Uo, V = Vo;
536 Uo += dUy; Vo += dVy;
537 if ( W2>(uint32_t)U && W2>(uint32_t)(U+15*dUx) &&
538 H2>(uint32_t)V && H2>(uint32_t)(V+15*dVx) )
539 {
540 uint32_t UV1, UV2;
541 for(i=0; i<8; ++i)
542 {
543 int32_t u = ( U >> 16 ) << rho;
544 int32_t v = ( V >> 16 ) << rho;
545 U += dUx; V += dVx;
546 Offsets[ i] = u;
547 Offsets[16+i] = v;
548 }
549
550 // batch 8 input pixels when linearity says it's ok
551 UV1 = (Offsets[ 0] | (Offsets[16]<<16)) & 0xfff0fff0U;
552 UV2 = (Offsets[ 7] | (Offsets[23]<<16)) & 0xfff0fff0U;
553 if (UV1+7*16==UV2)
554 {
555 const uint32_t Off = (Offsets[0]>>4) + (Offsets[16]>>4)*srcstride;
556 GMC_Core_Lin_8(uDst, Offsets, uSrc+Off, srcstride, Rounder);
557 GMC_Core_Lin_8(vDst, Offsets, vSrc+Off, srcstride, Rounder);
558 }
559 else {
560 GMC_Core_Non_Lin_8(uDst, Offsets, uSrc, srcstride, Rounder);
561 GMC_Core_Non_Lin_8(vDst, Offsets, vSrc, srcstride, Rounder);
562 }
563 }
564 else
565 {
566 for(i=0; i<8; ++i)
567 {
568 int u = ( U >> 16 ) << rho;
569 int v = ( V >> 16 ) << rho;
570 U += dUx; V += dVx;
571 Offsets[ i] = (u<0) ? 0 : (u>=W) ? W : u;
572 Offsets[16+i] = (v<0) ? 0 : (v>=H) ? H : v;
573 }
574 GMC_Core_Non_Lin_8(uDst, Offsets, uSrc, srcstride, Rounder);
575 GMC_Core_Non_Lin_8(vDst, Offsets, vSrc, srcstride, Rounder);
576 }
577 uDst += dststride;
578 vDst += dststride;
579 }
580 }
581
582 #endif /* ARCH_IS_IA32 */
583
584 /* *************************************************************
585 * will initialize internal pointers
586 */
587
init_GMC(const unsigned int cpu_flags)588 void init_GMC(const unsigned int cpu_flags)
589 {
590 Predict_16x16_func = Predict_16x16_C;
591 Predict_8x8_func = Predict_8x8_C;
592
593 #if defined(ARCH_IS_IA32) || defined(ARCH_IS_X86_64)
594 if ((cpu_flags & XVID_CPU_MMX) || (cpu_flags & XVID_CPU_MMXEXT) ||
595 (cpu_flags & XVID_CPU_3DNOW) || (cpu_flags & XVID_CPU_3DNOWEXT) ||
596 (cpu_flags & XVID_CPU_SSE) || (cpu_flags & XVID_CPU_SSE2) ||
597 (cpu_flags & XVID_CPU_SSE3) || (cpu_flags & XVID_CPU_SSE41))
598 {
599 Predict_16x16_func = Predict_16x16_mmx;
600 Predict_8x8_func = Predict_8x8_mmx;
601
602 if (cpu_flags & XVID_CPU_SSE41)
603 GMC_Core_Lin_8 = xvid_GMC_Core_Lin_8_sse41;
604 else if (cpu_flags & XVID_CPU_SSE2)
605 GMC_Core_Lin_8 = xvid_GMC_Core_Lin_8_sse2;
606 else
607 GMC_Core_Lin_8 = xvid_GMC_Core_Lin_8_mmx;
608 }
609 #endif
610 }
611
612 /* *************************************************************
613 * Warning! It's Accuracy being passed, not 'resolution'!
614 */
615
generate_GMCparameters(int nb_pts,const int accuracy,const WARPPOINTS * const pts,const int width,const int height,NEW_GMC_DATA * const gmc)616 void generate_GMCparameters( int nb_pts, const int accuracy,
617 const WARPPOINTS *const pts,
618 const int width, const int height,
619 NEW_GMC_DATA *const gmc)
620 {
621 gmc->sW = width << 4;
622 gmc->sH = height << 4;
623 gmc->accuracy = accuracy;
624 gmc->num_wp = nb_pts;
625
626 /* reduce the number of points, if possible */
627 if (nb_pts<2 || (pts->duv[2].x==0 && pts->duv[2].y==0 && pts->duv[1].x==0 && pts->duv[1].y==0 )) {
628 if (nb_pts<2 || (pts->duv[1].x==0 && pts->duv[1].y==0)) {
629 if (nb_pts<1 || (pts->duv[0].x==0 && pts->duv[0].y==0)) {
630 nb_pts = 0;
631 }
632 else nb_pts = 1;
633 }
634 else nb_pts = 2;
635 }
636
637 /* now, nb_pts stores the actual number of points required for interpolation */
638
639 if (nb_pts<=1)
640 {
641 if (nb_pts==1) {
642 /* store as 4b fixed point */
643 gmc->Uo = pts->duv[0].x << accuracy;
644 gmc->Vo = pts->duv[0].y << accuracy;
645 gmc->Uco = ((pts->duv[0].x>>1) | (pts->duv[0].x&1)) << accuracy; /* DIV2RND() */
646 gmc->Vco = ((pts->duv[0].y>>1) | (pts->duv[0].y&1)) << accuracy; /* DIV2RND() */
647 }
648 else { /* zero points?! */
649 gmc->Uo = gmc->Vo = 0;
650 gmc->Uco = gmc->Vco = 0;
651 }
652
653 gmc->predict_16x16 = Predict_1pt_16x16_C;
654 gmc->predict_8x8 = Predict_1pt_8x8_C;
655 gmc->get_average_mv = get_average_mv_1pt_C;
656 }
657 else { /* 2 or 3 points */
658 const int rho = 3 - accuracy; /* = {3,2,1,0} for Acc={0,1,2,3} */
659 int Alpha = log2bin(width-1);
660 int Ws = 1 << Alpha;
661
662 gmc->dU[0] = 16*Ws + RDIV( 8*Ws*pts->duv[1].x, width ); /* dU/dx */
663 gmc->dV[0] = RDIV( 8*Ws*pts->duv[1].y, width ); /* dV/dx */
664
665 if (nb_pts==2) {
666 gmc->dU[1] = -gmc->dV[0]; /* -Sin */
667 gmc->dV[1] = gmc->dU[0] ; /* Cos */
668 }
669 else
670 {
671 const int Beta = log2bin(height-1);
672 const int Hs = 1<<Beta;
673 gmc->dU[1] = RDIV( 8*Hs*pts->duv[2].x, height ); /* dU/dy */
674 gmc->dV[1] = 16*Hs + RDIV( 8*Hs*pts->duv[2].y, height ); /* dV/dy */
675 if (Beta>Alpha) {
676 gmc->dU[0] <<= (Beta-Alpha);
677 gmc->dV[0] <<= (Beta-Alpha);
678 Alpha = Beta;
679 Ws = Hs;
680 }
681 else {
682 gmc->dU[1] <<= Alpha - Beta;
683 gmc->dV[1] <<= Alpha - Beta;
684 }
685 }
686 /* upscale to 16b fixed-point */
687 gmc->dU[0] <<= (16-Alpha - rho);
688 gmc->dU[1] <<= (16-Alpha - rho);
689 gmc->dV[0] <<= (16-Alpha - rho);
690 gmc->dV[1] <<= (16-Alpha - rho);
691
692 gmc->Uo = ( pts->duv[0].x <<(16+ accuracy)) + (1<<15);
693 gmc->Vo = ( pts->duv[0].y <<(16+ accuracy)) + (1<<15);
694 gmc->Uco = ((pts->duv[0].x-1)<<(17+ accuracy)) + (1<<17);
695 gmc->Vco = ((pts->duv[0].y-1)<<(17+ accuracy)) + (1<<17);
696 gmc->Uco = (gmc->Uco + gmc->dU[0] + gmc->dU[1])>>2;
697 gmc->Vco = (gmc->Vco + gmc->dV[0] + gmc->dV[1])>>2;
698
699 gmc->predict_16x16 = Predict_16x16_func;
700 gmc->predict_8x8 = Predict_8x8_func;
701 gmc->get_average_mv = get_average_mv_C;
702 }
703 }
704
705 /* *******************************************************************
706 * quick and dirty routine to generate the full warped image
707 * (pGMC != NULL) or just all average Motion Vectors (pGMC == NULL) */
708
709 void
generate_GMCimage(const NEW_GMC_DATA * const gmc_data,const IMAGE * const pRef,const int mb_width,const int mb_height,const int stride,const int stride2,const int fcode,const int32_t quarterpel,const int reduced_resolution,const int32_t rounding,MACROBLOCK * const pMBs,IMAGE * const pGMC)710 generate_GMCimage( const NEW_GMC_DATA *const gmc_data, /* [input] precalculated data */
711 const IMAGE *const pRef, /* [input] */
712 const int mb_width,
713 const int mb_height,
714 const int stride,
715 const int stride2,
716 const int fcode, /* [input] some parameters... */
717 const int32_t quarterpel, /* [input] for rounding avgMV */
718 const int reduced_resolution, /* [input] ignored */
719 const int32_t rounding, /* [input] for rounding image data */
720 MACROBLOCK *const pMBs, /* [output] average motion vectors */
721 IMAGE *const pGMC) /* [output] full warped image */
722 {
723
724 unsigned int mj,mi;
725 VECTOR avgMV;
726
727 for (mj = 0; mj < (unsigned int)mb_height; mj++)
728 for (mi = 0; mi < (unsigned int)mb_width; mi++) {
729 const int mbnum = mj*mb_width+mi;
730 if (pGMC)
731 {
732 gmc_data->predict_16x16(gmc_data,
733 pGMC->y + mj*16*stride + mi*16, pRef->y,
734 stride, stride, mi, mj, rounding);
735
736 gmc_data->predict_8x8(gmc_data,
737 pGMC->u + mj*8*stride2 + mi*8, pRef->u,
738 pGMC->v + mj*8*stride2 + mi*8, pRef->v,
739 stride2, stride2, mi, mj, rounding);
740 }
741 gmc_data->get_average_mv(gmc_data, &avgMV, mi, mj, quarterpel);
742
743 pMBs[mbnum].amv.x = gmc_sanitize(avgMV.x, quarterpel, fcode);
744 pMBs[mbnum].amv.y = gmc_sanitize(avgMV.y, quarterpel, fcode);
745
746 pMBs[mbnum].mcsel = 0; /* until mode decision */
747 }
748 emms();
749 }
750