1 /*
2  *  dct.c
3  *
4  *     Copyright (C) Charles 'Buck' Krasic - April 2000
5  *     Copyright (C) Erik Walthinsen - April 2000
6  *
7  *  This file is part of libdv, a free DV (IEC 61834/SMPTE 314M)
8  *  codec.
9  *
10  *  libdv is free software; you can redistribute it and/or modify it
11  *  under the terms of the GNU Lesser Public License as published by
12  *  the Free Software Foundation; either version 2.1, or (at your
13  *  option) any later version.
14  *
15  *  libdv is distributed in the hope that it will be useful, but
16  *  WITHOUT ANY WARRANTY; without even the implied warranty of
17  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  *  Lesser Public License for more details.
19  *
20  *  You should have received a copy of the GNU Lesser Public License
21  *  along with libdv; see the file COPYING.  If not, write to
22  *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23  *
24  *  The libdv homepage is http://libdv.sourceforge.net/.
25  */
26 
27 #ifdef HAVE_CONFIG_H
28 #include <config.h>
29 #endif
30 
31 #include <math.h>
32 #include <string.h>
33 
34 #include "dct.h"
35 #include "weighting.h"
36 
37 #if ARCH_X86 || ARCH_X86_64
38 #include "mmx.h"
39 #endif /* ARCH_X86 || ARCH_X86_64 */
40 
41 /** @file
42  *  @ingroup dct
43  *  @brief Implementation for @link dct Discrete Cosine Transform @endlink
44  */
45 
46 /** @weakgroup dct Discrete Cosine Transform
47  *
48  *  The Discrete Cosine Transform is the basis of most popular video
49  *  and image coding standards.  To oversimplify, the power of the
50  *  transform is that for most video, energy gets concentrated around
51  *  the lower frequency coefficients.  The human visual system is less
52  *  sensitive to higher frequency components.  Because of this it is
53  *  possible to achieve significant compression ratios in a hybrid
54  *  coder, where by reducing the precision of higher frequency terms,
55  *  causing many terms towards zero.  The result is highly amenable to
56  *  losseless entropy coders such as huffman encoding.
57  *
58  *  @{
59  */
60 
61 typedef short var;
62 
63 #if BRUTE_FORCE_DCT_248 || BRUTE_FORCE_DCT_88
64 static double KC248[8][4][4][8];
65 #endif /* BRUTE_FORCE_DCT_248 || BRUTE_FORCE_DCT_88 */
66 
67 #if ((!ARCH_X86) && (!ARCH_X86_64)) || BRUTE_FORCE_DCT_248 || BRUTE_FORCE_DCT_88
68 static double C[8];
69 static double KC88[8][8][8][8];
70 #endif /* ((!ARCH_X86) && (!ARCH_X86_64)) || BRUTE_FORCE_DCT_248 || BRUTE_FORCE_DCT_88 */
71 
72 #if ARCH_X86_64
73 void _dv_dct_88_block_mmx_x86_64(int16_t* block);
74 void _dv_dct_block_mmx_x86_64_postscale_88(int16_t* block, int16_t* postscale_matrix);
75 void _dv_dct_block_mmx_x86_64_postscale_248(int16_t* block, int16_t* postscale_matrix);
76 void _dv_dct_248_block_mmx_x86_64(int16_t* block);
77 void _dv_dct_248_block_mmx_x86_64_post_sum(int16_t* out_block);
78 void _dv_idct_block_mmx_x86_64(dv_coeff_t *block);
79 void _dv_transpose_mmx_x86_64(short * dst);
80 #endif
81 #if ARCH_X86
82 void _dv_idct_block_mmx(dv_coeff_t *block);
83 void _dv_dct_88_block_mmx(int16_t* block);
84 void _dv_dct_block_mmx_postscale_88(int16_t* block, int16_t* postscale_matrix);
85 void _dv_dct_block_mmx_postscale_248(int16_t* block, int16_t* postscale_matrix);
86 void _dv_dct_248_block_mmx(int16_t* block);
87 void _dv_dct_248_block_mmx_post_sum(int16_t* out_block);
88 void _dv_transpose_mmx(short * dst);
89 #endif /* ARCH_X86 */
90 
91 extern dv_coeff_t postSC88[64] ALIGN32;
92 extern dv_coeff_t postSC248[64] ALIGN32;
93 
_dv_dct_init(void)94 void _dv_dct_init(void) {
95 #if BRUTE_FORCE_DCT_248 || BRUTE_FORCE_DCT_88
96   int u, z;
97 #endif /* BRUTE_FORCE_DCT_248 || BRUTE_FORCE_DCT_88 */
98 #if ((!ARCH_X86) && (!ARCH_X86_64)) || BRUTE_FORCE_DCT_248 || BRUTE_FORCE_DCT_88
99   int x, y, h, v, i;
100   for (x = 0; x < 8; x++) {
101     for (y = 0; y < 8; y++) {
102       for (v = 0; v < 8; v++) {
103         for (h = 0; h < 8; h++) {
104           KC88[x][y][h][v] =
105             cos((M_PI * v * ((2.0 * y) + 1.0)) / 16.0) *
106             cos((M_PI * h * ((2.0 * x) + 1.0)) / 16.0);
107         }
108       }
109     }
110   }
111 #endif /* ((!ARCH_X86) && (!ARCH_X86_64)) || BRUTE_FORCE_DCT_248 || BRUTE_FORCE_DCT_88 */
112 #if BRUTE_FORCE_DCT_248 || BRUTE_FORCE_DCT_88
113   for (x = 0; x < 8; x++) {
114     for (z = 0; z < 4; z++) {
115       for (u = 0; u < 4; u++) {
116         for (h = 0; h < 8; h++) {
117 	  KC248[x][z][u][h] =
118 	    cos((M_PI * u * ((2.0 * z) + 1.0)) / 8.0) *
119 	    cos((M_PI * h * ((2.0 * x) + 1.0)) / 16.0);
120         }                       /* for h */
121       }                         /* for u */
122     }                           /* for z */
123   }                             /* for x */
124 #endif /* BRUTE_FORCE_DCT_248 || BRUTE_FORCE_DCT_88 */
125 #if ((!ARCH_X86) && (!ARCH_X86_64))  || BRUTE_FORCE_DCT_248 || BRUTE_FORCE_DCT_88
126   for (i = 0; i < 8; i++) {
127     C[i] = (i == 0 ? 0.5 / sqrt(2.0) : 0.5);
128   } /* for i */
129 #endif /* ((!ARCH_X86) && (!ARCH_X86_64)) || BRUTE_FORCE_DCT_248 || BRUTE_FORCE_DCT_88 */
130 }
131 
132 #if 0
133 /* Optimized out using integer fixpoint */
134 /* for DCT */
135 #define A1 0.70711
136 #define A2 0.54120
137 #define A3 0.70711
138 #define A4 1.30658
139 #define A5 0.38268
140 #endif
141 
dct88_aan_line(short * in,short * out)142 static void dct88_aan_line(short* in, short* out)
143 {
144 	var v0, v1, v2, v3, v4, v5, v6, v7;
145 	var v00,v01,v02,v03,v04,v05,v06,v07;
146 	var v10,v11,v12,v13,v14,v15,v16;
147 	var v20,v21,v22;
148 	var v32,v34,v35,v36;
149 	var v42,v43,v45,v47;
150 	var v54,v55,v56,v57,va0;
151 
152 	v0 = in[0];
153 	v1 = in[1];
154 	v2 = in[2];
155 	v3 = in[3];
156 	v4 = in[4];
157 	v5 = in[5];
158 	v6 = in[6];
159 	v7 = in[7];
160 
161 	/* first butterfly stage  */
162 	v00 = v0+v7;	  /* 0 */
163 	v07 = v0-v7;	  /* 7 */
164 	v01 = v1+v6;	  /* 1 */
165 	v06 = v1-v6;	  /* 6 */
166 	v02 = v2+v5;	  /* 2 */
167 	v05 = v2-v5;	  /* 5 */
168 	v03 = v3+v4;	  /* 3 */
169 	v04 = v3-v4;	  /* 4 */
170 
171 	/* second low butterfly */
172 	v10=v00+v03;	     /* 0 */
173 	v13=v00-v03;		 /* 3 */
174 	v11=v01+v02;		 /* 1 */
175 	v12=v01-v02;		 /* 2 */
176 
177 	/* second high */
178 	v16=v06+v07;		 /* 6 */
179 	v15=v05+v06;		 /* 5 */
180 	v14=-(v04+v05);	 /* 4 */
181 	/* 7 v77 without change */
182 
183 	/* third	(only 3 real new terms) */
184 	v20=v10+v11;			    /* 0 */
185 	v21=v10-v11;				/* 1 */
186 	v22=v12+v13;   			/* 2 */
187 #if 0
188 	va0=(v14+v16)*A5; /* temporary for A5 multiply */
189 #endif
190 	va0=((v14+v16)*25079) >> 16;	 /* temporary for A5 multiply */
191 
192 	/* fourth */
193 #if 0
194 	v32=v22*A1;                           /* 2 */
195 	v34=-(v14*A2+va0);                    /* 4 ? */
196 	v36=v16*A4-va0;                       /* 6 ? */
197 	v35=v15*A3;                           /* 5 */
198 #endif
199 
200 	v32=(v22*23171) >> 15;                /* 2 */
201 	v34=-(((v14*17734) >> 15)+va0);       /* 4 ? */
202 	v36=((v16*21407) >> 14)-va0;          /* 6 ? */
203 	v35=(v15*23171) >> 15;                /* 5 */
204 
205 	/* fifth */
206 	v42=v32+v13;                    /* 2 */
207 	v43=v13-v32;                    /* 3 */
208 	v45=v07+v35;                    /* 5 */
209 	v47=v07-v35;                    /* 7 */
210 
211 	/* last */
212 	v54=v34+v47;                         /* 4 */
213 	v57=v47-v34;                         /* 7 */
214 	v55=v45+v36;                         /* 5 */
215 	v56=v45-v36;                         /* 5 */
216 
217 	/* output butterfly */
218 
219 	out[0] = v20;
220 	out[1] = v55;
221 	out[2] = v42;
222 	out[3] = v57;
223 	out[4] = v21;
224 	out[5] = v54;
225 	out[6] = v43;
226 	out[7] = v56;
227 }
228 
229 
dct44_aan_line(short * in,short * out)230 static void dct44_aan_line(short* in, short* out)
231 {
232 	var v00,v01,v02,v03;
233 	var v10,v11,v12,v13;
234 	var v20,v21,v22;
235 	var v32;
236 	var v42,v43;
237 
238 	/* first butterfly stage  */
239 	v00 = in[0];
240 	v01 = in[2];
241 	v02 = in[4];
242 	v03 = in[6];
243 
244 	/* second low butterfly */
245 	v10=v00+v03;	     /* 0 */
246 	v13=v00-v03;		 /* 3 */
247 	v11=v01+v02;		 /* 1 */
248 	v12=v01-v02;		 /* 2 */
249 
250 	/* third	(only 3 real new terms) */
251 	v20=v10+v11;			    /* 0 */
252 	v21=v10-v11;				/* 1 */
253 	v22=v12+v13;   			/* 2 */
254 
255 	/* fourth */
256 
257 	v32=(v22*23171) >> 15;                /* 2 */
258 
259 	/* fifth */
260 	v42=v32+v13;                    /* 2 */
261 	v43=v13-v32;                    /* 3 */
262 
263 	/* output butterfly */
264 
265 	out[0] = v20;
266 	out[2] = v42;
267 	out[4] = v21;
268 	out[6] = v43;
269 }
270 
postscale88(var v[64])271 static void postscale88(var v[64])
272 {
273 	int i;
274 	int factor = pow(2, 16 + DCT_YUV_PRECISION);
275 	for( i = 0; i < 64; i++ ) {
276 		v[i] = ( v[i] * postSC88[i] ) / factor;
277 	}
278 }
279 
280 /* Input has to be transposed ! */
281 
dct88_aan(short * s_in)282 static inline void dct88_aan(short *s_in)
283 {
284 	int i,j,r,c;
285 	short s_out[64];
286 
287 	for(c = 0; c < 64; c += 8 ) /* columns */
288 		dct88_aan_line(s_in + c, s_in + c);
289 
290 	for(i = 0; i < 8; i++) {
291 		for (j = 0; j < 8; j++) {
292 			s_out[i * 8 + j] = s_in[j * 8 + i];
293 		}
294 	}
295 
296 	for(r = 0; r < 64; r += 8 ) /* then rows */
297 		dct88_aan_line(s_out + r, s_out + r);
298 
299 	memcpy(s_in, s_out, 64 * sizeof(short));
300 }
301 
302 /* Input has to be transposed ! */
303 
dct248_aan(short * s_in)304 static inline void dct248_aan(short *s_in)
305 {
306 	int i,j,r,c;
307 	short s_out[64];
308 
309 	for(c = 0; c < 64; c += 8 ) { /* columns */
310 		dct44_aan_line(s_in + c, s_in + c);
311 		dct44_aan_line(s_in + c + 1, s_in + c + 1);
312 	}
313 
314 	for(i = 0; i < 8; i++) {
315 		for (j = 0; j < 8; j++) {
316 			s_out[i * 8 + j] = s_in[j * 8 + i];
317 		}
318 	}
319 
320 	for(r = 0; r < 64; r += 8 ) { /* then rows */
321 		dct88_aan_line(s_out + r, s_out + r);
322 	}
323 
324 	for(i = 0; i < 4; i++) {
325 		for (j = 0; j < 8; j++) {
326 			s_in[i * 8 + j] = s_out[i * 16 + j]
327 				+ s_out[i * 16 + 8 + j];
328 			s_in[i * 8 + 32 + j] = s_out[i * 16 + j]
329 				- s_out[i * 16 + 8 + j];
330 		}
331 	}
332 }
333 
postscale248(var v[64])334 static void postscale248(var v[64])
335 {
336 	int i;
337 	int factor = pow(2, 16 + DCT_YUV_PRECISION);
338 	for( i=0; i<64; i++ ) {
339 		v[i] = ( v[i] * postSC248[i] ) / factor;
340 	}
341 }
342 
343 /* Input has to be transposed !!! */
344 
_dv_dct_88(dv_coeff_t * block)345 void _dv_dct_88(dv_coeff_t *block) {
346 #if ((!ARCH_X86) && (!ARCH_X86_64))
347 #if BRUTE_FORCE_DCT_88
348   int v,h,y,x,i;
349   double temp[64];
350   int factor = pow(2, DCT_YUV_PRECISION);
351 
352   memset(temp,0,sizeof(temp));
353   for (v = 0; v < 8; v++) {
354     for (h = 0; h < 8; h++) {
355       for (y = 0;y < 8; y++) {
356 	for (x = 0;x < 8; x++) {
357 	  temp[v * 8 + h] += block[x * 8 + y]
358 	    * KC88[x][y][h][v];
359 	}
360       }
361       temp[v * 8 + h] *= (C[h] * C[v]);
362     }
363   }
364 
365   for (i = 0; i < 64; i++) {
366     block[i] = temp[i] / factor;
367   }
368 #else /* BRUTE_FORCE_DCT_88 */
369   dct88_aan(block);
370   postscale88(block);
371 #endif /* BRUTE_FORCE_DCT_88 */
372 #elif ARCH_X86_64
373   _dv_dct_88_block_mmx_x86_64(block);
374   _dv_transpose_mmx_x86_64(block);
375   _dv_dct_88_block_mmx_x86_64(block);
376   _dv_dct_block_mmx_x86_64_postscale_88(block, postSC88);
377   emms();
378 #else /* ((!ARCH_X86) && (!ARCH_X86_64)) */
379   _dv_dct_88_block_mmx(block);
380   _dv_transpose_mmx(block);
381   _dv_dct_88_block_mmx(block);
382   _dv_dct_block_mmx_postscale_88(block, postSC88);
383   emms();
384 #endif /* ((!ARCH_X86) && (!ARCH_X86_64)) */
385 }
386 
387 /* Input has to be transposed !!! */
388 
_dv_dct_248(dv_coeff_t * block)389 void _dv_dct_248(dv_coeff_t *block)
390 {
391 #if ((!ARCH_X86) && (!ARCH_X86_64))
392 #if BRUTE_FORCE_DCT_248
393   int u,h,z,x,i;
394   double temp[64];
395   int factor = pow(2, DCT_YUV_PRECISION);
396 
397   memset(temp,0,sizeof(temp));
398   for (u=0;u<4;u++) {
399     for (h=0;h<8;h++) {
400       for (z=0;z<4;z++) {
401 	for (x=0;x<8;x++) {
402 	  temp[u*8+h] +=     (block[x*8+2*z] + block[x*8+(2*z+1)]) *
403 		  KC248[x][z][u][h];
404 	  temp[(u+4)*8+h] += (block[x*8+2*z] - block[x*8+(2*z+1)]) *
405 	    KC248[x][z][u][h];
406 	}
407       }
408       temp[u*8+h] *= (C[h] * C[u]);
409       temp[(u+4)*8+h] *= (C[h] * C[u]);
410     }
411   }
412 
413   for (i=0;i<64;i++)
414 	  block[i] = temp[i] / factor;
415 #else /* BRUTE_FORCE_DCT_248 */
416   dct248_aan(block);
417   postscale248(block);
418 #endif /* BRUTE_FORCE_DCT_248 */
419 #elif ARCH_X86_64
420   _dv_dct_88_block_mmx_x86_64(block);
421   _dv_transpose_mmx_x86_64(block);
422   _dv_dct_248_block_mmx_x86_64(block);
423   _dv_dct_248_block_mmx_x86_64_post_sum(block);
424   _dv_dct_block_mmx_x86_64_postscale_248(block, postSC248);
425 #else /* ((!ARCH_X86) && (!ARCH_X86_64)) */
426   _dv_dct_88_block_mmx(block);
427   _dv_transpose_mmx(block);
428   _dv_dct_248_block_mmx(block);
429   _dv_dct_248_block_mmx_post_sum(block);
430   _dv_dct_block_mmx_postscale_248(block, postSC248);
431   emms();
432 #endif /* ((!ARCH_X86) && (!ARCH_X86_64)) */
433 }
434 
_dv_idct_88(dv_coeff_t * block)435 void _dv_idct_88(dv_coeff_t *block)
436 {
437 #if ARCH_X86_64
438 
439   _dv_idct_block_mmx_x86_64(block);
440   emms();
441 
442 #elif ARCH_X86
443 
444   _dv_idct_block_mmx(block);
445   emms();
446 
447 #else /* ARCH_X86 */
448 
449   int v,h,y,x,i;
450   double temp[64];
451 
452   memset(temp,0,sizeof(temp));
453   for (v=0;v<8;v++) {
454     for (h=0;h<8;h++) {
455       for (y=0;y<8;y++){
456 	for (x=0;x<8;x++) {
457 	  temp[y*8+x] += C[v] * C[h] * block[v*8+h] * KC88[x][y][h][v];
458 	}
459       }
460     }
461   }
462 
463   for (i=0;i<64;i++)
464     block[i] = temp[i];
465 #endif
466 }
467 
468 #if BRUTE_FORCE_248
469 
_dv_idct_248(double * block)470 void _dv_idct_248(double *block)
471 {
472   int u,h,z,x,i;
473   double temp[64];
474   double temp2[64];
475   double b,c;
476   double (*in)[8][8], (*out)[8][8]; /* don't really need storage
477 				       -- fixup later */
478 
479 #if 0
480   /* This is to identify visually where 248 blocks are... */
481   for(i=0;i<64;i++) {
482     block[i] = 235 - 128;
483   }
484   return;
485 #endif
486 
487   memset(temp,0,sizeof(temp));
488 
489   out = &temp;
490   in = &temp2;
491 
492   for(z=0;z<8;z++) {
493     for(x=0;x<8;x++)
494       (*in)[z][x] = block[z*8+x];
495   }
496 
497   for (x = 0; x < 8; x++) {
498     for (z = 0; z < 4; z++) {
499       for (u = 0; u < 4; u++) {
500 	for (h = 0; h < 8; h++) {
501 	  b = (double)(*in)[u][h];
502 	  c = (double)(*in)[u+4][h];
503 	  (*out)[2*z][x] += C[u] * C[h] * (b + c) * KC248[x][z][u][h];
504 	  (*out)[2*z+1][x] += C[u] * C[h] * (b - c) * KC248[x][z][u][h];
505 	}                       /* for h */
506       }                         /* for u */
507     }                           /* for z */
508   }                             /* for x */
509 
510 #if 0
511   for (u=0;u<4;u++) {
512     for (h=0;h<8;h++) {
513       for (z=0;z<4;z++) {
514         for (x=0;x<8;x++) {
515           b = block[u*8+h];
516           c = block[(u+4)*8+h];
517           temp[(2*u)*8+h] += C[h] * C[u] * (b + c) * KC248[x][z][h][u];
518           temp[(2*u+1)*8+h] += C[h] * C[u] * (b - c) * KC248[x][z][h][u];
519         }
520       }
521     }
522   }
523 #endif
524 
525   for(z=0;z<8;z++) {
526     for(x=0;x<8;x++)
527       block[z*8+x] = (*out)[z][x];
528   }
529 }
530 
531 
532 #endif /* BRUTE_FORCE_248 */
533 
534 /*@}*/
535