1 /*
2 * dct.c
3 *
4 * Copyright (C) Charles 'Buck' Krasic - April 2000
5 * Copyright (C) Erik Walthinsen - April 2000
6 *
7 * This file is part of libdv, a free DV (IEC 61834/SMPTE 314M)
8 * codec.
9 *
10 * libdv is free software; you can redistribute it and/or modify it
11 * under the terms of the GNU Lesser Public License as published by
12 * the Free Software Foundation; either version 2.1, or (at your
13 * option) any later version.
14 *
15 * libdv is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser Public License
21 * along with libdv; see the file COPYING. If not, write to
22 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * The libdv homepage is http://libdv.sourceforge.net/.
25 */
26
27 #ifdef HAVE_CONFIG_H
28 #include <config.h>
29 #endif
30
31 #include <math.h>
32 #include <string.h>
33
34 #include "dct.h"
35 #include "weighting.h"
36
37 #if ARCH_X86 || ARCH_X86_64
38 #include "mmx.h"
39 #endif /* ARCH_X86 || ARCH_X86_64 */
40
41 /** @file
42 * @ingroup dct
43 * @brief Implementation for @link dct Discrete Cosine Transform @endlink
44 */
45
46 /** @weakgroup dct Discrete Cosine Transform
47 *
48 * The Discrete Cosine Transform is the basis of most popular video
49 * and image coding standards. To oversimplify, the power of the
50 * transform is that for most video, energy gets concentrated around
51 * the lower frequency coefficients. The human visual system is less
52 * sensitive to higher frequency components. Because of this it is
53 * possible to achieve significant compression ratios in a hybrid
54 * coder, where by reducing the precision of higher frequency terms,
55 * causing many terms towards zero. The result is highly amenable to
56 * losseless entropy coders such as huffman encoding.
57 *
58 * @{
59 */
60
61 typedef short var;
62
63 #if BRUTE_FORCE_DCT_248 || BRUTE_FORCE_DCT_88
64 static double KC248[8][4][4][8];
65 #endif /* BRUTE_FORCE_DCT_248 || BRUTE_FORCE_DCT_88 */
66
67 #if ((!ARCH_X86) && (!ARCH_X86_64)) || BRUTE_FORCE_DCT_248 || BRUTE_FORCE_DCT_88
68 static double C[8];
69 static double KC88[8][8][8][8];
70 #endif /* ((!ARCH_X86) && (!ARCH_X86_64)) || BRUTE_FORCE_DCT_248 || BRUTE_FORCE_DCT_88 */
71
72 #if ARCH_X86_64
73 void _dv_dct_88_block_mmx_x86_64(int16_t* block);
74 void _dv_dct_block_mmx_x86_64_postscale_88(int16_t* block, int16_t* postscale_matrix);
75 void _dv_dct_block_mmx_x86_64_postscale_248(int16_t* block, int16_t* postscale_matrix);
76 void _dv_dct_248_block_mmx_x86_64(int16_t* block);
77 void _dv_dct_248_block_mmx_x86_64_post_sum(int16_t* out_block);
78 void _dv_idct_block_mmx_x86_64(dv_coeff_t *block);
79 void _dv_transpose_mmx_x86_64(short * dst);
80 #endif
81 #if ARCH_X86
82 void _dv_idct_block_mmx(dv_coeff_t *block);
83 void _dv_dct_88_block_mmx(int16_t* block);
84 void _dv_dct_block_mmx_postscale_88(int16_t* block, int16_t* postscale_matrix);
85 void _dv_dct_block_mmx_postscale_248(int16_t* block, int16_t* postscale_matrix);
86 void _dv_dct_248_block_mmx(int16_t* block);
87 void _dv_dct_248_block_mmx_post_sum(int16_t* out_block);
88 void _dv_transpose_mmx(short * dst);
89 #endif /* ARCH_X86 */
90
91 extern dv_coeff_t postSC88[64] ALIGN32;
92 extern dv_coeff_t postSC248[64] ALIGN32;
93
_dv_dct_init(void)94 void _dv_dct_init(void) {
95 #if BRUTE_FORCE_DCT_248 || BRUTE_FORCE_DCT_88
96 int u, z;
97 #endif /* BRUTE_FORCE_DCT_248 || BRUTE_FORCE_DCT_88 */
98 #if ((!ARCH_X86) && (!ARCH_X86_64)) || BRUTE_FORCE_DCT_248 || BRUTE_FORCE_DCT_88
99 int x, y, h, v, i;
100 for (x = 0; x < 8; x++) {
101 for (y = 0; y < 8; y++) {
102 for (v = 0; v < 8; v++) {
103 for (h = 0; h < 8; h++) {
104 KC88[x][y][h][v] =
105 cos((M_PI * v * ((2.0 * y) + 1.0)) / 16.0) *
106 cos((M_PI * h * ((2.0 * x) + 1.0)) / 16.0);
107 }
108 }
109 }
110 }
111 #endif /* ((!ARCH_X86) && (!ARCH_X86_64)) || BRUTE_FORCE_DCT_248 || BRUTE_FORCE_DCT_88 */
112 #if BRUTE_FORCE_DCT_248 || BRUTE_FORCE_DCT_88
113 for (x = 0; x < 8; x++) {
114 for (z = 0; z < 4; z++) {
115 for (u = 0; u < 4; u++) {
116 for (h = 0; h < 8; h++) {
117 KC248[x][z][u][h] =
118 cos((M_PI * u * ((2.0 * z) + 1.0)) / 8.0) *
119 cos((M_PI * h * ((2.0 * x) + 1.0)) / 16.0);
120 } /* for h */
121 } /* for u */
122 } /* for z */
123 } /* for x */
124 #endif /* BRUTE_FORCE_DCT_248 || BRUTE_FORCE_DCT_88 */
125 #if ((!ARCH_X86) && (!ARCH_X86_64)) || BRUTE_FORCE_DCT_248 || BRUTE_FORCE_DCT_88
126 for (i = 0; i < 8; i++) {
127 C[i] = (i == 0 ? 0.5 / sqrt(2.0) : 0.5);
128 } /* for i */
129 #endif /* ((!ARCH_X86) && (!ARCH_X86_64)) || BRUTE_FORCE_DCT_248 || BRUTE_FORCE_DCT_88 */
130 }
131
132 #if 0
133 /* Optimized out using integer fixpoint */
134 /* for DCT */
135 #define A1 0.70711
136 #define A2 0.54120
137 #define A3 0.70711
138 #define A4 1.30658
139 #define A5 0.38268
140 #endif
141
dct88_aan_line(short * in,short * out)142 static void dct88_aan_line(short* in, short* out)
143 {
144 var v0, v1, v2, v3, v4, v5, v6, v7;
145 var v00,v01,v02,v03,v04,v05,v06,v07;
146 var v10,v11,v12,v13,v14,v15,v16;
147 var v20,v21,v22;
148 var v32,v34,v35,v36;
149 var v42,v43,v45,v47;
150 var v54,v55,v56,v57,va0;
151
152 v0 = in[0];
153 v1 = in[1];
154 v2 = in[2];
155 v3 = in[3];
156 v4 = in[4];
157 v5 = in[5];
158 v6 = in[6];
159 v7 = in[7];
160
161 /* first butterfly stage */
162 v00 = v0+v7; /* 0 */
163 v07 = v0-v7; /* 7 */
164 v01 = v1+v6; /* 1 */
165 v06 = v1-v6; /* 6 */
166 v02 = v2+v5; /* 2 */
167 v05 = v2-v5; /* 5 */
168 v03 = v3+v4; /* 3 */
169 v04 = v3-v4; /* 4 */
170
171 /* second low butterfly */
172 v10=v00+v03; /* 0 */
173 v13=v00-v03; /* 3 */
174 v11=v01+v02; /* 1 */
175 v12=v01-v02; /* 2 */
176
177 /* second high */
178 v16=v06+v07; /* 6 */
179 v15=v05+v06; /* 5 */
180 v14=-(v04+v05); /* 4 */
181 /* 7 v77 without change */
182
183 /* third (only 3 real new terms) */
184 v20=v10+v11; /* 0 */
185 v21=v10-v11; /* 1 */
186 v22=v12+v13; /* 2 */
187 #if 0
188 va0=(v14+v16)*A5; /* temporary for A5 multiply */
189 #endif
190 va0=((v14+v16)*25079) >> 16; /* temporary for A5 multiply */
191
192 /* fourth */
193 #if 0
194 v32=v22*A1; /* 2 */
195 v34=-(v14*A2+va0); /* 4 ? */
196 v36=v16*A4-va0; /* 6 ? */
197 v35=v15*A3; /* 5 */
198 #endif
199
200 v32=(v22*23171) >> 15; /* 2 */
201 v34=-(((v14*17734) >> 15)+va0); /* 4 ? */
202 v36=((v16*21407) >> 14)-va0; /* 6 ? */
203 v35=(v15*23171) >> 15; /* 5 */
204
205 /* fifth */
206 v42=v32+v13; /* 2 */
207 v43=v13-v32; /* 3 */
208 v45=v07+v35; /* 5 */
209 v47=v07-v35; /* 7 */
210
211 /* last */
212 v54=v34+v47; /* 4 */
213 v57=v47-v34; /* 7 */
214 v55=v45+v36; /* 5 */
215 v56=v45-v36; /* 5 */
216
217 /* output butterfly */
218
219 out[0] = v20;
220 out[1] = v55;
221 out[2] = v42;
222 out[3] = v57;
223 out[4] = v21;
224 out[5] = v54;
225 out[6] = v43;
226 out[7] = v56;
227 }
228
229
dct44_aan_line(short * in,short * out)230 static void dct44_aan_line(short* in, short* out)
231 {
232 var v00,v01,v02,v03;
233 var v10,v11,v12,v13;
234 var v20,v21,v22;
235 var v32;
236 var v42,v43;
237
238 /* first butterfly stage */
239 v00 = in[0];
240 v01 = in[2];
241 v02 = in[4];
242 v03 = in[6];
243
244 /* second low butterfly */
245 v10=v00+v03; /* 0 */
246 v13=v00-v03; /* 3 */
247 v11=v01+v02; /* 1 */
248 v12=v01-v02; /* 2 */
249
250 /* third (only 3 real new terms) */
251 v20=v10+v11; /* 0 */
252 v21=v10-v11; /* 1 */
253 v22=v12+v13; /* 2 */
254
255 /* fourth */
256
257 v32=(v22*23171) >> 15; /* 2 */
258
259 /* fifth */
260 v42=v32+v13; /* 2 */
261 v43=v13-v32; /* 3 */
262
263 /* output butterfly */
264
265 out[0] = v20;
266 out[2] = v42;
267 out[4] = v21;
268 out[6] = v43;
269 }
270
postscale88(var v[64])271 static void postscale88(var v[64])
272 {
273 int i;
274 int factor = pow(2, 16 + DCT_YUV_PRECISION);
275 for( i = 0; i < 64; i++ ) {
276 v[i] = ( v[i] * postSC88[i] ) / factor;
277 }
278 }
279
280 /* Input has to be transposed ! */
281
dct88_aan(short * s_in)282 static inline void dct88_aan(short *s_in)
283 {
284 int i,j,r,c;
285 short s_out[64];
286
287 for(c = 0; c < 64; c += 8 ) /* columns */
288 dct88_aan_line(s_in + c, s_in + c);
289
290 for(i = 0; i < 8; i++) {
291 for (j = 0; j < 8; j++) {
292 s_out[i * 8 + j] = s_in[j * 8 + i];
293 }
294 }
295
296 for(r = 0; r < 64; r += 8 ) /* then rows */
297 dct88_aan_line(s_out + r, s_out + r);
298
299 memcpy(s_in, s_out, 64 * sizeof(short));
300 }
301
302 /* Input has to be transposed ! */
303
dct248_aan(short * s_in)304 static inline void dct248_aan(short *s_in)
305 {
306 int i,j,r,c;
307 short s_out[64];
308
309 for(c = 0; c < 64; c += 8 ) { /* columns */
310 dct44_aan_line(s_in + c, s_in + c);
311 dct44_aan_line(s_in + c + 1, s_in + c + 1);
312 }
313
314 for(i = 0; i < 8; i++) {
315 for (j = 0; j < 8; j++) {
316 s_out[i * 8 + j] = s_in[j * 8 + i];
317 }
318 }
319
320 for(r = 0; r < 64; r += 8 ) { /* then rows */
321 dct88_aan_line(s_out + r, s_out + r);
322 }
323
324 for(i = 0; i < 4; i++) {
325 for (j = 0; j < 8; j++) {
326 s_in[i * 8 + j] = s_out[i * 16 + j]
327 + s_out[i * 16 + 8 + j];
328 s_in[i * 8 + 32 + j] = s_out[i * 16 + j]
329 - s_out[i * 16 + 8 + j];
330 }
331 }
332 }
333
postscale248(var v[64])334 static void postscale248(var v[64])
335 {
336 int i;
337 int factor = pow(2, 16 + DCT_YUV_PRECISION);
338 for( i=0; i<64; i++ ) {
339 v[i] = ( v[i] * postSC248[i] ) / factor;
340 }
341 }
342
343 /* Input has to be transposed !!! */
344
_dv_dct_88(dv_coeff_t * block)345 void _dv_dct_88(dv_coeff_t *block) {
346 #if ((!ARCH_X86) && (!ARCH_X86_64))
347 #if BRUTE_FORCE_DCT_88
348 int v,h,y,x,i;
349 double temp[64];
350 int factor = pow(2, DCT_YUV_PRECISION);
351
352 memset(temp,0,sizeof(temp));
353 for (v = 0; v < 8; v++) {
354 for (h = 0; h < 8; h++) {
355 for (y = 0;y < 8; y++) {
356 for (x = 0;x < 8; x++) {
357 temp[v * 8 + h] += block[x * 8 + y]
358 * KC88[x][y][h][v];
359 }
360 }
361 temp[v * 8 + h] *= (C[h] * C[v]);
362 }
363 }
364
365 for (i = 0; i < 64; i++) {
366 block[i] = temp[i] / factor;
367 }
368 #else /* BRUTE_FORCE_DCT_88 */
369 dct88_aan(block);
370 postscale88(block);
371 #endif /* BRUTE_FORCE_DCT_88 */
372 #elif ARCH_X86_64
373 _dv_dct_88_block_mmx_x86_64(block);
374 _dv_transpose_mmx_x86_64(block);
375 _dv_dct_88_block_mmx_x86_64(block);
376 _dv_dct_block_mmx_x86_64_postscale_88(block, postSC88);
377 emms();
378 #else /* ((!ARCH_X86) && (!ARCH_X86_64)) */
379 _dv_dct_88_block_mmx(block);
380 _dv_transpose_mmx(block);
381 _dv_dct_88_block_mmx(block);
382 _dv_dct_block_mmx_postscale_88(block, postSC88);
383 emms();
384 #endif /* ((!ARCH_X86) && (!ARCH_X86_64)) */
385 }
386
387 /* Input has to be transposed !!! */
388
_dv_dct_248(dv_coeff_t * block)389 void _dv_dct_248(dv_coeff_t *block)
390 {
391 #if ((!ARCH_X86) && (!ARCH_X86_64))
392 #if BRUTE_FORCE_DCT_248
393 int u,h,z,x,i;
394 double temp[64];
395 int factor = pow(2, DCT_YUV_PRECISION);
396
397 memset(temp,0,sizeof(temp));
398 for (u=0;u<4;u++) {
399 for (h=0;h<8;h++) {
400 for (z=0;z<4;z++) {
401 for (x=0;x<8;x++) {
402 temp[u*8+h] += (block[x*8+2*z] + block[x*8+(2*z+1)]) *
403 KC248[x][z][u][h];
404 temp[(u+4)*8+h] += (block[x*8+2*z] - block[x*8+(2*z+1)]) *
405 KC248[x][z][u][h];
406 }
407 }
408 temp[u*8+h] *= (C[h] * C[u]);
409 temp[(u+4)*8+h] *= (C[h] * C[u]);
410 }
411 }
412
413 for (i=0;i<64;i++)
414 block[i] = temp[i] / factor;
415 #else /* BRUTE_FORCE_DCT_248 */
416 dct248_aan(block);
417 postscale248(block);
418 #endif /* BRUTE_FORCE_DCT_248 */
419 #elif ARCH_X86_64
420 _dv_dct_88_block_mmx_x86_64(block);
421 _dv_transpose_mmx_x86_64(block);
422 _dv_dct_248_block_mmx_x86_64(block);
423 _dv_dct_248_block_mmx_x86_64_post_sum(block);
424 _dv_dct_block_mmx_x86_64_postscale_248(block, postSC248);
425 #else /* ((!ARCH_X86) && (!ARCH_X86_64)) */
426 _dv_dct_88_block_mmx(block);
427 _dv_transpose_mmx(block);
428 _dv_dct_248_block_mmx(block);
429 _dv_dct_248_block_mmx_post_sum(block);
430 _dv_dct_block_mmx_postscale_248(block, postSC248);
431 emms();
432 #endif /* ((!ARCH_X86) && (!ARCH_X86_64)) */
433 }
434
_dv_idct_88(dv_coeff_t * block)435 void _dv_idct_88(dv_coeff_t *block)
436 {
437 #if ARCH_X86_64
438
439 _dv_idct_block_mmx_x86_64(block);
440 emms();
441
442 #elif ARCH_X86
443
444 _dv_idct_block_mmx(block);
445 emms();
446
447 #else /* ARCH_X86 */
448
449 int v,h,y,x,i;
450 double temp[64];
451
452 memset(temp,0,sizeof(temp));
453 for (v=0;v<8;v++) {
454 for (h=0;h<8;h++) {
455 for (y=0;y<8;y++){
456 for (x=0;x<8;x++) {
457 temp[y*8+x] += C[v] * C[h] * block[v*8+h] * KC88[x][y][h][v];
458 }
459 }
460 }
461 }
462
463 for (i=0;i<64;i++)
464 block[i] = temp[i];
465 #endif
466 }
467
468 #if BRUTE_FORCE_248
469
_dv_idct_248(double * block)470 void _dv_idct_248(double *block)
471 {
472 int u,h,z,x,i;
473 double temp[64];
474 double temp2[64];
475 double b,c;
476 double (*in)[8][8], (*out)[8][8]; /* don't really need storage
477 -- fixup later */
478
479 #if 0
480 /* This is to identify visually where 248 blocks are... */
481 for(i=0;i<64;i++) {
482 block[i] = 235 - 128;
483 }
484 return;
485 #endif
486
487 memset(temp,0,sizeof(temp));
488
489 out = &temp;
490 in = &temp2;
491
492 for(z=0;z<8;z++) {
493 for(x=0;x<8;x++)
494 (*in)[z][x] = block[z*8+x];
495 }
496
497 for (x = 0; x < 8; x++) {
498 for (z = 0; z < 4; z++) {
499 for (u = 0; u < 4; u++) {
500 for (h = 0; h < 8; h++) {
501 b = (double)(*in)[u][h];
502 c = (double)(*in)[u+4][h];
503 (*out)[2*z][x] += C[u] * C[h] * (b + c) * KC248[x][z][u][h];
504 (*out)[2*z+1][x] += C[u] * C[h] * (b - c) * KC248[x][z][u][h];
505 } /* for h */
506 } /* for u */
507 } /* for z */
508 } /* for x */
509
510 #if 0
511 for (u=0;u<4;u++) {
512 for (h=0;h<8;h++) {
513 for (z=0;z<4;z++) {
514 for (x=0;x<8;x++) {
515 b = block[u*8+h];
516 c = block[(u+4)*8+h];
517 temp[(2*u)*8+h] += C[h] * C[u] * (b + c) * KC248[x][z][h][u];
518 temp[(2*u+1)*8+h] += C[h] * C[u] * (b - c) * KC248[x][z][h][u];
519 }
520 }
521 }
522 }
523 #endif
524
525 for(z=0;z<8;z++) {
526 for(x=0;x<8;x++)
527 block[z*8+x] = (*out)[z][x];
528 }
529 }
530
531
532 #endif /* BRUTE_FORCE_248 */
533
534 /*@}*/
535