1 /*
2  * jrevdct.c
3  *
4  * Copyright (C) 1991, 1992, Thomas G. Lane.
5  * This file is part of the Independent JPEG Group's software.
6  * For conditions of distribution and use, see the accompanying README file.
7  *
8  * This file contains the basic inverse-DCT transformation subroutine.
9  *
10  * This implementation is based on an algorithm described in
11  *   C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
12  *   Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
13  *   Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
14  * The primary algorithm described there uses 11 multiplies and 29 adds.
15  * We use their alternate method with 12 multiplies and 32 adds.
16  * The advantage of this method is that no data path contains more than one
17  * multiplication; this allows a very simple and accurate implementation in
18  * scaled fixed-point arithmetic, with a minimal number of shifts.
19  *
20  * I've made lots of modifications to attempt to take advantage of the
21  * sparse nature of the DCT matrices we're getting.  Although the logic
22  * is cumbersome, it's straightforward and the resulting code is much
23  * faster.
24  *
25  * A better way to do this would be to pass in the DCT block as a sparse
26  * matrix, perhaps with the difference cases encoded.
27  */
28 
29 #include <memory.h>
30 #include "all.h"
31 #include "ansi.h"
32 #include "dct.h"
33 
34 
35 #define CONST_BITS 13
36 
37 /*
38  * This routine is specialized to the case DCTSIZE = 8.
39  */
40 
41 #if DCTSIZE != 8
42   Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
43 #endif
44 
45 
46 /*
47  * A 2-D IDCT can be done by 1-D IDCT on each row followed by 1-D IDCT
48  * on each column.  Direct algorithms are also available, but they are
49  * much more complex and seem not to be any faster when reduced to code.
50  *
51  * The poop on this scaling stuff is as follows:
52  *
53  * Each 1-D IDCT step produces outputs which are a factor of sqrt(N)
54  * larger than the true IDCT outputs.  The final outputs are therefore
55  * a factor of N larger than desired; since N=8 this can be cured by
56  * a simple right shift at the end of the algorithm.  The advantage of
57  * this arrangement is that we save two multiplications per 1-D IDCT,
58  * because the y0 and y4 inputs need not be divided by sqrt(N).
59  *
60  * We have to do addition and subtraction of the integer inputs, which
61  * is no problem, and multiplication by fractional constants, which is
62  * a problem to do in integer arithmetic.  We multiply all the constants
63  * by CONST_SCALE and convert them to integer constants (thus retaining
64  * CONST_BITS bits of precision in the constants).  After doing a
65  * multiplication we have to divide the product by CONST_SCALE, with proper
66  * rounding, to produce the correct output.  This division can be done
67  * cheaply as a right shift of CONST_BITS bits.  We postpone shifting
68  * as long as possible so that partial sums can be added together with
69  * full fractional precision.
70  *
71  * The outputs of the first pass are scaled up by PASS1_BITS bits so that
72  * they are represented to better-than-integral precision.  These outputs
73  * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
74  * with the recommended scaling.  (To scale up 12-bit sample data further, an
75  * intermediate int32 array would be needed.)
76  *
77  * To avoid overflow of the 32-bit intermediate results in pass 2, we must
78  * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis
79  * shows that the values given below are the most effective.
80  */
81 
82 #ifdef EIGHT_BIT_SAMPLES
83 #define PASS1_BITS  2
84 #else
85 #define PASS1_BITS  1		/* lose a little precision to avoid overflow */
86 #endif
87 
88 #define ONE	((int32) 1)
89 
90 #define CONST_SCALE (ONE << CONST_BITS)
91 
92 /* Convert a positive real constant to an integer scaled by CONST_SCALE.
93  * IMPORTANT: if your compiler doesn't do this arithmetic at compile time,
94  * you will pay a significant penalty in run time.  In that case, figure
95  * the correct integer constant values and insert them by hand.
96  */
97 
98 /* Actually FIX is no longer used, we precomputed them all */
99 #define FIX(x)	((int32) ((x) * CONST_SCALE + 0.5))
100 
101 /* Descale and correctly round an int32 value that's scaled by N bits.
102  * We assume RIGHT_SHIFT rounds towards minus infinity, so adding
103  * the fudge factor is correct for either sign of X.
104  */
105 
106 #define DESCALE(x,n)  RIGHT_SHIFT((x) + (ONE << ((n)-1)), n)
107 
108 /* Multiply an int32 variable by an int32 constant to yield an int32 result.
109  * For 8-bit samples with the recommended scaling, all the variable
110  * and constant values involved are no more than 16 bits wide, so a
111  * 16x16->32 bit multiply can be used instead of a full 32x32 multiply;
112  * this provides a useful speedup on many machines.
113  * There is no way to specify a 16x16->32 multiply in portable C, but
114  * some C compilers will do the right thing if you provide the correct
115  * combination of casts.
116  * NB: for 12-bit samples, a full 32-bit multiplication will be needed.
117  */
118 
119 #ifdef EIGHT_BIT_SAMPLES
120 #ifdef SHORTxSHORT_32		/* may work if 'int' is 32 bits */
121 #define MULTIPLY(var,const)  (((INT16) (var)) * ((INT16) (const)))
122 #endif
123 #ifdef SHORTxLCONST_32		/* known to work with Microsoft C 6.0 */
124 #define MULTIPLY(var,const)  (((INT16) (var)) * ((int32) (const)))
125 #endif
126 #endif
127 
128 #ifndef MULTIPLY		/* default definition */
129 #define MULTIPLY(var,const)  ((var) * (const))
130 #endif
131 
132 
133 /*
134   Unlike our decoder where we approximate the FIXes, we need to use exact
135 ones here or successive P-frames will drift too much with Reference frame coding
136 */
137 #define FIX_0_211164243 1730
138 #define FIX_0_275899380 2260
139 #define FIX_0_298631336 2446
140 #define FIX_0_390180644 3196
141 #define FIX_0_509795579 4176
142 #define FIX_0_541196100 4433
143 #define FIX_0_601344887 4926
144 #define FIX_0_765366865 6270
145 #define FIX_0_785694958 6436
146 #define FIX_0_899976223 7373
147 #define FIX_1_061594337 8697
148 #define FIX_1_111140466 9102
149 #define FIX_1_175875602 9633
150 #define FIX_1_306562965 10703
151 #define FIX_1_387039845 11363
152 #define FIX_1_451774981 11893
153 #define FIX_1_501321110 12299
154 #define FIX_1_662939225 13623
155 #define FIX_1_847759065 15137
156 #define FIX_1_961570560 16069
157 #define FIX_2_053119869 16819
158 #define FIX_2_172734803 17799
159 #define FIX_2_562915447 20995
160 #define FIX_3_072711026 25172
161 
162 /*
163   Switch on reverse_dct choices
164 */
165 void reference_rev_dct _ANSI_ARGS_((int16 *block));
166 void mpeg_jrevdct_quick _ANSI_ARGS_((int16 *block));
167 void init_idctref _ANSI_ARGS_((void));
168 
169 extern boolean pureDCT;
170 
171 void
mpeg_jrevdct(DCTBLOCK data)172 mpeg_jrevdct(DCTBLOCK data)
173 {
174   if (pureDCT) reference_rev_dct(data);
175   else mpeg_jrevdct_quick(data);
176 }
177 
178 /*
179  * Perform the inverse DCT on one block of coefficients.
180  */
181 
182 void
mpeg_jrevdct_quick(DCTBLOCK data)183 mpeg_jrevdct_quick(DCTBLOCK data)
184 {
185   int32 tmp0, tmp1, tmp2, tmp3;
186   int32 tmp10, tmp11, tmp12, tmp13;
187   int32 z1, z2, z3, z4, z5;
188   int32 d0, d1, d2, d3, d4, d5, d6, d7;
189   register DCTELEM *dataptr;
190   int rowctr;
191   SHIFT_TEMPS
192 
193   /* Pass 1: process rows. */
194   /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
195   /* furthermore, we scale the results by 2**PASS1_BITS. */
196 
197   dataptr = data;
198 
199   for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
200     /* Due to quantization, we will usually find that many of the input
201      * coefficients are zero, especially the AC terms.  We can exploit this
202      * by short-circuiting the IDCT calculation for any row in which all
203      * the AC terms are zero.  In that case each output is equal to the
204      * DC coefficient (with scale factor as needed).
205      * With typical images and quantization tables, half or more of the
206      * row DCT calculations can be simplified this way.
207      */
208 
209     register int *idataptr = (int*)dataptr;
210     d0 = dataptr[0];
211     d1 = dataptr[1];
212     if ((d1 == 0) && (idataptr[1] | idataptr[2] | idataptr[3]) == 0) {
213       /* AC terms all zero */
214       if (d0) {
215 	  /* Compute a 32 bit value to assign. */
216 	  DCTELEM dcval = (DCTELEM) (d0 << PASS1_BITS);
217 	  register int v = (dcval & 0xffff) | ((dcval << 16) & 0xffff0000);
218 
219 	  idataptr[0] = v;
220 	  idataptr[1] = v;
221 	  idataptr[2] = v;
222 	  idataptr[3] = v;
223       }
224 
225       dataptr += DCTSIZE;	/* advance pointer to next row */
226       continue;
227     }
228     d2 = dataptr[2];
229     d3 = dataptr[3];
230     d4 = dataptr[4];
231     d5 = dataptr[5];
232     d6 = dataptr[6];
233     d7 = dataptr[7];
234 
235     /* Even part: reverse the even part of the forward DCT. */
236     /* The rotator is sqrt(2)*c(-6). */
237 {
238     if (d6) {
239 	if (d4) {
240 	    if (d2) {
241 		if (d0) {
242 		    /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
243 		    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
244 		    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
245 		    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
246 
247 		    tmp0 = (d0 + d4) << CONST_BITS;
248 		    tmp1 = (d0 - d4) << CONST_BITS;
249 
250 		    tmp10 = tmp0 + tmp3;
251 		    tmp13 = tmp0 - tmp3;
252 		    tmp11 = tmp1 + tmp2;
253 		    tmp12 = tmp1 - tmp2;
254 		} else {
255 		    /* d0 == 0, d2 != 0, d4 != 0, d6 != 0 */
256 		    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
257 		    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
258 		    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
259 
260 		    tmp0 = d4 << CONST_BITS;
261 
262 		    tmp10 = tmp0 + tmp3;
263 		    tmp13 = tmp0 - tmp3;
264 		    tmp11 = tmp2 - tmp0;
265 		    tmp12 = -(tmp0 + tmp2);
266 		}
267 	    } else {
268 		if (d0) {
269 		    /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
270 		    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
271 		    tmp3 = MULTIPLY(d6, FIX_0_541196100);
272 
273 		    tmp0 = (d0 + d4) << CONST_BITS;
274 		    tmp1 = (d0 - d4) << CONST_BITS;
275 
276 		    tmp10 = tmp0 + tmp3;
277 		    tmp13 = tmp0 - tmp3;
278 		    tmp11 = tmp1 + tmp2;
279 		    tmp12 = tmp1 - tmp2;
280 		} else {
281 		    /* d0 == 0, d2 == 0, d4 != 0, d6 != 0 */
282 		    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
283 		    tmp3 = MULTIPLY(d6, FIX_0_541196100);
284 
285 		    tmp0 = d4 << CONST_BITS;
286 
287 		    tmp10 = tmp0 + tmp3;
288 		    tmp13 = tmp0 - tmp3;
289 		    tmp11 = tmp2 - tmp0;
290 		    tmp12 = -(tmp0 + tmp2);
291 		}
292 	    }
293 	} else {
294 	    if (d2) {
295 		if (d0) {
296 		    /* d0 != 0, d2 != 0, d4 == 0, d6 != 0 */
297 		    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
298 		    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
299 		    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
300 
301 		    tmp0 = d0 << CONST_BITS;
302 
303 		    tmp10 = tmp0 + tmp3;
304 		    tmp13 = tmp0 - tmp3;
305 		    tmp11 = tmp0 + tmp2;
306 		    tmp12 = tmp0 - tmp2;
307 		} else {
308 		    /* d0 == 0, d2 != 0, d4 == 0, d6 != 0 */
309 		    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
310 		    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
311 		    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
312 
313 		    tmp10 = tmp3;
314 		    tmp13 = -tmp3;
315 		    tmp11 = tmp2;
316 		    tmp12 = -tmp2;
317 		}
318 	    } else {
319 		if (d0) {
320 		    /* d0 != 0, d2 == 0, d4 == 0, d6 != 0 */
321 		    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
322 		    tmp3 = MULTIPLY(d6, FIX_0_541196100);
323 
324 		    tmp0 = d0 << CONST_BITS;
325 
326 		    tmp10 = tmp0 + tmp3;
327 		    tmp13 = tmp0 - tmp3;
328 		    tmp11 = tmp0 + tmp2;
329 		    tmp12 = tmp0 - tmp2;
330 		} else {
331 		    /* d0 == 0, d2 == 0, d4 == 0, d6 != 0 */
332 		    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
333 		    tmp3 = MULTIPLY(d6, FIX_0_541196100);
334 
335 		    tmp10 = tmp3;
336 		    tmp13 = -tmp3;
337 		    tmp11 = tmp2;
338 		    tmp12 = -tmp2;
339 		}
340 	    }
341 	}
342     } else {
343 	if (d4) {
344 	    if (d2) {
345 		if (d0) {
346 		    /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
347 		    tmp2 = MULTIPLY(d2, FIX_0_541196100);
348 		    tmp3 = MULTIPLY(d2, FIX_1_306562965);
349 
350 		    tmp0 = (d0 + d4) << CONST_BITS;
351 		    tmp1 = (d0 - d4) << CONST_BITS;
352 
353 		    tmp10 = tmp0 + tmp3;
354 		    tmp13 = tmp0 - tmp3;
355 		    tmp11 = tmp1 + tmp2;
356 		    tmp12 = tmp1 - tmp2;
357 		} else {
358 		    /* d0 == 0, d2 != 0, d4 != 0, d6 == 0 */
359 		    tmp2 = MULTIPLY(d2, FIX_0_541196100);
360 		    tmp3 = MULTIPLY(d2, FIX_1_306562965);
361 
362 		    tmp0 = d4 << CONST_BITS;
363 
364 		    tmp10 = tmp0 + tmp3;
365 		    tmp13 = tmp0 - tmp3;
366 		    tmp11 = tmp2 - tmp0;
367 		    tmp12 = -(tmp0 + tmp2);
368 		}
369 	    } else {
370 		if (d0) {
371 		    /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
372 		    tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
373 		    tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
374 		} else {
375 		    /* d0 == 0, d2 == 0, d4 != 0, d6 == 0 */
376 		    tmp10 = tmp13 = d4 << CONST_BITS;
377 		    tmp11 = tmp12 = -tmp10;
378 		}
379 	    }
380 	} else {
381 	    if (d2) {
382 		if (d0) {
383 		    /* d0 != 0, d2 != 0, d4 == 0, d6 == 0 */
384 		    tmp2 = MULTIPLY(d2, FIX_0_541196100);
385 		    tmp3 = MULTIPLY(d2, FIX_1_306562965);
386 
387 		    tmp0 = d0 << CONST_BITS;
388 
389 		    tmp10 = tmp0 + tmp3;
390 		    tmp13 = tmp0 - tmp3;
391 		    tmp11 = tmp0 + tmp2;
392 		    tmp12 = tmp0 - tmp2;
393 		} else {
394 		    /* d0 == 0, d2 != 0, d4 == 0, d6 == 0 */
395 		    tmp2 = MULTIPLY(d2, FIX_0_541196100);
396 		    tmp3 = MULTIPLY(d2, FIX_1_306562965);
397 
398 		    tmp10 = tmp3;
399 		    tmp13 = -tmp3;
400 		    tmp11 = tmp2;
401 		    tmp12 = -tmp2;
402 		}
403 	    } else {
404 		if (d0) {
405 		    /* d0 != 0, d2 == 0, d4 == 0, d6 == 0 */
406 		    tmp10 = tmp13 = tmp11 = tmp12 = d0 << CONST_BITS;
407 		} else {
408 		    /* d0 == 0, d2 == 0, d4 == 0, d6 == 0 */
409 		    tmp10 = tmp13 = tmp11 = tmp12 = 0;
410 		}
411 	    }
412 	}
413       }
414 
415     /* Odd part per figure 8; the matrix is unitary and hence its
416      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
417      */
418 
419     if (d7) {
420 	if (d5) {
421 	    if (d3) {
422 		if (d1) {
423 		    /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
424 		    z1 = d7 + d1;
425 		    z2 = d5 + d3;
426 		    z3 = d7 + d3;
427 		    z4 = d5 + d1;
428 		    z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
429 
430 		    tmp0 = MULTIPLY(d7, FIX_0_298631336);
431 		    tmp1 = MULTIPLY(d5, FIX_2_053119869);
432 		    tmp2 = MULTIPLY(d3, FIX_3_072711026);
433 		    tmp3 = MULTIPLY(d1, FIX_1_501321110);
434 		    z1 = MULTIPLY(-z1, FIX_0_899976223);
435 		    z2 = MULTIPLY(-z2, FIX_2_562915447);
436 		    z3 = MULTIPLY(-z3, FIX_1_961570560);
437 		    z4 = MULTIPLY(-z4, FIX_0_390180644);
438 
439 		    z3 += z5;
440 		    z4 += z5;
441 
442 		    tmp0 += z1 + z3;
443 		    tmp1 += z2 + z4;
444 		    tmp2 += z2 + z3;
445 		    tmp3 += z1 + z4;
446 		} else {
447 		    /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
448 		    z2 = d5 + d3;
449 		    z3 = d7 + d3;
450 		    z5 = MULTIPLY(z3 + d5, FIX_1_175875602);
451 
452 		    tmp0 = MULTIPLY(d7, FIX_0_298631336);
453 		    tmp1 = MULTIPLY(d5, FIX_2_053119869);
454 		    tmp2 = MULTIPLY(d3, FIX_3_072711026);
455 		    z1 = MULTIPLY(-d7, FIX_0_899976223);
456 		    z2 = MULTIPLY(-z2, FIX_2_562915447);
457 		    z3 = MULTIPLY(-z3, FIX_1_961570560);
458 		    z4 = MULTIPLY(-d5, FIX_0_390180644);
459 
460 		    z3 += z5;
461 		    z4 += z5;
462 
463 		    tmp0 += z1 + z3;
464 		    tmp1 += z2 + z4;
465 		    tmp2 += z2 + z3;
466 		    tmp3 = z1 + z4;
467 		}
468 	    } else {
469 		if (d1) {
470 		    /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
471 		    z1 = d7 + d1;
472 		    z4 = d5 + d1;
473 		    z5 = MULTIPLY(d7 + z4, FIX_1_175875602);
474 
475 		    tmp0 = MULTIPLY(d7, FIX_0_298631336);
476 		    tmp1 = MULTIPLY(d5, FIX_2_053119869);
477 		    tmp3 = MULTIPLY(d1, FIX_1_501321110);
478 		    z1 = MULTIPLY(-z1, FIX_0_899976223);
479 		    z2 = MULTIPLY(-d5, FIX_2_562915447);
480 		    z3 = MULTIPLY(-d7, FIX_1_961570560);
481 		    z4 = MULTIPLY(-z4, FIX_0_390180644);
482 
483 		    z3 += z5;
484 		    z4 += z5;
485 
486 		    tmp0 += z1 + z3;
487 		    tmp1 += z2 + z4;
488 		    tmp2 = z2 + z3;
489 		    tmp3 += z1 + z4;
490 		} else {
491 		    /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
492 		    tmp0 = MULTIPLY(-d7, FIX_0_601344887);
493 		    z1 = MULTIPLY(-d7, FIX_0_899976223);
494 		    z3 = MULTIPLY(-d7, FIX_1_961570560);
495 		    tmp1 = MULTIPLY(-d5, FIX_0_509795579);
496 		    z2 = MULTIPLY(-d5, FIX_2_562915447);
497 		    z4 = MULTIPLY(-d5, FIX_0_390180644);
498 		    z5 = MULTIPLY(d5 + d7, FIX_1_175875602);
499 
500 		    z3 += z5;
501 		    z4 += z5;
502 
503 		    tmp0 += z3;
504 		    tmp1 += z4;
505 		    tmp2 = z2 + z3;
506 		    tmp3 = z1 + z4;
507 		}
508 	    }
509 	} else {
510 	    if (d3) {
511 		if (d1) {
512 		    /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
513 		    z1 = d7 + d1;
514 		    z3 = d7 + d3;
515 		    z5 = MULTIPLY(z3 + d1, FIX_1_175875602);
516 
517 		    tmp0 = MULTIPLY(d7, FIX_0_298631336);
518 		    tmp2 = MULTIPLY(d3, FIX_3_072711026);
519 		    tmp3 = MULTIPLY(d1, FIX_1_501321110);
520 		    z1 = MULTIPLY(-z1, FIX_0_899976223);
521 		    z2 = MULTIPLY(-d3, FIX_2_562915447);
522 		    z3 = MULTIPLY(-z3, FIX_1_961570560);
523 		    z4 = MULTIPLY(-d1, FIX_0_390180644);
524 
525 		    z3 += z5;
526 		    z4 += z5;
527 
528 		    tmp0 += z1 + z3;
529 		    tmp1 = z2 + z4;
530 		    tmp2 += z2 + z3;
531 		    tmp3 += z1 + z4;
532 		} else {
533 		    /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
534 		    z3 = d7 + d3;
535 
536 		    tmp0 = MULTIPLY(-d7, FIX_0_601344887);
537 		    z1 = MULTIPLY(-d7, FIX_0_899976223);
538 		    tmp2 = MULTIPLY(d3, FIX_0_509795579);
539 		    z2 = MULTIPLY(-d3, FIX_2_562915447);
540 		    z5 = MULTIPLY(z3, FIX_1_175875602);
541 		    z3 = MULTIPLY(-z3, FIX_0_785694958);
542 
543 		    tmp0 += z3;
544 		    tmp1 = z2 + z5;
545 		    tmp2 += z3;
546 		    tmp3 = z1 + z5;
547 		}
548 	    } else {
549 		if (d1) {
550 		    /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
551 		    z1 = d7 + d1;
552 		    z5 = MULTIPLY(z1, FIX_1_175875602);
553 
554 		    z1 = MULTIPLY(z1, FIX_0_275899380);
555 		    z3 = MULTIPLY(-d7, FIX_1_961570560);
556 		    tmp0 = MULTIPLY(-d7, FIX_1_662939225);
557 		    z4 = MULTIPLY(-d1, FIX_0_390180644);
558 		    tmp3 = MULTIPLY(d1, FIX_1_111140466);
559 
560 		    tmp0 += z1;
561 		    tmp1 = z4 + z5;
562 		    tmp2 = z3 + z5;
563 		    tmp3 += z1;
564 		} else {
565 		    /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
566 		    tmp0 = MULTIPLY(-d7, FIX_1_387039845);
567 		    tmp1 = MULTIPLY(d7, FIX_1_175875602);
568 		    tmp2 = MULTIPLY(-d7, FIX_0_785694958);
569 		    tmp3 = MULTIPLY(d7, FIX_0_275899380);
570 		}
571 	    }
572 	}
573     } else {
574 	if (d5) {
575 	    if (d3) {
576 		if (d1) {
577 		    /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
578 		    z2 = d5 + d3;
579 		    z4 = d5 + d1;
580 		    z5 = MULTIPLY(d3 + z4, FIX_1_175875602);
581 
582 		    tmp1 = MULTIPLY(d5, FIX_2_053119869);
583 		    tmp2 = MULTIPLY(d3, FIX_3_072711026);
584 		    tmp3 = MULTIPLY(d1, FIX_1_501321110);
585 		    z1 = MULTIPLY(-d1, FIX_0_899976223);
586 		    z2 = MULTIPLY(-z2, FIX_2_562915447);
587 		    z3 = MULTIPLY(-d3, FIX_1_961570560);
588 		    z4 = MULTIPLY(-z4, FIX_0_390180644);
589 
590 		    z3 += z5;
591 		    z4 += z5;
592 
593 		    tmp0 = z1 + z3;
594 		    tmp1 += z2 + z4;
595 		    tmp2 += z2 + z3;
596 		    tmp3 += z1 + z4;
597 		} else {
598 		    /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
599 		    z2 = d5 + d3;
600 
601 		    z5 = MULTIPLY(z2, FIX_1_175875602);
602 		    tmp1 = MULTIPLY(d5, FIX_1_662939225);
603 		    z4 = MULTIPLY(-d5, FIX_0_390180644);
604 		    z2 = MULTIPLY(-z2, FIX_1_387039845);
605 		    tmp2 = MULTIPLY(d3, FIX_1_111140466);
606 		    z3 = MULTIPLY(-d3, FIX_1_961570560);
607 
608 		    tmp0 = z3 + z5;
609 		    tmp1 += z2;
610 		    tmp2 += z2;
611 		    tmp3 = z4 + z5;
612 		}
613 	    } else {
614 		if (d1) {
615 		    /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
616 		    z4 = d5 + d1;
617 
618 		    z5 = MULTIPLY(z4, FIX_1_175875602);
619 		    z1 = MULTIPLY(-d1, FIX_0_899976223);
620 		    tmp3 = MULTIPLY(d1, FIX_0_601344887);
621 		    tmp1 = MULTIPLY(-d5, FIX_0_509795579);
622 		    z2 = MULTIPLY(-d5, FIX_2_562915447);
623 		    z4 = MULTIPLY(z4, FIX_0_785694958);
624 
625 		    tmp0 = z1 + z5;
626 		    tmp1 += z4;
627 		    tmp2 = z2 + z5;
628 		    tmp3 += z4;
629 		} else {
630 		    /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
631 		    tmp0 = MULTIPLY(d5, FIX_1_175875602);
632 		    tmp1 = MULTIPLY(d5, FIX_0_275899380);
633 		    tmp2 = MULTIPLY(-d5, FIX_1_387039845);
634 		    tmp3 = MULTIPLY(d5, FIX_0_785694958);
635 		}
636 	    }
637 	} else {
638 	    if (d3) {
639 		if (d1) {
640 		    /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
641 		    z5 = d1 + d3;
642 		    tmp3 = MULTIPLY(d1, FIX_0_211164243);
643 		    tmp2 = MULTIPLY(-d3, FIX_1_451774981);
644 		    z1 = MULTIPLY(d1, FIX_1_061594337);
645 		    z2 = MULTIPLY(-d3, FIX_2_172734803);
646 		    z4 = MULTIPLY(z5, FIX_0_785694958);
647 		    z5 = MULTIPLY(z5, FIX_1_175875602);
648 
649 		    tmp0 = z1 - z4;
650 		    tmp1 = z2 + z4;
651 		    tmp2 += z5;
652 		    tmp3 += z5;
653 		} else {
654 		    /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
655 		    tmp0 = MULTIPLY(-d3, FIX_0_785694958);
656 		    tmp1 = MULTIPLY(-d3, FIX_1_387039845);
657 		    tmp2 = MULTIPLY(-d3, FIX_0_275899380);
658 		    tmp3 = MULTIPLY(d3, FIX_1_175875602);
659 		}
660 	    } else {
661 		if (d1) {
662 		    /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
663 		    tmp0 = MULTIPLY(d1, FIX_0_275899380);
664 		    tmp1 = MULTIPLY(d1, FIX_0_785694958);
665 		    tmp2 = MULTIPLY(d1, FIX_1_175875602);
666 		    tmp3 = MULTIPLY(d1, FIX_1_387039845);
667 		} else {
668 		    /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
669 		    tmp0 = tmp1 = tmp2 = tmp3 = 0;
670 		}
671 	    }
672 	}
673     }
674 }
675     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
676 
677     dataptr[0] = (DCTELEM) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
678     dataptr[7] = (DCTELEM) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
679     dataptr[1] = (DCTELEM) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
680     dataptr[6] = (DCTELEM) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
681     dataptr[2] = (DCTELEM) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
682     dataptr[5] = (DCTELEM) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
683     dataptr[3] = (DCTELEM) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
684     dataptr[4] = (DCTELEM) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
685 
686     dataptr += DCTSIZE;		/* advance pointer to next row */
687   }
688 
689   /* Pass 2: process columns. */
690   /* Note that we must descale the results by a factor of 8 == 2**3, */
691   /* and also undo the PASS1_BITS scaling. */
692 
693   dataptr = data;
694   for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
695     /* Columns of zeroes can be exploited in the same way as we did with rows.
696      * However, the row calculation has created many nonzero AC terms, so the
697      * simplification applies less often (typically 5% to 10% of the time).
698      * On machines with very fast multiplication, it's possible that the
699      * test takes more time than it's worth.  In that case this section
700      * may be commented out.
701      */
702 
703     d0 = dataptr[DCTSIZE*0];
704     d1 = dataptr[DCTSIZE*1];
705     d2 = dataptr[DCTSIZE*2];
706     d3 = dataptr[DCTSIZE*3];
707     d4 = dataptr[DCTSIZE*4];
708     d5 = dataptr[DCTSIZE*5];
709     d6 = dataptr[DCTSIZE*6];
710     d7 = dataptr[DCTSIZE*7];
711 
712     /* Even part: reverse the even part of the forward DCT. */
713     /* The rotator is sqrt(2)*c(-6). */
714     if (d6) {
715 	if (d4) {
716 	    if (d2) {
717 		if (d0) {
718 		    /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
719 		    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
720 		    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
721 		    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
722 
723 		    tmp0 = (d0 + d4) << CONST_BITS;
724 		    tmp1 = (d0 - d4) << CONST_BITS;
725 
726 		    tmp10 = tmp0 + tmp3;
727 		    tmp13 = tmp0 - tmp3;
728 		    tmp11 = tmp1 + tmp2;
729 		    tmp12 = tmp1 - tmp2;
730 		} else {
731 		    /* d0 == 0, d2 != 0, d4 != 0, d6 != 0 */
732 		    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
733 		    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
734 		    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
735 
736 		    tmp0 = d4 << CONST_BITS;
737 
738 		    tmp10 = tmp0 + tmp3;
739 		    tmp13 = tmp0 - tmp3;
740 		    tmp11 = tmp2 - tmp0;
741 		    tmp12 = -(tmp0 + tmp2);
742 		}
743 	    } else {
744 		if (d0) {
745 		    /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
746 		    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
747 		    tmp3 = MULTIPLY(d6, FIX_0_541196100);
748 
749 		    tmp0 = (d0 + d4) << CONST_BITS;
750 		    tmp1 = (d0 - d4) << CONST_BITS;
751 
752 		    tmp10 = tmp0 + tmp3;
753 		    tmp13 = tmp0 - tmp3;
754 		    tmp11 = tmp1 + tmp2;
755 		    tmp12 = tmp1 - tmp2;
756 		} else {
757 		    /* d0 == 0, d2 == 0, d4 != 0, d6 != 0 */
758 		    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
759 		    tmp3 = MULTIPLY(d6, FIX_0_541196100);
760 
761 		    tmp0 = d4 << CONST_BITS;
762 
763 		    tmp10 = tmp0 + tmp3;
764 		    tmp13 = tmp0 - tmp3;
765 		    tmp11 = tmp2 - tmp0;
766 		    tmp12 = -(tmp0 + tmp2);
767 		}
768 	    }
769 	} else {
770 	    if (d2) {
771 		if (d0) {
772 		    /* d0 != 0, d2 != 0, d4 == 0, d6 != 0 */
773 		    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
774 		    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
775 		    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
776 
777 		    tmp0 = d0 << CONST_BITS;
778 
779 		    tmp10 = tmp0 + tmp3;
780 		    tmp13 = tmp0 - tmp3;
781 		    tmp11 = tmp0 + tmp2;
782 		    tmp12 = tmp0 - tmp2;
783 		} else {
784 		    /* d0 == 0, d2 != 0, d4 == 0, d6 != 0 */
785 		    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
786 		    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
787 		    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
788 
789 		    tmp10 = tmp3;
790 		    tmp13 = -tmp3;
791 		    tmp11 = tmp2;
792 		    tmp12 = -tmp2;
793 		}
794 	    } else {
795 		if (d0) {
796 		    /* d0 != 0, d2 == 0, d4 == 0, d6 != 0 */
797 		    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
798 		    tmp3 = MULTIPLY(d6, FIX_0_541196100);
799 
800 		    tmp0 = d0 << CONST_BITS;
801 
802 		    tmp10 = tmp0 + tmp3;
803 		    tmp13 = tmp0 - tmp3;
804 		    tmp11 = tmp0 + tmp2;
805 		    tmp12 = tmp0 - tmp2;
806 		} else {
807 		    /* d0 == 0, d2 == 0, d4 == 0, d6 != 0 */
808 		    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
809 		    tmp3 = MULTIPLY(d6, FIX_0_541196100);
810 
811 		    tmp10 = tmp3;
812 		    tmp13 = -tmp3;
813 		    tmp11 = tmp2;
814 		    tmp12 = -tmp2;
815 		}
816 	    }
817 	}
818     } else {
819 	if (d4) {
820 	    if (d2) {
821 		if (d0) {
822 		    /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
823 		    tmp2 = MULTIPLY(d2, FIX_0_541196100);
824 		    tmp3 = MULTIPLY(d2, FIX_1_306562965);
825 
826 		    tmp0 = (d0 + d4) << CONST_BITS;
827 		    tmp1 = (d0 - d4) << CONST_BITS;
828 
829 		    tmp10 = tmp0 + tmp3;
830 		    tmp13 = tmp0 - tmp3;
831 		    tmp11 = tmp1 + tmp2;
832 		    tmp12 = tmp1 - tmp2;
833 		} else {
834 		    /* d0 == 0, d2 != 0, d4 != 0, d6 == 0 */
835 		    tmp2 = MULTIPLY(d2, FIX_0_541196100);
836 		    tmp3 = MULTIPLY(d2, FIX_1_306562965);
837 
838 		    tmp0 = d4 << CONST_BITS;
839 
840 		    tmp10 = tmp0 + tmp3;
841 		    tmp13 = tmp0 - tmp3;
842 		    tmp11 = tmp2 - tmp0;
843 		    tmp12 = -(tmp0 + tmp2);
844 		}
845 	    } else {
846 		if (d0) {
847 		    /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
848 		    tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
849 		    tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
850 		} else {
851 		    /* d0 == 0, d2 == 0, d4 != 0, d6 == 0 */
852 		    tmp10 = tmp13 = d4 << CONST_BITS;
853 		    tmp11 = tmp12 = -tmp10;
854 		}
855 	    }
856 	} else {
857 	    if (d2) {
858 		if (d0) {
859 		    /* d0 != 0, d2 != 0, d4 == 0, d6 == 0 */
860 		    tmp2 = MULTIPLY(d2, FIX_0_541196100);
861 		    tmp3 = MULTIPLY(d2, FIX_1_306562965);
862 
863 		    tmp0 = d0 << CONST_BITS;
864 
865 		    tmp10 = tmp0 + tmp3;
866 		    tmp13 = tmp0 - tmp3;
867 		    tmp11 = tmp0 + tmp2;
868 		    tmp12 = tmp0 - tmp2;
869 		} else {
870 		    /* d0 == 0, d2 != 0, d4 == 0, d6 == 0 */
871 		    tmp2 = MULTIPLY(d2, FIX_0_541196100);
872 		    tmp3 = MULTIPLY(d2, FIX_1_306562965);
873 
874 		    tmp10 = tmp3;
875 		    tmp13 = -tmp3;
876 		    tmp11 = tmp2;
877 		    tmp12 = -tmp2;
878 		}
879 	    } else {
880 		if (d0) {
881 		    /* d0 != 0, d2 == 0, d4 == 0, d6 == 0 */
882 		    tmp10 = tmp13 = tmp11 = tmp12 = d0 << CONST_BITS;
883 		} else {
884 		    /* d0 == 0, d2 == 0, d4 == 0, d6 == 0 */
885 		    tmp10 = tmp13 = tmp11 = tmp12 = 0;
886 		}
887 	    }
888 	}
889     }
890 
891     /* Odd part per figure 8; the matrix is unitary and hence its
892      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
893      */
894     if (d7) {
895 	if (d5) {
896 	    if (d3) {
897 		if (d1) {
898 		    /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
899 		    z1 = d7 + d1;
900 		    z2 = d5 + d3;
901 		    z3 = d7 + d3;
902 		    z4 = d5 + d1;
903 		    z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
904 
905 		    tmp0 = MULTIPLY(d7, FIX_0_298631336);
906 		    tmp1 = MULTIPLY(d5, FIX_2_053119869);
907 		    tmp2 = MULTIPLY(d3, FIX_3_072711026);
908 		    tmp3 = MULTIPLY(d1, FIX_1_501321110);
909 		    z1 = MULTIPLY(-z1, FIX_0_899976223);
910 		    z2 = MULTIPLY(-z2, FIX_2_562915447);
911 		    z3 = MULTIPLY(-z3, FIX_1_961570560);
912 		    z4 = MULTIPLY(-z4, FIX_0_390180644);
913 
914 		    z3 += z5;
915 		    z4 += z5;
916 
917 		    tmp0 += z1 + z3;
918 		    tmp1 += z2 + z4;
919 		    tmp2 += z2 + z3;
920 		    tmp3 += z1 + z4;
921 		} else {
922 		    /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
923 		    z1 = d7;
924 		    z2 = d5 + d3;
925 		    z3 = d7 + d3;
926 		    z5 = MULTIPLY(z3 + d5, FIX_1_175875602);
927 
928 		    tmp0 = MULTIPLY(d7, FIX_0_298631336);
929 		    tmp1 = MULTIPLY(d5, FIX_2_053119869);
930 		    tmp2 = MULTIPLY(d3, FIX_3_072711026);
931 		    z1 = MULTIPLY(-d7, FIX_0_899976223);
932 		    z2 = MULTIPLY(-z2, FIX_2_562915447);
933 		    z3 = MULTIPLY(-z3, FIX_1_961570560);
934 		    z4 = MULTIPLY(-d5, FIX_0_390180644);
935 
936 		    z3 += z5;
937 		    z4 += z5;
938 
939 		    tmp0 += z1 + z3;
940 		    tmp1 += z2 + z4;
941 		    tmp2 += z2 + z3;
942 		    tmp3 = z1 + z4;
943 		}
944 	    } else {
945 		if (d1) {
946 		    /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
947 		    z1 = d7 + d1;
948 		    z2 = d5;
949 		    z3 = d7;
950 		    z4 = d5 + d1;
951 		    z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
952 
953 		    tmp0 = MULTIPLY(d7, FIX_0_298631336);
954 		    tmp1 = MULTIPLY(d5, FIX_2_053119869);
955 		    tmp3 = MULTIPLY(d1, FIX_1_501321110);
956 		    z1 = MULTIPLY(-z1, FIX_0_899976223);
957 		    z2 = MULTIPLY(-d5, FIX_2_562915447);
958 		    z3 = MULTIPLY(-d7, FIX_1_961570560);
959 		    z4 = MULTIPLY(-z4, FIX_0_390180644);
960 
961 		    z3 += z5;
962 		    z4 += z5;
963 
964 		    tmp0 += z1 + z3;
965 		    tmp1 += z2 + z4;
966 		    tmp2 = z2 + z3;
967 		    tmp3 += z1 + z4;
968 		} else {
969 		    /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
970 		    tmp0 = MULTIPLY(-d7, FIX_0_601344887);
971 		    z1 = MULTIPLY(-d7, FIX_0_899976223);
972 		    z3 = MULTIPLY(-d7, FIX_1_961570560);
973 		    tmp1 = MULTIPLY(-d5, FIX_0_509795579);
974 		    z2 = MULTIPLY(-d5, FIX_2_562915447);
975 		    z4 = MULTIPLY(-d5, FIX_0_390180644);
976 		    z5 = MULTIPLY(d5 + d7, FIX_1_175875602);
977 
978 		    z3 += z5;
979 		    z4 += z5;
980 
981 		    tmp0 += z3;
982 		    tmp1 += z4;
983 		    tmp2 = z2 + z3;
984 		    tmp3 = z1 + z4;
985 		}
986 	    }
987 	} else {
988 	    if (d3) {
989 		if (d1) {
990 		    /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
991 		    z1 = d7 + d1;
992 		    z3 = d7 + d3;
993 		    z5 = MULTIPLY(z3 + d1, FIX_1_175875602);
994 
995 		    tmp0 = MULTIPLY(d7, FIX_0_298631336);
996 		    tmp2 = MULTIPLY(d3, FIX_3_072711026);
997 		    tmp3 = MULTIPLY(d1, FIX_1_501321110);
998 		    z1 = MULTIPLY(-z1, FIX_0_899976223);
999 		    z2 = MULTIPLY(-d3, FIX_2_562915447);
1000 		    z3 = MULTIPLY(-z3, FIX_1_961570560);
1001 		    z4 = MULTIPLY(-d1, FIX_0_390180644);
1002 
1003 		    z3 += z5;
1004 		    z4 += z5;
1005 
1006 		    tmp0 += z1 + z3;
1007 		    tmp1 = z2 + z4;
1008 		    tmp2 += z2 + z3;
1009 		    tmp3 += z1 + z4;
1010 		} else {
1011 		    /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
1012 		    z3 = d7 + d3;
1013 
1014 		    tmp0 = MULTIPLY(-d7, FIX_0_601344887);
1015 		    z1 = MULTIPLY(-d7, FIX_0_899976223);
1016 		    tmp2 = MULTIPLY(d3, FIX_0_509795579);
1017 		    z2 = MULTIPLY(-d3, FIX_2_562915447);
1018 		    z5 = MULTIPLY(z3, FIX_1_175875602);
1019 		    z3 = MULTIPLY(-z3, FIX_0_785694958);
1020 
1021 		    tmp0 += z3;
1022 		    tmp1 = z2 + z5;
1023 		    tmp2 += z3;
1024 		    tmp3 = z1 + z5;
1025 		}
1026 	    } else {
1027 		if (d1) {
1028 		    /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
1029 		    z1 = d7 + d1;
1030 		    z5 = MULTIPLY(z1, FIX_1_175875602);
1031 
1032 		    z1 = MULTIPLY(z1, FIX_0_275899380);
1033 		    z3 = MULTIPLY(-d7, FIX_1_961570560);
1034 		    tmp0 = MULTIPLY(-d7, FIX_1_662939225);
1035 		    z4 = MULTIPLY(-d1, FIX_0_390180644);
1036 		    tmp3 = MULTIPLY(d1, FIX_1_111140466);
1037 
1038 		    tmp0 += z1;
1039 		    tmp1 = z4 + z5;
1040 		    tmp2 = z3 + z5;
1041 		    tmp3 += z1;
1042 		} else {
1043 		    /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
1044 		    tmp0 = MULTIPLY(-d7, FIX_1_387039845);
1045 		    tmp1 = MULTIPLY(d7, FIX_1_175875602);
1046 		    tmp2 = MULTIPLY(-d7, FIX_0_785694958);
1047 		    tmp3 = MULTIPLY(d7, FIX_0_275899380);
1048 		}
1049 	    }
1050 	}
1051     } else {
1052 	if (d5) {
1053 	    if (d3) {
1054 		if (d1) {
1055 		    /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
1056 		    z2 = d5 + d3;
1057 		    z4 = d5 + d1;
1058 		    z5 = MULTIPLY(d3 + z4, FIX_1_175875602);
1059 
1060 		    tmp1 = MULTIPLY(d5, FIX_2_053119869);
1061 		    tmp2 = MULTIPLY(d3, FIX_3_072711026);
1062 		    tmp3 = MULTIPLY(d1, FIX_1_501321110);
1063 		    z1 = MULTIPLY(-d1, FIX_0_899976223);
1064 		    z2 = MULTIPLY(-z2, FIX_2_562915447);
1065 		    z3 = MULTIPLY(-d3, FIX_1_961570560);
1066 		    z4 = MULTIPLY(-z4, FIX_0_390180644);
1067 
1068 		    z3 += z5;
1069 		    z4 += z5;
1070 
1071 		    tmp0 = z1 + z3;
1072 		    tmp1 += z2 + z4;
1073 		    tmp2 += z2 + z3;
1074 		    tmp3 += z1 + z4;
1075 		} else {
1076 		    /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
1077 		    z2 = d5 + d3;
1078 
1079 		    z5 = MULTIPLY(z2, FIX_1_175875602);
1080 		    tmp1 = MULTIPLY(d5, FIX_1_662939225);
1081 		    z4 = MULTIPLY(-d5, FIX_0_390180644);
1082 		    z2 = MULTIPLY(-z2, FIX_1_387039845);
1083 		    tmp2 = MULTIPLY(d3, FIX_1_111140466);
1084 		    z3 = MULTIPLY(-d3, FIX_1_961570560);
1085 
1086 		    tmp0 = z3 + z5;
1087 		    tmp1 += z2;
1088 		    tmp2 += z2;
1089 		    tmp3 = z4 + z5;
1090 		}
1091 	    } else {
1092 		if (d1) {
1093 		    /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
1094 		    z4 = d5 + d1;
1095 
1096 		    z5 = MULTIPLY(z4, FIX_1_175875602);
1097 		    z1 = MULTIPLY(-d1, FIX_0_899976223);
1098 		    tmp3 = MULTIPLY(d1, FIX_0_601344887);
1099 		    tmp1 = MULTIPLY(-d5, FIX_0_509795579);
1100 		    z2 = MULTIPLY(-d5, FIX_2_562915447);
1101 		    z4 = MULTIPLY(z4, FIX_0_785694958);
1102 
1103 		    tmp0 = z1 + z5;
1104 		    tmp1 += z4;
1105 		    tmp2 = z2 + z5;
1106 		    tmp3 += z4;
1107 		} else {
1108 		    /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
1109 		    tmp0 = MULTIPLY(d5, FIX_1_175875602);
1110 		    tmp1 = MULTIPLY(d5, FIX_0_275899380);
1111 		    tmp2 = MULTIPLY(-d5, FIX_1_387039845);
1112 		    tmp3 = MULTIPLY(d5, FIX_0_785694958);
1113 		}
1114 	    }
1115 	} else {
1116 	    if (d3) {
1117 		if (d1) {
1118 		    /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
1119 		    z5 = d1 + d3;
1120 		    tmp3 = MULTIPLY(d1, FIX_0_211164243);
1121 		    tmp2 = MULTIPLY(-d3, FIX_1_451774981);
1122 		    z1 = MULTIPLY(d1, FIX_1_061594337);
1123 		    z2 = MULTIPLY(-d3, FIX_2_172734803);
1124 		    z4 = MULTIPLY(z5, FIX_0_785694958);
1125 		    z5 = MULTIPLY(z5, FIX_1_175875602);
1126 
1127 		    tmp0 = z1 - z4;
1128 		    tmp1 = z2 + z4;
1129 		    tmp2 += z5;
1130 		    tmp3 += z5;
1131 		} else {
1132 		    /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
1133 		    tmp0 = MULTIPLY(-d3, FIX_0_785694958);
1134 		    tmp1 = MULTIPLY(-d3, FIX_1_387039845);
1135 		    tmp2 = MULTIPLY(-d3, FIX_0_275899380);
1136 		    tmp3 = MULTIPLY(d3, FIX_1_175875602);
1137 		}
1138 	    } else {
1139 		if (d1) {
1140 		    /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
1141 		    tmp0 = MULTIPLY(d1, FIX_0_275899380);
1142 		    tmp1 = MULTIPLY(d1, FIX_0_785694958);
1143 		    tmp2 = MULTIPLY(d1, FIX_1_175875602);
1144 		    tmp3 = MULTIPLY(d1, FIX_1_387039845);
1145 		} else {
1146 		    /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
1147 		    tmp0 = tmp1 = tmp2 = tmp3 = 0;
1148 		}
1149 	    }
1150 	}
1151     }
1152 
1153     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
1154 
1155     dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp3,
1156 					   CONST_BITS+PASS1_BITS+3);
1157     dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp10 - tmp3,
1158 					   CONST_BITS+PASS1_BITS+3);
1159     dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp11 + tmp2,
1160 					   CONST_BITS+PASS1_BITS+3);
1161     dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(tmp11 - tmp2,
1162 					   CONST_BITS+PASS1_BITS+3);
1163     dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp12 + tmp1,
1164 					   CONST_BITS+PASS1_BITS+3);
1165     dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12 - tmp1,
1166 					   CONST_BITS+PASS1_BITS+3);
1167     dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp13 + tmp0,
1168 					   CONST_BITS+PASS1_BITS+3);
1169     dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp13 - tmp0,
1170 					   CONST_BITS+PASS1_BITS+3);
1171 
1172     dataptr++;			/* advance pointer to next column */
1173   }
1174 }
1175 
1176 
1177 /* here is the reference one, in case of problems with the normal one */
1178 
1179 /* idctref.c, Inverse Discrete Fourier Transform, double precision          */
1180 
1181 /* Copyright (C) 1994, MPEG Software Simulation Group. All Rights Reserved. */
1182 
1183 /*
1184  * Disclaimer of Warranty
1185  *
1186  * These software programs are available to the user without any license fee or
1187  * royalty on an "as is" basis.  The MPEG Software Simulation Group disclaims
1188  * any and all warranties, whether express, implied, or statuary, including any
1189  * implied warranties or merchantability or of fitness for a particular
1190  * purpose.  In no event shall the copyright-holder be liable for any
1191  * incidental, punitive, or consequential damages of any kind whatsoever
1192  * arising from the use of these programs.
1193  *
1194  * This disclaimer of warranty extends to the user of these programs and user's
1195  * customers, employees, agents, transferees, successors, and assigns.
1196  *
1197  * The MPEG Software Simulation Group does not represent or warrant that the
1198  * programs furnished hereunder are free of infringement of any third-party
1199  * patents.
1200  *
1201  * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
1202  * are subject to royalty fees to patent holders.  Many of these patents are
1203  * general enough such that they are unavoidable regardless of implementation
1204  * design.
1205  *
1206  */
1207 
1208 /*  Perform IEEE 1180 reference (64-bit floating point, separable 8x1
1209  *  direct matrix multiply) Inverse Discrete Cosine Transform
1210 */
1211 
1212 
1213 /* Here we use math.h to generate constants.  Compiler results may
1214    vary a little */
1215 
1216 #ifndef PI
1217 #ifdef M_PI
1218 #define PI M_PI
1219 #else
1220 #define PI 3.14159265358979323846
1221 #endif
1222 #endif
1223 
1224 /* cosine transform matrix for 8x1 IDCT */
1225 static double itrans_coef[8][8];
1226 
1227 /* initialize DCT coefficient matrix */
1228 
init_idctref()1229 void init_idctref()
1230 {
1231   int freq, time;
1232   double scale;
1233 
1234   for (freq=0; freq < 8; freq++)
1235   {
1236     scale = (freq == 0) ? sqrt(0.125) : 0.5;
1237     for (time=0; time<8; time++)
1238       itrans_coef[freq][time] = scale*cos((PI/8.0)*freq*(time + 0.5));
1239   }
1240 }
1241 
1242 /* perform IDCT matrix multiply for 8x8 coefficient block */
1243 
reference_rev_dct(int16 * block)1244 void reference_rev_dct(int16 *block)
1245 {
1246   int i, j, k, v;
1247   double partial_product;
1248   double tmp[64];
1249 
1250   for (i=0; i<8; i++)
1251     for (j=0; j<8; j++)
1252     {
1253       partial_product = 0.0;
1254 
1255       for (k=0; k<8; k++)
1256         partial_product+= itrans_coef[k][j]*block[8*i+k];
1257 
1258       tmp[8*i+j] = partial_product;
1259     }
1260 
1261   /* Transpose operation is integrated into address mapping by switching
1262      loop order of i and j */
1263 
1264   for (j=0; j<8; j++)
1265     for (i=0; i<8; i++)
1266     {
1267       partial_product = 0.0;
1268 
1269       for (k=0; k<8; k++)
1270         partial_product+= itrans_coef[k][i]*tmp[8*k+j];
1271 
1272       v = (int)floor(partial_product+0.5);
1273       block[8*i+j] = (v<-256) ? -256 : ((v>255) ? 255 : v);
1274     }
1275 }
1276