xref: /reactos/dll/3rdparty/libjpeg/jidctint.c (revision 50cf16b3)
1 /*
2  * jidctint.c
3  *
4  * Copyright (C) 1991-1998, Thomas G. Lane.
5  * Modification developed 2002-2016 by Guido Vollbeding.
6  * This file is part of the Independent JPEG Group's software.
7  * For conditions of distribution and use, see the accompanying README file.
8  *
9  * This file contains a slow-but-accurate integer implementation of the
10  * inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
11  * must also perform dequantization of the input coefficients.
12  *
13  * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
14  * on each row (or vice versa, but it's more convenient to emit a row at
15  * a time).  Direct algorithms are also available, but they are much more
16  * complex and seem not to be any faster when reduced to code.
17  *
18  * This implementation is based on an algorithm described in
19  *   C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
20  *   Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
21  *   Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
22  * The primary algorithm described there uses 11 multiplies and 29 adds.
23  * We use their alternate method with 12 multiplies and 32 adds.
24  * The advantage of this method is that no data path contains more than one
25  * multiplication; this allows a very simple and accurate implementation in
26  * scaled fixed-point arithmetic, with a minimal number of shifts.
27  *
28  * We also provide IDCT routines with various output sample block sizes for
29  * direct resolution reduction or enlargement and for direct resolving the
30  * common 2x1 and 1x2 subsampling cases without additional resampling: NxN
31  * (N=1...16), 2NxN, and Nx2N (N=1...8) pixels for one 8x8 input DCT block.
32  *
33  * For N<8 we simply take the corresponding low-frequency coefficients of
34  * the 8x8 input DCT block and apply an NxN point IDCT on the sub-block
35  * to yield the downscaled outputs.
36  * This can be seen as direct low-pass downsampling from the DCT domain
37  * point of view rather than the usual spatial domain point of view,
38  * yielding significant computational savings and results at least
39  * as good as common bilinear (averaging) spatial downsampling.
40  *
41  * For N>8 we apply a partial NxN IDCT on the 8 input coefficients as
42  * lower frequencies and higher frequencies assumed to be zero.
43  * It turns out that the computational effort is similar to the 8x8 IDCT
44  * regarding the output size.
45  * Furthermore, the scaling and descaling is the same for all IDCT sizes.
46  *
47  * CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
48  * since there would be too many additional constants to pre-calculate.
49  */
50 
51 #define JPEG_INTERNALS
52 #include "jinclude.h"
53 #include "jpeglib.h"
54 #include "jdct.h"		/* Private declarations for DCT subsystem */
55 
56 #ifdef DCT_ISLOW_SUPPORTED
57 
58 
59 /*
60  * This module is specialized to the case DCTSIZE = 8.
61  */
62 
63 #if DCTSIZE != 8
64   Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
65 #endif
66 
67 
68 /*
69  * The poop on this scaling stuff is as follows:
70  *
71  * Each 1-D IDCT step produces outputs which are a factor of sqrt(N)
72  * larger than the true IDCT outputs.  The final outputs are therefore
73  * a factor of N larger than desired; since N=8 this can be cured by
74  * a simple right shift at the end of the algorithm.  The advantage of
75  * this arrangement is that we save two multiplications per 1-D IDCT,
76  * because the y0 and y4 inputs need not be divided by sqrt(N).
77  *
78  * We have to do addition and subtraction of the integer inputs, which
79  * is no problem, and multiplication by fractional constants, which is
80  * a problem to do in integer arithmetic.  We multiply all the constants
81  * by CONST_SCALE and convert them to integer constants (thus retaining
82  * CONST_BITS bits of precision in the constants).  After doing a
83  * multiplication we have to divide the product by CONST_SCALE, with proper
84  * rounding, to produce the correct output.  This division can be done
85  * cheaply as a right shift of CONST_BITS bits.  We postpone shifting
86  * as long as possible so that partial sums can be added together with
87  * full fractional precision.
88  *
89  * The outputs of the first pass are scaled up by PASS1_BITS bits so that
90  * they are represented to better-than-integral precision.  These outputs
91  * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
92  * with the recommended scaling.  (To scale up 12-bit sample data further, an
93  * intermediate INT32 array would be needed.)
94  *
95  * To avoid overflow of the 32-bit intermediate results in pass 2, we must
96  * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis
97  * shows that the values given below are the most effective.
98  */
99 
100 #if BITS_IN_JSAMPLE == 8
101 #define CONST_BITS  13
102 #define PASS1_BITS  2
103 #else
104 #define CONST_BITS  13
105 #define PASS1_BITS  1		/* lose a little precision to avoid overflow */
106 #endif
107 
108 /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
109  * causing a lot of useless floating-point operations at run time.
110  * To get around this we use the following pre-calculated constants.
111  * If you change CONST_BITS you may want to add appropriate values.
112  * (With a reasonable C compiler, you can just rely on the FIX() macro...)
113  */
114 
115 #if CONST_BITS == 13
116 #define FIX_0_298631336  ((INT32)  2446)	/* FIX(0.298631336) */
117 #define FIX_0_390180644  ((INT32)  3196)	/* FIX(0.390180644) */
118 #define FIX_0_541196100  ((INT32)  4433)	/* FIX(0.541196100) */
119 #define FIX_0_765366865  ((INT32)  6270)	/* FIX(0.765366865) */
120 #define FIX_0_899976223  ((INT32)  7373)	/* FIX(0.899976223) */
121 #define FIX_1_175875602  ((INT32)  9633)	/* FIX(1.175875602) */
122 #define FIX_1_501321110  ((INT32)  12299)	/* FIX(1.501321110) */
123 #define FIX_1_847759065  ((INT32)  15137)	/* FIX(1.847759065) */
124 #define FIX_1_961570560  ((INT32)  16069)	/* FIX(1.961570560) */
125 #define FIX_2_053119869  ((INT32)  16819)	/* FIX(2.053119869) */
126 #define FIX_2_562915447  ((INT32)  20995)	/* FIX(2.562915447) */
127 #define FIX_3_072711026  ((INT32)  25172)	/* FIX(3.072711026) */
128 #else
129 #define FIX_0_298631336  FIX(0.298631336)
130 #define FIX_0_390180644  FIX(0.390180644)
131 #define FIX_0_541196100  FIX(0.541196100)
132 #define FIX_0_765366865  FIX(0.765366865)
133 #define FIX_0_899976223  FIX(0.899976223)
134 #define FIX_1_175875602  FIX(1.175875602)
135 #define FIX_1_501321110  FIX(1.501321110)
136 #define FIX_1_847759065  FIX(1.847759065)
137 #define FIX_1_961570560  FIX(1.961570560)
138 #define FIX_2_053119869  FIX(2.053119869)
139 #define FIX_2_562915447  FIX(2.562915447)
140 #define FIX_3_072711026  FIX(3.072711026)
141 #endif
142 
143 
144 /* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
145  * For 8-bit samples with the recommended scaling, all the variable
146  * and constant values involved are no more than 16 bits wide, so a
147  * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
148  * For 12-bit samples, a full 32-bit multiplication will be needed.
149  */
150 
151 #if BITS_IN_JSAMPLE == 8
152 #define MULTIPLY(var,const)  MULTIPLY16C16(var,const)
153 #else
154 #define MULTIPLY(var,const)  ((var) * (const))
155 #endif
156 
157 
158 /* Dequantize a coefficient by multiplying it by the multiplier-table
159  * entry; produce an int result.  In this module, both inputs and result
160  * are 16 bits or less, so either int or short multiply will work.
161  */
162 
163 #define DEQUANTIZE(coef,quantval)  (((ISLOW_MULT_TYPE) (coef)) * (quantval))
164 
165 
166 /*
167  * Perform dequantization and inverse DCT on one block of coefficients.
168  *
169  * Optimized algorithm with 12 multiplications in the 1-D kernel.
170  * cK represents sqrt(2) * cos(K*pi/16).
171  */
172 
173 GLOBAL(void)
174 jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
175 		 JCOEFPTR coef_block,
176 		 JSAMPARRAY output_buf, JDIMENSION output_col)
177 {
178   INT32 tmp0, tmp1, tmp2, tmp3;
179   INT32 tmp10, tmp11, tmp12, tmp13;
180   INT32 z1, z2, z3;
181   JCOEFPTR inptr;
182   ISLOW_MULT_TYPE * quantptr;
183   int * wsptr;
184   JSAMPROW outptr;
185   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
186   int ctr;
187   int workspace[DCTSIZE2];	/* buffers data between passes */
188   SHIFT_TEMPS
189 
190   /* Pass 1: process columns from input, store into work array.
191    * Note results are scaled up by sqrt(8) compared to a true IDCT;
192    * furthermore, we scale the results by 2**PASS1_BITS.
193    */
194 
195   inptr = coef_block;
196   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
197   wsptr = workspace;
198   for (ctr = DCTSIZE; ctr > 0; ctr--) {
199     /* Due to quantization, we will usually find that many of the input
200      * coefficients are zero, especially the AC terms.  We can exploit this
201      * by short-circuiting the IDCT calculation for any column in which all
202      * the AC terms are zero.  In that case each output is equal to the
203      * DC coefficient (with scale factor as needed).
204      * With typical images and quantization tables, half or more of the
205      * column DCT calculations can be simplified this way.
206      */
207 
208     if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
209 	inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
210 	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
211 	inptr[DCTSIZE*7] == 0) {
212       /* AC terms all zero */
213       int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
214 
215       wsptr[DCTSIZE*0] = dcval;
216       wsptr[DCTSIZE*1] = dcval;
217       wsptr[DCTSIZE*2] = dcval;
218       wsptr[DCTSIZE*3] = dcval;
219       wsptr[DCTSIZE*4] = dcval;
220       wsptr[DCTSIZE*5] = dcval;
221       wsptr[DCTSIZE*6] = dcval;
222       wsptr[DCTSIZE*7] = dcval;
223 
224       inptr++;			/* advance pointers to next column */
225       quantptr++;
226       wsptr++;
227       continue;
228     }
229 
230     /* Even part: reverse the even part of the forward DCT.
231      * The rotator is c(-6).
232      */
233 
234     z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
235     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
236     z2 <<= CONST_BITS;
237     z3 <<= CONST_BITS;
238     /* Add fudge factor here for final descale. */
239     z2 += ONE << (CONST_BITS-PASS1_BITS-1);
240 
241     tmp0 = z2 + z3;
242     tmp1 = z2 - z3;
243 
244     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
245     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
246 
247     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);       /* c6 */
248     tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);     /* c2-c6 */
249     tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);     /* c2+c6 */
250 
251     tmp10 = tmp0 + tmp2;
252     tmp13 = tmp0 - tmp2;
253     tmp11 = tmp1 + tmp3;
254     tmp12 = tmp1 - tmp3;
255 
256     /* Odd part per figure 8; the matrix is unitary and hence its
257      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
258      */
259 
260     tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
261     tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
262     tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
263     tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
264 
265     z2 = tmp0 + tmp2;
266     z3 = tmp1 + tmp3;
267 
268     z1 = MULTIPLY(z2 + z3, FIX_1_175875602);       /*  c3 */
269     z2 = MULTIPLY(z2, - FIX_1_961570560);          /* -c3-c5 */
270     z3 = MULTIPLY(z3, - FIX_0_390180644);          /* -c3+c5 */
271     z2 += z1;
272     z3 += z1;
273 
274     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* -c3+c7 */
275     tmp0 = MULTIPLY(tmp0, FIX_0_298631336);        /* -c1+c3+c5-c7 */
276     tmp3 = MULTIPLY(tmp3, FIX_1_501321110);        /*  c1+c3-c5-c7 */
277     tmp0 += z1 + z2;
278     tmp3 += z1 + z3;
279 
280     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* -c1-c3 */
281     tmp1 = MULTIPLY(tmp1, FIX_2_053119869);        /*  c1+c3-c5+c7 */
282     tmp2 = MULTIPLY(tmp2, FIX_3_072711026);        /*  c1+c3+c5-c7 */
283     tmp1 += z1 + z3;
284     tmp2 += z1 + z2;
285 
286     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
287 
288     wsptr[DCTSIZE*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
289     wsptr[DCTSIZE*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
290     wsptr[DCTSIZE*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
291     wsptr[DCTSIZE*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
292     wsptr[DCTSIZE*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
293     wsptr[DCTSIZE*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
294     wsptr[DCTSIZE*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
295     wsptr[DCTSIZE*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
296 
297     inptr++;			/* advance pointers to next column */
298     quantptr++;
299     wsptr++;
300   }
301 
302   /* Pass 2: process rows from work array, store into output array.
303    * Note that we must descale the results by a factor of 8 == 2**3,
304    * and also undo the PASS1_BITS scaling.
305    */
306 
307   wsptr = workspace;
308   for (ctr = 0; ctr < DCTSIZE; ctr++) {
309     outptr = output_buf[ctr] + output_col;
310 
311     /* Add range center and fudge factor for final descale and range-limit. */
312     z2 = (INT32) wsptr[0] +
313 	   ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
314 	    (ONE << (PASS1_BITS+2)));
315 
316     /* Rows of zeroes can be exploited in the same way as we did with columns.
317      * However, the column calculation has created many nonzero AC terms, so
318      * the simplification applies less often (typically 5% to 10% of the time).
319      * On machines with very fast multiplication, it's possible that the
320      * test takes more time than it's worth.  In that case this section
321      * may be commented out.
322      */
323 
324 #ifndef NO_ZERO_ROW_TEST
325     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
326 	wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
327       /* AC terms all zero */
328       JSAMPLE dcval = range_limit[(int) RIGHT_SHIFT(z2, PASS1_BITS+3)
329 				  & RANGE_MASK];
330 
331       outptr[0] = dcval;
332       outptr[1] = dcval;
333       outptr[2] = dcval;
334       outptr[3] = dcval;
335       outptr[4] = dcval;
336       outptr[5] = dcval;
337       outptr[6] = dcval;
338       outptr[7] = dcval;
339 
340       wsptr += DCTSIZE;		/* advance pointer to next row */
341       continue;
342     }
343 #endif
344 
345     /* Even part: reverse the even part of the forward DCT.
346      * The rotator is c(-6).
347      */
348 
349     z3 = (INT32) wsptr[4];
350 
351     tmp0 = (z2 + z3) << CONST_BITS;
352     tmp1 = (z2 - z3) << CONST_BITS;
353 
354     z2 = (INT32) wsptr[2];
355     z3 = (INT32) wsptr[6];
356 
357     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);       /* c6 */
358     tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);     /* c2-c6 */
359     tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);     /* c2+c6 */
360 
361     tmp10 = tmp0 + tmp2;
362     tmp13 = tmp0 - tmp2;
363     tmp11 = tmp1 + tmp3;
364     tmp12 = tmp1 - tmp3;
365 
366     /* Odd part per figure 8; the matrix is unitary and hence its
367      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
368      */
369 
370     tmp0 = (INT32) wsptr[7];
371     tmp1 = (INT32) wsptr[5];
372     tmp2 = (INT32) wsptr[3];
373     tmp3 = (INT32) wsptr[1];
374 
375     z2 = tmp0 + tmp2;
376     z3 = tmp1 + tmp3;
377 
378     z1 = MULTIPLY(z2 + z3, FIX_1_175875602);       /*  c3 */
379     z2 = MULTIPLY(z2, - FIX_1_961570560);          /* -c3-c5 */
380     z3 = MULTIPLY(z3, - FIX_0_390180644);          /* -c3+c5 */
381     z2 += z1;
382     z3 += z1;
383 
384     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* -c3+c7 */
385     tmp0 = MULTIPLY(tmp0, FIX_0_298631336);        /* -c1+c3+c5-c7 */
386     tmp3 = MULTIPLY(tmp3, FIX_1_501321110);        /*  c1+c3-c5-c7 */
387     tmp0 += z1 + z2;
388     tmp3 += z1 + z3;
389 
390     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* -c1-c3 */
391     tmp1 = MULTIPLY(tmp1, FIX_2_053119869);        /*  c1+c3-c5+c7 */
392     tmp2 = MULTIPLY(tmp2, FIX_3_072711026);        /*  c1+c3+c5-c7 */
393     tmp1 += z1 + z3;
394     tmp2 += z1 + z2;
395 
396     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
397 
398     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp3,
399 					      CONST_BITS+PASS1_BITS+3)
400 			    & RANGE_MASK];
401     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp3,
402 					      CONST_BITS+PASS1_BITS+3)
403 			    & RANGE_MASK];
404     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp2,
405 					      CONST_BITS+PASS1_BITS+3)
406 			    & RANGE_MASK];
407     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp2,
408 					      CONST_BITS+PASS1_BITS+3)
409 			    & RANGE_MASK];
410     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp1,
411 					      CONST_BITS+PASS1_BITS+3)
412 			    & RANGE_MASK];
413     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp1,
414 					      CONST_BITS+PASS1_BITS+3)
415 			    & RANGE_MASK];
416     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp0,
417 					      CONST_BITS+PASS1_BITS+3)
418 			    & RANGE_MASK];
419     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp0,
420 					      CONST_BITS+PASS1_BITS+3)
421 			    & RANGE_MASK];
422 
423     wsptr += DCTSIZE;		/* advance pointer to next row */
424   }
425 }
426 
427 #ifdef IDCT_SCALING_SUPPORTED
428 
429 
430 /*
431  * Perform dequantization and inverse DCT on one block of coefficients,
432  * producing a reduced-size 7x7 output block.
433  *
434  * Optimized algorithm with 12 multiplications in the 1-D kernel.
435  * cK represents sqrt(2) * cos(K*pi/14).
436  */
437 
438 GLOBAL(void)
439 jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
440 	       JCOEFPTR coef_block,
441 	       JSAMPARRAY output_buf, JDIMENSION output_col)
442 {
443   INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12, tmp13;
444   INT32 z1, z2, z3;
445   JCOEFPTR inptr;
446   ISLOW_MULT_TYPE * quantptr;
447   int * wsptr;
448   JSAMPROW outptr;
449   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
450   int ctr;
451   int workspace[7*7];	/* buffers data between passes */
452   SHIFT_TEMPS
453 
454   /* Pass 1: process columns from input, store into work array. */
455 
456   inptr = coef_block;
457   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
458   wsptr = workspace;
459   for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) {
460     /* Even part */
461 
462     tmp13 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
463     tmp13 <<= CONST_BITS;
464     /* Add fudge factor here for final descale. */
465     tmp13 += ONE << (CONST_BITS-PASS1_BITS-1);
466 
467     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
468     z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
469     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
470 
471     tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
472     tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
473     tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
474     tmp0 = z1 + z3;
475     z2 -= tmp0;
476     tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
477     tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536));  /* c2-c4-c6 */
478     tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249));  /* c2+c4+c6 */
479     tmp13 += MULTIPLY(z2, FIX(1.414213562));         /* c0 */
480 
481     /* Odd part */
482 
483     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
484     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
485     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
486 
487     tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
488     tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
489     tmp0 = tmp1 - tmp2;
490     tmp1 += tmp2;
491     tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276));    /* -c1 */
492     tmp1 += tmp2;
493     z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
494     tmp0 += z2;
495     tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693));     /* c3+c1-c5 */
496 
497     /* Final output stage */
498 
499     wsptr[7*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
500     wsptr[7*6] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
501     wsptr[7*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
502     wsptr[7*5] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
503     wsptr[7*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
504     wsptr[7*4] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
505     wsptr[7*3] = (int) RIGHT_SHIFT(tmp13, CONST_BITS-PASS1_BITS);
506   }
507 
508   /* Pass 2: process 7 rows from work array, store into output array. */
509 
510   wsptr = workspace;
511   for (ctr = 0; ctr < 7; ctr++) {
512     outptr = output_buf[ctr] + output_col;
513 
514     /* Even part */
515 
516     /* Add range center and fudge factor for final descale and range-limit. */
517     tmp13 = (INT32) wsptr[0] +
518 	      ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
519 	       (ONE << (PASS1_BITS+2)));
520     tmp13 <<= CONST_BITS;
521 
522     z1 = (INT32) wsptr[2];
523     z2 = (INT32) wsptr[4];
524     z3 = (INT32) wsptr[6];
525 
526     tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
527     tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
528     tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
529     tmp0 = z1 + z3;
530     z2 -= tmp0;
531     tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
532     tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536));  /* c2-c4-c6 */
533     tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249));  /* c2+c4+c6 */
534     tmp13 += MULTIPLY(z2, FIX(1.414213562));         /* c0 */
535 
536     /* Odd part */
537 
538     z1 = (INT32) wsptr[1];
539     z2 = (INT32) wsptr[3];
540     z3 = (INT32) wsptr[5];
541 
542     tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
543     tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
544     tmp0 = tmp1 - tmp2;
545     tmp1 += tmp2;
546     tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276));    /* -c1 */
547     tmp1 += tmp2;
548     z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
549     tmp0 += z2;
550     tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693));     /* c3+c1-c5 */
551 
552     /* Final output stage */
553 
554     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
555 					      CONST_BITS+PASS1_BITS+3)
556 			    & RANGE_MASK];
557     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
558 					      CONST_BITS+PASS1_BITS+3)
559 			    & RANGE_MASK];
560     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
561 					      CONST_BITS+PASS1_BITS+3)
562 			    & RANGE_MASK];
563     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
564 					      CONST_BITS+PASS1_BITS+3)
565 			    & RANGE_MASK];
566     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
567 					      CONST_BITS+PASS1_BITS+3)
568 			    & RANGE_MASK];
569     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
570 					      CONST_BITS+PASS1_BITS+3)
571 			    & RANGE_MASK];
572     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13,
573 					      CONST_BITS+PASS1_BITS+3)
574 			    & RANGE_MASK];
575 
576     wsptr += 7;		/* advance pointer to next row */
577   }
578 }
579 
580 
581 /*
582  * Perform dequantization and inverse DCT on one block of coefficients,
583  * producing a reduced-size 6x6 output block.
584  *
585  * Optimized algorithm with 3 multiplications in the 1-D kernel.
586  * cK represents sqrt(2) * cos(K*pi/12).
587  */
588 
589 GLOBAL(void)
590 jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
591 	       JCOEFPTR coef_block,
592 	       JSAMPARRAY output_buf, JDIMENSION output_col)
593 {
594   INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
595   INT32 z1, z2, z3;
596   JCOEFPTR inptr;
597   ISLOW_MULT_TYPE * quantptr;
598   int * wsptr;
599   JSAMPROW outptr;
600   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
601   int ctr;
602   int workspace[6*6];	/* buffers data between passes */
603   SHIFT_TEMPS
604 
605   /* Pass 1: process columns from input, store into work array. */
606 
607   inptr = coef_block;
608   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
609   wsptr = workspace;
610   for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
611     /* Even part */
612 
613     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
614     tmp0 <<= CONST_BITS;
615     /* Add fudge factor here for final descale. */
616     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
617     tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
618     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
619     tmp1 = tmp0 + tmp10;
620     tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS-PASS1_BITS);
621     tmp10 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
622     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
623     tmp10 = tmp1 + tmp0;
624     tmp12 = tmp1 - tmp0;
625 
626     /* Odd part */
627 
628     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
629     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
630     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
631     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
632     tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
633     tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
634     tmp1 = (z1 - z2 - z3) << PASS1_BITS;
635 
636     /* Final output stage */
637 
638     wsptr[6*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
639     wsptr[6*5] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
640     wsptr[6*1] = (int) (tmp11 + tmp1);
641     wsptr[6*4] = (int) (tmp11 - tmp1);
642     wsptr[6*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
643     wsptr[6*3] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
644   }
645 
646   /* Pass 2: process 6 rows from work array, store into output array. */
647 
648   wsptr = workspace;
649   for (ctr = 0; ctr < 6; ctr++) {
650     outptr = output_buf[ctr] + output_col;
651 
652     /* Even part */
653 
654     /* Add range center and fudge factor for final descale and range-limit. */
655     tmp0 = (INT32) wsptr[0] +
656 	     ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
657 	      (ONE << (PASS1_BITS+2)));
658     tmp0 <<= CONST_BITS;
659     tmp2 = (INT32) wsptr[4];
660     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
661     tmp1 = tmp0 + tmp10;
662     tmp11 = tmp0 - tmp10 - tmp10;
663     tmp10 = (INT32) wsptr[2];
664     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
665     tmp10 = tmp1 + tmp0;
666     tmp12 = tmp1 - tmp0;
667 
668     /* Odd part */
669 
670     z1 = (INT32) wsptr[1];
671     z2 = (INT32) wsptr[3];
672     z3 = (INT32) wsptr[5];
673     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
674     tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
675     tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
676     tmp1 = (z1 - z2 - z3) << CONST_BITS;
677 
678     /* Final output stage */
679 
680     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
681 					      CONST_BITS+PASS1_BITS+3)
682 			    & RANGE_MASK];
683     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
684 					      CONST_BITS+PASS1_BITS+3)
685 			    & RANGE_MASK];
686     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
687 					      CONST_BITS+PASS1_BITS+3)
688 			    & RANGE_MASK];
689     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
690 					      CONST_BITS+PASS1_BITS+3)
691 			    & RANGE_MASK];
692     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
693 					      CONST_BITS+PASS1_BITS+3)
694 			    & RANGE_MASK];
695     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
696 					      CONST_BITS+PASS1_BITS+3)
697 			    & RANGE_MASK];
698 
699     wsptr += 6;		/* advance pointer to next row */
700   }
701 }
702 
703 
704 /*
705  * Perform dequantization and inverse DCT on one block of coefficients,
706  * producing a reduced-size 5x5 output block.
707  *
708  * Optimized algorithm with 5 multiplications in the 1-D kernel.
709  * cK represents sqrt(2) * cos(K*pi/10).
710  */
711 
712 GLOBAL(void)
713 jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
714 	       JCOEFPTR coef_block,
715 	       JSAMPARRAY output_buf, JDIMENSION output_col)
716 {
717   INT32 tmp0, tmp1, tmp10, tmp11, tmp12;
718   INT32 z1, z2, z3;
719   JCOEFPTR inptr;
720   ISLOW_MULT_TYPE * quantptr;
721   int * wsptr;
722   JSAMPROW outptr;
723   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
724   int ctr;
725   int workspace[5*5];	/* buffers data between passes */
726   SHIFT_TEMPS
727 
728   /* Pass 1: process columns from input, store into work array. */
729 
730   inptr = coef_block;
731   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
732   wsptr = workspace;
733   for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) {
734     /* Even part */
735 
736     tmp12 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
737     tmp12 <<= CONST_BITS;
738     /* Add fudge factor here for final descale. */
739     tmp12 += ONE << (CONST_BITS-PASS1_BITS-1);
740     tmp0 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
741     tmp1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
742     z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
743     z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
744     z3 = tmp12 + z2;
745     tmp10 = z3 + z1;
746     tmp11 = z3 - z1;
747     tmp12 -= z2 << 2;
748 
749     /* Odd part */
750 
751     z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
752     z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
753 
754     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
755     tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
756     tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899));   /* c1+c3 */
757 
758     /* Final output stage */
759 
760     wsptr[5*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
761     wsptr[5*4] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
762     wsptr[5*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
763     wsptr[5*3] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
764     wsptr[5*2] = (int) RIGHT_SHIFT(tmp12, CONST_BITS-PASS1_BITS);
765   }
766 
767   /* Pass 2: process 5 rows from work array, store into output array. */
768 
769   wsptr = workspace;
770   for (ctr = 0; ctr < 5; ctr++) {
771     outptr = output_buf[ctr] + output_col;
772 
773     /* Even part */
774 
775     /* Add range center and fudge factor for final descale and range-limit. */
776     tmp12 = (INT32) wsptr[0] +
777 	      ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
778 	       (ONE << (PASS1_BITS+2)));
779     tmp12 <<= CONST_BITS;
780     tmp0 = (INT32) wsptr[2];
781     tmp1 = (INT32) wsptr[4];
782     z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
783     z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
784     z3 = tmp12 + z2;
785     tmp10 = z3 + z1;
786     tmp11 = z3 - z1;
787     tmp12 -= z2 << 2;
788 
789     /* Odd part */
790 
791     z2 = (INT32) wsptr[1];
792     z3 = (INT32) wsptr[3];
793 
794     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
795     tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
796     tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899));   /* c1+c3 */
797 
798     /* Final output stage */
799 
800     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
801 					      CONST_BITS+PASS1_BITS+3)
802 			    & RANGE_MASK];
803     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
804 					      CONST_BITS+PASS1_BITS+3)
805 			    & RANGE_MASK];
806     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
807 					      CONST_BITS+PASS1_BITS+3)
808 			    & RANGE_MASK];
809     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
810 					      CONST_BITS+PASS1_BITS+3)
811 			    & RANGE_MASK];
812     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12,
813 					      CONST_BITS+PASS1_BITS+3)
814 			    & RANGE_MASK];
815 
816     wsptr += 5;		/* advance pointer to next row */
817   }
818 }
819 
820 
821 /*
822  * Perform dequantization and inverse DCT on one block of coefficients,
823  * producing a reduced-size 4x4 output block.
824  *
825  * Optimized algorithm with 3 multiplications in the 1-D kernel.
826  * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
827  */
828 
829 GLOBAL(void)
830 jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
831 	       JCOEFPTR coef_block,
832 	       JSAMPARRAY output_buf, JDIMENSION output_col)
833 {
834   INT32 tmp0, tmp2, tmp10, tmp12;
835   INT32 z1, z2, z3;
836   JCOEFPTR inptr;
837   ISLOW_MULT_TYPE * quantptr;
838   int * wsptr;
839   JSAMPROW outptr;
840   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
841   int ctr;
842   int workspace[4*4];	/* buffers data between passes */
843   SHIFT_TEMPS
844 
845   /* Pass 1: process columns from input, store into work array. */
846 
847   inptr = coef_block;
848   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
849   wsptr = workspace;
850   for (ctr = 0; ctr < 4; ctr++, inptr++, quantptr++, wsptr++) {
851     /* Even part */
852 
853     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
854     tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
855 
856     tmp10 = (tmp0 + tmp2) << PASS1_BITS;
857     tmp12 = (tmp0 - tmp2) << PASS1_BITS;
858 
859     /* Odd part */
860     /* Same rotation as in the even part of the 8x8 LL&M IDCT */
861 
862     z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
863     z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
864 
865     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);               /* c6 */
866     /* Add fudge factor here for final descale. */
867     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
868     tmp0 = RIGHT_SHIFT(z1 + MULTIPLY(z2, FIX_0_765366865), /* c2-c6 */
869 		       CONST_BITS-PASS1_BITS);
870     tmp2 = RIGHT_SHIFT(z1 - MULTIPLY(z3, FIX_1_847759065), /* c2+c6 */
871 		       CONST_BITS-PASS1_BITS);
872 
873     /* Final output stage */
874 
875     wsptr[4*0] = (int) (tmp10 + tmp0);
876     wsptr[4*3] = (int) (tmp10 - tmp0);
877     wsptr[4*1] = (int) (tmp12 + tmp2);
878     wsptr[4*2] = (int) (tmp12 - tmp2);
879   }
880 
881   /* Pass 2: process 4 rows from work array, store into output array. */
882 
883   wsptr = workspace;
884   for (ctr = 0; ctr < 4; ctr++) {
885     outptr = output_buf[ctr] + output_col;
886 
887     /* Even part */
888 
889     /* Add range center and fudge factor for final descale and range-limit. */
890     tmp0 = (INT32) wsptr[0] +
891 	     ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
892 	      (ONE << (PASS1_BITS+2)));
893     tmp2 = (INT32) wsptr[2];
894 
895     tmp10 = (tmp0 + tmp2) << CONST_BITS;
896     tmp12 = (tmp0 - tmp2) << CONST_BITS;
897 
898     /* Odd part */
899     /* Same rotation as in the even part of the 8x8 LL&M IDCT */
900 
901     z2 = (INT32) wsptr[1];
902     z3 = (INT32) wsptr[3];
903 
904     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
905     tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
906     tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
907 
908     /* Final output stage */
909 
910     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
911 					      CONST_BITS+PASS1_BITS+3)
912 			    & RANGE_MASK];
913     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
914 					      CONST_BITS+PASS1_BITS+3)
915 			    & RANGE_MASK];
916     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
917 					      CONST_BITS+PASS1_BITS+3)
918 			    & RANGE_MASK];
919     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
920 					      CONST_BITS+PASS1_BITS+3)
921 			    & RANGE_MASK];
922 
923     wsptr += 4;		/* advance pointer to next row */
924   }
925 }
926 
927 
928 /*
929  * Perform dequantization and inverse DCT on one block of coefficients,
930  * producing a reduced-size 3x3 output block.
931  *
932  * Optimized algorithm with 2 multiplications in the 1-D kernel.
933  * cK represents sqrt(2) * cos(K*pi/6).
934  */
935 
936 GLOBAL(void)
937 jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
938 	       JCOEFPTR coef_block,
939 	       JSAMPARRAY output_buf, JDIMENSION output_col)
940 {
941   INT32 tmp0, tmp2, tmp10, tmp12;
942   JCOEFPTR inptr;
943   ISLOW_MULT_TYPE * quantptr;
944   int * wsptr;
945   JSAMPROW outptr;
946   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
947   int ctr;
948   int workspace[3*3];	/* buffers data between passes */
949   SHIFT_TEMPS
950 
951   /* Pass 1: process columns from input, store into work array. */
952 
953   inptr = coef_block;
954   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
955   wsptr = workspace;
956   for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) {
957     /* Even part */
958 
959     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
960     tmp0 <<= CONST_BITS;
961     /* Add fudge factor here for final descale. */
962     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
963     tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
964     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
965     tmp10 = tmp0 + tmp12;
966     tmp2 = tmp0 - tmp12 - tmp12;
967 
968     /* Odd part */
969 
970     tmp12 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
971     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
972 
973     /* Final output stage */
974 
975     wsptr[3*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
976     wsptr[3*2] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
977     wsptr[3*1] = (int) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
978   }
979 
980   /* Pass 2: process 3 rows from work array, store into output array. */
981 
982   wsptr = workspace;
983   for (ctr = 0; ctr < 3; ctr++) {
984     outptr = output_buf[ctr] + output_col;
985 
986     /* Even part */
987 
988     /* Add range center and fudge factor for final descale and range-limit. */
989     tmp0 = (INT32) wsptr[0] +
990 	     ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
991 	      (ONE << (PASS1_BITS+2)));
992     tmp0 <<= CONST_BITS;
993     tmp2 = (INT32) wsptr[2];
994     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
995     tmp10 = tmp0 + tmp12;
996     tmp2 = tmp0 - tmp12 - tmp12;
997 
998     /* Odd part */
999 
1000     tmp12 = (INT32) wsptr[1];
1001     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
1002 
1003     /* Final output stage */
1004 
1005     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
1006 					      CONST_BITS+PASS1_BITS+3)
1007 			    & RANGE_MASK];
1008     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
1009 					      CONST_BITS+PASS1_BITS+3)
1010 			    & RANGE_MASK];
1011     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2,
1012 					      CONST_BITS+PASS1_BITS+3)
1013 			    & RANGE_MASK];
1014 
1015     wsptr += 3;		/* advance pointer to next row */
1016   }
1017 }
1018 
1019 
1020 /*
1021  * Perform dequantization and inverse DCT on one block of coefficients,
1022  * producing a reduced-size 2x2 output block.
1023  *
1024  * Multiplication-less algorithm.
1025  */
1026 
1027 GLOBAL(void)
1028 jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1029 	       JCOEFPTR coef_block,
1030 	       JSAMPARRAY output_buf, JDIMENSION output_col)
1031 {
1032   DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1033   ISLOW_MULT_TYPE * quantptr;
1034   JSAMPROW outptr;
1035   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1036   ISHIFT_TEMPS
1037 
1038   /* Pass 1: process columns from input. */
1039 
1040   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1041 
1042   /* Column 0 */
1043   tmp4 = DEQUANTIZE(coef_block[DCTSIZE*0], quantptr[DCTSIZE*0]);
1044   tmp5 = DEQUANTIZE(coef_block[DCTSIZE*1], quantptr[DCTSIZE*1]);
1045   /* Add range center and fudge factor for final descale and range-limit. */
1046   tmp4 += (((DCTELEM) RANGE_CENTER) << 3) + (1 << 2);
1047 
1048   tmp0 = tmp4 + tmp5;
1049   tmp2 = tmp4 - tmp5;
1050 
1051   /* Column 1 */
1052   tmp4 = DEQUANTIZE(coef_block[DCTSIZE*0+1], quantptr[DCTSIZE*0+1]);
1053   tmp5 = DEQUANTIZE(coef_block[DCTSIZE*1+1], quantptr[DCTSIZE*1+1]);
1054 
1055   tmp1 = tmp4 + tmp5;
1056   tmp3 = tmp4 - tmp5;
1057 
1058   /* Pass 2: process 2 rows, store into output array. */
1059 
1060   /* Row 0 */
1061   outptr = output_buf[0] + output_col;
1062 
1063   outptr[0] = range_limit[(int) IRIGHT_SHIFT(tmp0 + tmp1, 3) & RANGE_MASK];
1064   outptr[1] = range_limit[(int) IRIGHT_SHIFT(tmp0 - tmp1, 3) & RANGE_MASK];
1065 
1066   /* Row 1 */
1067   outptr = output_buf[1] + output_col;
1068 
1069   outptr[0] = range_limit[(int) IRIGHT_SHIFT(tmp2 + tmp3, 3) & RANGE_MASK];
1070   outptr[1] = range_limit[(int) IRIGHT_SHIFT(tmp2 - tmp3, 3) & RANGE_MASK];
1071 }
1072 
1073 
1074 /*
1075  * Perform dequantization and inverse DCT on one block of coefficients,
1076  * producing a reduced-size 1x1 output block.
1077  *
1078  * We hardly need an inverse DCT routine for this: just take the
1079  * average pixel value, which is one-eighth of the DC coefficient.
1080  */
1081 
1082 GLOBAL(void)
1083 jpeg_idct_1x1 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1084 	       JCOEFPTR coef_block,
1085 	       JSAMPARRAY output_buf, JDIMENSION output_col)
1086 {
1087   DCTELEM dcval;
1088   ISLOW_MULT_TYPE * quantptr;
1089   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1090   ISHIFT_TEMPS
1091 
1092   /* 1x1 is trivial: just take the DC coefficient divided by 8. */
1093 
1094   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1095 
1096   dcval = DEQUANTIZE(coef_block[0], quantptr[0]);
1097   /* Add range center and fudge factor for descale and range-limit. */
1098   dcval += (((DCTELEM) RANGE_CENTER) << 3) + (1 << 2);
1099 
1100   output_buf[0][output_col] =
1101     range_limit[(int) IRIGHT_SHIFT(dcval, 3) & RANGE_MASK];
1102 }
1103 
1104 
1105 /*
1106  * Perform dequantization and inverse DCT on one block of coefficients,
1107  * producing a 9x9 output block.
1108  *
1109  * Optimized algorithm with 10 multiplications in the 1-D kernel.
1110  * cK represents sqrt(2) * cos(K*pi/18).
1111  */
1112 
1113 GLOBAL(void)
1114 jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1115 	       JCOEFPTR coef_block,
1116 	       JSAMPARRAY output_buf, JDIMENSION output_col)
1117 {
1118   INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13, tmp14;
1119   INT32 z1, z2, z3, z4;
1120   JCOEFPTR inptr;
1121   ISLOW_MULT_TYPE * quantptr;
1122   int * wsptr;
1123   JSAMPROW outptr;
1124   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1125   int ctr;
1126   int workspace[8*9];	/* buffers data between passes */
1127   SHIFT_TEMPS
1128 
1129   /* Pass 1: process columns from input, store into work array. */
1130 
1131   inptr = coef_block;
1132   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1133   wsptr = workspace;
1134   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1135     /* Even part */
1136 
1137     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
1138     tmp0 <<= CONST_BITS;
1139     /* Add fudge factor here for final descale. */
1140     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
1141 
1142     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
1143     z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
1144     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
1145 
1146     tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
1147     tmp1 = tmp0 + tmp3;
1148     tmp2 = tmp0 - tmp3 - tmp3;
1149 
1150     tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
1151     tmp11 = tmp2 + tmp0;
1152     tmp14 = tmp2 - tmp0 - tmp0;
1153 
1154     tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
1155     tmp2 = MULTIPLY(z1, FIX(1.083350441));      /* c4 */
1156     tmp3 = MULTIPLY(z2, FIX(0.245575608));      /* c8 */
1157 
1158     tmp10 = tmp1 + tmp0 - tmp3;
1159     tmp12 = tmp1 - tmp0 + tmp2;
1160     tmp13 = tmp1 - tmp2 + tmp3;
1161 
1162     /* Odd part */
1163 
1164     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
1165     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
1166     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
1167     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
1168 
1169     z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
1170 
1171     tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
1172     tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
1173     tmp0 = tmp2 + tmp3 - z2;
1174     tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481));      /* c1 */
1175     tmp2 += z2 - tmp1;
1176     tmp3 += z2 + tmp1;
1177     tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
1178 
1179     /* Final output stage */
1180 
1181     wsptr[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
1182     wsptr[8*8] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
1183     wsptr[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
1184     wsptr[8*7] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
1185     wsptr[8*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
1186     wsptr[8*6] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
1187     wsptr[8*3] = (int) RIGHT_SHIFT(tmp13 + tmp3, CONST_BITS-PASS1_BITS);
1188     wsptr[8*5] = (int) RIGHT_SHIFT(tmp13 - tmp3, CONST_BITS-PASS1_BITS);
1189     wsptr[8*4] = (int) RIGHT_SHIFT(tmp14, CONST_BITS-PASS1_BITS);
1190   }
1191 
1192   /* Pass 2: process 9 rows from work array, store into output array. */
1193 
1194   wsptr = workspace;
1195   for (ctr = 0; ctr < 9; ctr++) {
1196     outptr = output_buf[ctr] + output_col;
1197 
1198     /* Even part */
1199 
1200     /* Add range center and fudge factor for final descale and range-limit. */
1201     tmp0 = (INT32) wsptr[0] +
1202 	     ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
1203 	      (ONE << (PASS1_BITS+2)));
1204     tmp0 <<= CONST_BITS;
1205 
1206     z1 = (INT32) wsptr[2];
1207     z2 = (INT32) wsptr[4];
1208     z3 = (INT32) wsptr[6];
1209 
1210     tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
1211     tmp1 = tmp0 + tmp3;
1212     tmp2 = tmp0 - tmp3 - tmp3;
1213 
1214     tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
1215     tmp11 = tmp2 + tmp0;
1216     tmp14 = tmp2 - tmp0 - tmp0;
1217 
1218     tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
1219     tmp2 = MULTIPLY(z1, FIX(1.083350441));      /* c4 */
1220     tmp3 = MULTIPLY(z2, FIX(0.245575608));      /* c8 */
1221 
1222     tmp10 = tmp1 + tmp0 - tmp3;
1223     tmp12 = tmp1 - tmp0 + tmp2;
1224     tmp13 = tmp1 - tmp2 + tmp3;
1225 
1226     /* Odd part */
1227 
1228     z1 = (INT32) wsptr[1];
1229     z2 = (INT32) wsptr[3];
1230     z3 = (INT32) wsptr[5];
1231     z4 = (INT32) wsptr[7];
1232 
1233     z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
1234 
1235     tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
1236     tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
1237     tmp0 = tmp2 + tmp3 - z2;
1238     tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481));      /* c1 */
1239     tmp2 += z2 - tmp1;
1240     tmp3 += z2 + tmp1;
1241     tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
1242 
1243     /* Final output stage */
1244 
1245     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
1246 					      CONST_BITS+PASS1_BITS+3)
1247 			    & RANGE_MASK];
1248     outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
1249 					      CONST_BITS+PASS1_BITS+3)
1250 			    & RANGE_MASK];
1251     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
1252 					      CONST_BITS+PASS1_BITS+3)
1253 			    & RANGE_MASK];
1254     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
1255 					      CONST_BITS+PASS1_BITS+3)
1256 			    & RANGE_MASK];
1257     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
1258 					      CONST_BITS+PASS1_BITS+3)
1259 			    & RANGE_MASK];
1260     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
1261 					      CONST_BITS+PASS1_BITS+3)
1262 			    & RANGE_MASK];
1263     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp3,
1264 					      CONST_BITS+PASS1_BITS+3)
1265 			    & RANGE_MASK];
1266     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp3,
1267 					      CONST_BITS+PASS1_BITS+3)
1268 			    & RANGE_MASK];
1269     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp14,
1270 					      CONST_BITS+PASS1_BITS+3)
1271 			    & RANGE_MASK];
1272 
1273     wsptr += 8;		/* advance pointer to next row */
1274   }
1275 }
1276 
1277 
1278 /*
1279  * Perform dequantization and inverse DCT on one block of coefficients,
1280  * producing a 10x10 output block.
1281  *
1282  * Optimized algorithm with 12 multiplications in the 1-D kernel.
1283  * cK represents sqrt(2) * cos(K*pi/20).
1284  */
1285 
1286 GLOBAL(void)
1287 jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1288 		 JCOEFPTR coef_block,
1289 		 JSAMPARRAY output_buf, JDIMENSION output_col)
1290 {
1291   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
1292   INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
1293   INT32 z1, z2, z3, z4, z5;
1294   JCOEFPTR inptr;
1295   ISLOW_MULT_TYPE * quantptr;
1296   int * wsptr;
1297   JSAMPROW outptr;
1298   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1299   int ctr;
1300   int workspace[8*10];	/* buffers data between passes */
1301   SHIFT_TEMPS
1302 
1303   /* Pass 1: process columns from input, store into work array. */
1304 
1305   inptr = coef_block;
1306   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1307   wsptr = workspace;
1308   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1309     /* Even part */
1310 
1311     z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
1312     z3 <<= CONST_BITS;
1313     /* Add fudge factor here for final descale. */
1314     z3 += ONE << (CONST_BITS-PASS1_BITS-1);
1315     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
1316     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
1317     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
1318     tmp10 = z3 + z1;
1319     tmp11 = z3 - z2;
1320 
1321     tmp22 = RIGHT_SHIFT(z3 - ((z1 - z2) << 1),   /* c0 = (c4-c8)*2 */
1322 			CONST_BITS-PASS1_BITS);
1323 
1324     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
1325     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
1326 
1327     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
1328     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
1329     tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
1330 
1331     tmp20 = tmp10 + tmp12;
1332     tmp24 = tmp10 - tmp12;
1333     tmp21 = tmp11 + tmp13;
1334     tmp23 = tmp11 - tmp13;
1335 
1336     /* Odd part */
1337 
1338     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
1339     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
1340     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
1341     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
1342 
1343     tmp11 = z2 + z4;
1344     tmp13 = z2 - z4;
1345 
1346     tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
1347     z5 = z3 << CONST_BITS;
1348 
1349     z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
1350     z4 = z5 + tmp12;
1351 
1352     tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
1353     tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
1354 
1355     z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
1356     z4 = z5 - tmp12 - (tmp13 << (CONST_BITS - 1));
1357 
1358     tmp12 = (z1 - tmp13 - z3) << PASS1_BITS;
1359 
1360     tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
1361     tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
1362 
1363     /* Final output stage */
1364 
1365     wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
1366     wsptr[8*9] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
1367     wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
1368     wsptr[8*8] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
1369     wsptr[8*2] = (int) (tmp22 + tmp12);
1370     wsptr[8*7] = (int) (tmp22 - tmp12);
1371     wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
1372     wsptr[8*6] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
1373     wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
1374     wsptr[8*5] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
1375   }
1376 
1377   /* Pass 2: process 10 rows from work array, store into output array. */
1378 
1379   wsptr = workspace;
1380   for (ctr = 0; ctr < 10; ctr++) {
1381     outptr = output_buf[ctr] + output_col;
1382 
1383     /* Even part */
1384 
1385     /* Add range center and fudge factor for final descale and range-limit. */
1386     z3 = (INT32) wsptr[0] +
1387 	   ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
1388 	    (ONE << (PASS1_BITS+2)));
1389     z3 <<= CONST_BITS;
1390     z4 = (INT32) wsptr[4];
1391     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
1392     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
1393     tmp10 = z3 + z1;
1394     tmp11 = z3 - z2;
1395 
1396     tmp22 = z3 - ((z1 - z2) << 1);               /* c0 = (c4-c8)*2 */
1397 
1398     z2 = (INT32) wsptr[2];
1399     z3 = (INT32) wsptr[6];
1400 
1401     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
1402     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
1403     tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
1404 
1405     tmp20 = tmp10 + tmp12;
1406     tmp24 = tmp10 - tmp12;
1407     tmp21 = tmp11 + tmp13;
1408     tmp23 = tmp11 - tmp13;
1409 
1410     /* Odd part */
1411 
1412     z1 = (INT32) wsptr[1];
1413     z2 = (INT32) wsptr[3];
1414     z3 = (INT32) wsptr[5];
1415     z3 <<= CONST_BITS;
1416     z4 = (INT32) wsptr[7];
1417 
1418     tmp11 = z2 + z4;
1419     tmp13 = z2 - z4;
1420 
1421     tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
1422 
1423     z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
1424     z4 = z3 + tmp12;
1425 
1426     tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
1427     tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
1428 
1429     z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
1430     z4 = z3 - tmp12 - (tmp13 << (CONST_BITS - 1));
1431 
1432     tmp12 = ((z1 - tmp13) << CONST_BITS) - z3;
1433 
1434     tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
1435     tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
1436 
1437     /* Final output stage */
1438 
1439     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
1440 					      CONST_BITS+PASS1_BITS+3)
1441 			    & RANGE_MASK];
1442     outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
1443 					      CONST_BITS+PASS1_BITS+3)
1444 			    & RANGE_MASK];
1445     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
1446 					      CONST_BITS+PASS1_BITS+3)
1447 			    & RANGE_MASK];
1448     outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
1449 					      CONST_BITS+PASS1_BITS+3)
1450 			    & RANGE_MASK];
1451     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
1452 					      CONST_BITS+PASS1_BITS+3)
1453 			    & RANGE_MASK];
1454     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
1455 					      CONST_BITS+PASS1_BITS+3)
1456 			    & RANGE_MASK];
1457     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
1458 					      CONST_BITS+PASS1_BITS+3)
1459 			    & RANGE_MASK];
1460     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
1461 					      CONST_BITS+PASS1_BITS+3)
1462 			    & RANGE_MASK];
1463     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
1464 					      CONST_BITS+PASS1_BITS+3)
1465 			    & RANGE_MASK];
1466     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
1467 					      CONST_BITS+PASS1_BITS+3)
1468 			    & RANGE_MASK];
1469 
1470     wsptr += 8;		/* advance pointer to next row */
1471   }
1472 }
1473 
1474 
1475 /*
1476  * Perform dequantization and inverse DCT on one block of coefficients,
1477  * producing a 11x11 output block.
1478  *
1479  * Optimized algorithm with 24 multiplications in the 1-D kernel.
1480  * cK represents sqrt(2) * cos(K*pi/22).
1481  */
1482 
1483 GLOBAL(void)
1484 jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1485 		 JCOEFPTR coef_block,
1486 		 JSAMPARRAY output_buf, JDIMENSION output_col)
1487 {
1488   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
1489   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
1490   INT32 z1, z2, z3, z4;
1491   JCOEFPTR inptr;
1492   ISLOW_MULT_TYPE * quantptr;
1493   int * wsptr;
1494   JSAMPROW outptr;
1495   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1496   int ctr;
1497   int workspace[8*11];	/* buffers data between passes */
1498   SHIFT_TEMPS
1499 
1500   /* Pass 1: process columns from input, store into work array. */
1501 
1502   inptr = coef_block;
1503   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1504   wsptr = workspace;
1505   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1506     /* Even part */
1507 
1508     tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
1509     tmp10 <<= CONST_BITS;
1510     /* Add fudge factor here for final descale. */
1511     tmp10 += ONE << (CONST_BITS-PASS1_BITS-1);
1512 
1513     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
1514     z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
1515     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
1516 
1517     tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
1518     tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
1519     z4 = z1 + z3;
1520     tmp24 = MULTIPLY(z4, - FIX(1.155664402));        /* -(c2-c10) */
1521     z4 -= z2;
1522     tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
1523     tmp21 = tmp20 + tmp23 + tmp25 -
1524 	    MULTIPLY(z2, FIX(1.821790775));          /* c2+c4+c10-c6 */
1525     tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
1526     tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
1527     tmp24 += tmp25;
1528     tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120));  /* c8+c10 */
1529     tmp24 += MULTIPLY(z2, FIX(1.944413522)) -        /* c2+c8 */
1530 	     MULTIPLY(z1, FIX(1.390975730));         /* c4+c10 */
1531     tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562));  /* c0 */
1532 
1533     /* Odd part */
1534 
1535     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
1536     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
1537     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
1538     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
1539 
1540     tmp11 = z1 + z2;
1541     tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
1542     tmp11 = MULTIPLY(tmp11, FIX(0.887983902));           /* c3-c9 */
1543     tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295));         /* c5-c9 */
1544     tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
1545     tmp10 = tmp11 + tmp12 + tmp13 -
1546 	    MULTIPLY(z1, FIX(0.923107866));              /* c7+c5+c3-c1-2*c9 */
1547     z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
1548     tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
1549     tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
1550     z1    = MULTIPLY(z2 + z4, - FIX(1.798248910));       /* -(c1+c9) */
1551     tmp11 += z1;
1552     tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
1553     tmp14 += MULTIPLY(z2, - FIX(1.467221301)) +          /* -(c5+c9) */
1554 	     MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
1555 	     MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
1556 
1557     /* Final output stage */
1558 
1559     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
1560     wsptr[8*10] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
1561     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
1562     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
1563     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
1564     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
1565     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
1566     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
1567     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
1568     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
1569     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25, CONST_BITS-PASS1_BITS);
1570   }
1571 
1572   /* Pass 2: process 11 rows from work array, store into output array. */
1573 
1574   wsptr = workspace;
1575   for (ctr = 0; ctr < 11; ctr++) {
1576     outptr = output_buf[ctr] + output_col;
1577 
1578     /* Even part */
1579 
1580     /* Add range center and fudge factor for final descale and range-limit. */
1581     tmp10 = (INT32) wsptr[0] +
1582 	      ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
1583 	       (ONE << (PASS1_BITS+2)));
1584     tmp10 <<= CONST_BITS;
1585 
1586     z1 = (INT32) wsptr[2];
1587     z2 = (INT32) wsptr[4];
1588     z3 = (INT32) wsptr[6];
1589 
1590     tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
1591     tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
1592     z4 = z1 + z3;
1593     tmp24 = MULTIPLY(z4, - FIX(1.155664402));        /* -(c2-c10) */
1594     z4 -= z2;
1595     tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
1596     tmp21 = tmp20 + tmp23 + tmp25 -
1597 	    MULTIPLY(z2, FIX(1.821790775));          /* c2+c4+c10-c6 */
1598     tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
1599     tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
1600     tmp24 += tmp25;
1601     tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120));  /* c8+c10 */
1602     tmp24 += MULTIPLY(z2, FIX(1.944413522)) -        /* c2+c8 */
1603 	     MULTIPLY(z1, FIX(1.390975730));         /* c4+c10 */
1604     tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562));  /* c0 */
1605 
1606     /* Odd part */
1607 
1608     z1 = (INT32) wsptr[1];
1609     z2 = (INT32) wsptr[3];
1610     z3 = (INT32) wsptr[5];
1611     z4 = (INT32) wsptr[7];
1612 
1613     tmp11 = z1 + z2;
1614     tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
1615     tmp11 = MULTIPLY(tmp11, FIX(0.887983902));           /* c3-c9 */
1616     tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295));         /* c5-c9 */
1617     tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
1618     tmp10 = tmp11 + tmp12 + tmp13 -
1619 	    MULTIPLY(z1, FIX(0.923107866));              /* c7+c5+c3-c1-2*c9 */
1620     z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
1621     tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
1622     tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
1623     z1    = MULTIPLY(z2 + z4, - FIX(1.798248910));       /* -(c1+c9) */
1624     tmp11 += z1;
1625     tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
1626     tmp14 += MULTIPLY(z2, - FIX(1.467221301)) +          /* -(c5+c9) */
1627 	     MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
1628 	     MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
1629 
1630     /* Final output stage */
1631 
1632     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
1633 					       CONST_BITS+PASS1_BITS+3)
1634 			     & RANGE_MASK];
1635     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
1636 					       CONST_BITS+PASS1_BITS+3)
1637 			     & RANGE_MASK];
1638     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
1639 					       CONST_BITS+PASS1_BITS+3)
1640 			     & RANGE_MASK];
1641     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
1642 					       CONST_BITS+PASS1_BITS+3)
1643 			     & RANGE_MASK];
1644     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
1645 					       CONST_BITS+PASS1_BITS+3)
1646 			     & RANGE_MASK];
1647     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
1648 					       CONST_BITS+PASS1_BITS+3)
1649 			     & RANGE_MASK];
1650     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
1651 					       CONST_BITS+PASS1_BITS+3)
1652 			     & RANGE_MASK];
1653     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
1654 					       CONST_BITS+PASS1_BITS+3)
1655 			     & RANGE_MASK];
1656     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
1657 					       CONST_BITS+PASS1_BITS+3)
1658 			     & RANGE_MASK];
1659     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
1660 					       CONST_BITS+PASS1_BITS+3)
1661 			     & RANGE_MASK];
1662     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25,
1663 					       CONST_BITS+PASS1_BITS+3)
1664 			     & RANGE_MASK];
1665 
1666     wsptr += 8;		/* advance pointer to next row */
1667   }
1668 }
1669 
1670 
1671 /*
1672  * Perform dequantization and inverse DCT on one block of coefficients,
1673  * producing a 12x12 output block.
1674  *
1675  * Optimized algorithm with 15 multiplications in the 1-D kernel.
1676  * cK represents sqrt(2) * cos(K*pi/24).
1677  */
1678 
1679 GLOBAL(void)
1680 jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1681 		 JCOEFPTR coef_block,
1682 		 JSAMPARRAY output_buf, JDIMENSION output_col)
1683 {
1684   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1685   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
1686   INT32 z1, z2, z3, z4;
1687   JCOEFPTR inptr;
1688   ISLOW_MULT_TYPE * quantptr;
1689   int * wsptr;
1690   JSAMPROW outptr;
1691   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1692   int ctr;
1693   int workspace[8*12];	/* buffers data between passes */
1694   SHIFT_TEMPS
1695 
1696   /* Pass 1: process columns from input, store into work array. */
1697 
1698   inptr = coef_block;
1699   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1700   wsptr = workspace;
1701   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1702     /* Even part */
1703 
1704     z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
1705     z3 <<= CONST_BITS;
1706     /* Add fudge factor here for final descale. */
1707     z3 += ONE << (CONST_BITS-PASS1_BITS-1);
1708 
1709     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
1710     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
1711 
1712     tmp10 = z3 + z4;
1713     tmp11 = z3 - z4;
1714 
1715     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
1716     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
1717     z1 <<= CONST_BITS;
1718     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
1719     z2 <<= CONST_BITS;
1720 
1721     tmp12 = z1 - z2;
1722 
1723     tmp21 = z3 + tmp12;
1724     tmp24 = z3 - tmp12;
1725 
1726     tmp12 = z4 + z2;
1727 
1728     tmp20 = tmp10 + tmp12;
1729     tmp25 = tmp10 - tmp12;
1730 
1731     tmp12 = z4 - z1 - z2;
1732 
1733     tmp22 = tmp11 + tmp12;
1734     tmp23 = tmp11 - tmp12;
1735 
1736     /* Odd part */
1737 
1738     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
1739     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
1740     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
1741     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
1742 
1743     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
1744     tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
1745 
1746     tmp10 = z1 + z3;
1747     tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
1748     tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
1749     tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
1750     tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
1751     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
1752     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
1753     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
1754 	     MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
1755 
1756     z1 -= z4;
1757     z2 -= z3;
1758     z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
1759     tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
1760     tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
1761 
1762     /* Final output stage */
1763 
1764     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
1765     wsptr[8*11] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
1766     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
1767     wsptr[8*10] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
1768     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
1769     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
1770     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
1771     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
1772     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
1773     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
1774     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
1775     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
1776   }
1777 
1778   /* Pass 2: process 12 rows from work array, store into output array. */
1779 
1780   wsptr = workspace;
1781   for (ctr = 0; ctr < 12; ctr++) {
1782     outptr = output_buf[ctr] + output_col;
1783 
1784     /* Even part */
1785 
1786     /* Add range center and fudge factor for final descale and range-limit. */
1787     z3 = (INT32) wsptr[0] +
1788 	   ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
1789 	    (ONE << (PASS1_BITS+2)));
1790     z3 <<= CONST_BITS;
1791 
1792     z4 = (INT32) wsptr[4];
1793     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
1794 
1795     tmp10 = z3 + z4;
1796     tmp11 = z3 - z4;
1797 
1798     z1 = (INT32) wsptr[2];
1799     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
1800     z1 <<= CONST_BITS;
1801     z2 = (INT32) wsptr[6];
1802     z2 <<= CONST_BITS;
1803 
1804     tmp12 = z1 - z2;
1805 
1806     tmp21 = z3 + tmp12;
1807     tmp24 = z3 - tmp12;
1808 
1809     tmp12 = z4 + z2;
1810 
1811     tmp20 = tmp10 + tmp12;
1812     tmp25 = tmp10 - tmp12;
1813 
1814     tmp12 = z4 - z1 - z2;
1815 
1816     tmp22 = tmp11 + tmp12;
1817     tmp23 = tmp11 - tmp12;
1818 
1819     /* Odd part */
1820 
1821     z1 = (INT32) wsptr[1];
1822     z2 = (INT32) wsptr[3];
1823     z3 = (INT32) wsptr[5];
1824     z4 = (INT32) wsptr[7];
1825 
1826     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
1827     tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
1828 
1829     tmp10 = z1 + z3;
1830     tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
1831     tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
1832     tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
1833     tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
1834     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
1835     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
1836     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
1837 	     MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
1838 
1839     z1 -= z4;
1840     z2 -= z3;
1841     z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
1842     tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
1843     tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
1844 
1845     /* Final output stage */
1846 
1847     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
1848 					       CONST_BITS+PASS1_BITS+3)
1849 			     & RANGE_MASK];
1850     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
1851 					       CONST_BITS+PASS1_BITS+3)
1852 			     & RANGE_MASK];
1853     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
1854 					       CONST_BITS+PASS1_BITS+3)
1855 			     & RANGE_MASK];
1856     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
1857 					       CONST_BITS+PASS1_BITS+3)
1858 			     & RANGE_MASK];
1859     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
1860 					       CONST_BITS+PASS1_BITS+3)
1861 			     & RANGE_MASK];
1862     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
1863 					       CONST_BITS+PASS1_BITS+3)
1864 			     & RANGE_MASK];
1865     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
1866 					       CONST_BITS+PASS1_BITS+3)
1867 			     & RANGE_MASK];
1868     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
1869 					       CONST_BITS+PASS1_BITS+3)
1870 			     & RANGE_MASK];
1871     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
1872 					       CONST_BITS+PASS1_BITS+3)
1873 			     & RANGE_MASK];
1874     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
1875 					       CONST_BITS+PASS1_BITS+3)
1876 			     & RANGE_MASK];
1877     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
1878 					       CONST_BITS+PASS1_BITS+3)
1879 			     & RANGE_MASK];
1880     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
1881 					       CONST_BITS+PASS1_BITS+3)
1882 			     & RANGE_MASK];
1883 
1884     wsptr += 8;		/* advance pointer to next row */
1885   }
1886 }
1887 
1888 
1889 /*
1890  * Perform dequantization and inverse DCT on one block of coefficients,
1891  * producing a 13x13 output block.
1892  *
1893  * Optimized algorithm with 29 multiplications in the 1-D kernel.
1894  * cK represents sqrt(2) * cos(K*pi/26).
1895  */
1896 
1897 GLOBAL(void)
1898 jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1899 		 JCOEFPTR coef_block,
1900 		 JSAMPARRAY output_buf, JDIMENSION output_col)
1901 {
1902   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1903   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
1904   INT32 z1, z2, z3, z4;
1905   JCOEFPTR inptr;
1906   ISLOW_MULT_TYPE * quantptr;
1907   int * wsptr;
1908   JSAMPROW outptr;
1909   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1910   int ctr;
1911   int workspace[8*13];	/* buffers data between passes */
1912   SHIFT_TEMPS
1913 
1914   /* Pass 1: process columns from input, store into work array. */
1915 
1916   inptr = coef_block;
1917   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1918   wsptr = workspace;
1919   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1920     /* Even part */
1921 
1922     z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
1923     z1 <<= CONST_BITS;
1924     /* Add fudge factor here for final descale. */
1925     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
1926 
1927     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
1928     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
1929     z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
1930 
1931     tmp10 = z3 + z4;
1932     tmp11 = z3 - z4;
1933 
1934     tmp12 = MULTIPLY(tmp10, FIX(1.155388986));                /* (c4+c6)/2 */
1935     tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1;           /* (c4-c6)/2 */
1936 
1937     tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13;   /* c2 */
1938     tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13;   /* c10 */
1939 
1940     tmp12 = MULTIPLY(tmp10, FIX(0.316450131));                /* (c8-c12)/2 */
1941     tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
1942 
1943     tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
1944     tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
1945 
1946     tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
1947     tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
1948 
1949     tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
1950     tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
1951 
1952     tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
1953 
1954     /* Odd part */
1955 
1956     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
1957     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
1958     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
1959     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
1960 
1961     tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
1962     tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
1963     tmp15 = z1 + z4;
1964     tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
1965     tmp10 = tmp11 + tmp12 + tmp13 -
1966 	    MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
1967     tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458));   /* -c11 */
1968     tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
1969     tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
1970     tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945));   /* -c5 */
1971     tmp11 += tmp14;
1972     tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
1973     tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813));   /* -c9 */
1974     tmp12 += tmp14;
1975     tmp13 += tmp14;
1976     tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
1977     tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
1978 	    MULTIPLY(z2, FIX(0.466105296));          /* c1-c7 */
1979     z1    = MULTIPLY(z3 - z2, FIX(0.937797057));     /* c7 */
1980     tmp14 += z1;
1981     tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) -   /* c3-c7 */
1982 	     MULTIPLY(z4, FIX(1.742345811));         /* c1+c11 */
1983 
1984     /* Final output stage */
1985 
1986     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
1987     wsptr[8*12] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
1988     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
1989     wsptr[8*11] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
1990     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
1991     wsptr[8*10] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
1992     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
1993     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
1994     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
1995     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
1996     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
1997     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
1998     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26, CONST_BITS-PASS1_BITS);
1999   }
2000 
2001   /* Pass 2: process 13 rows from work array, store into output array. */
2002 
2003   wsptr = workspace;
2004   for (ctr = 0; ctr < 13; ctr++) {
2005     outptr = output_buf[ctr] + output_col;
2006 
2007     /* Even part */
2008 
2009     /* Add range center and fudge factor for final descale and range-limit. */
2010     z1 = (INT32) wsptr[0] +
2011 	   ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
2012 	    (ONE << (PASS1_BITS+2)));
2013     z1 <<= CONST_BITS;
2014 
2015     z2 = (INT32) wsptr[2];
2016     z3 = (INT32) wsptr[4];
2017     z4 = (INT32) wsptr[6];
2018 
2019     tmp10 = z3 + z4;
2020     tmp11 = z3 - z4;
2021 
2022     tmp12 = MULTIPLY(tmp10, FIX(1.155388986));                /* (c4+c6)/2 */
2023     tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1;           /* (c4-c6)/2 */
2024 
2025     tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13;   /* c2 */
2026     tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13;   /* c10 */
2027 
2028     tmp12 = MULTIPLY(tmp10, FIX(0.316450131));                /* (c8-c12)/2 */
2029     tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
2030 
2031     tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
2032     tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
2033 
2034     tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
2035     tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
2036 
2037     tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
2038     tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
2039 
2040     tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
2041 
2042     /* Odd part */
2043 
2044     z1 = (INT32) wsptr[1];
2045     z2 = (INT32) wsptr[3];
2046     z3 = (INT32) wsptr[5];
2047     z4 = (INT32) wsptr[7];
2048 
2049     tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
2050     tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
2051     tmp15 = z1 + z4;
2052     tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
2053     tmp10 = tmp11 + tmp12 + tmp13 -
2054 	    MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
2055     tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458));   /* -c11 */
2056     tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
2057     tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
2058     tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945));   /* -c5 */
2059     tmp11 += tmp14;
2060     tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
2061     tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813));   /* -c9 */
2062     tmp12 += tmp14;
2063     tmp13 += tmp14;
2064     tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
2065     tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
2066 	    MULTIPLY(z2, FIX(0.466105296));          /* c1-c7 */
2067     z1    = MULTIPLY(z3 - z2, FIX(0.937797057));     /* c7 */
2068     tmp14 += z1;
2069     tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) -   /* c3-c7 */
2070 	     MULTIPLY(z4, FIX(1.742345811));         /* c1+c11 */
2071 
2072     /* Final output stage */
2073 
2074     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
2075 					       CONST_BITS+PASS1_BITS+3)
2076 			     & RANGE_MASK];
2077     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
2078 					       CONST_BITS+PASS1_BITS+3)
2079 			     & RANGE_MASK];
2080     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
2081 					       CONST_BITS+PASS1_BITS+3)
2082 			     & RANGE_MASK];
2083     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
2084 					       CONST_BITS+PASS1_BITS+3)
2085 			     & RANGE_MASK];
2086     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
2087 					       CONST_BITS+PASS1_BITS+3)
2088 			     & RANGE_MASK];
2089     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
2090 					       CONST_BITS+PASS1_BITS+3)
2091 			     & RANGE_MASK];
2092     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
2093 					       CONST_BITS+PASS1_BITS+3)
2094 			     & RANGE_MASK];
2095     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
2096 					       CONST_BITS+PASS1_BITS+3)
2097 			     & RANGE_MASK];
2098     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
2099 					       CONST_BITS+PASS1_BITS+3)
2100 			     & RANGE_MASK];
2101     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
2102 					       CONST_BITS+PASS1_BITS+3)
2103 			     & RANGE_MASK];
2104     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
2105 					       CONST_BITS+PASS1_BITS+3)
2106 			     & RANGE_MASK];
2107     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
2108 					       CONST_BITS+PASS1_BITS+3)
2109 			     & RANGE_MASK];
2110     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26,
2111 					       CONST_BITS+PASS1_BITS+3)
2112 			     & RANGE_MASK];
2113 
2114     wsptr += 8;		/* advance pointer to next row */
2115   }
2116 }
2117 
2118 
2119 /*
2120  * Perform dequantization and inverse DCT on one block of coefficients,
2121  * producing a 14x14 output block.
2122  *
2123  * Optimized algorithm with 20 multiplications in the 1-D kernel.
2124  * cK represents sqrt(2) * cos(K*pi/28).
2125  */
2126 
2127 GLOBAL(void)
2128 jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2129 		 JCOEFPTR coef_block,
2130 		 JSAMPARRAY output_buf, JDIMENSION output_col)
2131 {
2132   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
2133   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
2134   INT32 z1, z2, z3, z4;
2135   JCOEFPTR inptr;
2136   ISLOW_MULT_TYPE * quantptr;
2137   int * wsptr;
2138   JSAMPROW outptr;
2139   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
2140   int ctr;
2141   int workspace[8*14];	/* buffers data between passes */
2142   SHIFT_TEMPS
2143 
2144   /* Pass 1: process columns from input, store into work array. */
2145 
2146   inptr = coef_block;
2147   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
2148   wsptr = workspace;
2149   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
2150     /* Even part */
2151 
2152     z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
2153     z1 <<= CONST_BITS;
2154     /* Add fudge factor here for final descale. */
2155     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
2156     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
2157     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
2158     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
2159     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
2160 
2161     tmp10 = z1 + z2;
2162     tmp11 = z1 + z3;
2163     tmp12 = z1 - z4;
2164 
2165     tmp23 = RIGHT_SHIFT(z1 - ((z2 + z3 - z4) << 1), /* c0 = (c4+c12-c8)*2 */
2166 			CONST_BITS-PASS1_BITS);
2167 
2168     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
2169     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
2170 
2171     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
2172 
2173     tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
2174     tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
2175     tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
2176 	    MULTIPLY(z2, FIX(1.378756276));      /* c2 */
2177 
2178     tmp20 = tmp10 + tmp13;
2179     tmp26 = tmp10 - tmp13;
2180     tmp21 = tmp11 + tmp14;
2181     tmp25 = tmp11 - tmp14;
2182     tmp22 = tmp12 + tmp15;
2183     tmp24 = tmp12 - tmp15;
2184 
2185     /* Odd part */
2186 
2187     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
2188     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
2189     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
2190     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
2191     tmp13 = z4 << CONST_BITS;
2192 
2193     tmp14 = z1 + z3;
2194     tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
2195     tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
2196     tmp10 = tmp11 + tmp12 + tmp13 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
2197     tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
2198     tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
2199     z1    -= z2;
2200     tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13;        /* c11 */
2201     tmp16 += tmp15;
2202     z1    += z4;
2203     z4    = MULTIPLY(z2 + z3, - FIX(0.158341681)) - tmp13; /* -c13 */
2204     tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948));          /* c3-c9-c13 */
2205     tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773));          /* c3+c5-c13 */
2206     z4    = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
2207     tmp14 += z4 + tmp13 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
2208     tmp15 += z4 + MULTIPLY(z2, FIX(0.674957567));          /* c1+c11-c5 */
2209 
2210     tmp13 = (z1 - z3) << PASS1_BITS;
2211 
2212     /* Final output stage */
2213 
2214     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
2215     wsptr[8*13] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
2216     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
2217     wsptr[8*12] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
2218     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
2219     wsptr[8*11] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
2220     wsptr[8*3]  = (int) (tmp23 + tmp13);
2221     wsptr[8*10] = (int) (tmp23 - tmp13);
2222     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
2223     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
2224     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
2225     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
2226     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
2227     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
2228   }
2229 
2230   /* Pass 2: process 14 rows from work array, store into output array. */
2231 
2232   wsptr = workspace;
2233   for (ctr = 0; ctr < 14; ctr++) {
2234     outptr = output_buf[ctr] + output_col;
2235 
2236     /* Even part */
2237 
2238     /* Add range center and fudge factor for final descale and range-limit. */
2239     z1 = (INT32) wsptr[0] +
2240 	   ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
2241 	    (ONE << (PASS1_BITS+2)));
2242     z1 <<= CONST_BITS;
2243     z4 = (INT32) wsptr[4];
2244     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
2245     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
2246     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
2247 
2248     tmp10 = z1 + z2;
2249     tmp11 = z1 + z3;
2250     tmp12 = z1 - z4;
2251 
2252     tmp23 = z1 - ((z2 + z3 - z4) << 1);          /* c0 = (c4+c12-c8)*2 */
2253 
2254     z1 = (INT32) wsptr[2];
2255     z2 = (INT32) wsptr[6];
2256 
2257     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
2258 
2259     tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
2260     tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
2261     tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
2262 	    MULTIPLY(z2, FIX(1.378756276));      /* c2 */
2263 
2264     tmp20 = tmp10 + tmp13;
2265     tmp26 = tmp10 - tmp13;
2266     tmp21 = tmp11 + tmp14;
2267     tmp25 = tmp11 - tmp14;
2268     tmp22 = tmp12 + tmp15;
2269     tmp24 = tmp12 - tmp15;
2270 
2271     /* Odd part */
2272 
2273     z1 = (INT32) wsptr[1];
2274     z2 = (INT32) wsptr[3];
2275     z3 = (INT32) wsptr[5];
2276     z4 = (INT32) wsptr[7];
2277     z4 <<= CONST_BITS;
2278 
2279     tmp14 = z1 + z3;
2280     tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
2281     tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
2282     tmp10 = tmp11 + tmp12 + z4 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
2283     tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
2284     tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
2285     z1    -= z2;
2286     tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4;           /* c11 */
2287     tmp16 += tmp15;
2288     tmp13 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - z4;    /* -c13 */
2289     tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948));       /* c3-c9-c13 */
2290     tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773));       /* c3+c5-c13 */
2291     tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
2292     tmp14 += tmp13 + z4 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
2293     tmp15 += tmp13 + MULTIPLY(z2, FIX(0.674957567));       /* c1+c11-c5 */
2294 
2295     tmp13 = ((z1 - z3) << CONST_BITS) + z4;
2296 
2297     /* Final output stage */
2298 
2299     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
2300 					       CONST_BITS+PASS1_BITS+3)
2301 			     & RANGE_MASK];
2302     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
2303 					       CONST_BITS+PASS1_BITS+3)
2304 			     & RANGE_MASK];
2305     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
2306 					       CONST_BITS+PASS1_BITS+3)
2307 			     & RANGE_MASK];
2308     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
2309 					       CONST_BITS+PASS1_BITS+3)
2310 			     & RANGE_MASK];
2311     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
2312 					       CONST_BITS+PASS1_BITS+3)
2313 			     & RANGE_MASK];
2314     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
2315 					       CONST_BITS+PASS1_BITS+3)
2316 			     & RANGE_MASK];
2317     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
2318 					       CONST_BITS+PASS1_BITS+3)
2319 			     & RANGE_MASK];
2320     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
2321 					       CONST_BITS+PASS1_BITS+3)
2322 			     & RANGE_MASK];
2323     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
2324 					       CONST_BITS+PASS1_BITS+3)
2325 			     & RANGE_MASK];
2326     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
2327 					       CONST_BITS+PASS1_BITS+3)
2328 			     & RANGE_MASK];
2329     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
2330 					       CONST_BITS+PASS1_BITS+3)
2331 			     & RANGE_MASK];
2332     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
2333 					       CONST_BITS+PASS1_BITS+3)
2334 			     & RANGE_MASK];
2335     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
2336 					       CONST_BITS+PASS1_BITS+3)
2337 			     & RANGE_MASK];
2338     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
2339 					       CONST_BITS+PASS1_BITS+3)
2340 			     & RANGE_MASK];
2341 
2342     wsptr += 8;		/* advance pointer to next row */
2343   }
2344 }
2345 
2346 
2347 /*
2348  * Perform dequantization and inverse DCT on one block of coefficients,
2349  * producing a 15x15 output block.
2350  *
2351  * Optimized algorithm with 22 multiplications in the 1-D kernel.
2352  * cK represents sqrt(2) * cos(K*pi/30).
2353  */
2354 
2355 GLOBAL(void)
2356 jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2357 		 JCOEFPTR coef_block,
2358 		 JSAMPARRAY output_buf, JDIMENSION output_col)
2359 {
2360   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
2361   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
2362   INT32 z1, z2, z3, z4;
2363   JCOEFPTR inptr;
2364   ISLOW_MULT_TYPE * quantptr;
2365   int * wsptr;
2366   JSAMPROW outptr;
2367   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
2368   int ctr;
2369   int workspace[8*15];	/* buffers data between passes */
2370   SHIFT_TEMPS
2371 
2372   /* Pass 1: process columns from input, store into work array. */
2373 
2374   inptr = coef_block;
2375   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
2376   wsptr = workspace;
2377   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
2378     /* Even part */
2379 
2380     z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
2381     z1 <<= CONST_BITS;
2382     /* Add fudge factor here for final descale. */
2383     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
2384 
2385     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
2386     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
2387     z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
2388 
2389     tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
2390     tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
2391 
2392     tmp12 = z1 - tmp10;
2393     tmp13 = z1 + tmp11;
2394     z1 -= (tmp11 - tmp10) << 1;             /* c0 = (c6-c12)*2 */
2395 
2396     z4 = z2 - z3;
2397     z3 += z2;
2398     tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
2399     tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
2400     z2 = MULTIPLY(z2, FIX(1.439773946));    /* c4+c14 */
2401 
2402     tmp20 = tmp13 + tmp10 + tmp11;
2403     tmp23 = tmp12 - tmp10 + tmp11 + z2;
2404 
2405     tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
2406     tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
2407 
2408     tmp25 = tmp13 - tmp10 - tmp11;
2409     tmp26 = tmp12 + tmp10 - tmp11 - z2;
2410 
2411     tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
2412     tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
2413 
2414     tmp21 = tmp12 + tmp10 + tmp11;
2415     tmp24 = tmp13 - tmp10 + tmp11;
2416     tmp11 += tmp11;
2417     tmp22 = z1 + tmp11;                     /* c10 = c6-c12 */
2418     tmp27 = z1 - tmp11 - tmp11;             /* c0 = (c6-c12)*2 */
2419 
2420     /* Odd part */
2421 
2422     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
2423     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
2424     z4 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
2425     z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
2426     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
2427 
2428     tmp13 = z2 - z4;
2429     tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
2430     tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
2431     tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
2432 
2433     tmp13 = MULTIPLY(z2, - FIX(0.831253876));               /* -c9 */
2434     tmp15 = MULTIPLY(z2, - FIX(1.344997024));               /* -c3 */
2435     z2 = z1 - z4;
2436     tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
2437 
2438     tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
2439     tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
2440     tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3;            /* c5 */
2441     z2 = MULTIPLY(z1 + z4, FIX(0.575212477));               /* c11 */
2442     tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3;      /* c7-c11 */
2443     tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3;      /* c11+c13 */
2444 
2445     /* Final output stage */
2446 
2447     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
2448     wsptr[8*14] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
2449     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
2450     wsptr[8*13] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
2451     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
2452     wsptr[8*12] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
2453     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
2454     wsptr[8*11] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
2455     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
2456     wsptr[8*10] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
2457     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
2458     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
2459     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
2460     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
2461     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27, CONST_BITS-PASS1_BITS);
2462   }
2463 
2464   /* Pass 2: process 15 rows from work array, store into output array. */
2465 
2466   wsptr = workspace;
2467   for (ctr = 0; ctr < 15; ctr++) {
2468     outptr = output_buf[ctr] + output_col;
2469 
2470     /* Even part */
2471 
2472     /* Add range center and fudge factor for final descale and range-limit. */
2473     z1 = (INT32) wsptr[0] +
2474 	   ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
2475 	    (ONE << (PASS1_BITS+2)));
2476     z1 <<= CONST_BITS;
2477 
2478     z2 = (INT32) wsptr[2];
2479     z3 = (INT32) wsptr[4];
2480     z4 = (INT32) wsptr[6];
2481 
2482     tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
2483     tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
2484 
2485     tmp12 = z1 - tmp10;
2486     tmp13 = z1 + tmp11;
2487     z1 -= (tmp11 - tmp10) << 1;             /* c0 = (c6-c12)*2 */
2488 
2489     z4 = z2 - z3;
2490     z3 += z2;
2491     tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
2492     tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
2493     z2 = MULTIPLY(z2, FIX(1.439773946));    /* c4+c14 */
2494 
2495     tmp20 = tmp13 + tmp10 + tmp11;
2496     tmp23 = tmp12 - tmp10 + tmp11 + z2;
2497 
2498     tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
2499     tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
2500 
2501     tmp25 = tmp13 - tmp10 - tmp11;
2502     tmp26 = tmp12 + tmp10 - tmp11 - z2;
2503 
2504     tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
2505     tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
2506 
2507     tmp21 = tmp12 + tmp10 + tmp11;
2508     tmp24 = tmp13 - tmp10 + tmp11;
2509     tmp11 += tmp11;
2510     tmp22 = z1 + tmp11;                     /* c10 = c6-c12 */
2511     tmp27 = z1 - tmp11 - tmp11;             /* c0 = (c6-c12)*2 */
2512 
2513     /* Odd part */
2514 
2515     z1 = (INT32) wsptr[1];
2516     z2 = (INT32) wsptr[3];
2517     z4 = (INT32) wsptr[5];
2518     z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
2519     z4 = (INT32) wsptr[7];
2520 
2521     tmp13 = z2 - z4;
2522     tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
2523     tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
2524     tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
2525 
2526     tmp13 = MULTIPLY(z2, - FIX(0.831253876));               /* -c9 */
2527     tmp15 = MULTIPLY(z2, - FIX(1.344997024));               /* -c3 */
2528     z2 = z1 - z4;
2529     tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
2530 
2531     tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
2532     tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
2533     tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3;            /* c5 */
2534     z2 = MULTIPLY(z1 + z4, FIX(0.575212477));               /* c11 */
2535     tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3;      /* c7-c11 */
2536     tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3;      /* c11+c13 */
2537 
2538     /* Final output stage */
2539 
2540     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
2541 					       CONST_BITS+PASS1_BITS+3)
2542 			     & RANGE_MASK];
2543     outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
2544 					       CONST_BITS+PASS1_BITS+3)
2545 			     & RANGE_MASK];
2546     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
2547 					       CONST_BITS+PASS1_BITS+3)
2548 			     & RANGE_MASK];
2549     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
2550 					       CONST_BITS+PASS1_BITS+3)
2551 			     & RANGE_MASK];
2552     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
2553 					       CONST_BITS+PASS1_BITS+3)
2554 			     & RANGE_MASK];
2555     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
2556 					       CONST_BITS+PASS1_BITS+3)
2557 			     & RANGE_MASK];
2558     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
2559 					       CONST_BITS+PASS1_BITS+3)
2560 			     & RANGE_MASK];
2561     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
2562 					       CONST_BITS+PASS1_BITS+3)
2563 			     & RANGE_MASK];
2564     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
2565 					       CONST_BITS+PASS1_BITS+3)
2566 			     & RANGE_MASK];
2567     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
2568 					       CONST_BITS+PASS1_BITS+3)
2569 			     & RANGE_MASK];
2570     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
2571 					       CONST_BITS+PASS1_BITS+3)
2572 			     & RANGE_MASK];
2573     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
2574 					       CONST_BITS+PASS1_BITS+3)
2575 			     & RANGE_MASK];
2576     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
2577 					       CONST_BITS+PASS1_BITS+3)
2578 			     & RANGE_MASK];
2579     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
2580 					       CONST_BITS+PASS1_BITS+3)
2581 			     & RANGE_MASK];
2582     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27,
2583 					       CONST_BITS+PASS1_BITS+3)
2584 			     & RANGE_MASK];
2585 
2586     wsptr += 8;		/* advance pointer to next row */
2587   }
2588 }
2589 
2590 
2591 /*
2592  * Perform dequantization and inverse DCT on one block of coefficients,
2593  * producing a 16x16 output block.
2594  *
2595  * Optimized algorithm with 28 multiplications in the 1-D kernel.
2596  * cK represents sqrt(2) * cos(K*pi/32).
2597  */
2598 
2599 GLOBAL(void)
2600 jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2601 		 JCOEFPTR coef_block,
2602 		 JSAMPARRAY output_buf, JDIMENSION output_col)
2603 {
2604   INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
2605   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
2606   INT32 z1, z2, z3, z4;
2607   JCOEFPTR inptr;
2608   ISLOW_MULT_TYPE * quantptr;
2609   int * wsptr;
2610   JSAMPROW outptr;
2611   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
2612   int ctr;
2613   int workspace[8*16];	/* buffers data between passes */
2614   SHIFT_TEMPS
2615 
2616   /* Pass 1: process columns from input, store into work array. */
2617 
2618   inptr = coef_block;
2619   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
2620   wsptr = workspace;
2621   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
2622     /* Even part */
2623 
2624     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
2625     tmp0 <<= CONST_BITS;
2626     /* Add fudge factor here for final descale. */
2627     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
2628 
2629     z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
2630     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
2631     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
2632 
2633     tmp10 = tmp0 + tmp1;
2634     tmp11 = tmp0 - tmp1;
2635     tmp12 = tmp0 + tmp2;
2636     tmp13 = tmp0 - tmp2;
2637 
2638     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
2639     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
2640     z3 = z1 - z2;
2641     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
2642     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
2643 
2644     tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
2645     tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
2646     tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
2647     tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
2648 
2649     tmp20 = tmp10 + tmp0;
2650     tmp27 = tmp10 - tmp0;
2651     tmp21 = tmp12 + tmp1;
2652     tmp26 = tmp12 - tmp1;
2653     tmp22 = tmp13 + tmp2;
2654     tmp25 = tmp13 - tmp2;
2655     tmp23 = tmp11 + tmp3;
2656     tmp24 = tmp11 - tmp3;
2657 
2658     /* Odd part */
2659 
2660     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
2661     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
2662     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
2663     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
2664 
2665     tmp11 = z1 + z3;
2666 
2667     tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
2668     tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
2669     tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
2670     tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
2671     tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
2672     tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
2673     tmp0  = tmp1 + tmp2 + tmp3 -
2674 	    MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
2675     tmp13 = tmp10 + tmp11 + tmp12 -
2676 	    MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
2677     z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
2678     tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
2679     tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
2680     z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
2681     tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
2682     tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
2683     z2    += z4;
2684     z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
2685     tmp1  += z1;
2686     tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
2687     z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
2688     tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
2689     tmp12 += z2;
2690     z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
2691     tmp2  += z2;
2692     tmp3  += z2;
2693     z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
2694     tmp10 += z2;
2695     tmp11 += z2;
2696 
2697     /* Final output stage */
2698 
2699     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp0,  CONST_BITS-PASS1_BITS);
2700     wsptr[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0,  CONST_BITS-PASS1_BITS);
2701     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp1,  CONST_BITS-PASS1_BITS);
2702     wsptr[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1,  CONST_BITS-PASS1_BITS);
2703     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp2,  CONST_BITS-PASS1_BITS);
2704     wsptr[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2,  CONST_BITS-PASS1_BITS);
2705     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp3,  CONST_BITS-PASS1_BITS);
2706     wsptr[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3,  CONST_BITS-PASS1_BITS);
2707     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
2708     wsptr[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
2709     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
2710     wsptr[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
2711     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
2712     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
2713     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
2714     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
2715   }
2716 
2717   /* Pass 2: process 16 rows from work array, store into output array. */
2718 
2719   wsptr = workspace;
2720   for (ctr = 0; ctr < 16; ctr++) {
2721     outptr = output_buf[ctr] + output_col;
2722 
2723     /* Even part */
2724 
2725     /* Add range center and fudge factor for final descale and range-limit. */
2726     tmp0 = (INT32) wsptr[0] +
2727 	     ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
2728 	      (ONE << (PASS1_BITS+2)));
2729     tmp0 <<= CONST_BITS;
2730 
2731     z1 = (INT32) wsptr[4];
2732     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
2733     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
2734 
2735     tmp10 = tmp0 + tmp1;
2736     tmp11 = tmp0 - tmp1;
2737     tmp12 = tmp0 + tmp2;
2738     tmp13 = tmp0 - tmp2;
2739 
2740     z1 = (INT32) wsptr[2];
2741     z2 = (INT32) wsptr[6];
2742     z3 = z1 - z2;
2743     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
2744     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
2745 
2746     tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
2747     tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
2748     tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
2749     tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
2750 
2751     tmp20 = tmp10 + tmp0;
2752     tmp27 = tmp10 - tmp0;
2753     tmp21 = tmp12 + tmp1;
2754     tmp26 = tmp12 - tmp1;
2755     tmp22 = tmp13 + tmp2;
2756     tmp25 = tmp13 - tmp2;
2757     tmp23 = tmp11 + tmp3;
2758     tmp24 = tmp11 - tmp3;
2759 
2760     /* Odd part */
2761 
2762     z1 = (INT32) wsptr[1];
2763     z2 = (INT32) wsptr[3];
2764     z3 = (INT32) wsptr[5];
2765     z4 = (INT32) wsptr[7];
2766 
2767     tmp11 = z1 + z3;
2768 
2769     tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
2770     tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
2771     tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
2772     tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
2773     tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
2774     tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
2775     tmp0  = tmp1 + tmp2 + tmp3 -
2776 	    MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
2777     tmp13 = tmp10 + tmp11 + tmp12 -
2778 	    MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
2779     z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
2780     tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
2781     tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
2782     z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
2783     tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
2784     tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
2785     z2    += z4;
2786     z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
2787     tmp1  += z1;
2788     tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
2789     z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
2790     tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
2791     tmp12 += z2;
2792     z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
2793     tmp2  += z2;
2794     tmp3  += z2;
2795     z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
2796     tmp10 += z2;
2797     tmp11 += z2;
2798 
2799     /* Final output stage */
2800 
2801     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp0,
2802 					       CONST_BITS+PASS1_BITS+3)
2803 			     & RANGE_MASK];
2804     outptr[15] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp0,
2805 					       CONST_BITS+PASS1_BITS+3)
2806 			     & RANGE_MASK];
2807     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp1,
2808 					       CONST_BITS+PASS1_BITS+3)
2809 			     & RANGE_MASK];
2810     outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp1,
2811 					       CONST_BITS+PASS1_BITS+3)
2812 			     & RANGE_MASK];
2813     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp2,
2814 					       CONST_BITS+PASS1_BITS+3)
2815 			     & RANGE_MASK];
2816     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp2,
2817 					       CONST_BITS+PASS1_BITS+3)
2818 			     & RANGE_MASK];
2819     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp3,
2820 					       CONST_BITS+PASS1_BITS+3)
2821 			     & RANGE_MASK];
2822     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp3,
2823 					       CONST_BITS+PASS1_BITS+3)
2824 			     & RANGE_MASK];
2825     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp10,
2826 					       CONST_BITS+PASS1_BITS+3)
2827 			     & RANGE_MASK];
2828     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp10,
2829 					       CONST_BITS+PASS1_BITS+3)
2830 			     & RANGE_MASK];
2831     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp11,
2832 					       CONST_BITS+PASS1_BITS+3)
2833 			     & RANGE_MASK];
2834     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp11,
2835 					       CONST_BITS+PASS1_BITS+3)
2836 			     & RANGE_MASK];
2837     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp12,
2838 					       CONST_BITS+PASS1_BITS+3)
2839 			     & RANGE_MASK];
2840     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp12,
2841 					       CONST_BITS+PASS1_BITS+3)
2842 			     & RANGE_MASK];
2843     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27 + tmp13,
2844 					       CONST_BITS+PASS1_BITS+3)
2845 			     & RANGE_MASK];
2846     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp27 - tmp13,
2847 					       CONST_BITS+PASS1_BITS+3)
2848 			     & RANGE_MASK];
2849 
2850     wsptr += 8;		/* advance pointer to next row */
2851   }
2852 }
2853 
2854 
2855 /*
2856  * Perform dequantization and inverse DCT on one block of coefficients,
2857  * producing a 16x8 output block.
2858  *
2859  * 8-point IDCT in pass 1 (columns), 16-point in pass 2 (rows).
2860  */
2861 
2862 GLOBAL(void)
2863 jpeg_idct_16x8 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2864 		JCOEFPTR coef_block,
2865 		JSAMPARRAY output_buf, JDIMENSION output_col)
2866 {
2867   INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
2868   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
2869   INT32 z1, z2, z3, z4;
2870   JCOEFPTR inptr;
2871   ISLOW_MULT_TYPE * quantptr;
2872   int * wsptr;
2873   JSAMPROW outptr;
2874   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
2875   int ctr;
2876   int workspace[8*8];	/* buffers data between passes */
2877   SHIFT_TEMPS
2878 
2879   /* Pass 1: process columns from input, store into work array.
2880    * Note results are scaled up by sqrt(8) compared to a true IDCT;
2881    * furthermore, we scale the results by 2**PASS1_BITS.
2882    * 8-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
2883    */
2884 
2885   inptr = coef_block;
2886   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
2887   wsptr = workspace;
2888   for (ctr = DCTSIZE; ctr > 0; ctr--) {
2889     /* Due to quantization, we will usually find that many of the input
2890      * coefficients are zero, especially the AC terms.  We can exploit this
2891      * by short-circuiting the IDCT calculation for any column in which all
2892      * the AC terms are zero.  In that case each output is equal to the
2893      * DC coefficient (with scale factor as needed).
2894      * With typical images and quantization tables, half or more of the
2895      * column DCT calculations can be simplified this way.
2896      */
2897 
2898     if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
2899 	inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
2900 	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
2901 	inptr[DCTSIZE*7] == 0) {
2902       /* AC terms all zero */
2903       int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
2904 
2905       wsptr[DCTSIZE*0] = dcval;
2906       wsptr[DCTSIZE*1] = dcval;
2907       wsptr[DCTSIZE*2] = dcval;
2908       wsptr[DCTSIZE*3] = dcval;
2909       wsptr[DCTSIZE*4] = dcval;
2910       wsptr[DCTSIZE*5] = dcval;
2911       wsptr[DCTSIZE*6] = dcval;
2912       wsptr[DCTSIZE*7] = dcval;
2913 
2914       inptr++;			/* advance pointers to next column */
2915       quantptr++;
2916       wsptr++;
2917       continue;
2918     }
2919 
2920     /* Even part: reverse the even part of the forward DCT.
2921      * The rotator is c(-6).
2922      */
2923 
2924     z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
2925     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
2926     z2 <<= CONST_BITS;
2927     z3 <<= CONST_BITS;
2928     /* Add fudge factor here for final descale. */
2929     z2 += ONE << (CONST_BITS-PASS1_BITS-1);
2930 
2931     tmp0 = z2 + z3;
2932     tmp1 = z2 - z3;
2933 
2934     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
2935     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
2936 
2937     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);       /* c6 */
2938     tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);     /* c2-c6 */
2939     tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);     /* c2+c6 */
2940 
2941     tmp10 = tmp0 + tmp2;
2942     tmp13 = tmp0 - tmp2;
2943     tmp11 = tmp1 + tmp3;
2944     tmp12 = tmp1 - tmp3;
2945 
2946     /* Odd part per figure 8; the matrix is unitary and hence its
2947      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
2948      */
2949 
2950     tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
2951     tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
2952     tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
2953     tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
2954 
2955     z2 = tmp0 + tmp2;
2956     z3 = tmp1 + tmp3;
2957 
2958     z1 = MULTIPLY(z2 + z3, FIX_1_175875602);       /*  c3 */
2959     z2 = MULTIPLY(z2, - FIX_1_961570560);          /* -c3-c5 */
2960     z3 = MULTIPLY(z3, - FIX_0_390180644);          /* -c3+c5 */
2961     z2 += z1;
2962     z3 += z1;
2963 
2964     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* -c3+c7 */
2965     tmp0 = MULTIPLY(tmp0, FIX_0_298631336);        /* -c1+c3+c5-c7 */
2966     tmp3 = MULTIPLY(tmp3, FIX_1_501321110);        /*  c1+c3-c5-c7 */
2967     tmp0 += z1 + z2;
2968     tmp3 += z1 + z3;
2969 
2970     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* -c1-c3 */
2971     tmp1 = MULTIPLY(tmp1, FIX_2_053119869);        /*  c1+c3-c5+c7 */
2972     tmp2 = MULTIPLY(tmp2, FIX_3_072711026);        /*  c1+c3+c5-c7 */
2973     tmp1 += z1 + z3;
2974     tmp2 += z1 + z2;
2975 
2976     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
2977 
2978     wsptr[DCTSIZE*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
2979     wsptr[DCTSIZE*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
2980     wsptr[DCTSIZE*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
2981     wsptr[DCTSIZE*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
2982     wsptr[DCTSIZE*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
2983     wsptr[DCTSIZE*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
2984     wsptr[DCTSIZE*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
2985     wsptr[DCTSIZE*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
2986 
2987     inptr++;			/* advance pointers to next column */
2988     quantptr++;
2989     wsptr++;
2990   }
2991 
2992   /* Pass 2: process 8 rows from work array, store into output array.
2993    * 16-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
2994    */
2995 
2996   wsptr = workspace;
2997   for (ctr = 0; ctr < 8; ctr++) {
2998     outptr = output_buf[ctr] + output_col;
2999 
3000     /* Even part */
3001 
3002     /* Add range center and fudge factor for final descale and range-limit. */
3003     tmp0 = (INT32) wsptr[0] +
3004 	     ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
3005 	      (ONE << (PASS1_BITS+2)));
3006     tmp0 <<= CONST_BITS;
3007 
3008     z1 = (INT32) wsptr[4];
3009     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
3010     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
3011 
3012     tmp10 = tmp0 + tmp1;
3013     tmp11 = tmp0 - tmp1;
3014     tmp12 = tmp0 + tmp2;
3015     tmp13 = tmp0 - tmp2;
3016 
3017     z1 = (INT32) wsptr[2];
3018     z2 = (INT32) wsptr[6];
3019     z3 = z1 - z2;
3020     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
3021     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
3022 
3023     tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
3024     tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
3025     tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
3026     tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
3027 
3028     tmp20 = tmp10 + tmp0;
3029     tmp27 = tmp10 - tmp0;
3030     tmp21 = tmp12 + tmp1;
3031     tmp26 = tmp12 - tmp1;
3032     tmp22 = tmp13 + tmp2;
3033     tmp25 = tmp13 - tmp2;
3034     tmp23 = tmp11 + tmp3;
3035     tmp24 = tmp11 - tmp3;
3036 
3037     /* Odd part */
3038 
3039     z1 = (INT32) wsptr[1];
3040     z2 = (INT32) wsptr[3];
3041     z3 = (INT32) wsptr[5];
3042     z4 = (INT32) wsptr[7];
3043 
3044     tmp11 = z1 + z3;
3045 
3046     tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
3047     tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
3048     tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
3049     tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
3050     tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
3051     tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
3052     tmp0  = tmp1 + tmp2 + tmp3 -
3053 	    MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
3054     tmp13 = tmp10 + tmp11 + tmp12 -
3055 	    MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
3056     z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
3057     tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
3058     tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
3059     z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
3060     tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
3061     tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
3062     z2    += z4;
3063     z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
3064     tmp1  += z1;
3065     tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
3066     z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
3067     tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
3068     tmp12 += z2;
3069     z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
3070     tmp2  += z2;
3071     tmp3  += z2;
3072     z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
3073     tmp10 += z2;
3074     tmp11 += z2;
3075 
3076     /* Final output stage */
3077 
3078     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp0,
3079 					       CONST_BITS+PASS1_BITS+3)
3080 			     & RANGE_MASK];
3081     outptr[15] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp0,
3082 					       CONST_BITS+PASS1_BITS+3)
3083 			     & RANGE_MASK];
3084     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp1,
3085 					       CONST_BITS+PASS1_BITS+3)
3086 			     & RANGE_MASK];
3087     outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp1,
3088 					       CONST_BITS+PASS1_BITS+3)
3089 			     & RANGE_MASK];
3090     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp2,
3091 					       CONST_BITS+PASS1_BITS+3)
3092 			     & RANGE_MASK];
3093     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp2,
3094 					       CONST_BITS+PASS1_BITS+3)
3095 			     & RANGE_MASK];
3096     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp3,
3097 					       CONST_BITS+PASS1_BITS+3)
3098 			     & RANGE_MASK];
3099     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp3,
3100 					       CONST_BITS+PASS1_BITS+3)
3101 			     & RANGE_MASK];
3102     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp10,
3103 					       CONST_BITS+PASS1_BITS+3)
3104 			     & RANGE_MASK];
3105     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp10,
3106 					       CONST_BITS+PASS1_BITS+3)
3107 			     & RANGE_MASK];
3108     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp11,
3109 					       CONST_BITS+PASS1_BITS+3)
3110 			     & RANGE_MASK];
3111     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp11,
3112 					       CONST_BITS+PASS1_BITS+3)
3113 			     & RANGE_MASK];
3114     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp12,
3115 					       CONST_BITS+PASS1_BITS+3)
3116 			     & RANGE_MASK];
3117     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp12,
3118 					       CONST_BITS+PASS1_BITS+3)
3119 			     & RANGE_MASK];
3120     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27 + tmp13,
3121 					       CONST_BITS+PASS1_BITS+3)
3122 			     & RANGE_MASK];
3123     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp27 - tmp13,
3124 					       CONST_BITS+PASS1_BITS+3)
3125 			     & RANGE_MASK];
3126 
3127     wsptr += 8;		/* advance pointer to next row */
3128   }
3129 }
3130 
3131 
3132 /*
3133  * Perform dequantization and inverse DCT on one block of coefficients,
3134  * producing a 14x7 output block.
3135  *
3136  * 7-point IDCT in pass 1 (columns), 14-point in pass 2 (rows).
3137  */
3138 
3139 GLOBAL(void)
3140 jpeg_idct_14x7 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
3141 		JCOEFPTR coef_block,
3142 		JSAMPARRAY output_buf, JDIMENSION output_col)
3143 {
3144   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
3145   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
3146   INT32 z1, z2, z3, z4;
3147   JCOEFPTR inptr;
3148   ISLOW_MULT_TYPE * quantptr;
3149   int * wsptr;
3150   JSAMPROW outptr;
3151   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
3152   int ctr;
3153   int workspace[8*7];	/* buffers data between passes */
3154   SHIFT_TEMPS
3155 
3156   /* Pass 1: process columns from input, store into work array.
3157    * 7-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/14).
3158    */
3159 
3160   inptr = coef_block;
3161   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
3162   wsptr = workspace;
3163   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
3164     /* Even part */
3165 
3166     tmp23 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
3167     tmp23 <<= CONST_BITS;
3168     /* Add fudge factor here for final descale. */
3169     tmp23 += ONE << (CONST_BITS-PASS1_BITS-1);
3170 
3171     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
3172     z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
3173     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
3174 
3175     tmp20 = MULTIPLY(z2 - z3, FIX(0.881747734));       /* c4 */
3176     tmp22 = MULTIPLY(z1 - z2, FIX(0.314692123));       /* c6 */
3177     tmp21 = tmp20 + tmp22 + tmp23 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
3178     tmp10 = z1 + z3;
3179     z2 -= tmp10;
3180     tmp10 = MULTIPLY(tmp10, FIX(1.274162392)) + tmp23; /* c2 */
3181     tmp20 += tmp10 - MULTIPLY(z3, FIX(0.077722536));   /* c2-c4-c6 */
3182     tmp22 += tmp10 - MULTIPLY(z1, FIX(2.470602249));   /* c2+c4+c6 */
3183     tmp23 += MULTIPLY(z2, FIX(1.414213562));           /* c0 */
3184 
3185     /* Odd part */
3186 
3187     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
3188     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
3189     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
3190 
3191     tmp11 = MULTIPLY(z1 + z2, FIX(0.935414347));       /* (c3+c1-c5)/2 */
3192     tmp12 = MULTIPLY(z1 - z2, FIX(0.170262339));       /* (c3+c5-c1)/2 */
3193     tmp10 = tmp11 - tmp12;
3194     tmp11 += tmp12;
3195     tmp12 = MULTIPLY(z2 + z3, - FIX(1.378756276));     /* -c1 */
3196     tmp11 += tmp12;
3197     z2 = MULTIPLY(z1 + z3, FIX(0.613604268));          /* c5 */
3198     tmp10 += z2;
3199     tmp12 += z2 + MULTIPLY(z3, FIX(1.870828693));      /* c3+c1-c5 */
3200 
3201     /* Final output stage */
3202 
3203     wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
3204     wsptr[8*6] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
3205     wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
3206     wsptr[8*5] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
3207     wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
3208     wsptr[8*4] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
3209     wsptr[8*3] = (int) RIGHT_SHIFT(tmp23, CONST_BITS-PASS1_BITS);
3210   }
3211 
3212   /* Pass 2: process 7 rows from work array, store into output array.
3213    * 14-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/28).
3214    */
3215 
3216   wsptr = workspace;
3217   for (ctr = 0; ctr < 7; ctr++) {
3218     outptr = output_buf[ctr] + output_col;
3219 
3220     /* Even part */
3221 
3222     /* Add range center and fudge factor for final descale and range-limit. */
3223     z1 = (INT32) wsptr[0] +
3224 	   ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
3225 	    (ONE << (PASS1_BITS+2)));
3226     z1 <<= CONST_BITS;
3227     z4 = (INT32) wsptr[4];
3228     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
3229     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
3230     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
3231 
3232     tmp10 = z1 + z2;
3233     tmp11 = z1 + z3;
3234     tmp12 = z1 - z4;
3235 
3236     tmp23 = z1 - ((z2 + z3 - z4) << 1);          /* c0 = (c4+c12-c8)*2 */
3237 
3238     z1 = (INT32) wsptr[2];
3239     z2 = (INT32) wsptr[6];
3240 
3241     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
3242 
3243     tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
3244     tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
3245     tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
3246 	    MULTIPLY(z2, FIX(1.378756276));      /* c2 */
3247 
3248     tmp20 = tmp10 + tmp13;
3249     tmp26 = tmp10 - tmp13;
3250     tmp21 = tmp11 + tmp14;
3251     tmp25 = tmp11 - tmp14;
3252     tmp22 = tmp12 + tmp15;
3253     tmp24 = tmp12 - tmp15;
3254 
3255     /* Odd part */
3256 
3257     z1 = (INT32) wsptr[1];
3258     z2 = (INT32) wsptr[3];
3259     z3 = (INT32) wsptr[5];
3260     z4 = (INT32) wsptr[7];
3261     z4 <<= CONST_BITS;
3262 
3263     tmp14 = z1 + z3;
3264     tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
3265     tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
3266     tmp10 = tmp11 + tmp12 + z4 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
3267     tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
3268     tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
3269     z1    -= z2;
3270     tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4;           /* c11 */
3271     tmp16 += tmp15;
3272     tmp13 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - z4;    /* -c13 */
3273     tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948));       /* c3-c9-c13 */
3274     tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773));       /* c3+c5-c13 */
3275     tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
3276     tmp14 += tmp13 + z4 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
3277     tmp15 += tmp13 + MULTIPLY(z2, FIX(0.674957567));       /* c1+c11-c5 */
3278 
3279     tmp13 = ((z1 - z3) << CONST_BITS) + z4;
3280 
3281     /* Final output stage */
3282 
3283     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
3284 					       CONST_BITS+PASS1_BITS+3)
3285 			     & RANGE_MASK];
3286     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
3287 					       CONST_BITS+PASS1_BITS+3)
3288 			     & RANGE_MASK];
3289     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
3290 					       CONST_BITS+PASS1_BITS+3)
3291 			     & RANGE_MASK];
3292     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
3293 					       CONST_BITS+PASS1_BITS+3)
3294 			     & RANGE_MASK];
3295     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
3296 					       CONST_BITS+PASS1_BITS+3)
3297 			     & RANGE_MASK];
3298     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
3299 					       CONST_BITS+PASS1_BITS+3)
3300 			     & RANGE_MASK];
3301     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
3302 					       CONST_BITS+PASS1_BITS+3)
3303 			     & RANGE_MASK];
3304     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
3305 					       CONST_BITS+PASS1_BITS+3)
3306 			     & RANGE_MASK];
3307     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
3308 					       CONST_BITS+PASS1_BITS+3)
3309 			     & RANGE_MASK];
3310     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
3311 					       CONST_BITS+PASS1_BITS+3)
3312 			     & RANGE_MASK];
3313     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
3314 					       CONST_BITS+PASS1_BITS+3)
3315 			     & RANGE_MASK];
3316     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
3317 					       CONST_BITS+PASS1_BITS+3)
3318 			     & RANGE_MASK];
3319     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
3320 					       CONST_BITS+PASS1_BITS+3)
3321 			     & RANGE_MASK];
3322     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
3323 					       CONST_BITS+PASS1_BITS+3)
3324 			     & RANGE_MASK];
3325 
3326     wsptr += 8;		/* advance pointer to next row */
3327   }
3328 }
3329 
3330 
3331 /*
3332  * Perform dequantization and inverse DCT on one block of coefficients,
3333  * producing a 12x6 output block.
3334  *
3335  * 6-point IDCT in pass 1 (columns), 12-point in pass 2 (rows).
3336  */
3337 
3338 GLOBAL(void)
3339 jpeg_idct_12x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
3340 		JCOEFPTR coef_block,
3341 		JSAMPARRAY output_buf, JDIMENSION output_col)
3342 {
3343   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
3344   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
3345   INT32 z1, z2, z3, z4;
3346   JCOEFPTR inptr;
3347   ISLOW_MULT_TYPE * quantptr;
3348   int * wsptr;
3349   JSAMPROW outptr;
3350   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
3351   int ctr;
3352   int workspace[8*6];	/* buffers data between passes */
3353   SHIFT_TEMPS
3354 
3355   /* Pass 1: process columns from input, store into work array.
3356    * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
3357    */
3358 
3359   inptr = coef_block;
3360   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
3361   wsptr = workspace;
3362   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
3363     /* Even part */
3364 
3365     tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
3366     tmp10 <<= CONST_BITS;
3367     /* Add fudge factor here for final descale. */
3368     tmp10 += ONE << (CONST_BITS-PASS1_BITS-1);
3369     tmp12 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
3370     tmp20 = MULTIPLY(tmp12, FIX(0.707106781));   /* c4 */
3371     tmp11 = tmp10 + tmp20;
3372     tmp21 = RIGHT_SHIFT(tmp10 - tmp20 - tmp20, CONST_BITS-PASS1_BITS);
3373     tmp20 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
3374     tmp10 = MULTIPLY(tmp20, FIX(1.224744871));   /* c2 */
3375     tmp20 = tmp11 + tmp10;
3376     tmp22 = tmp11 - tmp10;
3377 
3378     /* Odd part */
3379 
3380     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
3381     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
3382     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
3383     tmp11 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
3384     tmp10 = tmp11 + ((z1 + z2) << CONST_BITS);
3385     tmp12 = tmp11 + ((z3 - z2) << CONST_BITS);
3386     tmp11 = (z1 - z2 - z3) << PASS1_BITS;
3387 
3388     /* Final output stage */
3389 
3390     wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
3391     wsptr[8*5] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
3392     wsptr[8*1] = (int) (tmp21 + tmp11);
3393     wsptr[8*4] = (int) (tmp21 - tmp11);
3394     wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
3395     wsptr[8*3] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
3396   }
3397 
3398   /* Pass 2: process 6 rows from work array, store into output array.
3399    * 12-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/24).
3400    */
3401 
3402   wsptr = workspace;
3403   for (ctr = 0; ctr < 6; ctr++) {
3404     outptr = output_buf[ctr] + output_col;
3405 
3406     /* Even part */
3407 
3408     /* Add range center and fudge factor for final descale and range-limit. */
3409     z3 = (INT32) wsptr[0] +
3410 	   ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
3411 	    (ONE << (PASS1_BITS+2)));
3412     z3 <<= CONST_BITS;
3413 
3414     z4 = (INT32) wsptr[4];
3415     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
3416 
3417     tmp10 = z3 + z4;
3418     tmp11 = z3 - z4;
3419 
3420     z1 = (INT32) wsptr[2];
3421     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
3422     z1 <<= CONST_BITS;
3423     z2 = (INT32) wsptr[6];
3424     z2 <<= CONST_BITS;
3425 
3426     tmp12 = z1 - z2;
3427 
3428     tmp21 = z3 + tmp12;
3429     tmp24 = z3 - tmp12;
3430 
3431     tmp12 = z4 + z2;
3432 
3433     tmp20 = tmp10 + tmp12;
3434     tmp25 = tmp10 - tmp12;
3435 
3436     tmp12 = z4 - z1 - z2;
3437 
3438     tmp22 = tmp11 + tmp12;
3439     tmp23 = tmp11 - tmp12;
3440 
3441     /* Odd part */
3442 
3443     z1 = (INT32) wsptr[1];
3444     z2 = (INT32) wsptr[3];
3445     z3 = (INT32) wsptr[5];
3446     z4 = (INT32) wsptr[7];
3447 
3448     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
3449     tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
3450 
3451     tmp10 = z1 + z3;
3452     tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
3453     tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
3454     tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
3455     tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
3456     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
3457     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
3458     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
3459 	     MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
3460 
3461     z1 -= z4;
3462     z2 -= z3;
3463     z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
3464     tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
3465     tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
3466 
3467     /* Final output stage */
3468 
3469     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
3470 					       CONST_BITS+PASS1_BITS+3)
3471 			     & RANGE_MASK];
3472     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
3473 					       CONST_BITS+PASS1_BITS+3)
3474 			     & RANGE_MASK];
3475     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
3476 					       CONST_BITS+PASS1_BITS+3)
3477 			     & RANGE_MASK];
3478     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
3479 					       CONST_BITS+PASS1_BITS+3)
3480 			     & RANGE_MASK];
3481     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
3482 					       CONST_BITS+PASS1_BITS+3)
3483 			     & RANGE_MASK];
3484     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
3485 					       CONST_BITS+PASS1_BITS+3)
3486 			     & RANGE_MASK];
3487     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
3488 					       CONST_BITS+PASS1_BITS+3)
3489 			     & RANGE_MASK];
3490     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
3491 					       CONST_BITS+PASS1_BITS+3)
3492 			     & RANGE_MASK];
3493     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
3494 					       CONST_BITS+PASS1_BITS+3)
3495 			     & RANGE_MASK];
3496     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
3497 					       CONST_BITS+PASS1_BITS+3)
3498 			     & RANGE_MASK];
3499     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
3500 					       CONST_BITS+PASS1_BITS+3)
3501 			     & RANGE_MASK];
3502     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
3503 					       CONST_BITS+PASS1_BITS+3)
3504 			     & RANGE_MASK];
3505 
3506     wsptr += 8;		/* advance pointer to next row */
3507   }
3508 }
3509 
3510 
3511 /*
3512  * Perform dequantization and inverse DCT on one block of coefficients,
3513  * producing a 10x5 output block.
3514  *
3515  * 5-point IDCT in pass 1 (columns), 10-point in pass 2 (rows).
3516  */
3517 
3518 GLOBAL(void)
3519 jpeg_idct_10x5 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
3520 		JCOEFPTR coef_block,
3521 		JSAMPARRAY output_buf, JDIMENSION output_col)
3522 {
3523   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
3524   INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
3525   INT32 z1, z2, z3, z4;
3526   JCOEFPTR inptr;
3527   ISLOW_MULT_TYPE * quantptr;
3528   int * wsptr;
3529   JSAMPROW outptr;
3530   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
3531   int ctr;
3532   int workspace[8*5];	/* buffers data between passes */
3533   SHIFT_TEMPS
3534 
3535   /* Pass 1: process columns from input, store into work array.
3536    * 5-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/10).
3537    */
3538 
3539   inptr = coef_block;
3540   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
3541   wsptr = workspace;
3542   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
3543     /* Even part */
3544 
3545     tmp12 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
3546     tmp12 <<= CONST_BITS;
3547     /* Add fudge factor here for final descale. */
3548     tmp12 += ONE << (CONST_BITS-PASS1_BITS-1);
3549     tmp13 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
3550     tmp14 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
3551     z1 = MULTIPLY(tmp13 + tmp14, FIX(0.790569415)); /* (c2+c4)/2 */
3552     z2 = MULTIPLY(tmp13 - tmp14, FIX(0.353553391)); /* (c2-c4)/2 */
3553     z3 = tmp12 + z2;
3554     tmp10 = z3 + z1;
3555     tmp11 = z3 - z1;
3556     tmp12 -= z2 << 2;
3557 
3558     /* Odd part */
3559 
3560     z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
3561     z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
3562 
3563     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));       /* c3 */
3564     tmp13 = z1 + MULTIPLY(z2, FIX(0.513743148));    /* c1-c3 */
3565     tmp14 = z1 - MULTIPLY(z3, FIX(2.176250899));    /* c1+c3 */
3566 
3567     /* Final output stage */
3568 
3569     wsptr[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp13, CONST_BITS-PASS1_BITS);
3570     wsptr[8*4] = (int) RIGHT_SHIFT(tmp10 - tmp13, CONST_BITS-PASS1_BITS);
3571     wsptr[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp14, CONST_BITS-PASS1_BITS);
3572     wsptr[8*3] = (int) RIGHT_SHIFT(tmp11 - tmp14, CONST_BITS-PASS1_BITS);
3573     wsptr[8*2] = (int) RIGHT_SHIFT(tmp12, CONST_BITS-PASS1_BITS);
3574   }
3575 
3576   /* Pass 2: process 5 rows from work array, store into output array.
3577    * 10-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/20).
3578    */
3579 
3580   wsptr = workspace;
3581   for (ctr = 0; ctr < 5; ctr++) {
3582     outptr = output_buf[ctr] + output_col;
3583 
3584     /* Even part */
3585 
3586     /* Add range center and fudge factor for final descale and range-limit. */
3587     z3 = (INT32) wsptr[0] +
3588 	   ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
3589 	    (ONE << (PASS1_BITS+2)));
3590     z3 <<= CONST_BITS;
3591     z4 = (INT32) wsptr[4];
3592     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
3593     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
3594     tmp10 = z3 + z1;
3595     tmp11 = z3 - z2;
3596 
3597     tmp22 = z3 - ((z1 - z2) << 1);               /* c0 = (c4-c8)*2 */
3598 
3599     z2 = (INT32) wsptr[2];
3600     z3 = (INT32) wsptr[6];
3601 
3602     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
3603     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
3604     tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
3605 
3606     tmp20 = tmp10 + tmp12;
3607     tmp24 = tmp10 - tmp12;
3608     tmp21 = tmp11 + tmp13;
3609     tmp23 = tmp11 - tmp13;
3610 
3611     /* Odd part */
3612 
3613     z1 = (INT32) wsptr[1];
3614     z2 = (INT32) wsptr[3];
3615     z3 = (INT32) wsptr[5];
3616     z3 <<= CONST_BITS;
3617     z4 = (INT32) wsptr[7];
3618 
3619     tmp11 = z2 + z4;
3620     tmp13 = z2 - z4;
3621 
3622     tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
3623 
3624     z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
3625     z4 = z3 + tmp12;
3626 
3627     tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
3628     tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
3629 
3630     z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
3631     z4 = z3 - tmp12 - (tmp13 << (CONST_BITS - 1));
3632 
3633     tmp12 = ((z1 - tmp13) << CONST_BITS) - z3;
3634 
3635     tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
3636     tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
3637 
3638     /* Final output stage */
3639 
3640     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
3641 					      CONST_BITS+PASS1_BITS+3)
3642 			    & RANGE_MASK];
3643     outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
3644 					      CONST_BITS+PASS1_BITS+3)
3645 			    & RANGE_MASK];
3646     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
3647 					      CONST_BITS+PASS1_BITS+3)
3648 			    & RANGE_MASK];
3649     outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
3650 					      CONST_BITS+PASS1_BITS+3)
3651 			    & RANGE_MASK];
3652     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
3653 					      CONST_BITS+PASS1_BITS+3)
3654 			    & RANGE_MASK];
3655     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
3656 					      CONST_BITS+PASS1_BITS+3)
3657 			    & RANGE_MASK];
3658     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
3659 					      CONST_BITS+PASS1_BITS+3)
3660 			    & RANGE_MASK];
3661     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
3662 					      CONST_BITS+PASS1_BITS+3)
3663 			    & RANGE_MASK];
3664     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
3665 					      CONST_BITS+PASS1_BITS+3)
3666 			    & RANGE_MASK];
3667     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
3668 					      CONST_BITS+PASS1_BITS+3)
3669 			    & RANGE_MASK];
3670 
3671     wsptr += 8;		/* advance pointer to next row */
3672   }
3673 }
3674 
3675 
3676 /*
3677  * Perform dequantization and inverse DCT on one block of coefficients,
3678  * producing a 8x4 output block.
3679  *
3680  * 4-point IDCT in pass 1 (columns), 8-point in pass 2 (rows).
3681  */
3682 
3683 GLOBAL(void)
3684 jpeg_idct_8x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
3685 	       JCOEFPTR coef_block,
3686 	       JSAMPARRAY output_buf, JDIMENSION output_col)
3687 {
3688   INT32 tmp0, tmp1, tmp2, tmp3;
3689   INT32 tmp10, tmp11, tmp12, tmp13;
3690   INT32 z1, z2, z3;
3691   JCOEFPTR inptr;
3692   ISLOW_MULT_TYPE * quantptr;
3693   int * wsptr;
3694   JSAMPROW outptr;
3695   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
3696   int ctr;
3697   int workspace[8*4];	/* buffers data between passes */
3698   SHIFT_TEMPS
3699 
3700   /* Pass 1: process columns from input, store into work array.
3701    * 4-point IDCT kernel,
3702    * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
3703    */
3704 
3705   inptr = coef_block;
3706   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
3707   wsptr = workspace;
3708   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
3709     /* Even part */
3710 
3711     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
3712     tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
3713 
3714     tmp10 = (tmp0 + tmp2) << PASS1_BITS;
3715     tmp12 = (tmp0 - tmp2) << PASS1_BITS;
3716 
3717     /* Odd part */
3718     /* Same rotation as in the even part of the 8x8 LL&M IDCT */
3719 
3720     z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
3721     z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
3722 
3723     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);               /* c6 */
3724     /* Add fudge factor here for final descale. */
3725     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
3726     tmp0 = RIGHT_SHIFT(z1 + MULTIPLY(z2, FIX_0_765366865), /* c2-c6 */
3727 		       CONST_BITS-PASS1_BITS);
3728     tmp2 = RIGHT_SHIFT(z1 - MULTIPLY(z3, FIX_1_847759065), /* c2+c6 */
3729 		       CONST_BITS-PASS1_BITS);
3730 
3731     /* Final output stage */
3732 
3733     wsptr[8*0] = (int) (tmp10 + tmp0);
3734     wsptr[8*3] = (int) (tmp10 - tmp0);
3735     wsptr[8*1] = (int) (tmp12 + tmp2);
3736     wsptr[8*2] = (int) (tmp12 - tmp2);
3737   }
3738 
3739   /* Pass 2: process rows from work array, store into output array.
3740    * Note that we must descale the results by a factor of 8 == 2**3,
3741    * and also undo the PASS1_BITS scaling.
3742    * 8-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
3743    */
3744 
3745   wsptr = workspace;
3746   for (ctr = 0; ctr < 4; ctr++) {
3747     outptr = output_buf[ctr] + output_col;
3748 
3749     /* Even part: reverse the even part of the forward DCT.
3750      * The rotator is c(-6).
3751      */
3752 
3753     /* Add range center and fudge factor for final descale and range-limit. */
3754     z2 = (INT32) wsptr[0] +
3755 	   ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
3756 	    (ONE << (PASS1_BITS+2)));
3757     z3 = (INT32) wsptr[4];
3758 
3759     tmp0 = (z2 + z3) << CONST_BITS;
3760     tmp1 = (z2 - z3) << CONST_BITS;
3761 
3762     z2 = (INT32) wsptr[2];
3763     z3 = (INT32) wsptr[6];
3764 
3765     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);       /* c6 */
3766     tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);     /* c2-c6 */
3767     tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);     /* c2+c6 */
3768 
3769     tmp10 = tmp0 + tmp2;
3770     tmp13 = tmp0 - tmp2;
3771     tmp11 = tmp1 + tmp3;
3772     tmp12 = tmp1 - tmp3;
3773 
3774     /* Odd part per figure 8; the matrix is unitary and hence its
3775      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
3776      */
3777 
3778     tmp0 = (INT32) wsptr[7];
3779     tmp1 = (INT32) wsptr[5];
3780     tmp2 = (INT32) wsptr[3];
3781     tmp3 = (INT32) wsptr[1];
3782 
3783     z2 = tmp0 + tmp2;
3784     z3 = tmp1 + tmp3;
3785 
3786     z1 = MULTIPLY(z2 + z3, FIX_1_175875602);       /*  c3 */
3787     z2 = MULTIPLY(z2, - FIX_1_961570560);          /* -c3-c5 */
3788     z3 = MULTIPLY(z3, - FIX_0_390180644);          /* -c3+c5 */
3789     z2 += z1;
3790     z3 += z1;
3791 
3792     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* -c3+c7 */
3793     tmp0 = MULTIPLY(tmp0, FIX_0_298631336);        /* -c1+c3+c5-c7 */
3794     tmp3 = MULTIPLY(tmp3, FIX_1_501321110);        /*  c1+c3-c5-c7 */
3795     tmp0 += z1 + z2;
3796     tmp3 += z1 + z3;
3797 
3798     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* -c1-c3 */
3799     tmp1 = MULTIPLY(tmp1, FIX_2_053119869);        /*  c1+c3-c5+c7 */
3800     tmp2 = MULTIPLY(tmp2, FIX_3_072711026);        /*  c1+c3+c5-c7 */
3801     tmp1 += z1 + z3;
3802     tmp2 += z1 + z2;
3803 
3804     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
3805 
3806     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp3,
3807 					      CONST_BITS+PASS1_BITS+3)
3808 			    & RANGE_MASK];
3809     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp3,
3810 					      CONST_BITS+PASS1_BITS+3)
3811 			    & RANGE_MASK];
3812     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp2,
3813 					      CONST_BITS+PASS1_BITS+3)
3814 			    & RANGE_MASK];
3815     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp2,
3816 					      CONST_BITS+PASS1_BITS+3)
3817 			    & RANGE_MASK];
3818     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp1,
3819 					      CONST_BITS+PASS1_BITS+3)
3820 			    & RANGE_MASK];
3821     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp1,
3822 					      CONST_BITS+PASS1_BITS+3)
3823 			    & RANGE_MASK];
3824     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp0,
3825 					      CONST_BITS+PASS1_BITS+3)
3826 			    & RANGE_MASK];
3827     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp0,
3828 					      CONST_BITS+PASS1_BITS+3)
3829 			    & RANGE_MASK];
3830 
3831     wsptr += DCTSIZE;		/* advance pointer to next row */
3832   }
3833 }
3834 
3835 
3836 /*
3837  * Perform dequantization and inverse DCT on one block of coefficients,
3838  * producing a reduced-size 6x3 output block.
3839  *
3840  * 3-point IDCT in pass 1 (columns), 6-point in pass 2 (rows).
3841  */
3842 
3843 GLOBAL(void)
3844 jpeg_idct_6x3 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
3845 	       JCOEFPTR coef_block,
3846 	       JSAMPARRAY output_buf, JDIMENSION output_col)
3847 {
3848   INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
3849   INT32 z1, z2, z3;
3850   JCOEFPTR inptr;
3851   ISLOW_MULT_TYPE * quantptr;
3852   int * wsptr;
3853   JSAMPROW outptr;
3854   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
3855   int ctr;
3856   int workspace[6*3];	/* buffers data between passes */
3857   SHIFT_TEMPS
3858 
3859   /* Pass 1: process columns from input, store into work array.
3860    * 3-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/6).
3861    */
3862 
3863   inptr = coef_block;
3864   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
3865   wsptr = workspace;
3866   for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
3867     /* Even part */
3868 
3869     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
3870     tmp0 <<= CONST_BITS;
3871     /* Add fudge factor here for final descale. */
3872     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
3873     tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
3874     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
3875     tmp10 = tmp0 + tmp12;
3876     tmp2 = tmp0 - tmp12 - tmp12;
3877 
3878     /* Odd part */
3879 
3880     tmp12 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
3881     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
3882 
3883     /* Final output stage */
3884 
3885     wsptr[6*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
3886     wsptr[6*2] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
3887     wsptr[6*1] = (int) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
3888   }
3889 
3890   /* Pass 2: process 3 rows from work array, store into output array.
3891    * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
3892    */
3893 
3894   wsptr = workspace;
3895   for (ctr = 0; ctr < 3; ctr++) {
3896     outptr = output_buf[ctr] + output_col;
3897 
3898     /* Even part */
3899 
3900     /* Add range center and fudge factor for final descale and range-limit. */
3901     tmp0 = (INT32) wsptr[0] +
3902 	     ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
3903 	      (ONE << (PASS1_BITS+2)));
3904     tmp0 <<= CONST_BITS;
3905     tmp2 = (INT32) wsptr[4];
3906     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
3907     tmp1 = tmp0 + tmp10;
3908     tmp11 = tmp0 - tmp10 - tmp10;
3909     tmp10 = (INT32) wsptr[2];
3910     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
3911     tmp10 = tmp1 + tmp0;
3912     tmp12 = tmp1 - tmp0;
3913 
3914     /* Odd part */
3915 
3916     z1 = (INT32) wsptr[1];
3917     z2 = (INT32) wsptr[3];
3918     z3 = (INT32) wsptr[5];
3919     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
3920     tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
3921     tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
3922     tmp1 = (z1 - z2 - z3) << CONST_BITS;
3923 
3924     /* Final output stage */
3925 
3926     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
3927 					      CONST_BITS+PASS1_BITS+3)
3928 			    & RANGE_MASK];
3929     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
3930 					      CONST_BITS+PASS1_BITS+3)
3931 			    & RANGE_MASK];
3932     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
3933 					      CONST_BITS+PASS1_BITS+3)
3934 			    & RANGE_MASK];
3935     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
3936 					      CONST_BITS+PASS1_BITS+3)
3937 			    & RANGE_MASK];
3938     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
3939 					      CONST_BITS+PASS1_BITS+3)
3940 			    & RANGE_MASK];
3941     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
3942 					      CONST_BITS+PASS1_BITS+3)
3943 			    & RANGE_MASK];
3944 
3945     wsptr += 6;		/* advance pointer to next row */
3946   }
3947 }
3948 
3949 
3950 /*
3951  * Perform dequantization and inverse DCT on one block of coefficients,
3952  * producing a 4x2 output block.
3953  *
3954  * 2-point IDCT in pass 1 (columns), 4-point in pass 2 (rows).
3955  */
3956 
3957 GLOBAL(void)
3958 jpeg_idct_4x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
3959 	       JCOEFPTR coef_block,
3960 	       JSAMPARRAY output_buf, JDIMENSION output_col)
3961 {
3962   INT32 tmp0, tmp2, tmp10, tmp12;
3963   INT32 z1, z2, z3;
3964   JCOEFPTR inptr;
3965   ISLOW_MULT_TYPE * quantptr;
3966   INT32 * wsptr;
3967   JSAMPROW outptr;
3968   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
3969   int ctr;
3970   INT32 workspace[4*2];	/* buffers data between passes */
3971   SHIFT_TEMPS
3972 
3973   /* Pass 1: process columns from input, store into work array. */
3974 
3975   inptr = coef_block;
3976   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
3977   wsptr = workspace;
3978   for (ctr = 0; ctr < 4; ctr++, inptr++, quantptr++, wsptr++) {
3979     /* Even part */
3980 
3981     tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
3982 
3983     /* Odd part */
3984 
3985     tmp0 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
3986 
3987     /* Final output stage */
3988 
3989     wsptr[4*0] = tmp10 + tmp0;
3990     wsptr[4*1] = tmp10 - tmp0;
3991   }
3992 
3993   /* Pass 2: process 2 rows from work array, store into output array.
3994    * 4-point IDCT kernel,
3995    * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
3996    */
3997 
3998   wsptr = workspace;
3999   for (ctr = 0; ctr < 2; ctr++) {
4000     outptr = output_buf[ctr] + output_col;
4001 
4002     /* Even part */
4003 
4004     /* Add range center and fudge factor for final descale and range-limit. */
4005     tmp0 = wsptr[0] + ((((INT32) RANGE_CENTER) << 3) + (ONE << 2));
4006     tmp2 = wsptr[2];
4007 
4008     tmp10 = (tmp0 + tmp2) << CONST_BITS;
4009     tmp12 = (tmp0 - tmp2) << CONST_BITS;
4010 
4011     /* Odd part */
4012     /* Same rotation as in the even part of the 8x8 LL&M IDCT */
4013 
4014     z2 = wsptr[1];
4015     z3 = wsptr[3];
4016 
4017     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
4018     tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
4019     tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
4020 
4021     /* Final output stage */
4022 
4023     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
4024 					      CONST_BITS+3)
4025 			    & RANGE_MASK];
4026     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
4027 					      CONST_BITS+3)
4028 			    & RANGE_MASK];
4029     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
4030 					      CONST_BITS+3)
4031 			    & RANGE_MASK];
4032     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
4033 					      CONST_BITS+3)
4034 			    & RANGE_MASK];
4035 
4036     wsptr += 4;		/* advance pointer to next row */
4037   }
4038 }
4039 
4040 
4041 /*
4042  * Perform dequantization and inverse DCT on one block of coefficients,
4043  * producing a 2x1 output block.
4044  *
4045  * 1-point IDCT in pass 1 (columns), 2-point in pass 2 (rows).
4046  */
4047 
4048 GLOBAL(void)
4049 jpeg_idct_2x1 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
4050 	       JCOEFPTR coef_block,
4051 	       JSAMPARRAY output_buf, JDIMENSION output_col)
4052 {
4053   DCTELEM tmp0, tmp1;
4054   ISLOW_MULT_TYPE * quantptr;
4055   JSAMPROW outptr;
4056   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
4057   ISHIFT_TEMPS
4058 
4059   /* Pass 1: empty. */
4060 
4061   /* Pass 2: process 1 row from input, store into output array. */
4062 
4063   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
4064   outptr = output_buf[0] + output_col;
4065 
4066   /* Even part */
4067 
4068   tmp0 = DEQUANTIZE(coef_block[0], quantptr[0]);
4069   /* Add range center and fudge factor for final descale and range-limit. */
4070   tmp0 += (((DCTELEM) RANGE_CENTER) << 3) + (1 << 2);
4071 
4072   /* Odd part */
4073 
4074   tmp1 = DEQUANTIZE(coef_block[1], quantptr[1]);
4075 
4076   /* Final output stage */
4077 
4078   outptr[0] = range_limit[(int) IRIGHT_SHIFT(tmp0 + tmp1, 3) & RANGE_MASK];
4079   outptr[1] = range_limit[(int) IRIGHT_SHIFT(tmp0 - tmp1, 3) & RANGE_MASK];
4080 }
4081 
4082 
4083 /*
4084  * Perform dequantization and inverse DCT on one block of coefficients,
4085  * producing a 8x16 output block.
4086  *
4087  * 16-point IDCT in pass 1 (columns), 8-point in pass 2 (rows).
4088  */
4089 
4090 GLOBAL(void)
4091 jpeg_idct_8x16 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
4092 		JCOEFPTR coef_block,
4093 		JSAMPARRAY output_buf, JDIMENSION output_col)
4094 {
4095   INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
4096   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
4097   INT32 z1, z2, z3, z4;
4098   JCOEFPTR inptr;
4099   ISLOW_MULT_TYPE * quantptr;
4100   int * wsptr;
4101   JSAMPROW outptr;
4102   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
4103   int ctr;
4104   int workspace[8*16];	/* buffers data between passes */
4105   SHIFT_TEMPS
4106 
4107   /* Pass 1: process columns from input, store into work array.
4108    * 16-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
4109    */
4110 
4111   inptr = coef_block;
4112   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
4113   wsptr = workspace;
4114   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
4115     /* Even part */
4116 
4117     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
4118     tmp0 <<= CONST_BITS;
4119     /* Add fudge factor here for final descale. */
4120     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
4121 
4122     z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
4123     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
4124     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
4125 
4126     tmp10 = tmp0 + tmp1;
4127     tmp11 = tmp0 - tmp1;
4128     tmp12 = tmp0 + tmp2;
4129     tmp13 = tmp0 - tmp2;
4130 
4131     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
4132     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
4133     z3 = z1 - z2;
4134     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
4135     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
4136 
4137     tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
4138     tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
4139     tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
4140     tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
4141 
4142     tmp20 = tmp10 + tmp0;
4143     tmp27 = tmp10 - tmp0;
4144     tmp21 = tmp12 + tmp1;
4145     tmp26 = tmp12 - tmp1;
4146     tmp22 = tmp13 + tmp2;
4147     tmp25 = tmp13 - tmp2;
4148     tmp23 = tmp11 + tmp3;
4149     tmp24 = tmp11 - tmp3;
4150 
4151     /* Odd part */
4152 
4153     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
4154     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
4155     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
4156     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
4157 
4158     tmp11 = z1 + z3;
4159 
4160     tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
4161     tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
4162     tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
4163     tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
4164     tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
4165     tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
4166     tmp0  = tmp1 + tmp2 + tmp3 -
4167 	    MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
4168     tmp13 = tmp10 + tmp11 + tmp12 -
4169 	    MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
4170     z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
4171     tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
4172     tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
4173     z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
4174     tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
4175     tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
4176     z2    += z4;
4177     z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
4178     tmp1  += z1;
4179     tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
4180     z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
4181     tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
4182     tmp12 += z2;
4183     z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
4184     tmp2  += z2;
4185     tmp3  += z2;
4186     z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
4187     tmp10 += z2;
4188     tmp11 += z2;
4189 
4190     /* Final output stage */
4191 
4192     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp0,  CONST_BITS-PASS1_BITS);
4193     wsptr[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0,  CONST_BITS-PASS1_BITS);
4194     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp1,  CONST_BITS-PASS1_BITS);
4195     wsptr[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1,  CONST_BITS-PASS1_BITS);
4196     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp2,  CONST_BITS-PASS1_BITS);
4197     wsptr[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2,  CONST_BITS-PASS1_BITS);
4198     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp3,  CONST_BITS-PASS1_BITS);
4199     wsptr[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3,  CONST_BITS-PASS1_BITS);
4200     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
4201     wsptr[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
4202     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
4203     wsptr[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
4204     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
4205     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
4206     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
4207     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
4208   }
4209 
4210   /* Pass 2: process rows from work array, store into output array.
4211    * Note that we must descale the results by a factor of 8 == 2**3,
4212    * and also undo the PASS1_BITS scaling.
4213    * 8-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
4214    */
4215 
4216   wsptr = workspace;
4217   for (ctr = 0; ctr < 16; ctr++) {
4218     outptr = output_buf[ctr] + output_col;
4219 
4220     /* Even part: reverse the even part of the forward DCT.
4221      * The rotator is c(-6).
4222      */
4223 
4224     /* Add range center and fudge factor for final descale and range-limit. */
4225     z2 = (INT32) wsptr[0] +
4226 	   ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
4227 	    (ONE << (PASS1_BITS+2)));
4228     z3 = (INT32) wsptr[4];
4229 
4230     tmp0 = (z2 + z3) << CONST_BITS;
4231     tmp1 = (z2 - z3) << CONST_BITS;
4232 
4233     z2 = (INT32) wsptr[2];
4234     z3 = (INT32) wsptr[6];
4235 
4236     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);       /* c6 */
4237     tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);     /* c2-c6 */
4238     tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);     /* c2+c6 */
4239 
4240     tmp10 = tmp0 + tmp2;
4241     tmp13 = tmp0 - tmp2;
4242     tmp11 = tmp1 + tmp3;
4243     tmp12 = tmp1 - tmp3;
4244 
4245     /* Odd part per figure 8; the matrix is unitary and hence its
4246      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
4247      */
4248 
4249     tmp0 = (INT32) wsptr[7];
4250     tmp1 = (INT32) wsptr[5];
4251     tmp2 = (INT32) wsptr[3];
4252     tmp3 = (INT32) wsptr[1];
4253 
4254     z2 = tmp0 + tmp2;
4255     z3 = tmp1 + tmp3;
4256 
4257     z1 = MULTIPLY(z2 + z3, FIX_1_175875602);       /*  c3 */
4258     z2 = MULTIPLY(z2, - FIX_1_961570560);          /* -c3-c5 */
4259     z3 = MULTIPLY(z3, - FIX_0_390180644);          /* -c3+c5 */
4260     z2 += z1;
4261     z3 += z1;
4262 
4263     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* -c3+c7 */
4264     tmp0 = MULTIPLY(tmp0, FIX_0_298631336);        /* -c1+c3+c5-c7 */
4265     tmp3 = MULTIPLY(tmp3, FIX_1_501321110);        /*  c1+c3-c5-c7 */
4266     tmp0 += z1 + z2;
4267     tmp3 += z1 + z3;
4268 
4269     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* -c1-c3 */
4270     tmp1 = MULTIPLY(tmp1, FIX_2_053119869);        /*  c1+c3-c5+c7 */
4271     tmp2 = MULTIPLY(tmp2, FIX_3_072711026);        /*  c1+c3+c5-c7 */
4272     tmp1 += z1 + z3;
4273     tmp2 += z1 + z2;
4274 
4275     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
4276 
4277     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp3,
4278 					      CONST_BITS+PASS1_BITS+3)
4279 			    & RANGE_MASK];
4280     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp3,
4281 					      CONST_BITS+PASS1_BITS+3)
4282 			    & RANGE_MASK];
4283     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp2,
4284 					      CONST_BITS+PASS1_BITS+3)
4285 			    & RANGE_MASK];
4286     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp2,
4287 					      CONST_BITS+PASS1_BITS+3)
4288 			    & RANGE_MASK];
4289     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp1,
4290 					      CONST_BITS+PASS1_BITS+3)
4291 			    & RANGE_MASK];
4292     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp1,
4293 					      CONST_BITS+PASS1_BITS+3)
4294 			    & RANGE_MASK];
4295     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp0,
4296 					      CONST_BITS+PASS1_BITS+3)
4297 			    & RANGE_MASK];
4298     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp0,
4299 					      CONST_BITS+PASS1_BITS+3)
4300 			    & RANGE_MASK];
4301 
4302     wsptr += DCTSIZE;		/* advance pointer to next row */
4303   }
4304 }
4305 
4306 
4307 /*
4308  * Perform dequantization and inverse DCT on one block of coefficients,
4309  * producing a 7x14 output block.
4310  *
4311  * 14-point IDCT in pass 1 (columns), 7-point in pass 2 (rows).
4312  */
4313 
4314 GLOBAL(void)
4315 jpeg_idct_7x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
4316 		JCOEFPTR coef_block,
4317 		JSAMPARRAY output_buf, JDIMENSION output_col)
4318 {
4319   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
4320   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
4321   INT32 z1, z2, z3, z4;
4322   JCOEFPTR inptr;
4323   ISLOW_MULT_TYPE * quantptr;
4324   int * wsptr;
4325   JSAMPROW outptr;
4326   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
4327   int ctr;
4328   int workspace[7*14];	/* buffers data between passes */
4329   SHIFT_TEMPS
4330 
4331   /* Pass 1: process columns from input, store into work array.
4332    * 14-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/28).
4333    */
4334 
4335   inptr = coef_block;
4336   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
4337   wsptr = workspace;
4338   for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) {
4339     /* Even part */
4340 
4341     z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
4342     z1 <<= CONST_BITS;
4343     /* Add fudge factor here for final descale. */
4344     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
4345     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
4346     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
4347     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
4348     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
4349 
4350     tmp10 = z1 + z2;
4351     tmp11 = z1 + z3;
4352     tmp12 = z1 - z4;
4353 
4354     tmp23 = RIGHT_SHIFT(z1 - ((z2 + z3 - z4) << 1), /* c0 = (c4+c12-c8)*2 */
4355 			CONST_BITS-PASS1_BITS);
4356 
4357     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
4358     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
4359 
4360     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
4361 
4362     tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
4363     tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
4364     tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
4365 	    MULTIPLY(z2, FIX(1.378756276));      /* c2 */
4366 
4367     tmp20 = tmp10 + tmp13;
4368     tmp26 = tmp10 - tmp13;
4369     tmp21 = tmp11 + tmp14;
4370     tmp25 = tmp11 - tmp14;
4371     tmp22 = tmp12 + tmp15;
4372     tmp24 = tmp12 - tmp15;
4373 
4374     /* Odd part */
4375 
4376     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
4377     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
4378     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
4379     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
4380     tmp13 = z4 << CONST_BITS;
4381 
4382     tmp14 = z1 + z3;
4383     tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
4384     tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
4385     tmp10 = tmp11 + tmp12 + tmp13 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
4386     tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
4387     tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
4388     z1    -= z2;
4389     tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13;        /* c11 */
4390     tmp16 += tmp15;
4391     z1    += z4;
4392     z4    = MULTIPLY(z2 + z3, - FIX(0.158341681)) - tmp13; /* -c13 */
4393     tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948));          /* c3-c9-c13 */
4394     tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773));          /* c3+c5-c13 */
4395     z4    = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
4396     tmp14 += z4 + tmp13 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
4397     tmp15 += z4 + MULTIPLY(z2, FIX(0.674957567));          /* c1+c11-c5 */
4398 
4399     tmp13 = (z1 - z3) << PASS1_BITS;
4400 
4401     /* Final output stage */
4402 
4403     wsptr[7*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
4404     wsptr[7*13] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
4405     wsptr[7*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
4406     wsptr[7*12] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
4407     wsptr[7*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
4408     wsptr[7*11] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
4409     wsptr[7*3]  = (int) (tmp23 + tmp13);
4410     wsptr[7*10] = (int) (tmp23 - tmp13);
4411     wsptr[7*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
4412     wsptr[7*9]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
4413     wsptr[7*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
4414     wsptr[7*8]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
4415     wsptr[7*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
4416     wsptr[7*7]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
4417   }
4418 
4419   /* Pass 2: process 14 rows from work array, store into output array.
4420    * 7-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/14).
4421    */
4422 
4423   wsptr = workspace;
4424   for (ctr = 0; ctr < 14; ctr++) {
4425     outptr = output_buf[ctr] + output_col;
4426 
4427     /* Even part */
4428 
4429     /* Add range center and fudge factor for final descale and range-limit. */
4430     tmp23 = (INT32) wsptr[0] +
4431 	      ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
4432 	       (ONE << (PASS1_BITS+2)));
4433     tmp23 <<= CONST_BITS;
4434 
4435     z1 = (INT32) wsptr[2];
4436     z2 = (INT32) wsptr[4];
4437     z3 = (INT32) wsptr[6];
4438 
4439     tmp20 = MULTIPLY(z2 - z3, FIX(0.881747734));       /* c4 */
4440     tmp22 = MULTIPLY(z1 - z2, FIX(0.314692123));       /* c6 */
4441     tmp21 = tmp20 + tmp22 + tmp23 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
4442     tmp10 = z1 + z3;
4443     z2 -= tmp10;
4444     tmp10 = MULTIPLY(tmp10, FIX(1.274162392)) + tmp23; /* c2 */
4445     tmp20 += tmp10 - MULTIPLY(z3, FIX(0.077722536));   /* c2-c4-c6 */
4446     tmp22 += tmp10 - MULTIPLY(z1, FIX(2.470602249));   /* c2+c4+c6 */
4447     tmp23 += MULTIPLY(z2, FIX(1.414213562));           /* c0 */
4448 
4449     /* Odd part */
4450 
4451     z1 = (INT32) wsptr[1];
4452     z2 = (INT32) wsptr[3];
4453     z3 = (INT32) wsptr[5];
4454 
4455     tmp11 = MULTIPLY(z1 + z2, FIX(0.935414347));       /* (c3+c1-c5)/2 */
4456     tmp12 = MULTIPLY(z1 - z2, FIX(0.170262339));       /* (c3+c5-c1)/2 */
4457     tmp10 = tmp11 - tmp12;
4458     tmp11 += tmp12;
4459     tmp12 = MULTIPLY(z2 + z3, - FIX(1.378756276));     /* -c1 */
4460     tmp11 += tmp12;
4461     z2 = MULTIPLY(z1 + z3, FIX(0.613604268));          /* c5 */
4462     tmp10 += z2;
4463     tmp12 += z2 + MULTIPLY(z3, FIX(1.870828693));      /* c3+c1-c5 */
4464 
4465     /* Final output stage */
4466 
4467     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
4468 					      CONST_BITS+PASS1_BITS+3)
4469 			    & RANGE_MASK];
4470     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
4471 					      CONST_BITS+PASS1_BITS+3)
4472 			    & RANGE_MASK];
4473     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
4474 					      CONST_BITS+PASS1_BITS+3)
4475 			    & RANGE_MASK];
4476     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
4477 					      CONST_BITS+PASS1_BITS+3)
4478 			    & RANGE_MASK];
4479     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
4480 					      CONST_BITS+PASS1_BITS+3)
4481 			    & RANGE_MASK];
4482     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
4483 					      CONST_BITS+PASS1_BITS+3)
4484 			    & RANGE_MASK];
4485     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23,
4486 					      CONST_BITS+PASS1_BITS+3)
4487 			    & RANGE_MASK];
4488 
4489     wsptr += 7;		/* advance pointer to next row */
4490   }
4491 }
4492 
4493 
4494 /*
4495  * Perform dequantization and inverse DCT on one block of coefficients,
4496  * producing a 6x12 output block.
4497  *
4498  * 12-point IDCT in pass 1 (columns), 6-point in pass 2 (rows).
4499  */
4500 
4501 GLOBAL(void)
4502 jpeg_idct_6x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
4503 		JCOEFPTR coef_block,
4504 		JSAMPARRAY output_buf, JDIMENSION output_col)
4505 {
4506   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
4507   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
4508   INT32 z1, z2, z3, z4;
4509   JCOEFPTR inptr;
4510   ISLOW_MULT_TYPE * quantptr;
4511   int * wsptr;
4512   JSAMPROW outptr;
4513   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
4514   int ctr;
4515   int workspace[6*12];	/* buffers data between passes */
4516   SHIFT_TEMPS
4517 
4518   /* Pass 1: process columns from input, store into work array.
4519    * 12-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/24).
4520    */
4521 
4522   inptr = coef_block;
4523   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
4524   wsptr = workspace;
4525   for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
4526     /* Even part */
4527 
4528     z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
4529     z3 <<= CONST_BITS;
4530     /* Add fudge factor here for final descale. */
4531     z3 += ONE << (CONST_BITS-PASS1_BITS-1);
4532 
4533     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
4534     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
4535 
4536     tmp10 = z3 + z4;
4537     tmp11 = z3 - z4;
4538 
4539     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
4540     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
4541     z1 <<= CONST_BITS;
4542     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
4543     z2 <<= CONST_BITS;
4544 
4545     tmp12 = z1 - z2;
4546 
4547     tmp21 = z3 + tmp12;
4548     tmp24 = z3 - tmp12;
4549 
4550     tmp12 = z4 + z2;
4551 
4552     tmp20 = tmp10 + tmp12;
4553     tmp25 = tmp10 - tmp12;
4554 
4555     tmp12 = z4 - z1 - z2;
4556 
4557     tmp22 = tmp11 + tmp12;
4558     tmp23 = tmp11 - tmp12;
4559 
4560     /* Odd part */
4561 
4562     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
4563     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
4564     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
4565     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
4566 
4567     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
4568     tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
4569 
4570     tmp10 = z1 + z3;
4571     tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
4572     tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
4573     tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
4574     tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
4575     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
4576     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
4577     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
4578 	     MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
4579 
4580     z1 -= z4;
4581     z2 -= z3;
4582     z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
4583     tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
4584     tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
4585 
4586     /* Final output stage */
4587 
4588     wsptr[6*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
4589     wsptr[6*11] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
4590     wsptr[6*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
4591     wsptr[6*10] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
4592     wsptr[6*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
4593     wsptr[6*9]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
4594     wsptr[6*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
4595     wsptr[6*8]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
4596     wsptr[6*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
4597     wsptr[6*7]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
4598     wsptr[6*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
4599     wsptr[6*6]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
4600   }
4601 
4602   /* Pass 2: process 12 rows from work array, store into output array.
4603    * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
4604    */
4605 
4606   wsptr = workspace;
4607   for (ctr = 0; ctr < 12; ctr++) {
4608     outptr = output_buf[ctr] + output_col;
4609 
4610     /* Even part */
4611 
4612     /* Add range center and fudge factor for final descale and range-limit. */
4613     tmp10 = (INT32) wsptr[0] +
4614 	      ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
4615 	       (ONE << (PASS1_BITS+2)));
4616     tmp10 <<= CONST_BITS;
4617     tmp12 = (INT32) wsptr[4];
4618     tmp20 = MULTIPLY(tmp12, FIX(0.707106781));   /* c4 */
4619     tmp11 = tmp10 + tmp20;
4620     tmp21 = tmp10 - tmp20 - tmp20;
4621     tmp20 = (INT32) wsptr[2];
4622     tmp10 = MULTIPLY(tmp20, FIX(1.224744871));   /* c2 */
4623     tmp20 = tmp11 + tmp10;
4624     tmp22 = tmp11 - tmp10;
4625 
4626     /* Odd part */
4627 
4628     z1 = (INT32) wsptr[1];
4629     z2 = (INT32) wsptr[3];
4630     z3 = (INT32) wsptr[5];
4631     tmp11 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
4632     tmp10 = tmp11 + ((z1 + z2) << CONST_BITS);
4633     tmp12 = tmp11 + ((z3 - z2) << CONST_BITS);
4634     tmp11 = (z1 - z2 - z3) << CONST_BITS;
4635 
4636     /* Final output stage */
4637 
4638     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
4639 					      CONST_BITS+PASS1_BITS+3)
4640 			    & RANGE_MASK];
4641     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
4642 					      CONST_BITS+PASS1_BITS+3)
4643 			    & RANGE_MASK];
4644     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
4645 					      CONST_BITS+PASS1_BITS+3)
4646 			    & RANGE_MASK];
4647     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
4648 					      CONST_BITS+PASS1_BITS+3)
4649 			    & RANGE_MASK];
4650     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
4651 					      CONST_BITS+PASS1_BITS+3)
4652 			    & RANGE_MASK];
4653     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
4654 					      CONST_BITS+PASS1_BITS+3)
4655 			    & RANGE_MASK];
4656 
4657     wsptr += 6;		/* advance pointer to next row */
4658   }
4659 }
4660 
4661 
4662 /*
4663  * Perform dequantization and inverse DCT on one block of coefficients,
4664  * producing a 5x10 output block.
4665  *
4666  * 10-point IDCT in pass 1 (columns), 5-point in pass 2 (rows).
4667  */
4668 
4669 GLOBAL(void)
4670 jpeg_idct_5x10 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
4671 		JCOEFPTR coef_block,
4672 		JSAMPARRAY output_buf, JDIMENSION output_col)
4673 {
4674   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
4675   INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
4676   INT32 z1, z2, z3, z4, z5;
4677   JCOEFPTR inptr;
4678   ISLOW_MULT_TYPE * quantptr;
4679   int * wsptr;
4680   JSAMPROW outptr;
4681   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
4682   int ctr;
4683   int workspace[5*10];	/* buffers data between passes */
4684   SHIFT_TEMPS
4685 
4686   /* Pass 1: process columns from input, store into work array.
4687    * 10-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/20).
4688    */
4689 
4690   inptr = coef_block;
4691   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
4692   wsptr = workspace;
4693   for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) {
4694     /* Even part */
4695 
4696     z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
4697     z3 <<= CONST_BITS;
4698     /* Add fudge factor here for final descale. */
4699     z3 += ONE << (CONST_BITS-PASS1_BITS-1);
4700     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
4701     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
4702     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
4703     tmp10 = z3 + z1;
4704     tmp11 = z3 - z2;
4705 
4706     tmp22 = RIGHT_SHIFT(z3 - ((z1 - z2) << 1),   /* c0 = (c4-c8)*2 */
4707 			CONST_BITS-PASS1_BITS);
4708 
4709     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
4710     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
4711 
4712     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
4713     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
4714     tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
4715 
4716     tmp20 = tmp10 + tmp12;
4717     tmp24 = tmp10 - tmp12;
4718     tmp21 = tmp11 + tmp13;
4719     tmp23 = tmp11 - tmp13;
4720 
4721     /* Odd part */
4722 
4723     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
4724     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
4725     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
4726     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
4727 
4728     tmp11 = z2 + z4;
4729     tmp13 = z2 - z4;
4730 
4731     tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
4732     z5 = z3 << CONST_BITS;
4733 
4734     z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
4735     z4 = z5 + tmp12;
4736 
4737     tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
4738     tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
4739 
4740     z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
4741     z4 = z5 - tmp12 - (tmp13 << (CONST_BITS - 1));
4742 
4743     tmp12 = (z1 - tmp13 - z3) << PASS1_BITS;
4744 
4745     tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
4746     tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
4747 
4748     /* Final output stage */
4749 
4750     wsptr[5*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
4751     wsptr[5*9] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
4752     wsptr[5*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
4753     wsptr[5*8] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
4754     wsptr[5*2] = (int) (tmp22 + tmp12);
4755     wsptr[5*7] = (int) (tmp22 - tmp12);
4756     wsptr[5*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
4757     wsptr[5*6] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
4758     wsptr[5*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
4759     wsptr[5*5] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
4760   }
4761 
4762   /* Pass 2: process 10 rows from work array, store into output array.
4763    * 5-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/10).
4764    */
4765 
4766   wsptr = workspace;
4767   for (ctr = 0; ctr < 10; ctr++) {
4768     outptr = output_buf[ctr] + output_col;
4769 
4770     /* Even part */
4771 
4772     /* Add range center and fudge factor for final descale and range-limit. */
4773     tmp12 = (INT32) wsptr[0] +
4774 	      ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
4775 	       (ONE << (PASS1_BITS+2)));
4776     tmp12 <<= CONST_BITS;
4777     tmp13 = (INT32) wsptr[2];
4778     tmp14 = (INT32) wsptr[4];
4779     z1 = MULTIPLY(tmp13 + tmp14, FIX(0.790569415)); /* (c2+c4)/2 */
4780     z2 = MULTIPLY(tmp13 - tmp14, FIX(0.353553391)); /* (c2-c4)/2 */
4781     z3 = tmp12 + z2;
4782     tmp10 = z3 + z1;
4783     tmp11 = z3 - z1;
4784     tmp12 -= z2 << 2;
4785 
4786     /* Odd part */
4787 
4788     z2 = (INT32) wsptr[1];
4789     z3 = (INT32) wsptr[3];
4790 
4791     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));       /* c3 */
4792     tmp13 = z1 + MULTIPLY(z2, FIX(0.513743148));    /* c1-c3 */
4793     tmp14 = z1 - MULTIPLY(z3, FIX(2.176250899));    /* c1+c3 */
4794 
4795     /* Final output stage */
4796 
4797     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp13,
4798 					      CONST_BITS+PASS1_BITS+3)
4799 			    & RANGE_MASK];
4800     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp13,
4801 					      CONST_BITS+PASS1_BITS+3)
4802 			    & RANGE_MASK];
4803     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp14,
4804 					      CONST_BITS+PASS1_BITS+3)
4805 			    & RANGE_MASK];
4806     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp14,
4807 					      CONST_BITS+PASS1_BITS+3)
4808 			    & RANGE_MASK];
4809     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12,
4810 					      CONST_BITS+PASS1_BITS+3)
4811 			    & RANGE_MASK];
4812 
4813     wsptr += 5;		/* advance pointer to next row */
4814   }
4815 }
4816 
4817 
4818 /*
4819  * Perform dequantization and inverse DCT on one block of coefficients,
4820  * producing a 4x8 output block.
4821  *
4822  * 8-point IDCT in pass 1 (columns), 4-point in pass 2 (rows).
4823  */
4824 
4825 GLOBAL(void)
4826 jpeg_idct_4x8 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
4827 	       JCOEFPTR coef_block,
4828 	       JSAMPARRAY output_buf, JDIMENSION output_col)
4829 {
4830   INT32 tmp0, tmp1, tmp2, tmp3;
4831   INT32 tmp10, tmp11, tmp12, tmp13;
4832   INT32 z1, z2, z3;
4833   JCOEFPTR inptr;
4834   ISLOW_MULT_TYPE * quantptr;
4835   int * wsptr;
4836   JSAMPROW outptr;
4837   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
4838   int ctr;
4839   int workspace[4*8];	/* buffers data between passes */
4840   SHIFT_TEMPS
4841 
4842   /* Pass 1: process columns from input, store into work array.
4843    * Note results are scaled up by sqrt(8) compared to a true IDCT;
4844    * furthermore, we scale the results by 2**PASS1_BITS.
4845    * 8-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
4846    */
4847 
4848   inptr = coef_block;
4849   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
4850   wsptr = workspace;
4851   for (ctr = 4; ctr > 0; ctr--) {
4852     /* Due to quantization, we will usually find that many of the input
4853      * coefficients are zero, especially the AC terms.  We can exploit this
4854      * by short-circuiting the IDCT calculation for any column in which all
4855      * the AC terms are zero.  In that case each output is equal to the
4856      * DC coefficient (with scale factor as needed).
4857      * With typical images and quantization tables, half or more of the
4858      * column DCT calculations can be simplified this way.
4859      */
4860 
4861     if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
4862 	inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
4863 	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
4864 	inptr[DCTSIZE*7] == 0) {
4865       /* AC terms all zero */
4866       int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
4867 
4868       wsptr[4*0] = dcval;
4869       wsptr[4*1] = dcval;
4870       wsptr[4*2] = dcval;
4871       wsptr[4*3] = dcval;
4872       wsptr[4*4] = dcval;
4873       wsptr[4*5] = dcval;
4874       wsptr[4*6] = dcval;
4875       wsptr[4*7] = dcval;
4876 
4877       inptr++;			/* advance pointers to next column */
4878       quantptr++;
4879       wsptr++;
4880       continue;
4881     }
4882 
4883     /* Even part: reverse the even part of the forward DCT.
4884      * The rotator is c(-6).
4885      */
4886 
4887     z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
4888     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
4889     z2 <<= CONST_BITS;
4890     z3 <<= CONST_BITS;
4891     /* Add fudge factor here for final descale. */
4892     z2 += ONE << (CONST_BITS-PASS1_BITS-1);
4893 
4894     tmp0 = z2 + z3;
4895     tmp1 = z2 - z3;
4896 
4897     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
4898     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
4899 
4900     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);       /* c6 */
4901     tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);     /* c2-c6 */
4902     tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);     /* c2+c6 */
4903 
4904     tmp10 = tmp0 + tmp2;
4905     tmp13 = tmp0 - tmp2;
4906     tmp11 = tmp1 + tmp3;
4907     tmp12 = tmp1 - tmp3;
4908 
4909     /* Odd part per figure 8; the matrix is unitary and hence its
4910      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
4911      */
4912 
4913     tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
4914     tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
4915     tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
4916     tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
4917 
4918     z2 = tmp0 + tmp2;
4919     z3 = tmp1 + tmp3;
4920 
4921     z1 = MULTIPLY(z2 + z3, FIX_1_175875602);       /*  c3 */
4922     z2 = MULTIPLY(z2, - FIX_1_961570560);          /* -c3-c5 */
4923     z3 = MULTIPLY(z3, - FIX_0_390180644);          /* -c3+c5 */
4924     z2 += z1;
4925     z3 += z1;
4926 
4927     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* -c3+c7 */
4928     tmp0 = MULTIPLY(tmp0, FIX_0_298631336);        /* -c1+c3+c5-c7 */
4929     tmp3 = MULTIPLY(tmp3, FIX_1_501321110);        /*  c1+c3-c5-c7 */
4930     tmp0 += z1 + z2;
4931     tmp3 += z1 + z3;
4932 
4933     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* -c1-c3 */
4934     tmp1 = MULTIPLY(tmp1, FIX_2_053119869);        /*  c1+c3-c5+c7 */
4935     tmp2 = MULTIPLY(tmp2, FIX_3_072711026);        /*  c1+c3+c5-c7 */
4936     tmp1 += z1 + z3;
4937     tmp2 += z1 + z2;
4938 
4939     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
4940 
4941     wsptr[4*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
4942     wsptr[4*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
4943     wsptr[4*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
4944     wsptr[4*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
4945     wsptr[4*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
4946     wsptr[4*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
4947     wsptr[4*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
4948     wsptr[4*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
4949 
4950     inptr++;			/* advance pointers to next column */
4951     quantptr++;
4952     wsptr++;
4953   }
4954 
4955   /* Pass 2: process 8 rows from work array, store into output array.
4956    * 4-point IDCT kernel,
4957    * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
4958    */
4959 
4960   wsptr = workspace;
4961   for (ctr = 0; ctr < 8; ctr++) {
4962     outptr = output_buf[ctr] + output_col;
4963 
4964     /* Even part */
4965 
4966     /* Add range center and fudge factor for final descale and range-limit. */
4967     tmp0 = (INT32) wsptr[0] +
4968 	     ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
4969 	      (ONE << (PASS1_BITS+2)));
4970     tmp2 = (INT32) wsptr[2];
4971 
4972     tmp10 = (tmp0 + tmp2) << CONST_BITS;
4973     tmp12 = (tmp0 - tmp2) << CONST_BITS;
4974 
4975     /* Odd part */
4976     /* Same rotation as in the even part of the 8x8 LL&M IDCT */
4977 
4978     z2 = (INT32) wsptr[1];
4979     z3 = (INT32) wsptr[3];
4980 
4981     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
4982     tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
4983     tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
4984 
4985     /* Final output stage */
4986 
4987     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
4988 					      CONST_BITS+PASS1_BITS+3)
4989 			    & RANGE_MASK];
4990     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
4991 					      CONST_BITS+PASS1_BITS+3)
4992 			    & RANGE_MASK];
4993     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
4994 					      CONST_BITS+PASS1_BITS+3)
4995 			    & RANGE_MASK];
4996     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
4997 					      CONST_BITS+PASS1_BITS+3)
4998 			    & RANGE_MASK];
4999 
5000     wsptr += 4;		/* advance pointer to next row */
5001   }
5002 }
5003 
5004 
5005 /*
5006  * Perform dequantization and inverse DCT on one block of coefficients,
5007  * producing a reduced-size 3x6 output block.
5008  *
5009  * 6-point IDCT in pass 1 (columns), 3-point in pass 2 (rows).
5010  */
5011 
5012 GLOBAL(void)
5013 jpeg_idct_3x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
5014 	       JCOEFPTR coef_block,
5015 	       JSAMPARRAY output_buf, JDIMENSION output_col)
5016 {
5017   INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
5018   INT32 z1, z2, z3;
5019   JCOEFPTR inptr;
5020   ISLOW_MULT_TYPE * quantptr;
5021   int * wsptr;
5022   JSAMPROW outptr;
5023   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
5024   int ctr;
5025   int workspace[3*6];	/* buffers data between passes */
5026   SHIFT_TEMPS
5027 
5028   /* Pass 1: process columns from input, store into work array.
5029    * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
5030    */
5031 
5032   inptr = coef_block;
5033   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
5034   wsptr = workspace;
5035   for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) {
5036     /* Even part */
5037 
5038     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
5039     tmp0 <<= CONST_BITS;
5040     /* Add fudge factor here for final descale. */
5041     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
5042     tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
5043     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
5044     tmp1 = tmp0 + tmp10;
5045     tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS-PASS1_BITS);
5046     tmp10 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
5047     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
5048     tmp10 = tmp1 + tmp0;
5049     tmp12 = tmp1 - tmp0;
5050 
5051     /* Odd part */
5052 
5053     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
5054     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
5055     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
5056     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
5057     tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
5058     tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
5059     tmp1 = (z1 - z2 - z3) << PASS1_BITS;
5060 
5061     /* Final output stage */
5062 
5063     wsptr[3*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
5064     wsptr[3*5] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
5065     wsptr[3*1] = (int) (tmp11 + tmp1);
5066     wsptr[3*4] = (int) (tmp11 - tmp1);
5067     wsptr[3*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
5068     wsptr[3*3] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
5069   }
5070 
5071   /* Pass 2: process 6 rows from work array, store into output array.
5072    * 3-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/6).
5073    */
5074 
5075   wsptr = workspace;
5076   for (ctr = 0; ctr < 6; ctr++) {
5077     outptr = output_buf[ctr] + output_col;
5078 
5079     /* Even part */
5080 
5081     /* Add range center and fudge factor for final descale and range-limit. */
5082     tmp0 = (INT32) wsptr[0] +
5083 	     ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
5084 	      (ONE << (PASS1_BITS+2)));
5085     tmp0 <<= CONST_BITS;
5086     tmp2 = (INT32) wsptr[2];
5087     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
5088     tmp10 = tmp0 + tmp12;
5089     tmp2 = tmp0 - tmp12 - tmp12;
5090 
5091     /* Odd part */
5092 
5093     tmp12 = (INT32) wsptr[1];
5094     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
5095 
5096     /* Final output stage */
5097 
5098     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
5099 					      CONST_BITS+PASS1_BITS+3)
5100 			    & RANGE_MASK];
5101     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
5102 					      CONST_BITS+PASS1_BITS+3)
5103 			    & RANGE_MASK];
5104     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2,
5105 					      CONST_BITS+PASS1_BITS+3)
5106 			    & RANGE_MASK];
5107 
5108     wsptr += 3;		/* advance pointer to next row */
5109   }
5110 }
5111 
5112 
5113 /*
5114  * Perform dequantization and inverse DCT on one block of coefficients,
5115  * producing a 2x4 output block.
5116  *
5117  * 4-point IDCT in pass 1 (columns), 2-point in pass 2 (rows).
5118  */
5119 
5120 GLOBAL(void)
5121 jpeg_idct_2x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
5122 	       JCOEFPTR coef_block,
5123 	       JSAMPARRAY output_buf, JDIMENSION output_col)
5124 {
5125   INT32 tmp0, tmp2, tmp10, tmp12;
5126   INT32 z1, z2, z3;
5127   JCOEFPTR inptr;
5128   ISLOW_MULT_TYPE * quantptr;
5129   INT32 * wsptr;
5130   JSAMPROW outptr;
5131   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
5132   int ctr;
5133   INT32 workspace[2*4];	/* buffers data between passes */
5134   SHIFT_TEMPS
5135 
5136   /* Pass 1: process columns from input, store into work array.
5137    * 4-point IDCT kernel,
5138    * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
5139    */
5140 
5141   inptr = coef_block;
5142   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
5143   wsptr = workspace;
5144   for (ctr = 0; ctr < 2; ctr++, inptr++, quantptr++, wsptr++) {
5145     /* Even part */
5146 
5147     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
5148     tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
5149 
5150     tmp10 = (tmp0 + tmp2) << CONST_BITS;
5151     tmp12 = (tmp0 - tmp2) << CONST_BITS;
5152 
5153     /* Odd part */
5154     /* Same rotation as in the even part of the 8x8 LL&M IDCT */
5155 
5156     z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
5157     z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
5158 
5159     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
5160     tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
5161     tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
5162 
5163     /* Final output stage */
5164 
5165     wsptr[2*0] = tmp10 + tmp0;
5166     wsptr[2*3] = tmp10 - tmp0;
5167     wsptr[2*1] = tmp12 + tmp2;
5168     wsptr[2*2] = tmp12 - tmp2;
5169   }
5170 
5171   /* Pass 2: process 4 rows from work array, store into output array. */
5172 
5173   wsptr = workspace;
5174   for (ctr = 0; ctr < 4; ctr++) {
5175     outptr = output_buf[ctr] + output_col;
5176 
5177     /* Even part */
5178 
5179     /* Add range center and fudge factor for final descale and range-limit. */
5180     tmp10 = wsptr[0] +
5181 	      ((((INT32) RANGE_CENTER) << (CONST_BITS+3)) +
5182 	       (ONE << (CONST_BITS+2)));
5183 
5184     /* Odd part */
5185 
5186     tmp0 = wsptr[1];
5187 
5188     /* Final output stage */
5189 
5190     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS+3)
5191 			    & RANGE_MASK];
5192     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS+3)
5193 			    & RANGE_MASK];
5194 
5195     wsptr += 2;		/* advance pointer to next row */
5196   }
5197 }
5198 
5199 
5200 /*
5201  * Perform dequantization and inverse DCT on one block of coefficients,
5202  * producing a 1x2 output block.
5203  *
5204  * 2-point IDCT in pass 1 (columns), 1-point in pass 2 (rows).
5205  */
5206 
5207 GLOBAL(void)
5208 jpeg_idct_1x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
5209 	       JCOEFPTR coef_block,
5210 	       JSAMPARRAY output_buf, JDIMENSION output_col)
5211 {
5212   DCTELEM tmp0, tmp1;
5213   ISLOW_MULT_TYPE * quantptr;
5214   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
5215   ISHIFT_TEMPS
5216 
5217   /* Process 1 column from input, store into output array. */
5218 
5219   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
5220 
5221   /* Even part */
5222 
5223   tmp0 = DEQUANTIZE(coef_block[DCTSIZE*0], quantptr[DCTSIZE*0]);
5224   /* Add range center and fudge factor for final descale and range-limit. */
5225   tmp0 += (((DCTELEM) RANGE_CENTER) << 3) + (1 << 2);
5226 
5227   /* Odd part */
5228 
5229   tmp1 = DEQUANTIZE(coef_block[DCTSIZE*1], quantptr[DCTSIZE*1]);
5230 
5231   /* Final output stage */
5232 
5233   output_buf[0][output_col] =
5234     range_limit[(int) IRIGHT_SHIFT(tmp0 + tmp1, 3) & RANGE_MASK];
5235   output_buf[1][output_col] =
5236     range_limit[(int) IRIGHT_SHIFT(tmp0 - tmp1, 3) & RANGE_MASK];
5237 }
5238 
5239 #endif /* IDCT_SCALING_SUPPORTED */
5240 #endif /* DCT_ISLOW_SUPPORTED */
5241