1 /*
2  * jidctint.c
3  *
4  * Copyright (C) 1991-1998, Thomas G. Lane.
5  * Modification developed 2002-2009 by Guido Vollbeding.
6  * This file is part of the Independent JPEG Group's software.
7  * For conditions of distribution and use, see the accompanying README file.
8  *
9  * This file contains a slow-but-accurate integer implementation of the
10  * inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
11  * must also perform dequantization of the input coefficients.
12  *
13  * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
14  * on each row (or vice versa, but it's more convenient to emit a row at
15  * a time).  Direct algorithms are also available, but they are much more
16  * complex and seem not to be any faster when reduced to code.
17  *
18  * This implementation is based on an algorithm described in
19  *   C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
20  *   Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
21  *   Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
22  * The primary algorithm described there uses 11 multiplies and 29 adds.
23  * We use their alternate method with 12 multiplies and 32 adds.
24  * The advantage of this method is that no data path contains more than one
25  * multiplication; this allows a very simple and accurate implementation in
26  * scaled fixed-point arithmetic, with a minimal number of shifts.
27  *
28  * We also provide IDCT routines with various output sample block sizes for
29  * direct resolution reduction or enlargement and for direct resolving the
30  * common 2x1 and 1x2 subsampling cases without additional resampling: NxN
31  * (N=1...16), 2NxN, and Nx2N (N=1...8) pixels for one 8x8 input DCT block.
32  *
33  * For N<8 we simply take the corresponding low-frequency coefficients of
34  * the 8x8 input DCT block and apply an NxN point IDCT on the sub-block
35  * to yield the downscaled outputs.
36  * This can be seen as direct low-pass downsampling from the DCT domain
37  * point of view rather than the usual spatial domain point of view,
38  * yielding significant computational savings and results at least
39  * as good as common bilinear (averaging) spatial downsampling.
40  *
41  * For N>8 we apply a partial NxN IDCT on the 8 input coefficients as
42  * lower frequencies and higher frequencies assumed to be zero.
43  * It turns out that the computational effort is similar to the 8x8 IDCT
44  * regarding the output size.
45  * Furthermore, the scaling and descaling is the same for all IDCT sizes.
46  *
47  * CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
48  * since there would be too many additional constants to pre-calculate.
49  */
50 
51 #define JPEG_INTERNALS
52 #include "jinclude.h"
53 #include "jpeglib.h"
54 #include "jdct.h"		/* Private declarations for DCT subsystem */
55 
56 #ifdef DCT_ISLOW_SUPPORTED
57 
58 
59 /*
60  * This module is specialized to the case DCTSIZE = 8.
61  */
62 
63 #if DCTSIZE != 8
64   Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
65 #endif
66 
67 
68 /*
69  * The poop on this scaling stuff is as follows:
70  *
71  * Each 1-D IDCT step produces outputs which are a factor of sqrt(N)
72  * larger than the true IDCT outputs.  The final outputs are therefore
73  * a factor of N larger than desired; since N=8 this can be cured by
74  * a simple right shift at the end of the algorithm.  The advantage of
75  * this arrangement is that we save two multiplications per 1-D IDCT,
76  * because the y0 and y4 inputs need not be divided by sqrt(N).
77  *
78  * We have to do addition and subtraction of the integer inputs, which
79  * is no problem, and multiplication by fractional constants, which is
80  * a problem to do in integer arithmetic.  We multiply all the constants
81  * by CONST_SCALE and convert them to integer constants (thus retaining
82  * CONST_BITS bits of precision in the constants).  After doing a
83  * multiplication we have to divide the product by CONST_SCALE, with proper
84  * rounding, to produce the correct output.  This division can be done
85  * cheaply as a right shift of CONST_BITS bits.  We postpone shifting
86  * as long as possible so that partial sums can be added together with
87  * full fractional precision.
88  *
89  * The outputs of the first pass are scaled up by PASS1_BITS bits so that
90  * they are represented to better-than-integral precision.  These outputs
91  * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
92  * with the recommended scaling.  (To scale up 12-bit sample data further, an
93  * intermediate INT32 array would be needed.)
94  *
95  * To avoid overflow of the 32-bit intermediate results in pass 2, we must
96  * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis
97  * shows that the values given below are the most effective.
98  */
99 
100 #if BITS_IN_JSAMPLE == 8
101 #define CONST_BITS  13
102 #define PASS1_BITS  2
103 #else
104 #define CONST_BITS  13
105 #define PASS1_BITS  1		/* lose a little precision to avoid overflow */
106 #endif
107 
108 /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
109  * causing a lot of useless floating-point operations at run time.
110  * To get around this we use the following pre-calculated constants.
111  * If you change CONST_BITS you may want to add appropriate values.
112  * (With a reasonable C compiler, you can just rely on the FIX() macro...)
113  */
114 
115 #if CONST_BITS == 13
116 #define FIX_0_298631336  ((INT32)  2446)	/* FIX(0.298631336) */
117 #define FIX_0_390180644  ((INT32)  3196)	/* FIX(0.390180644) */
118 #define FIX_0_541196100  ((INT32)  4433)	/* FIX(0.541196100) */
119 #define FIX_0_765366865  ((INT32)  6270)	/* FIX(0.765366865) */
120 #define FIX_0_899976223  ((INT32)  7373)	/* FIX(0.899976223) */
121 #define FIX_1_175875602  ((INT32)  9633)	/* FIX(1.175875602) */
122 #define FIX_1_501321110  ((INT32)  12299)	/* FIX(1.501321110) */
123 #define FIX_1_847759065  ((INT32)  15137)	/* FIX(1.847759065) */
124 #define FIX_1_961570560  ((INT32)  16069)	/* FIX(1.961570560) */
125 #define FIX_2_053119869  ((INT32)  16819)	/* FIX(2.053119869) */
126 #define FIX_2_562915447  ((INT32)  20995)	/* FIX(2.562915447) */
127 #define FIX_3_072711026  ((INT32)  25172)	/* FIX(3.072711026) */
128 #else
129 #define FIX_0_298631336  FIX(0.298631336)
130 #define FIX_0_390180644  FIX(0.390180644)
131 #define FIX_0_541196100  FIX(0.541196100)
132 #define FIX_0_765366865  FIX(0.765366865)
133 #define FIX_0_899976223  FIX(0.899976223)
134 #define FIX_1_175875602  FIX(1.175875602)
135 #define FIX_1_501321110  FIX(1.501321110)
136 #define FIX_1_847759065  FIX(1.847759065)
137 #define FIX_1_961570560  FIX(1.961570560)
138 #define FIX_2_053119869  FIX(2.053119869)
139 #define FIX_2_562915447  FIX(2.562915447)
140 #define FIX_3_072711026  FIX(3.072711026)
141 #endif
142 
143 
144 /* Clamp DC value to acceptable range for bug 697186 */
145 #define CLAMP_DC(dcval)    \
146   {                        \
147     if (dcval < -1024)     \
148       dcval = -1024;       \
149     else if (dcval > 1023) \
150       dcval = 1023;        \
151   }
152 
153 /* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
154  * For 8-bit samples with the recommended scaling, all the variable
155  * and constant values involved are no more than 16 bits wide, so a
156  * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
157  * For 12-bit samples, a full 32-bit multiplication will be needed.
158  */
159 
160 #if BITS_IN_JSAMPLE == 8
161 #define MULTIPLY(var,const)  MULTIPLY16C16(var,const)
162 #else
163 #define MULTIPLY(var,const)  ((var) * (const))
164 #endif
165 
166 
167 /* Dequantize a coefficient by multiplying it by the multiplier-table
168  * entry; produce an int result.  In this module, both inputs and result
169  * are 16 bits or less, so either int or short multiply will work.
170  */
171 
172 #define DEQUANTIZE(coef,quantval)  (((ISLOW_MULT_TYPE) (coef)) * (quantval))
173 
174 
175 /*
176  * Perform dequantization and inverse DCT on one block of coefficients.
177  */
178 
179 GLOBAL(void)
180 jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
181 		 JCOEFPTR coef_block,
182 		 JSAMPARRAY output_buf, JDIMENSION output_col)
183 {
184   INT32 tmp0, tmp1, tmp2, tmp3;
185   INT32 tmp10, tmp11, tmp12, tmp13;
186   INT32 z1, z2, z3;
187   JCOEFPTR inptr;
188   ISLOW_MULT_TYPE * quantptr;
189   int * wsptr;
190   JSAMPROW outptr;
191   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
192   int ctr;
193   int workspace[DCTSIZE2];	/* buffers data between passes */
194   SHIFT_TEMPS
195 
196   /* Pass 1: process columns from input, store into work array. */
197   /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
198   /* furthermore, we scale the results by 2**PASS1_BITS. */
199 
200   inptr = coef_block;
201   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
202   wsptr = workspace;
203   for (ctr = DCTSIZE; ctr > 0; ctr--) {
204     /* Due to quantization, we will usually find that many of the input
205      * coefficients are zero, especially the AC terms.  We can exploit this
206      * by short-circuiting the IDCT calculation for any column in which all
207      * the AC terms are zero.  In that case each output is equal to the
208      * DC coefficient (with scale factor as needed).
209      * With typical images and quantization tables, half or more of the
210      * column DCT calculations can be simplified this way.
211      */
212 
213     if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
214 	inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
215 	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
216 	inptr[DCTSIZE*7] == 0) {
217       /* AC terms all zero */
218       int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
219       if (ctr == DCTSIZE)
220           CLAMP_DC(dcval);
221       dcval <<= PASS1_BITS;
222       wsptr[DCTSIZE*0] = dcval;
223       wsptr[DCTSIZE*1] = dcval;
224       wsptr[DCTSIZE*2] = dcval;
225       wsptr[DCTSIZE*3] = dcval;
226       wsptr[DCTSIZE*4] = dcval;
227       wsptr[DCTSIZE*5] = dcval;
228       wsptr[DCTSIZE*6] = dcval;
229       wsptr[DCTSIZE*7] = dcval;
230 
231       inptr++;			/* advance pointers to next column */
232       quantptr++;
233       wsptr++;
234       continue;
235     }
236 
237     /* Even part: reverse the even part of the forward DCT. */
238     /* The rotator is sqrt(2)*c(-6). */
239 
240     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
241     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
242 
243     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
244     tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
245     tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
246 
247     z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
248     if (ctr == DCTSIZE)
249       CLAMP_DC(z2);
250     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
251     z2 <<= CONST_BITS;
252     z3 <<= CONST_BITS;
253     /* Add fudge factor here for final descale. */
254     z2 += ONE << (CONST_BITS-PASS1_BITS-1);
255 
256     tmp0 = z2 + z3;
257     tmp1 = z2 - z3;
258 
259     tmp10 = tmp0 + tmp2;
260     tmp13 = tmp0 - tmp2;
261     tmp11 = tmp1 + tmp3;
262     tmp12 = tmp1 - tmp3;
263 
264     /* Odd part per figure 8; the matrix is unitary and hence its
265      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
266      */
267 
268     tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
269     tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
270     tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
271     tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
272 
273     z2 = tmp0 + tmp2;
274     z3 = tmp1 + tmp3;
275 
276     z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
277     z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
278     z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
279     z2 += z1;
280     z3 += z1;
281 
282     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
283     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
284     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
285     tmp0 += z1 + z2;
286     tmp3 += z1 + z3;
287 
288     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
289     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
290     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
291     tmp1 += z1 + z3;
292     tmp2 += z1 + z2;
293 
294     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
295 
296     wsptr[DCTSIZE*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
297     wsptr[DCTSIZE*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
298     wsptr[DCTSIZE*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
299     wsptr[DCTSIZE*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
300     wsptr[DCTSIZE*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
301     wsptr[DCTSIZE*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
302     wsptr[DCTSIZE*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
303     wsptr[DCTSIZE*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
304 
305     inptr++;			/* advance pointers to next column */
306     quantptr++;
307     wsptr++;
308   }
309 
310   /* Pass 2: process rows from work array, store into output array. */
311   /* Note that we must descale the results by a factor of 8 == 2**3, */
312   /* and also undo the PASS1_BITS scaling. */
313 
314   wsptr = workspace;
315   for (ctr = 0; ctr < DCTSIZE; ctr++) {
316     outptr = output_buf[ctr] + output_col;
317     /* Rows of zeroes can be exploited in the same way as we did with columns.
318      * However, the column calculation has created many nonzero AC terms, so
319      * the simplification applies less often (typically 5% to 10% of the time).
320      * On machines with very fast multiplication, it's possible that the
321      * test takes more time than it's worth.  In that case this section
322      * may be commented out.
323      */
324 
325 #ifndef NO_ZERO_ROW_TEST
326     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
327 	wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
328       /* AC terms all zero */
329       JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3)
330 				  & RANGE_MASK];
331 
332       outptr[0] = dcval;
333       outptr[1] = dcval;
334       outptr[2] = dcval;
335       outptr[3] = dcval;
336       outptr[4] = dcval;
337       outptr[5] = dcval;
338       outptr[6] = dcval;
339       outptr[7] = dcval;
340 
341       wsptr += DCTSIZE;		/* advance pointer to next row */
342       continue;
343     }
344 #endif
345 
346     /* Even part: reverse the even part of the forward DCT. */
347     /* The rotator is sqrt(2)*c(-6). */
348 
349     z2 = (INT32) wsptr[2];
350     z3 = (INT32) wsptr[6];
351 
352     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
353     tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
354     tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
355 
356     /* Add fudge factor here for final descale. */
357     z2 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
358     z3 = (INT32) wsptr[4];
359 
360     tmp0 = (z2 + z3) << CONST_BITS;
361     tmp1 = (z2 - z3) << CONST_BITS;
362 
363     tmp10 = tmp0 + tmp2;
364     tmp13 = tmp0 - tmp2;
365     tmp11 = tmp1 + tmp3;
366     tmp12 = tmp1 - tmp3;
367 
368     /* Odd part per figure 8; the matrix is unitary and hence its
369      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
370      */
371 
372     tmp0 = (INT32) wsptr[7];
373     tmp1 = (INT32) wsptr[5];
374     tmp2 = (INT32) wsptr[3];
375     tmp3 = (INT32) wsptr[1];
376 
377     z2 = tmp0 + tmp2;
378     z3 = tmp1 + tmp3;
379 
380     z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
381     z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
382     z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
383     z2 += z1;
384     z3 += z1;
385 
386     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
387     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
388     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
389     tmp0 += z1 + z2;
390     tmp3 += z1 + z3;
391 
392     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
393     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
394     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
395     tmp1 += z1 + z3;
396     tmp2 += z1 + z2;
397 
398     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
399 
400     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp3,
401 					      CONST_BITS+PASS1_BITS+3)
402 			    & RANGE_MASK];
403     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp3,
404 					      CONST_BITS+PASS1_BITS+3)
405 			    & RANGE_MASK];
406     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp2,
407 					      CONST_BITS+PASS1_BITS+3)
408 			    & RANGE_MASK];
409     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp2,
410 					      CONST_BITS+PASS1_BITS+3)
411 			    & RANGE_MASK];
412     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp1,
413 					      CONST_BITS+PASS1_BITS+3)
414 			    & RANGE_MASK];
415     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp1,
416 					      CONST_BITS+PASS1_BITS+3)
417 			    & RANGE_MASK];
418     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp0,
419 					      CONST_BITS+PASS1_BITS+3)
420 			    & RANGE_MASK];
421     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp0,
422 					      CONST_BITS+PASS1_BITS+3)
423 			    & RANGE_MASK];
424 
425     wsptr += DCTSIZE;		/* advance pointer to next row */
426   }
427 }
428 
429 #ifdef IDCT_SCALING_SUPPORTED
430 
431 
432 /*
433  * Perform dequantization and inverse DCT on one block of coefficients,
434  * producing a 7x7 output block.
435  *
436  * Optimized algorithm with 12 multiplications in the 1-D kernel.
437  * cK represents sqrt(2) * cos(K*pi/14).
438  */
439 
440 GLOBAL(void)
jpeg_idct_7x7(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)441 jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
442 	       JCOEFPTR coef_block,
443 	       JSAMPARRAY output_buf, JDIMENSION output_col)
444 {
445   INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12, tmp13;
446   INT32 z1, z2, z3;
447   JCOEFPTR inptr;
448   ISLOW_MULT_TYPE * quantptr;
449   int * wsptr;
450   JSAMPROW outptr;
451   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
452   int ctr;
453   int workspace[7*7];	/* buffers data between passes */
454   SHIFT_TEMPS
455 
456   /* Pass 1: process columns from input, store into work array. */
457 
458   inptr = coef_block;
459   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
460   wsptr = workspace;
461   for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) {
462     /* Even part */
463 
464     tmp13 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
465     if (ctr == 0)
466       CLAMP_DC(tmp13);
467     tmp13 <<= CONST_BITS;
468     /* Add fudge factor here for final descale. */
469     tmp13 += ONE << (CONST_BITS-PASS1_BITS-1);
470 
471     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
472     z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
473     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
474 
475     tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
476     tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
477     tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
478     tmp0 = z1 + z3;
479     z2 -= tmp0;
480     tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
481     tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536));  /* c2-c4-c6 */
482     tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249));  /* c2+c4+c6 */
483     tmp13 += MULTIPLY(z2, FIX(1.414213562));         /* c0 */
484 
485     /* Odd part */
486 
487     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
488     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
489     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
490 
491     tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
492     tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
493     tmp0 = tmp1 - tmp2;
494     tmp1 += tmp2;
495     tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276));    /* -c1 */
496     tmp1 += tmp2;
497     z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
498     tmp0 += z2;
499     tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693));     /* c3+c1-c5 */
500 
501     /* Final output stage */
502 
503     wsptr[7*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
504     wsptr[7*6] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
505     wsptr[7*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
506     wsptr[7*5] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
507     wsptr[7*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
508     wsptr[7*4] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
509     wsptr[7*3] = (int) RIGHT_SHIFT(tmp13, CONST_BITS-PASS1_BITS);
510   }
511 
512   /* Pass 2: process 7 rows from work array, store into output array. */
513 
514   wsptr = workspace;
515   for (ctr = 0; ctr < 7; ctr++) {
516     outptr = output_buf[ctr] + output_col;
517 
518     /* Even part */
519 
520     /* Add fudge factor here for final descale. */
521     tmp13 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
522     tmp13 <<= CONST_BITS;
523 
524     z1 = (INT32) wsptr[2];
525     z2 = (INT32) wsptr[4];
526     z3 = (INT32) wsptr[6];
527 
528     tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
529     tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
530     tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
531     tmp0 = z1 + z3;
532     z2 -= tmp0;
533     tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
534     tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536));  /* c2-c4-c6 */
535     tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249));  /* c2+c4+c6 */
536     tmp13 += MULTIPLY(z2, FIX(1.414213562));         /* c0 */
537 
538     /* Odd part */
539 
540     z1 = (INT32) wsptr[1];
541     z2 = (INT32) wsptr[3];
542     z3 = (INT32) wsptr[5];
543 
544     tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
545     tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
546     tmp0 = tmp1 - tmp2;
547     tmp1 += tmp2;
548     tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276));    /* -c1 */
549     tmp1 += tmp2;
550     z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
551     tmp0 += z2;
552     tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693));     /* c3+c1-c5 */
553 
554     /* Final output stage */
555 
556     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
557 					      CONST_BITS+PASS1_BITS+3)
558 			    & RANGE_MASK];
559     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
560 					      CONST_BITS+PASS1_BITS+3)
561 			    & RANGE_MASK];
562     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
563 					      CONST_BITS+PASS1_BITS+3)
564 			    & RANGE_MASK];
565     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
566 					      CONST_BITS+PASS1_BITS+3)
567 			    & RANGE_MASK];
568     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
569 					      CONST_BITS+PASS1_BITS+3)
570 			    & RANGE_MASK];
571     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
572 					      CONST_BITS+PASS1_BITS+3)
573 			    & RANGE_MASK];
574     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13,
575 					      CONST_BITS+PASS1_BITS+3)
576 			    & RANGE_MASK];
577 
578     wsptr += 7;		/* advance pointer to next row */
579   }
580 }
581 
582 
583 /*
584  * Perform dequantization and inverse DCT on one block of coefficients,
585  * producing a reduced-size 6x6 output block.
586  *
587  * Optimized algorithm with 3 multiplications in the 1-D kernel.
588  * cK represents sqrt(2) * cos(K*pi/12).
589  */
590 
591 GLOBAL(void)
jpeg_idct_6x6(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)592 jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
593 	       JCOEFPTR coef_block,
594 	       JSAMPARRAY output_buf, JDIMENSION output_col)
595 {
596   INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
597   INT32 z1, z2, z3;
598   JCOEFPTR inptr;
599   ISLOW_MULT_TYPE * quantptr;
600   int * wsptr;
601   JSAMPROW outptr;
602   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
603   int ctr;
604   int workspace[6*6];	/* buffers data between passes */
605   SHIFT_TEMPS
606 
607   /* Pass 1: process columns from input, store into work array. */
608 
609   inptr = coef_block;
610   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
611   wsptr = workspace;
612   for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
613     /* Even part */
614 
615     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
616     if (ctr == 0)
617       CLAMP_DC(tmp0);
618     tmp0 <<= CONST_BITS;
619     /* Add fudge factor here for final descale. */
620     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
621     tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
622     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
623     tmp1 = tmp0 + tmp10;
624     tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS-PASS1_BITS);
625     tmp10 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
626     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
627     tmp10 = tmp1 + tmp0;
628     tmp12 = tmp1 - tmp0;
629 
630     /* Odd part */
631 
632     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
633     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
634     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
635     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
636     tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
637     tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
638     tmp1 = (z1 - z2 - z3) << PASS1_BITS;
639 
640     /* Final output stage */
641 
642     wsptr[6*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
643     wsptr[6*5] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
644     wsptr[6*1] = (int) (tmp11 + tmp1);
645     wsptr[6*4] = (int) (tmp11 - tmp1);
646     wsptr[6*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
647     wsptr[6*3] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
648   }
649 
650   /* Pass 2: process 6 rows from work array, store into output array. */
651 
652   wsptr = workspace;
653   for (ctr = 0; ctr < 6; ctr++) {
654     outptr = output_buf[ctr] + output_col;
655 
656     /* Even part */
657 
658     /* Add fudge factor here for final descale. */
659     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
660     tmp0 <<= CONST_BITS;
661     tmp2 = (INT32) wsptr[4];
662     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
663     tmp1 = tmp0 + tmp10;
664     tmp11 = tmp0 - tmp10 - tmp10;
665     tmp10 = (INT32) wsptr[2];
666     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
667     tmp10 = tmp1 + tmp0;
668     tmp12 = tmp1 - tmp0;
669 
670     /* Odd part */
671 
672     z1 = (INT32) wsptr[1];
673     z2 = (INT32) wsptr[3];
674     z3 = (INT32) wsptr[5];
675     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
676     tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
677     tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
678     tmp1 = (z1 - z2 - z3) << CONST_BITS;
679 
680     /* Final output stage */
681 
682     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
683 					      CONST_BITS+PASS1_BITS+3)
684 			    & RANGE_MASK];
685     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
686 					      CONST_BITS+PASS1_BITS+3)
687 			    & RANGE_MASK];
688     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
689 					      CONST_BITS+PASS1_BITS+3)
690 			    & RANGE_MASK];
691     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
692 					      CONST_BITS+PASS1_BITS+3)
693 			    & RANGE_MASK];
694     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
695 					      CONST_BITS+PASS1_BITS+3)
696 			    & RANGE_MASK];
697     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
698 					      CONST_BITS+PASS1_BITS+3)
699 			    & RANGE_MASK];
700 
701     wsptr += 6;		/* advance pointer to next row */
702   }
703 }
704 
705 
706 /*
707  * Perform dequantization and inverse DCT on one block of coefficients,
708  * producing a reduced-size 5x5 output block.
709  *
710  * Optimized algorithm with 5 multiplications in the 1-D kernel.
711  * cK represents sqrt(2) * cos(K*pi/10).
712  */
713 
714 GLOBAL(void)
jpeg_idct_5x5(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)715 jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
716 	       JCOEFPTR coef_block,
717 	       JSAMPARRAY output_buf, JDIMENSION output_col)
718 {
719   INT32 tmp0, tmp1, tmp10, tmp11, tmp12;
720   INT32 z1, z2, z3;
721   JCOEFPTR inptr;
722   ISLOW_MULT_TYPE * quantptr;
723   int * wsptr;
724   JSAMPROW outptr;
725   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
726   int ctr;
727   int workspace[5*5];	/* buffers data between passes */
728   SHIFT_TEMPS
729 
730   /* Pass 1: process columns from input, store into work array. */
731 
732   inptr = coef_block;
733   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
734   wsptr = workspace;
735   for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) {
736     /* Even part */
737 
738     tmp12 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
739     if (ctr == 0)
740       CLAMP_DC(tmp12);
741     tmp12 <<= CONST_BITS;
742     /* Add fudge factor here for final descale. */
743     tmp12 += ONE << (CONST_BITS-PASS1_BITS-1);
744     tmp0 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
745     tmp1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
746     z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
747     z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
748     z3 = tmp12 + z2;
749     tmp10 = z3 + z1;
750     tmp11 = z3 - z1;
751     tmp12 -= z2 << 2;
752 
753     /* Odd part */
754 
755     z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
756     z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
757 
758     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
759     tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
760     tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899));   /* c1+c3 */
761 
762     /* Final output stage */
763 
764     wsptr[5*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
765     wsptr[5*4] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
766     wsptr[5*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
767     wsptr[5*3] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
768     wsptr[5*2] = (int) RIGHT_SHIFT(tmp12, CONST_BITS-PASS1_BITS);
769   }
770 
771   /* Pass 2: process 5 rows from work array, store into output array. */
772 
773   wsptr = workspace;
774   for (ctr = 0; ctr < 5; ctr++) {
775     outptr = output_buf[ctr] + output_col;
776 
777     /* Even part */
778 
779     /* Add fudge factor here for final descale. */
780     tmp12 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
781     tmp12 <<= CONST_BITS;
782     tmp0 = (INT32) wsptr[2];
783     tmp1 = (INT32) wsptr[4];
784     z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
785     z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
786     z3 = tmp12 + z2;
787     tmp10 = z3 + z1;
788     tmp11 = z3 - z1;
789     tmp12 -= z2 << 2;
790 
791     /* Odd part */
792 
793     z2 = (INT32) wsptr[1];
794     z3 = (INT32) wsptr[3];
795 
796     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
797     tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
798     tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899));   /* c1+c3 */
799 
800     /* Final output stage */
801 
802     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
803 					      CONST_BITS+PASS1_BITS+3)
804 			    & RANGE_MASK];
805     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
806 					      CONST_BITS+PASS1_BITS+3)
807 			    & RANGE_MASK];
808     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
809 					      CONST_BITS+PASS1_BITS+3)
810 			    & RANGE_MASK];
811     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
812 					      CONST_BITS+PASS1_BITS+3)
813 			    & RANGE_MASK];
814     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12,
815 					      CONST_BITS+PASS1_BITS+3)
816 			    & RANGE_MASK];
817 
818     wsptr += 5;		/* advance pointer to next row */
819   }
820 }
821 
822 
823 /*
824  * Perform dequantization and inverse DCT on one block of coefficients,
825  * producing a reduced-size 4x4 output block.
826  *
827  * Optimized algorithm with 3 multiplications in the 1-D kernel.
828  * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
829  */
830 
831 GLOBAL(void)
jpeg_idct_4x4(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)832 jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
833 	       JCOEFPTR coef_block,
834 	       JSAMPARRAY output_buf, JDIMENSION output_col)
835 {
836   INT32 tmp0, tmp2, tmp10, tmp12;
837   INT32 z1, z2, z3;
838   JCOEFPTR inptr;
839   ISLOW_MULT_TYPE * quantptr;
840   int * wsptr;
841   JSAMPROW outptr;
842   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
843   int ctr;
844   int workspace[4*4];	/* buffers data between passes */
845   SHIFT_TEMPS
846 
847   /* Pass 1: process columns from input, store into work array. */
848 
849   inptr = coef_block;
850   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
851   wsptr = workspace;
852   for (ctr = 0; ctr < 4; ctr++, inptr++, quantptr++, wsptr++) {
853     /* Even part */
854 
855     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
856     if (ctr == 0)
857       CLAMP_DC(tmp0);
858     tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
859 
860     tmp10 = (tmp0 + tmp2) << PASS1_BITS;
861     tmp12 = (tmp0 - tmp2) << PASS1_BITS;
862 
863     /* Odd part */
864     /* Same rotation as in the even part of the 8x8 LL&M IDCT */
865 
866     z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
867     z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
868 
869     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);               /* c6 */
870     /* Add fudge factor here for final descale. */
871     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
872     tmp0 = RIGHT_SHIFT(z1 + MULTIPLY(z2, FIX_0_765366865), /* c2-c6 */
873 		       CONST_BITS-PASS1_BITS);
874     tmp2 = RIGHT_SHIFT(z1 - MULTIPLY(z3, FIX_1_847759065), /* c2+c6 */
875 		       CONST_BITS-PASS1_BITS);
876 
877     /* Final output stage */
878 
879     wsptr[4*0] = (int) (tmp10 + tmp0);
880     wsptr[4*3] = (int) (tmp10 - tmp0);
881     wsptr[4*1] = (int) (tmp12 + tmp2);
882     wsptr[4*2] = (int) (tmp12 - tmp2);
883   }
884 
885   /* Pass 2: process 4 rows from work array, store into output array. */
886 
887   wsptr = workspace;
888   for (ctr = 0; ctr < 4; ctr++) {
889     outptr = output_buf[ctr] + output_col;
890 
891     /* Even part */
892 
893     /* Add fudge factor here for final descale. */
894     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
895     tmp2 = (INT32) wsptr[2];
896 
897     tmp10 = (tmp0 + tmp2) << CONST_BITS;
898     tmp12 = (tmp0 - tmp2) << CONST_BITS;
899 
900     /* Odd part */
901     /* Same rotation as in the even part of the 8x8 LL&M IDCT */
902 
903     z2 = (INT32) wsptr[1];
904     z3 = (INT32) wsptr[3];
905 
906     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
907     tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
908     tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
909 
910     /* Final output stage */
911 
912     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
913 					      CONST_BITS+PASS1_BITS+3)
914 			    & RANGE_MASK];
915     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
916 					      CONST_BITS+PASS1_BITS+3)
917 			    & RANGE_MASK];
918     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
919 					      CONST_BITS+PASS1_BITS+3)
920 			    & RANGE_MASK];
921     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
922 					      CONST_BITS+PASS1_BITS+3)
923 			    & RANGE_MASK];
924 
925     wsptr += 4;		/* advance pointer to next row */
926   }
927 }
928 
929 
930 /*
931  * Perform dequantization and inverse DCT on one block of coefficients,
932  * producing a reduced-size 3x3 output block.
933  *
934  * Optimized algorithm with 2 multiplications in the 1-D kernel.
935  * cK represents sqrt(2) * cos(K*pi/6).
936  */
937 
938 GLOBAL(void)
jpeg_idct_3x3(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)939 jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
940 	       JCOEFPTR coef_block,
941 	       JSAMPARRAY output_buf, JDIMENSION output_col)
942 {
943   INT32 tmp0, tmp2, tmp10, tmp12;
944   JCOEFPTR inptr;
945   ISLOW_MULT_TYPE * quantptr;
946   int * wsptr;
947   JSAMPROW outptr;
948   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
949   int ctr;
950   int workspace[3*3];	/* buffers data between passes */
951   SHIFT_TEMPS
952 
953   /* Pass 1: process columns from input, store into work array. */
954 
955   inptr = coef_block;
956   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
957   wsptr = workspace;
958   for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) {
959     /* Even part */
960 
961     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
962     if (ctr == 0)
963       CLAMP_DC(tmp0);
964     tmp0 <<= CONST_BITS;
965     /* Add fudge factor here for final descale. */
966     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
967     tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
968     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
969     tmp10 = tmp0 + tmp12;
970     tmp2 = tmp0 - tmp12 - tmp12;
971 
972     /* Odd part */
973 
974     tmp12 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
975     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
976 
977     /* Final output stage */
978 
979     wsptr[3*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
980     wsptr[3*2] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
981     wsptr[3*1] = (int) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
982   }
983 
984   /* Pass 2: process 3 rows from work array, store into output array. */
985 
986   wsptr = workspace;
987   for (ctr = 0; ctr < 3; ctr++) {
988     outptr = output_buf[ctr] + output_col;
989 
990     /* Even part */
991 
992     /* Add fudge factor here for final descale. */
993     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
994     tmp0 <<= CONST_BITS;
995     tmp2 = (INT32) wsptr[2];
996     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
997     tmp10 = tmp0 + tmp12;
998     tmp2 = tmp0 - tmp12 - tmp12;
999 
1000     /* Odd part */
1001 
1002     tmp12 = (INT32) wsptr[1];
1003     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
1004 
1005     /* Final output stage */
1006 
1007     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
1008 					      CONST_BITS+PASS1_BITS+3)
1009 			    & RANGE_MASK];
1010     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
1011 					      CONST_BITS+PASS1_BITS+3)
1012 			    & RANGE_MASK];
1013     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2,
1014 					      CONST_BITS+PASS1_BITS+3)
1015 			    & RANGE_MASK];
1016 
1017     wsptr += 3;		/* advance pointer to next row */
1018   }
1019 }
1020 
1021 
1022 /*
1023  * Perform dequantization and inverse DCT on one block of coefficients,
1024  * producing a reduced-size 2x2 output block.
1025  *
1026  * Multiplication-less algorithm.
1027  */
1028 
1029 GLOBAL(void)
jpeg_idct_2x2(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)1030 jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1031 	       JCOEFPTR coef_block,
1032 	       JSAMPARRAY output_buf, JDIMENSION output_col)
1033 {
1034   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1035   ISLOW_MULT_TYPE * quantptr;
1036   JSAMPROW outptr;
1037   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1038   SHIFT_TEMPS
1039 
1040   /* Pass 1: process columns from input. */
1041 
1042   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1043 
1044   /* Column 0 */
1045   tmp4 = DEQUANTIZE(coef_block[DCTSIZE*0], quantptr[DCTSIZE*0]);
1046   CLAMP_DC(tmp4);
1047   tmp5 = DEQUANTIZE(coef_block[DCTSIZE*1], quantptr[DCTSIZE*1]);
1048   /* Add fudge factor here for final descale. */
1049   tmp4 += ONE << 2;
1050 
1051   tmp0 = tmp4 + tmp5;
1052   tmp2 = tmp4 - tmp5;
1053 
1054   /* Column 1 */
1055   tmp4 = DEQUANTIZE(coef_block[DCTSIZE*0+1], quantptr[DCTSIZE*0+1]);
1056   tmp5 = DEQUANTIZE(coef_block[DCTSIZE*1+1], quantptr[DCTSIZE*1+1]);
1057 
1058   tmp1 = tmp4 + tmp5;
1059   tmp3 = tmp4 - tmp5;
1060 
1061   /* Pass 2: process 2 rows, store into output array. */
1062 
1063   /* Row 0 */
1064   outptr = output_buf[0] + output_col;
1065 
1066   outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp0 + tmp1, 3) & RANGE_MASK];
1067   outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp0 - tmp1, 3) & RANGE_MASK];
1068 
1069   /* Row 1 */
1070   outptr = output_buf[1] + output_col;
1071 
1072   outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp2 + tmp3, 3) & RANGE_MASK];
1073   outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2 - tmp3, 3) & RANGE_MASK];
1074 }
1075 
1076 
1077 /*
1078  * Perform dequantization and inverse DCT on one block of coefficients,
1079  * producing a reduced-size 1x1 output block.
1080  *
1081  * We hardly need an inverse DCT routine for this: just take the
1082  * average pixel value, which is one-eighth of the DC coefficient.
1083  */
1084 
1085 GLOBAL(void)
jpeg_idct_1x1(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)1086 jpeg_idct_1x1 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1087 	       JCOEFPTR coef_block,
1088 	       JSAMPARRAY output_buf, JDIMENSION output_col)
1089 {
1090   int dcval;
1091   ISLOW_MULT_TYPE * quantptr;
1092   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1093   SHIFT_TEMPS
1094 
1095   /* 1x1 is trivial: just take the DC coefficient divided by 8. */
1096   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1097   dcval = DEQUANTIZE(coef_block[0], quantptr[0]);
1098   CLAMP_DC(dcval);
1099   dcval = (int) DESCALE((INT32) dcval, 3);
1100 
1101   output_buf[0][output_col] = range_limit[dcval & RANGE_MASK];
1102 }
1103 
1104 
1105 /*
1106  * Perform dequantization and inverse DCT on one block of coefficients,
1107  * producing a 9x9 output block.
1108  *
1109  * Optimized algorithm with 10 multiplications in the 1-D kernel.
1110  * cK represents sqrt(2) * cos(K*pi/18).
1111  */
1112 
1113 GLOBAL(void)
jpeg_idct_9x9(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)1114 jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1115 	       JCOEFPTR coef_block,
1116 	       JSAMPARRAY output_buf, JDIMENSION output_col)
1117 {
1118   INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13, tmp14;
1119   INT32 z1, z2, z3, z4;
1120   JCOEFPTR inptr;
1121   ISLOW_MULT_TYPE * quantptr;
1122   int * wsptr;
1123   JSAMPROW outptr;
1124   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1125   int ctr;
1126   int workspace[8*9];	/* buffers data between passes */
1127   SHIFT_TEMPS
1128 
1129   /* Pass 1: process columns from input, store into work array. */
1130 
1131   inptr = coef_block;
1132   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1133   wsptr = workspace;
1134   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1135     /* Even part */
1136 
1137     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
1138     if (ctr == 0)
1139       CLAMP_DC(tmp0);
1140     tmp0 <<= CONST_BITS;
1141     /* Add fudge factor here for final descale. */
1142     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
1143 
1144     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
1145     z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
1146     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
1147 
1148     tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
1149     tmp1 = tmp0 + tmp3;
1150     tmp2 = tmp0 - tmp3 - tmp3;
1151 
1152     tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
1153     tmp11 = tmp2 + tmp0;
1154     tmp14 = tmp2 - tmp0 - tmp0;
1155 
1156     tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
1157     tmp2 = MULTIPLY(z1, FIX(1.083350441));      /* c4 */
1158     tmp3 = MULTIPLY(z2, FIX(0.245575608));      /* c8 */
1159 
1160     tmp10 = tmp1 + tmp0 - tmp3;
1161     tmp12 = tmp1 - tmp0 + tmp2;
1162     tmp13 = tmp1 - tmp2 + tmp3;
1163 
1164     /* Odd part */
1165 
1166     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
1167     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
1168     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
1169     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
1170 
1171     z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
1172 
1173     tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
1174     tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
1175     tmp0 = tmp2 + tmp3 - z2;
1176     tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481));      /* c1 */
1177     tmp2 += z2 - tmp1;
1178     tmp3 += z2 + tmp1;
1179     tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
1180 
1181     /* Final output stage */
1182 
1183     wsptr[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
1184     wsptr[8*8] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
1185     wsptr[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
1186     wsptr[8*7] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
1187     wsptr[8*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
1188     wsptr[8*6] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
1189     wsptr[8*3] = (int) RIGHT_SHIFT(tmp13 + tmp3, CONST_BITS-PASS1_BITS);
1190     wsptr[8*5] = (int) RIGHT_SHIFT(tmp13 - tmp3, CONST_BITS-PASS1_BITS);
1191     wsptr[8*4] = (int) RIGHT_SHIFT(tmp14, CONST_BITS-PASS1_BITS);
1192   }
1193 
1194   /* Pass 2: process 9 rows from work array, store into output array. */
1195 
1196   wsptr = workspace;
1197   for (ctr = 0; ctr < 9; ctr++) {
1198     outptr = output_buf[ctr] + output_col;
1199 
1200     /* Even part */
1201 
1202     /* Add fudge factor here for final descale. */
1203     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
1204     tmp0 <<= CONST_BITS;
1205 
1206     z1 = (INT32) wsptr[2];
1207     z2 = (INT32) wsptr[4];
1208     z3 = (INT32) wsptr[6];
1209 
1210     tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
1211     tmp1 = tmp0 + tmp3;
1212     tmp2 = tmp0 - tmp3 - tmp3;
1213 
1214     tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
1215     tmp11 = tmp2 + tmp0;
1216     tmp14 = tmp2 - tmp0 - tmp0;
1217 
1218     tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
1219     tmp2 = MULTIPLY(z1, FIX(1.083350441));      /* c4 */
1220     tmp3 = MULTIPLY(z2, FIX(0.245575608));      /* c8 */
1221 
1222     tmp10 = tmp1 + tmp0 - tmp3;
1223     tmp12 = tmp1 - tmp0 + tmp2;
1224     tmp13 = tmp1 - tmp2 + tmp3;
1225 
1226     /* Odd part */
1227 
1228     z1 = (INT32) wsptr[1];
1229     z2 = (INT32) wsptr[3];
1230     z3 = (INT32) wsptr[5];
1231     z4 = (INT32) wsptr[7];
1232 
1233     z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
1234 
1235     tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
1236     tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
1237     tmp0 = tmp2 + tmp3 - z2;
1238     tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481));      /* c1 */
1239     tmp2 += z2 - tmp1;
1240     tmp3 += z2 + tmp1;
1241     tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
1242 
1243     /* Final output stage */
1244 
1245     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
1246 					      CONST_BITS+PASS1_BITS+3)
1247 			    & RANGE_MASK];
1248     outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
1249 					      CONST_BITS+PASS1_BITS+3)
1250 			    & RANGE_MASK];
1251     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
1252 					      CONST_BITS+PASS1_BITS+3)
1253 			    & RANGE_MASK];
1254     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
1255 					      CONST_BITS+PASS1_BITS+3)
1256 			    & RANGE_MASK];
1257     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
1258 					      CONST_BITS+PASS1_BITS+3)
1259 			    & RANGE_MASK];
1260     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
1261 					      CONST_BITS+PASS1_BITS+3)
1262 			    & RANGE_MASK];
1263     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp3,
1264 					      CONST_BITS+PASS1_BITS+3)
1265 			    & RANGE_MASK];
1266     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp3,
1267 					      CONST_BITS+PASS1_BITS+3)
1268 			    & RANGE_MASK];
1269     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp14,
1270 					      CONST_BITS+PASS1_BITS+3)
1271 			    & RANGE_MASK];
1272 
1273     wsptr += 8;		/* advance pointer to next row */
1274   }
1275 }
1276 
1277 
1278 /*
1279  * Perform dequantization and inverse DCT on one block of coefficients,
1280  * producing a 10x10 output block.
1281  *
1282  * Optimized algorithm with 12 multiplications in the 1-D kernel.
1283  * cK represents sqrt(2) * cos(K*pi/20).
1284  */
1285 
1286 GLOBAL(void)
jpeg_idct_10x10(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)1287 jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1288 		 JCOEFPTR coef_block,
1289 		 JSAMPARRAY output_buf, JDIMENSION output_col)
1290 {
1291   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
1292   INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
1293   INT32 z1, z2, z3, z4, z5;
1294   JCOEFPTR inptr;
1295   ISLOW_MULT_TYPE * quantptr;
1296   int * wsptr;
1297   JSAMPROW outptr;
1298   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1299   int ctr;
1300   int workspace[8*10];	/* buffers data between passes */
1301   SHIFT_TEMPS
1302 
1303   /* Pass 1: process columns from input, store into work array. */
1304 
1305   inptr = coef_block;
1306   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1307   wsptr = workspace;
1308   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1309     /* Even part */
1310 
1311     z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
1312     if (ctr == 0)
1313       CLAMP_DC(z3);
1314     z3 <<= CONST_BITS;
1315     /* Add fudge factor here for final descale. */
1316     z3 += ONE << (CONST_BITS-PASS1_BITS-1);
1317     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
1318     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
1319     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
1320     tmp10 = z3 + z1;
1321     tmp11 = z3 - z2;
1322 
1323     tmp22 = RIGHT_SHIFT(z3 - ((z1 - z2) << 1),   /* c0 = (c4-c8)*2 */
1324 			CONST_BITS-PASS1_BITS);
1325 
1326     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
1327     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
1328 
1329     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
1330     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
1331     tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
1332 
1333     tmp20 = tmp10 + tmp12;
1334     tmp24 = tmp10 - tmp12;
1335     tmp21 = tmp11 + tmp13;
1336     tmp23 = tmp11 - tmp13;
1337 
1338     /* Odd part */
1339 
1340     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
1341     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
1342     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
1343     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
1344 
1345     tmp11 = z2 + z4;
1346     tmp13 = z2 - z4;
1347 
1348     tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
1349     z5 = z3 << CONST_BITS;
1350 
1351     z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
1352     z4 = z5 + tmp12;
1353 
1354     tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
1355     tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
1356 
1357     z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
1358     z4 = z5 - tmp12 - (tmp13 << (CONST_BITS - 1));
1359 
1360     tmp12 = (z1 - tmp13 - z3) << PASS1_BITS;
1361 
1362     tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
1363     tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
1364 
1365     /* Final output stage */
1366 
1367     wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
1368     wsptr[8*9] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
1369     wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
1370     wsptr[8*8] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
1371     wsptr[8*2] = (int) (tmp22 + tmp12);
1372     wsptr[8*7] = (int) (tmp22 - tmp12);
1373     wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
1374     wsptr[8*6] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
1375     wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
1376     wsptr[8*5] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
1377   }
1378 
1379   /* Pass 2: process 10 rows from work array, store into output array. */
1380 
1381   wsptr = workspace;
1382   for (ctr = 0; ctr < 10; ctr++) {
1383     outptr = output_buf[ctr] + output_col;
1384 
1385     /* Even part */
1386 
1387     /* Add fudge factor here for final descale. */
1388     z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
1389     z3 <<= CONST_BITS;
1390     z4 = (INT32) wsptr[4];
1391     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
1392     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
1393     tmp10 = z3 + z1;
1394     tmp11 = z3 - z2;
1395 
1396     tmp22 = z3 - ((z1 - z2) << 1);               /* c0 = (c4-c8)*2 */
1397 
1398     z2 = (INT32) wsptr[2];
1399     z3 = (INT32) wsptr[6];
1400 
1401     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
1402     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
1403     tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
1404 
1405     tmp20 = tmp10 + tmp12;
1406     tmp24 = tmp10 - tmp12;
1407     tmp21 = tmp11 + tmp13;
1408     tmp23 = tmp11 - tmp13;
1409 
1410     /* Odd part */
1411 
1412     z1 = (INT32) wsptr[1];
1413     z2 = (INT32) wsptr[3];
1414     z3 = (INT32) wsptr[5];
1415     z3 <<= CONST_BITS;
1416     z4 = (INT32) wsptr[7];
1417 
1418     tmp11 = z2 + z4;
1419     tmp13 = z2 - z4;
1420 
1421     tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
1422 
1423     z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
1424     z4 = z3 + tmp12;
1425 
1426     tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
1427     tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
1428 
1429     z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
1430     z4 = z3 - tmp12 - (tmp13 << (CONST_BITS - 1));
1431 
1432     tmp12 = ((z1 - tmp13) << CONST_BITS) - z3;
1433 
1434     tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
1435     tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
1436 
1437     /* Final output stage */
1438 
1439     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
1440 					      CONST_BITS+PASS1_BITS+3)
1441 			    & RANGE_MASK];
1442     outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
1443 					      CONST_BITS+PASS1_BITS+3)
1444 			    & RANGE_MASK];
1445     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
1446 					      CONST_BITS+PASS1_BITS+3)
1447 			    & RANGE_MASK];
1448     outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
1449 					      CONST_BITS+PASS1_BITS+3)
1450 			    & RANGE_MASK];
1451     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
1452 					      CONST_BITS+PASS1_BITS+3)
1453 			    & RANGE_MASK];
1454     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
1455 					      CONST_BITS+PASS1_BITS+3)
1456 			    & RANGE_MASK];
1457     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
1458 					      CONST_BITS+PASS1_BITS+3)
1459 			    & RANGE_MASK];
1460     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
1461 					      CONST_BITS+PASS1_BITS+3)
1462 			    & RANGE_MASK];
1463     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
1464 					      CONST_BITS+PASS1_BITS+3)
1465 			    & RANGE_MASK];
1466     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
1467 					      CONST_BITS+PASS1_BITS+3)
1468 			    & RANGE_MASK];
1469 
1470     wsptr += 8;		/* advance pointer to next row */
1471   }
1472 }
1473 
1474 
1475 /*
1476  * Perform dequantization and inverse DCT on one block of coefficients,
1477  * producing a 11x11 output block.
1478  *
1479  * Optimized algorithm with 24 multiplications in the 1-D kernel.
1480  * cK represents sqrt(2) * cos(K*pi/22).
1481  */
1482 
1483 GLOBAL(void)
jpeg_idct_11x11(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)1484 jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1485 		 JCOEFPTR coef_block,
1486 		 JSAMPARRAY output_buf, JDIMENSION output_col)
1487 {
1488   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
1489   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
1490   INT32 z1, z2, z3, z4;
1491   JCOEFPTR inptr;
1492   ISLOW_MULT_TYPE * quantptr;
1493   int * wsptr;
1494   JSAMPROW outptr;
1495   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1496   int ctr;
1497   int workspace[8*11];	/* buffers data between passes */
1498   SHIFT_TEMPS
1499 
1500   /* Pass 1: process columns from input, store into work array. */
1501 
1502   inptr = coef_block;
1503   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1504   wsptr = workspace;
1505   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1506     /* Even part */
1507 
1508     tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
1509     if (ctr == 0)
1510       CLAMP_DC(tmp10);
1511     tmp10 <<= CONST_BITS;
1512     /* Add fudge factor here for final descale. */
1513     tmp10 += ONE << (CONST_BITS-PASS1_BITS-1);
1514 
1515     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
1516     z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
1517     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
1518 
1519     tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
1520     tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
1521     z4 = z1 + z3;
1522     tmp24 = MULTIPLY(z4, - FIX(1.155664402));        /* -(c2-c10) */
1523     z4 -= z2;
1524     tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
1525     tmp21 = tmp20 + tmp23 + tmp25 -
1526 	    MULTIPLY(z2, FIX(1.821790775));          /* c2+c4+c10-c6 */
1527     tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
1528     tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
1529     tmp24 += tmp25;
1530     tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120));  /* c8+c10 */
1531     tmp24 += MULTIPLY(z2, FIX(1.944413522)) -        /* c2+c8 */
1532 	     MULTIPLY(z1, FIX(1.390975730));         /* c4+c10 */
1533     tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562));  /* c0 */
1534 
1535     /* Odd part */
1536 
1537     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
1538     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
1539     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
1540     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
1541 
1542     tmp11 = z1 + z2;
1543     tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
1544     tmp11 = MULTIPLY(tmp11, FIX(0.887983902));           /* c3-c9 */
1545     tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295));         /* c5-c9 */
1546     tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
1547     tmp10 = tmp11 + tmp12 + tmp13 -
1548 	    MULTIPLY(z1, FIX(0.923107866));              /* c7+c5+c3-c1-2*c9 */
1549     z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
1550     tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
1551     tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
1552     z1    = MULTIPLY(z2 + z4, - FIX(1.798248910));       /* -(c1+c9) */
1553     tmp11 += z1;
1554     tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
1555     tmp14 += MULTIPLY(z2, - FIX(1.467221301)) +          /* -(c5+c9) */
1556 	     MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
1557 	     MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
1558 
1559     /* Final output stage */
1560 
1561     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
1562     wsptr[8*10] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
1563     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
1564     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
1565     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
1566     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
1567     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
1568     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
1569     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
1570     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
1571     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25, CONST_BITS-PASS1_BITS);
1572   }
1573 
1574   /* Pass 2: process 11 rows from work array, store into output array. */
1575 
1576   wsptr = workspace;
1577   for (ctr = 0; ctr < 11; ctr++) {
1578     outptr = output_buf[ctr] + output_col;
1579 
1580     /* Even part */
1581 
1582     /* Add fudge factor here for final descale. */
1583     tmp10 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
1584     tmp10 <<= CONST_BITS;
1585 
1586     z1 = (INT32) wsptr[2];
1587     z2 = (INT32) wsptr[4];
1588     z3 = (INT32) wsptr[6];
1589 
1590     tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
1591     tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
1592     z4 = z1 + z3;
1593     tmp24 = MULTIPLY(z4, - FIX(1.155664402));        /* -(c2-c10) */
1594     z4 -= z2;
1595     tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
1596     tmp21 = tmp20 + tmp23 + tmp25 -
1597 	    MULTIPLY(z2, FIX(1.821790775));          /* c2+c4+c10-c6 */
1598     tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
1599     tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
1600     tmp24 += tmp25;
1601     tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120));  /* c8+c10 */
1602     tmp24 += MULTIPLY(z2, FIX(1.944413522)) -        /* c2+c8 */
1603 	     MULTIPLY(z1, FIX(1.390975730));         /* c4+c10 */
1604     tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562));  /* c0 */
1605 
1606     /* Odd part */
1607 
1608     z1 = (INT32) wsptr[1];
1609     z2 = (INT32) wsptr[3];
1610     z3 = (INT32) wsptr[5];
1611     z4 = (INT32) wsptr[7];
1612 
1613     tmp11 = z1 + z2;
1614     tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
1615     tmp11 = MULTIPLY(tmp11, FIX(0.887983902));           /* c3-c9 */
1616     tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295));         /* c5-c9 */
1617     tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
1618     tmp10 = tmp11 + tmp12 + tmp13 -
1619 	    MULTIPLY(z1, FIX(0.923107866));              /* c7+c5+c3-c1-2*c9 */
1620     z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
1621     tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
1622     tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
1623     z1    = MULTIPLY(z2 + z4, - FIX(1.798248910));       /* -(c1+c9) */
1624     tmp11 += z1;
1625     tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
1626     tmp14 += MULTIPLY(z2, - FIX(1.467221301)) +          /* -(c5+c9) */
1627 	     MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
1628 	     MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
1629 
1630     /* Final output stage */
1631 
1632     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
1633 					       CONST_BITS+PASS1_BITS+3)
1634 			     & RANGE_MASK];
1635     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
1636 					       CONST_BITS+PASS1_BITS+3)
1637 			     & RANGE_MASK];
1638     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
1639 					       CONST_BITS+PASS1_BITS+3)
1640 			     & RANGE_MASK];
1641     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
1642 					       CONST_BITS+PASS1_BITS+3)
1643 			     & RANGE_MASK];
1644     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
1645 					       CONST_BITS+PASS1_BITS+3)
1646 			     & RANGE_MASK];
1647     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
1648 					       CONST_BITS+PASS1_BITS+3)
1649 			     & RANGE_MASK];
1650     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
1651 					       CONST_BITS+PASS1_BITS+3)
1652 			     & RANGE_MASK];
1653     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
1654 					       CONST_BITS+PASS1_BITS+3)
1655 			     & RANGE_MASK];
1656     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
1657 					       CONST_BITS+PASS1_BITS+3)
1658 			     & RANGE_MASK];
1659     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
1660 					       CONST_BITS+PASS1_BITS+3)
1661 			     & RANGE_MASK];
1662     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25,
1663 					       CONST_BITS+PASS1_BITS+3)
1664 			     & RANGE_MASK];
1665 
1666     wsptr += 8;		/* advance pointer to next row */
1667   }
1668 }
1669 
1670 
1671 /*
1672  * Perform dequantization and inverse DCT on one block of coefficients,
1673  * producing a 12x12 output block.
1674  *
1675  * Optimized algorithm with 15 multiplications in the 1-D kernel.
1676  * cK represents sqrt(2) * cos(K*pi/24).
1677  */
1678 
1679 GLOBAL(void)
jpeg_idct_12x12(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)1680 jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1681 		 JCOEFPTR coef_block,
1682 		 JSAMPARRAY output_buf, JDIMENSION output_col)
1683 {
1684   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1685   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
1686   INT32 z1, z2, z3, z4;
1687   JCOEFPTR inptr;
1688   ISLOW_MULT_TYPE * quantptr;
1689   int * wsptr;
1690   JSAMPROW outptr;
1691   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1692   int ctr;
1693   int workspace[8*12];	/* buffers data between passes */
1694   SHIFT_TEMPS
1695 
1696   /* Pass 1: process columns from input, store into work array. */
1697 
1698   inptr = coef_block;
1699   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1700   wsptr = workspace;
1701   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1702     /* Even part */
1703 
1704     z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
1705     if (ctr == 0)
1706       CLAMP_DC(z3);
1707     z3 <<= CONST_BITS;
1708     /* Add fudge factor here for final descale. */
1709     z3 += ONE << (CONST_BITS-PASS1_BITS-1);
1710 
1711     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
1712     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
1713 
1714     tmp10 = z3 + z4;
1715     tmp11 = z3 - z4;
1716 
1717     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
1718     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
1719     z1 <<= CONST_BITS;
1720     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
1721     z2 <<= CONST_BITS;
1722 
1723     tmp12 = z1 - z2;
1724 
1725     tmp21 = z3 + tmp12;
1726     tmp24 = z3 - tmp12;
1727 
1728     tmp12 = z4 + z2;
1729 
1730     tmp20 = tmp10 + tmp12;
1731     tmp25 = tmp10 - tmp12;
1732 
1733     tmp12 = z4 - z1 - z2;
1734 
1735     tmp22 = tmp11 + tmp12;
1736     tmp23 = tmp11 - tmp12;
1737 
1738     /* Odd part */
1739 
1740     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
1741     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
1742     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
1743     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
1744 
1745     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
1746     tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
1747 
1748     tmp10 = z1 + z3;
1749     tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
1750     tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
1751     tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
1752     tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
1753     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
1754     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
1755     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
1756 	     MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
1757 
1758     z1 -= z4;
1759     z2 -= z3;
1760     z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
1761     tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
1762     tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
1763 
1764     /* Final output stage */
1765 
1766     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
1767     wsptr[8*11] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
1768     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
1769     wsptr[8*10] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
1770     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
1771     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
1772     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
1773     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
1774     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
1775     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
1776     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
1777     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
1778   }
1779 
1780   /* Pass 2: process 12 rows from work array, store into output array. */
1781 
1782   wsptr = workspace;
1783   for (ctr = 0; ctr < 12; ctr++) {
1784     outptr = output_buf[ctr] + output_col;
1785 
1786     /* Even part */
1787 
1788     /* Add fudge factor here for final descale. */
1789     z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
1790     z3 <<= CONST_BITS;
1791 
1792     z4 = (INT32) wsptr[4];
1793     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
1794 
1795     tmp10 = z3 + z4;
1796     tmp11 = z3 - z4;
1797 
1798     z1 = (INT32) wsptr[2];
1799     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
1800     z1 <<= CONST_BITS;
1801     z2 = (INT32) wsptr[6];
1802     z2 <<= CONST_BITS;
1803 
1804     tmp12 = z1 - z2;
1805 
1806     tmp21 = z3 + tmp12;
1807     tmp24 = z3 - tmp12;
1808 
1809     tmp12 = z4 + z2;
1810 
1811     tmp20 = tmp10 + tmp12;
1812     tmp25 = tmp10 - tmp12;
1813 
1814     tmp12 = z4 - z1 - z2;
1815 
1816     tmp22 = tmp11 + tmp12;
1817     tmp23 = tmp11 - tmp12;
1818 
1819     /* Odd part */
1820 
1821     z1 = (INT32) wsptr[1];
1822     z2 = (INT32) wsptr[3];
1823     z3 = (INT32) wsptr[5];
1824     z4 = (INT32) wsptr[7];
1825 
1826     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
1827     tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
1828 
1829     tmp10 = z1 + z3;
1830     tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
1831     tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
1832     tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
1833     tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
1834     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
1835     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
1836     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
1837 	     MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
1838 
1839     z1 -= z4;
1840     z2 -= z3;
1841     z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
1842     tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
1843     tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
1844 
1845     /* Final output stage */
1846 
1847     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
1848 					       CONST_BITS+PASS1_BITS+3)
1849 			     & RANGE_MASK];
1850     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
1851 					       CONST_BITS+PASS1_BITS+3)
1852 			     & RANGE_MASK];
1853     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
1854 					       CONST_BITS+PASS1_BITS+3)
1855 			     & RANGE_MASK];
1856     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
1857 					       CONST_BITS+PASS1_BITS+3)
1858 			     & RANGE_MASK];
1859     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
1860 					       CONST_BITS+PASS1_BITS+3)
1861 			     & RANGE_MASK];
1862     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
1863 					       CONST_BITS+PASS1_BITS+3)
1864 			     & RANGE_MASK];
1865     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
1866 					       CONST_BITS+PASS1_BITS+3)
1867 			     & RANGE_MASK];
1868     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
1869 					       CONST_BITS+PASS1_BITS+3)
1870 			     & RANGE_MASK];
1871     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
1872 					       CONST_BITS+PASS1_BITS+3)
1873 			     & RANGE_MASK];
1874     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
1875 					       CONST_BITS+PASS1_BITS+3)
1876 			     & RANGE_MASK];
1877     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
1878 					       CONST_BITS+PASS1_BITS+3)
1879 			     & RANGE_MASK];
1880     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
1881 					       CONST_BITS+PASS1_BITS+3)
1882 			     & RANGE_MASK];
1883 
1884     wsptr += 8;		/* advance pointer to next row */
1885   }
1886 }
1887 
1888 
1889 /*
1890  * Perform dequantization and inverse DCT on one block of coefficients,
1891  * producing a 13x13 output block.
1892  *
1893  * Optimized algorithm with 29 multiplications in the 1-D kernel.
1894  * cK represents sqrt(2) * cos(K*pi/26).
1895  */
1896 
1897 GLOBAL(void)
jpeg_idct_13x13(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)1898 jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1899 		 JCOEFPTR coef_block,
1900 		 JSAMPARRAY output_buf, JDIMENSION output_col)
1901 {
1902   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1903   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
1904   INT32 z1, z2, z3, z4;
1905   JCOEFPTR inptr;
1906   ISLOW_MULT_TYPE * quantptr;
1907   int * wsptr;
1908   JSAMPROW outptr;
1909   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1910   int ctr;
1911   int workspace[8*13];	/* buffers data between passes */
1912   SHIFT_TEMPS
1913 
1914   /* Pass 1: process columns from input, store into work array. */
1915 
1916   inptr = coef_block;
1917   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1918   wsptr = workspace;
1919   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1920     /* Even part */
1921 
1922     z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
1923     if (ctr == 0)
1924       CLAMP_DC(z1);
1925     z1 <<= CONST_BITS;
1926     /* Add fudge factor here for final descale. */
1927     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
1928 
1929     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
1930     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
1931     z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
1932 
1933     tmp10 = z3 + z4;
1934     tmp11 = z3 - z4;
1935 
1936     tmp12 = MULTIPLY(tmp10, FIX(1.155388986));                /* (c4+c6)/2 */
1937     tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1;           /* (c4-c6)/2 */
1938 
1939     tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13;   /* c2 */
1940     tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13;   /* c10 */
1941 
1942     tmp12 = MULTIPLY(tmp10, FIX(0.316450131));                /* (c8-c12)/2 */
1943     tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
1944 
1945     tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
1946     tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
1947 
1948     tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
1949     tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
1950 
1951     tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
1952     tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
1953 
1954     tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
1955 
1956     /* Odd part */
1957 
1958     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
1959     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
1960     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
1961     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
1962 
1963     tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
1964     tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
1965     tmp15 = z1 + z4;
1966     tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
1967     tmp10 = tmp11 + tmp12 + tmp13 -
1968 	    MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
1969     tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458));   /* -c11 */
1970     tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
1971     tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
1972     tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945));   /* -c5 */
1973     tmp11 += tmp14;
1974     tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
1975     tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813));   /* -c9 */
1976     tmp12 += tmp14;
1977     tmp13 += tmp14;
1978     tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
1979     tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
1980 	    MULTIPLY(z2, FIX(0.466105296));          /* c1-c7 */
1981     z1    = MULTIPLY(z3 - z2, FIX(0.937797057));     /* c7 */
1982     tmp14 += z1;
1983     tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) -   /* c3-c7 */
1984 	     MULTIPLY(z4, FIX(1.742345811));         /* c1+c11 */
1985 
1986     /* Final output stage */
1987 
1988     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
1989     wsptr[8*12] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
1990     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
1991     wsptr[8*11] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
1992     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
1993     wsptr[8*10] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
1994     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
1995     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
1996     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
1997     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
1998     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
1999     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
2000     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26, CONST_BITS-PASS1_BITS);
2001   }
2002 
2003   /* Pass 2: process 13 rows from work array, store into output array. */
2004 
2005   wsptr = workspace;
2006   for (ctr = 0; ctr < 13; ctr++) {
2007     outptr = output_buf[ctr] + output_col;
2008 
2009     /* Even part */
2010 
2011     /* Add fudge factor here for final descale. */
2012     z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
2013     z1 <<= CONST_BITS;
2014 
2015     z2 = (INT32) wsptr[2];
2016     z3 = (INT32) wsptr[4];
2017     z4 = (INT32) wsptr[6];
2018 
2019     tmp10 = z3 + z4;
2020     tmp11 = z3 - z4;
2021 
2022     tmp12 = MULTIPLY(tmp10, FIX(1.155388986));                /* (c4+c6)/2 */
2023     tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1;           /* (c4-c6)/2 */
2024 
2025     tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13;   /* c2 */
2026     tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13;   /* c10 */
2027 
2028     tmp12 = MULTIPLY(tmp10, FIX(0.316450131));                /* (c8-c12)/2 */
2029     tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
2030 
2031     tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
2032     tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
2033 
2034     tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
2035     tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
2036 
2037     tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
2038     tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
2039 
2040     tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
2041 
2042     /* Odd part */
2043 
2044     z1 = (INT32) wsptr[1];
2045     z2 = (INT32) wsptr[3];
2046     z3 = (INT32) wsptr[5];
2047     z4 = (INT32) wsptr[7];
2048 
2049     tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
2050     tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
2051     tmp15 = z1 + z4;
2052     tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
2053     tmp10 = tmp11 + tmp12 + tmp13 -
2054 	    MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
2055     tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458));   /* -c11 */
2056     tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
2057     tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
2058     tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945));   /* -c5 */
2059     tmp11 += tmp14;
2060     tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
2061     tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813));   /* -c9 */
2062     tmp12 += tmp14;
2063     tmp13 += tmp14;
2064     tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
2065     tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
2066 	    MULTIPLY(z2, FIX(0.466105296));          /* c1-c7 */
2067     z1    = MULTIPLY(z3 - z2, FIX(0.937797057));     /* c7 */
2068     tmp14 += z1;
2069     tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) -   /* c3-c7 */
2070 	     MULTIPLY(z4, FIX(1.742345811));         /* c1+c11 */
2071 
2072     /* Final output stage */
2073 
2074     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
2075 					       CONST_BITS+PASS1_BITS+3)
2076 			     & RANGE_MASK];
2077     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
2078 					       CONST_BITS+PASS1_BITS+3)
2079 			     & RANGE_MASK];
2080     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
2081 					       CONST_BITS+PASS1_BITS+3)
2082 			     & RANGE_MASK];
2083     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
2084 					       CONST_BITS+PASS1_BITS+3)
2085 			     & RANGE_MASK];
2086     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
2087 					       CONST_BITS+PASS1_BITS+3)
2088 			     & RANGE_MASK];
2089     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
2090 					       CONST_BITS+PASS1_BITS+3)
2091 			     & RANGE_MASK];
2092     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
2093 					       CONST_BITS+PASS1_BITS+3)
2094 			     & RANGE_MASK];
2095     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
2096 					       CONST_BITS+PASS1_BITS+3)
2097 			     & RANGE_MASK];
2098     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
2099 					       CONST_BITS+PASS1_BITS+3)
2100 			     & RANGE_MASK];
2101     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
2102 					       CONST_BITS+PASS1_BITS+3)
2103 			     & RANGE_MASK];
2104     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
2105 					       CONST_BITS+PASS1_BITS+3)
2106 			     & RANGE_MASK];
2107     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
2108 					       CONST_BITS+PASS1_BITS+3)
2109 			     & RANGE_MASK];
2110     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26,
2111 					       CONST_BITS+PASS1_BITS+3)
2112 			     & RANGE_MASK];
2113 
2114     wsptr += 8;		/* advance pointer to next row */
2115   }
2116 }
2117 
2118 
2119 /*
2120  * Perform dequantization and inverse DCT on one block of coefficients,
2121  * producing a 14x14 output block.
2122  *
2123  * Optimized algorithm with 20 multiplications in the 1-D kernel.
2124  * cK represents sqrt(2) * cos(K*pi/28).
2125  */
2126 
2127 GLOBAL(void)
jpeg_idct_14x14(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)2128 jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2129 		 JCOEFPTR coef_block,
2130 		 JSAMPARRAY output_buf, JDIMENSION output_col)
2131 {
2132   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
2133   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
2134   INT32 z1, z2, z3, z4;
2135   JCOEFPTR inptr;
2136   ISLOW_MULT_TYPE * quantptr;
2137   int * wsptr;
2138   JSAMPROW outptr;
2139   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
2140   int ctr;
2141   int workspace[8*14];	/* buffers data between passes */
2142   SHIFT_TEMPS
2143 
2144   /* Pass 1: process columns from input, store into work array. */
2145 
2146   inptr = coef_block;
2147   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
2148   wsptr = workspace;
2149   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
2150     /* Even part */
2151 
2152     z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
2153     if (ctr == 0)
2154       CLAMP_DC(z1);
2155     z1 <<= CONST_BITS;
2156     /* Add fudge factor here for final descale. */
2157     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
2158     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
2159     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
2160     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
2161     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
2162 
2163     tmp10 = z1 + z2;
2164     tmp11 = z1 + z3;
2165     tmp12 = z1 - z4;
2166 
2167     tmp23 = RIGHT_SHIFT(z1 - ((z2 + z3 - z4) << 1), /* c0 = (c4+c12-c8)*2 */
2168 			CONST_BITS-PASS1_BITS);
2169 
2170     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
2171     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
2172 
2173     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
2174 
2175     tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
2176     tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
2177     tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
2178 	    MULTIPLY(z2, FIX(1.378756276));      /* c2 */
2179 
2180     tmp20 = tmp10 + tmp13;
2181     tmp26 = tmp10 - tmp13;
2182     tmp21 = tmp11 + tmp14;
2183     tmp25 = tmp11 - tmp14;
2184     tmp22 = tmp12 + tmp15;
2185     tmp24 = tmp12 - tmp15;
2186 
2187     /* Odd part */
2188 
2189     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
2190     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
2191     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
2192     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
2193     tmp13 = z4 << CONST_BITS;
2194 
2195     tmp14 = z1 + z3;
2196     tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
2197     tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
2198     tmp10 = tmp11 + tmp12 + tmp13 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
2199     tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
2200     tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
2201     z1    -= z2;
2202     tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13;        /* c11 */
2203     tmp16 += tmp15;
2204     z1    += z4;
2205     z4    = MULTIPLY(z2 + z3, - FIX(0.158341681)) - tmp13; /* -c13 */
2206     tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948));          /* c3-c9-c13 */
2207     tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773));          /* c3+c5-c13 */
2208     z4    = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
2209     tmp14 += z4 + tmp13 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
2210     tmp15 += z4 + MULTIPLY(z2, FIX(0.674957567));          /* c1+c11-c5 */
2211 
2212     tmp13 = (z1 - z3) << PASS1_BITS;
2213 
2214     /* Final output stage */
2215 
2216     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
2217     wsptr[8*13] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
2218     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
2219     wsptr[8*12] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
2220     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
2221     wsptr[8*11] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
2222     wsptr[8*3]  = (int) (tmp23 + tmp13);
2223     wsptr[8*10] = (int) (tmp23 - tmp13);
2224     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
2225     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
2226     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
2227     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
2228     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
2229     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
2230   }
2231 
2232   /* Pass 2: process 14 rows from work array, store into output array. */
2233 
2234   wsptr = workspace;
2235   for (ctr = 0; ctr < 14; ctr++) {
2236     outptr = output_buf[ctr] + output_col;
2237 
2238     /* Even part */
2239 
2240     /* Add fudge factor here for final descale. */
2241     z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
2242     z1 <<= CONST_BITS;
2243     z4 = (INT32) wsptr[4];
2244     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
2245     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
2246     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
2247 
2248     tmp10 = z1 + z2;
2249     tmp11 = z1 + z3;
2250     tmp12 = z1 - z4;
2251 
2252     tmp23 = z1 - ((z2 + z3 - z4) << 1);          /* c0 = (c4+c12-c8)*2 */
2253 
2254     z1 = (INT32) wsptr[2];
2255     z2 = (INT32) wsptr[6];
2256 
2257     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
2258 
2259     tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
2260     tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
2261     tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
2262 	    MULTIPLY(z2, FIX(1.378756276));      /* c2 */
2263 
2264     tmp20 = tmp10 + tmp13;
2265     tmp26 = tmp10 - tmp13;
2266     tmp21 = tmp11 + tmp14;
2267     tmp25 = tmp11 - tmp14;
2268     tmp22 = tmp12 + tmp15;
2269     tmp24 = tmp12 - tmp15;
2270 
2271     /* Odd part */
2272 
2273     z1 = (INT32) wsptr[1];
2274     z2 = (INT32) wsptr[3];
2275     z3 = (INT32) wsptr[5];
2276     z4 = (INT32) wsptr[7];
2277     z4 <<= CONST_BITS;
2278 
2279     tmp14 = z1 + z3;
2280     tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
2281     tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
2282     tmp10 = tmp11 + tmp12 + z4 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
2283     tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
2284     tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
2285     z1    -= z2;
2286     tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4;           /* c11 */
2287     tmp16 += tmp15;
2288     tmp13 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - z4;    /* -c13 */
2289     tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948));       /* c3-c9-c13 */
2290     tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773));       /* c3+c5-c13 */
2291     tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
2292     tmp14 += tmp13 + z4 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
2293     tmp15 += tmp13 + MULTIPLY(z2, FIX(0.674957567));       /* c1+c11-c5 */
2294 
2295     tmp13 = ((z1 - z3) << CONST_BITS) + z4;
2296 
2297     /* Final output stage */
2298 
2299     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
2300 					       CONST_BITS+PASS1_BITS+3)
2301 			     & RANGE_MASK];
2302     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
2303 					       CONST_BITS+PASS1_BITS+3)
2304 			     & RANGE_MASK];
2305     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
2306 					       CONST_BITS+PASS1_BITS+3)
2307 			     & RANGE_MASK];
2308     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
2309 					       CONST_BITS+PASS1_BITS+3)
2310 			     & RANGE_MASK];
2311     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
2312 					       CONST_BITS+PASS1_BITS+3)
2313 			     & RANGE_MASK];
2314     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
2315 					       CONST_BITS+PASS1_BITS+3)
2316 			     & RANGE_MASK];
2317     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
2318 					       CONST_BITS+PASS1_BITS+3)
2319 			     & RANGE_MASK];
2320     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
2321 					       CONST_BITS+PASS1_BITS+3)
2322 			     & RANGE_MASK];
2323     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
2324 					       CONST_BITS+PASS1_BITS+3)
2325 			     & RANGE_MASK];
2326     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
2327 					       CONST_BITS+PASS1_BITS+3)
2328 			     & RANGE_MASK];
2329     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
2330 					       CONST_BITS+PASS1_BITS+3)
2331 			     & RANGE_MASK];
2332     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
2333 					       CONST_BITS+PASS1_BITS+3)
2334 			     & RANGE_MASK];
2335     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
2336 					       CONST_BITS+PASS1_BITS+3)
2337 			     & RANGE_MASK];
2338     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
2339 					       CONST_BITS+PASS1_BITS+3)
2340 			     & RANGE_MASK];
2341 
2342     wsptr += 8;		/* advance pointer to next row */
2343   }
2344 }
2345 
2346 
2347 /*
2348  * Perform dequantization and inverse DCT on one block of coefficients,
2349  * producing a 15x15 output block.
2350  *
2351  * Optimized algorithm with 22 multiplications in the 1-D kernel.
2352  * cK represents sqrt(2) * cos(K*pi/30).
2353  */
2354 
2355 GLOBAL(void)
jpeg_idct_15x15(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)2356 jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2357 		 JCOEFPTR coef_block,
2358 		 JSAMPARRAY output_buf, JDIMENSION output_col)
2359 {
2360   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
2361   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
2362   INT32 z1, z2, z3, z4;
2363   JCOEFPTR inptr;
2364   ISLOW_MULT_TYPE * quantptr;
2365   int * wsptr;
2366   JSAMPROW outptr;
2367   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
2368   int ctr;
2369   int workspace[8*15];	/* buffers data between passes */
2370   SHIFT_TEMPS
2371 
2372   /* Pass 1: process columns from input, store into work array. */
2373 
2374   inptr = coef_block;
2375   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
2376   wsptr = workspace;
2377   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
2378     /* Even part */
2379 
2380     z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
2381     if (ctr == 0)
2382       CLAMP_DC(z1);
2383     z1 <<= CONST_BITS;
2384     /* Add fudge factor here for final descale. */
2385     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
2386 
2387     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
2388     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
2389     z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
2390 
2391     tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
2392     tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
2393 
2394     tmp12 = z1 - tmp10;
2395     tmp13 = z1 + tmp11;
2396     z1 -= (tmp11 - tmp10) << 1;             /* c0 = (c6-c12)*2 */
2397 
2398     z4 = z2 - z3;
2399     z3 += z2;
2400     tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
2401     tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
2402     z2 = MULTIPLY(z2, FIX(1.439773946));    /* c4+c14 */
2403 
2404     tmp20 = tmp13 + tmp10 + tmp11;
2405     tmp23 = tmp12 - tmp10 + tmp11 + z2;
2406 
2407     tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
2408     tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
2409 
2410     tmp25 = tmp13 - tmp10 - tmp11;
2411     tmp26 = tmp12 + tmp10 - tmp11 - z2;
2412 
2413     tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
2414     tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
2415 
2416     tmp21 = tmp12 + tmp10 + tmp11;
2417     tmp24 = tmp13 - tmp10 + tmp11;
2418     tmp11 += tmp11;
2419     tmp22 = z1 + tmp11;                     /* c10 = c6-c12 */
2420     tmp27 = z1 - tmp11 - tmp11;             /* c0 = (c6-c12)*2 */
2421 
2422     /* Odd part */
2423 
2424     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
2425     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
2426     z4 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
2427     z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
2428     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
2429 
2430     tmp13 = z2 - z4;
2431     tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
2432     tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
2433     tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
2434 
2435     tmp13 = MULTIPLY(z2, - FIX(0.831253876));               /* -c9 */
2436     tmp15 = MULTIPLY(z2, - FIX(1.344997024));               /* -c3 */
2437     z2 = z1 - z4;
2438     tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
2439 
2440     tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
2441     tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
2442     tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3;            /* c5 */
2443     z2 = MULTIPLY(z1 + z4, FIX(0.575212477));               /* c11 */
2444     tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3;      /* c7-c11 */
2445     tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3;      /* c11+c13 */
2446 
2447     /* Final output stage */
2448 
2449     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
2450     wsptr[8*14] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
2451     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
2452     wsptr[8*13] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
2453     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
2454     wsptr[8*12] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
2455     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
2456     wsptr[8*11] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
2457     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
2458     wsptr[8*10] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
2459     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
2460     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
2461     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
2462     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
2463     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27, CONST_BITS-PASS1_BITS);
2464   }
2465 
2466   /* Pass 2: process 15 rows from work array, store into output array. */
2467 
2468   wsptr = workspace;
2469   for (ctr = 0; ctr < 15; ctr++) {
2470     outptr = output_buf[ctr] + output_col;
2471 
2472     /* Even part */
2473 
2474     /* Add fudge factor here for final descale. */
2475     z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
2476     z1 <<= CONST_BITS;
2477 
2478     z2 = (INT32) wsptr[2];
2479     z3 = (INT32) wsptr[4];
2480     z4 = (INT32) wsptr[6];
2481 
2482     tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
2483     tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
2484 
2485     tmp12 = z1 - tmp10;
2486     tmp13 = z1 + tmp11;
2487     z1 -= (tmp11 - tmp10) << 1;             /* c0 = (c6-c12)*2 */
2488 
2489     z4 = z2 - z3;
2490     z3 += z2;
2491     tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
2492     tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
2493     z2 = MULTIPLY(z2, FIX(1.439773946));    /* c4+c14 */
2494 
2495     tmp20 = tmp13 + tmp10 + tmp11;
2496     tmp23 = tmp12 - tmp10 + tmp11 + z2;
2497 
2498     tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
2499     tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
2500 
2501     tmp25 = tmp13 - tmp10 - tmp11;
2502     tmp26 = tmp12 + tmp10 - tmp11 - z2;
2503 
2504     tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
2505     tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
2506 
2507     tmp21 = tmp12 + tmp10 + tmp11;
2508     tmp24 = tmp13 - tmp10 + tmp11;
2509     tmp11 += tmp11;
2510     tmp22 = z1 + tmp11;                     /* c10 = c6-c12 */
2511     tmp27 = z1 - tmp11 - tmp11;             /* c0 = (c6-c12)*2 */
2512 
2513     /* Odd part */
2514 
2515     z1 = (INT32) wsptr[1];
2516     z2 = (INT32) wsptr[3];
2517     z4 = (INT32) wsptr[5];
2518     z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
2519     z4 = (INT32) wsptr[7];
2520 
2521     tmp13 = z2 - z4;
2522     tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
2523     tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
2524     tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
2525 
2526     tmp13 = MULTIPLY(z2, - FIX(0.831253876));               /* -c9 */
2527     tmp15 = MULTIPLY(z2, - FIX(1.344997024));               /* -c3 */
2528     z2 = z1 - z4;
2529     tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
2530 
2531     tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
2532     tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
2533     tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3;            /* c5 */
2534     z2 = MULTIPLY(z1 + z4, FIX(0.575212477));               /* c11 */
2535     tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3;      /* c7-c11 */
2536     tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3;      /* c11+c13 */
2537 
2538     /* Final output stage */
2539 
2540     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
2541 					       CONST_BITS+PASS1_BITS+3)
2542 			     & RANGE_MASK];
2543     outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
2544 					       CONST_BITS+PASS1_BITS+3)
2545 			     & RANGE_MASK];
2546     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
2547 					       CONST_BITS+PASS1_BITS+3)
2548 			     & RANGE_MASK];
2549     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
2550 					       CONST_BITS+PASS1_BITS+3)
2551 			     & RANGE_MASK];
2552     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
2553 					       CONST_BITS+PASS1_BITS+3)
2554 			     & RANGE_MASK];
2555     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
2556 					       CONST_BITS+PASS1_BITS+3)
2557 			     & RANGE_MASK];
2558     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
2559 					       CONST_BITS+PASS1_BITS+3)
2560 			     & RANGE_MASK];
2561     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
2562 					       CONST_BITS+PASS1_BITS+3)
2563 			     & RANGE_MASK];
2564     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
2565 					       CONST_BITS+PASS1_BITS+3)
2566 			     & RANGE_MASK];
2567     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
2568 					       CONST_BITS+PASS1_BITS+3)
2569 			     & RANGE_MASK];
2570     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
2571 					       CONST_BITS+PASS1_BITS+3)
2572 			     & RANGE_MASK];
2573     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
2574 					       CONST_BITS+PASS1_BITS+3)
2575 			     & RANGE_MASK];
2576     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
2577 					       CONST_BITS+PASS1_BITS+3)
2578 			     & RANGE_MASK];
2579     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
2580 					       CONST_BITS+PASS1_BITS+3)
2581 			     & RANGE_MASK];
2582     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27,
2583 					       CONST_BITS+PASS1_BITS+3)
2584 			     & RANGE_MASK];
2585 
2586     wsptr += 8;		/* advance pointer to next row */
2587   }
2588 }
2589 
2590 
2591 /*
2592  * Perform dequantization and inverse DCT on one block of coefficients,
2593  * producing a 16x16 output block.
2594  *
2595  * Optimized algorithm with 28 multiplications in the 1-D kernel.
2596  * cK represents sqrt(2) * cos(K*pi/32).
2597  */
2598 
2599 GLOBAL(void)
jpeg_idct_16x16(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)2600 jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2601 		 JCOEFPTR coef_block,
2602 		 JSAMPARRAY output_buf, JDIMENSION output_col)
2603 {
2604   INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
2605   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
2606   INT32 z1, z2, z3, z4;
2607   JCOEFPTR inptr;
2608   ISLOW_MULT_TYPE * quantptr;
2609   int * wsptr;
2610   JSAMPROW outptr;
2611   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
2612   int ctr;
2613   int workspace[8*16];	/* buffers data between passes */
2614   SHIFT_TEMPS
2615 
2616   /* Pass 1: process columns from input, store into work array. */
2617 
2618   inptr = coef_block;
2619   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
2620   wsptr = workspace;
2621   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
2622     /* Even part */
2623 
2624     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
2625     if (ctr == 0)
2626       CLAMP_DC(tmp0);
2627     tmp0 <<= CONST_BITS;
2628     /* Add fudge factor here for final descale. */
2629     tmp0 += 1 << (CONST_BITS-PASS1_BITS-1);
2630 
2631     z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
2632     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
2633     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
2634 
2635     tmp10 = tmp0 + tmp1;
2636     tmp11 = tmp0 - tmp1;
2637     tmp12 = tmp0 + tmp2;
2638     tmp13 = tmp0 - tmp2;
2639 
2640     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
2641     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
2642     z3 = z1 - z2;
2643     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
2644     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
2645 
2646     tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
2647     tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
2648     tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
2649     tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
2650 
2651     tmp20 = tmp10 + tmp0;
2652     tmp27 = tmp10 - tmp0;
2653     tmp21 = tmp12 + tmp1;
2654     tmp26 = tmp12 - tmp1;
2655     tmp22 = tmp13 + tmp2;
2656     tmp25 = tmp13 - tmp2;
2657     tmp23 = tmp11 + tmp3;
2658     tmp24 = tmp11 - tmp3;
2659 
2660     /* Odd part */
2661 
2662     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
2663     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
2664     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
2665     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
2666 
2667     tmp11 = z1 + z3;
2668 
2669     tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
2670     tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
2671     tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
2672     tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
2673     tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
2674     tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
2675     tmp0  = tmp1 + tmp2 + tmp3 -
2676 	    MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
2677     tmp13 = tmp10 + tmp11 + tmp12 -
2678 	    MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
2679     z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
2680     tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
2681     tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
2682     z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
2683     tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
2684     tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
2685     z2    += z4;
2686     z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
2687     tmp1  += z1;
2688     tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
2689     z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
2690     tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
2691     tmp12 += z2;
2692     z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
2693     tmp2  += z2;
2694     tmp3  += z2;
2695     z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
2696     tmp10 += z2;
2697     tmp11 += z2;
2698 
2699     /* Final output stage */
2700 
2701     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp0,  CONST_BITS-PASS1_BITS);
2702     wsptr[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0,  CONST_BITS-PASS1_BITS);
2703     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp1,  CONST_BITS-PASS1_BITS);
2704     wsptr[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1,  CONST_BITS-PASS1_BITS);
2705     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp2,  CONST_BITS-PASS1_BITS);
2706     wsptr[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2,  CONST_BITS-PASS1_BITS);
2707     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp3,  CONST_BITS-PASS1_BITS);
2708     wsptr[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3,  CONST_BITS-PASS1_BITS);
2709     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
2710     wsptr[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
2711     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
2712     wsptr[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
2713     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
2714     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
2715     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
2716     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
2717   }
2718 
2719   /* Pass 2: process 16 rows from work array, store into output array. */
2720 
2721   wsptr = workspace;
2722   for (ctr = 0; ctr < 16; ctr++) {
2723     outptr = output_buf[ctr] + output_col;
2724 
2725     /* Even part */
2726 
2727     /* Add fudge factor here for final descale. */
2728     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
2729     tmp0 <<= CONST_BITS;
2730 
2731     z1 = (INT32) wsptr[4];
2732     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
2733     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
2734 
2735     tmp10 = tmp0 + tmp1;
2736     tmp11 = tmp0 - tmp1;
2737     tmp12 = tmp0 + tmp2;
2738     tmp13 = tmp0 - tmp2;
2739 
2740     z1 = (INT32) wsptr[2];
2741     z2 = (INT32) wsptr[6];
2742     z3 = z1 - z2;
2743     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
2744     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
2745 
2746     tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
2747     tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
2748     tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
2749     tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
2750 
2751     tmp20 = tmp10 + tmp0;
2752     tmp27 = tmp10 - tmp0;
2753     tmp21 = tmp12 + tmp1;
2754     tmp26 = tmp12 - tmp1;
2755     tmp22 = tmp13 + tmp2;
2756     tmp25 = tmp13 - tmp2;
2757     tmp23 = tmp11 + tmp3;
2758     tmp24 = tmp11 - tmp3;
2759 
2760     /* Odd part */
2761 
2762     z1 = (INT32) wsptr[1];
2763     z2 = (INT32) wsptr[3];
2764     z3 = (INT32) wsptr[5];
2765     z4 = (INT32) wsptr[7];
2766 
2767     tmp11 = z1 + z3;
2768 
2769     tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
2770     tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
2771     tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
2772     tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
2773     tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
2774     tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
2775     tmp0  = tmp1 + tmp2 + tmp3 -
2776 	    MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
2777     tmp13 = tmp10 + tmp11 + tmp12 -
2778 	    MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
2779     z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
2780     tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
2781     tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
2782     z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
2783     tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
2784     tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
2785     z2    += z4;
2786     z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
2787     tmp1  += z1;
2788     tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
2789     z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
2790     tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
2791     tmp12 += z2;
2792     z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
2793     tmp2  += z2;
2794     tmp3  += z2;
2795     z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
2796     tmp10 += z2;
2797     tmp11 += z2;
2798 
2799     /* Final output stage */
2800 
2801     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp0,
2802 					       CONST_BITS+PASS1_BITS+3)
2803 			     & RANGE_MASK];
2804     outptr[15] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp0,
2805 					       CONST_BITS+PASS1_BITS+3)
2806 			     & RANGE_MASK];
2807     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp1,
2808 					       CONST_BITS+PASS1_BITS+3)
2809 			     & RANGE_MASK];
2810     outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp1,
2811 					       CONST_BITS+PASS1_BITS+3)
2812 			     & RANGE_MASK];
2813     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp2,
2814 					       CONST_BITS+PASS1_BITS+3)
2815 			     & RANGE_MASK];
2816     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp2,
2817 					       CONST_BITS+PASS1_BITS+3)
2818 			     & RANGE_MASK];
2819     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp3,
2820 					       CONST_BITS+PASS1_BITS+3)
2821 			     & RANGE_MASK];
2822     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp3,
2823 					       CONST_BITS+PASS1_BITS+3)
2824 			     & RANGE_MASK];
2825     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp10,
2826 					       CONST_BITS+PASS1_BITS+3)
2827 			     & RANGE_MASK];
2828     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp10,
2829 					       CONST_BITS+PASS1_BITS+3)
2830 			     & RANGE_MASK];
2831     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp11,
2832 					       CONST_BITS+PASS1_BITS+3)
2833 			     & RANGE_MASK];
2834     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp11,
2835 					       CONST_BITS+PASS1_BITS+3)
2836 			     & RANGE_MASK];
2837     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp12,
2838 					       CONST_BITS+PASS1_BITS+3)
2839 			     & RANGE_MASK];
2840     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp12,
2841 					       CONST_BITS+PASS1_BITS+3)
2842 			     & RANGE_MASK];
2843     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27 + tmp13,
2844 					       CONST_BITS+PASS1_BITS+3)
2845 			     & RANGE_MASK];
2846     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp27 - tmp13,
2847 					       CONST_BITS+PASS1_BITS+3)
2848 			     & RANGE_MASK];
2849 
2850     wsptr += 8;		/* advance pointer to next row */
2851   }
2852 }
2853 
2854 
2855 /*
2856  * Perform dequantization and inverse DCT on one block of coefficients,
2857  * producing a 16x8 output block.
2858  *
2859  * 8-point IDCT in pass 1 (columns), 16-point in pass 2 (rows).
2860  */
2861 
2862 GLOBAL(void)
jpeg_idct_16x8(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)2863 jpeg_idct_16x8 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2864 		JCOEFPTR coef_block,
2865 		JSAMPARRAY output_buf, JDIMENSION output_col)
2866 {
2867   INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
2868   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
2869   INT32 z1, z2, z3, z4;
2870   JCOEFPTR inptr;
2871   ISLOW_MULT_TYPE * quantptr;
2872   int * wsptr;
2873   JSAMPROW outptr;
2874   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
2875   int ctr;
2876   int workspace[8*8];	/* buffers data between passes */
2877   SHIFT_TEMPS
2878 
2879   /* Pass 1: process columns from input, store into work array. */
2880   /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
2881   /* furthermore, we scale the results by 2**PASS1_BITS. */
2882 
2883   inptr = coef_block;
2884   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
2885   wsptr = workspace;
2886   for (ctr = DCTSIZE; ctr > 0; ctr--) {
2887     /* Due to quantization, we will usually find that many of the input
2888      * coefficients are zero, especially the AC terms.  We can exploit this
2889      * by short-circuiting the IDCT calculation for any column in which all
2890      * the AC terms are zero.  In that case each output is equal to the
2891      * DC coefficient (with scale factor as needed).
2892      * With typical images and quantization tables, half or more of the
2893      * column DCT calculations can be simplified this way.
2894      */
2895 
2896     if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
2897 	inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
2898 	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
2899 	inptr[DCTSIZE*7] == 0) {
2900       /* AC terms all zero */
2901       int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
2902       if (ctr == DCTSIZE)
2903         CLAMP_DC(dcval);
2904       dcval <<= PASS1_BITS;
2905       wsptr[DCTSIZE*0] = dcval;
2906       wsptr[DCTSIZE*1] = dcval;
2907       wsptr[DCTSIZE*2] = dcval;
2908       wsptr[DCTSIZE*3] = dcval;
2909       wsptr[DCTSIZE*4] = dcval;
2910       wsptr[DCTSIZE*5] = dcval;
2911       wsptr[DCTSIZE*6] = dcval;
2912       wsptr[DCTSIZE*7] = dcval;
2913 
2914       inptr++;			/* advance pointers to next column */
2915       quantptr++;
2916       wsptr++;
2917       continue;
2918     }
2919 
2920     /* Even part: reverse the even part of the forward DCT. */
2921     /* The rotator is sqrt(2)*c(-6). */
2922 
2923     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
2924     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
2925 
2926     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
2927     tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
2928     tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
2929 
2930     z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
2931     if (ctr == DCTSIZE)
2932       CLAMP_DC(z2);
2933     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
2934     z2 <<= CONST_BITS;
2935     z3 <<= CONST_BITS;
2936     /* Add fudge factor here for final descale. */
2937     z2 += ONE << (CONST_BITS-PASS1_BITS-1);
2938 
2939     tmp0 = z2 + z3;
2940     tmp1 = z2 - z3;
2941 
2942     tmp10 = tmp0 + tmp2;
2943     tmp13 = tmp0 - tmp2;
2944     tmp11 = tmp1 + tmp3;
2945     tmp12 = tmp1 - tmp3;
2946 
2947     /* Odd part per figure 8; the matrix is unitary and hence its
2948      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
2949      */
2950 
2951     tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
2952     tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
2953     tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
2954     tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
2955 
2956     z2 = tmp0 + tmp2;
2957     z3 = tmp1 + tmp3;
2958 
2959     z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
2960     z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
2961     z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
2962     z2 += z1;
2963     z3 += z1;
2964 
2965     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
2966     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
2967     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
2968     tmp0 += z1 + z2;
2969     tmp3 += z1 + z3;
2970 
2971     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
2972     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
2973     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
2974     tmp1 += z1 + z3;
2975     tmp2 += z1 + z2;
2976 
2977     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
2978 
2979     wsptr[DCTSIZE*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
2980     wsptr[DCTSIZE*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
2981     wsptr[DCTSIZE*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
2982     wsptr[DCTSIZE*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
2983     wsptr[DCTSIZE*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
2984     wsptr[DCTSIZE*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
2985     wsptr[DCTSIZE*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
2986     wsptr[DCTSIZE*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
2987 
2988     inptr++;			/* advance pointers to next column */
2989     quantptr++;
2990     wsptr++;
2991   }
2992 
2993   /* Pass 2: process 8 rows from work array, store into output array.
2994    * 16-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
2995    */
2996   wsptr = workspace;
2997   for (ctr = 0; ctr < 8; ctr++) {
2998     outptr = output_buf[ctr] + output_col;
2999 
3000     /* Even part */
3001 
3002     /* Add fudge factor here for final descale. */
3003     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
3004     tmp0 <<= CONST_BITS;
3005 
3006     z1 = (INT32) wsptr[4];
3007     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
3008     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
3009 
3010     tmp10 = tmp0 + tmp1;
3011     tmp11 = tmp0 - tmp1;
3012     tmp12 = tmp0 + tmp2;
3013     tmp13 = tmp0 - tmp2;
3014 
3015     z1 = (INT32) wsptr[2];
3016     z2 = (INT32) wsptr[6];
3017     z3 = z1 - z2;
3018     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
3019     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
3020 
3021     tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
3022     tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
3023     tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
3024     tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
3025 
3026     tmp20 = tmp10 + tmp0;
3027     tmp27 = tmp10 - tmp0;
3028     tmp21 = tmp12 + tmp1;
3029     tmp26 = tmp12 - tmp1;
3030     tmp22 = tmp13 + tmp2;
3031     tmp25 = tmp13 - tmp2;
3032     tmp23 = tmp11 + tmp3;
3033     tmp24 = tmp11 - tmp3;
3034 
3035     /* Odd part */
3036 
3037     z1 = (INT32) wsptr[1];
3038     z2 = (INT32) wsptr[3];
3039     z3 = (INT32) wsptr[5];
3040     z4 = (INT32) wsptr[7];
3041 
3042     tmp11 = z1 + z3;
3043 
3044     tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
3045     tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
3046     tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
3047     tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
3048     tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
3049     tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
3050     tmp0  = tmp1 + tmp2 + tmp3 -
3051 	    MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
3052     tmp13 = tmp10 + tmp11 + tmp12 -
3053 	    MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
3054     z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
3055     tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
3056     tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
3057     z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
3058     tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
3059     tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
3060     z2    += z4;
3061     z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
3062     tmp1  += z1;
3063     tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
3064     z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
3065     tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
3066     tmp12 += z2;
3067     z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
3068     tmp2  += z2;
3069     tmp3  += z2;
3070     z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
3071     tmp10 += z2;
3072     tmp11 += z2;
3073 
3074     /* Final output stage */
3075 
3076     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp0,
3077 					       CONST_BITS+PASS1_BITS+3)
3078 			     & RANGE_MASK];
3079     outptr[15] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp0,
3080 					       CONST_BITS+PASS1_BITS+3)
3081 			     & RANGE_MASK];
3082     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp1,
3083 					       CONST_BITS+PASS1_BITS+3)
3084 			     & RANGE_MASK];
3085     outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp1,
3086 					       CONST_BITS+PASS1_BITS+3)
3087 			     & RANGE_MASK];
3088     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp2,
3089 					       CONST_BITS+PASS1_BITS+3)
3090 			     & RANGE_MASK];
3091     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp2,
3092 					       CONST_BITS+PASS1_BITS+3)
3093 			     & RANGE_MASK];
3094     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp3,
3095 					       CONST_BITS+PASS1_BITS+3)
3096 			     & RANGE_MASK];
3097     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp3,
3098 					       CONST_BITS+PASS1_BITS+3)
3099 			     & RANGE_MASK];
3100     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp10,
3101 					       CONST_BITS+PASS1_BITS+3)
3102 			     & RANGE_MASK];
3103     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp10,
3104 					       CONST_BITS+PASS1_BITS+3)
3105 			     & RANGE_MASK];
3106     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp11,
3107 					       CONST_BITS+PASS1_BITS+3)
3108 			     & RANGE_MASK];
3109     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp11,
3110 					       CONST_BITS+PASS1_BITS+3)
3111 			     & RANGE_MASK];
3112     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp12,
3113 					       CONST_BITS+PASS1_BITS+3)
3114 			     & RANGE_MASK];
3115     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp12,
3116 					       CONST_BITS+PASS1_BITS+3)
3117 			     & RANGE_MASK];
3118     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27 + tmp13,
3119 					       CONST_BITS+PASS1_BITS+3)
3120 			     & RANGE_MASK];
3121     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp27 - tmp13,
3122 					       CONST_BITS+PASS1_BITS+3)
3123 			     & RANGE_MASK];
3124 
3125     wsptr += 8;		/* advance pointer to next row */
3126   }
3127 }
3128 
3129 
3130 /*
3131  * Perform dequantization and inverse DCT on one block of coefficients,
3132  * producing a 14x7 output block.
3133  *
3134  * 7-point IDCT in pass 1 (columns), 14-point in pass 2 (rows).
3135  */
3136 
3137 GLOBAL(void)
jpeg_idct_14x7(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)3138 jpeg_idct_14x7 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
3139 		JCOEFPTR coef_block,
3140 		JSAMPARRAY output_buf, JDIMENSION output_col)
3141 {
3142   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
3143   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
3144   INT32 z1, z2, z3, z4;
3145   JCOEFPTR inptr;
3146   ISLOW_MULT_TYPE * quantptr;
3147   int * wsptr;
3148   JSAMPROW outptr;
3149   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
3150   int ctr;
3151   int workspace[8*7];	/* buffers data between passes */
3152   SHIFT_TEMPS
3153 
3154   /* Pass 1: process columns from input, store into work array.
3155    * 7-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/14).
3156    */
3157   inptr = coef_block;
3158   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
3159   wsptr = workspace;
3160   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
3161     /* Even part */
3162 
3163     tmp23 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
3164     if (ctr == 0)
3165       CLAMP_DC(tmp23);
3166     tmp23 <<= CONST_BITS;
3167     /* Add fudge factor here for final descale. */
3168     tmp23 += ONE << (CONST_BITS-PASS1_BITS-1);
3169 
3170     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
3171     z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
3172     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
3173 
3174     tmp20 = MULTIPLY(z2 - z3, FIX(0.881747734));       /* c4 */
3175     tmp22 = MULTIPLY(z1 - z2, FIX(0.314692123));       /* c6 */
3176     tmp21 = tmp20 + tmp22 + tmp23 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
3177     tmp10 = z1 + z3;
3178     z2 -= tmp10;
3179     tmp10 = MULTIPLY(tmp10, FIX(1.274162392)) + tmp23; /* c2 */
3180     tmp20 += tmp10 - MULTIPLY(z3, FIX(0.077722536));   /* c2-c4-c6 */
3181     tmp22 += tmp10 - MULTIPLY(z1, FIX(2.470602249));   /* c2+c4+c6 */
3182     tmp23 += MULTIPLY(z2, FIX(1.414213562));           /* c0 */
3183 
3184     /* Odd part */
3185 
3186     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
3187     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
3188     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
3189 
3190     tmp11 = MULTIPLY(z1 + z2, FIX(0.935414347));       /* (c3+c1-c5)/2 */
3191     tmp12 = MULTIPLY(z1 - z2, FIX(0.170262339));       /* (c3+c5-c1)/2 */
3192     tmp10 = tmp11 - tmp12;
3193     tmp11 += tmp12;
3194     tmp12 = MULTIPLY(z2 + z3, - FIX(1.378756276));     /* -c1 */
3195     tmp11 += tmp12;
3196     z2 = MULTIPLY(z1 + z3, FIX(0.613604268));          /* c5 */
3197     tmp10 += z2;
3198     tmp12 += z2 + MULTIPLY(z3, FIX(1.870828693));      /* c3+c1-c5 */
3199 
3200     /* Final output stage */
3201 
3202     wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
3203     wsptr[8*6] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
3204     wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
3205     wsptr[8*5] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
3206     wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
3207     wsptr[8*4] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
3208     wsptr[8*3] = (int) RIGHT_SHIFT(tmp23, CONST_BITS-PASS1_BITS);
3209   }
3210 
3211   /* Pass 2: process 7 rows from work array, store into output array.
3212    * 14-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/28).
3213    */
3214   wsptr = workspace;
3215   for (ctr = 0; ctr < 7; ctr++) {
3216     outptr = output_buf[ctr] + output_col;
3217 
3218     /* Even part */
3219 
3220     /* Add fudge factor here for final descale. */
3221     z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
3222     z1 <<= CONST_BITS;
3223     z4 = (INT32) wsptr[4];
3224     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
3225     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
3226     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
3227 
3228     tmp10 = z1 + z2;
3229     tmp11 = z1 + z3;
3230     tmp12 = z1 - z4;
3231 
3232     tmp23 = z1 - ((z2 + z3 - z4) << 1);          /* c0 = (c4+c12-c8)*2 */
3233 
3234     z1 = (INT32) wsptr[2];
3235     z2 = (INT32) wsptr[6];
3236 
3237     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
3238 
3239     tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
3240     tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
3241     tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
3242 	    MULTIPLY(z2, FIX(1.378756276));      /* c2 */
3243 
3244     tmp20 = tmp10 + tmp13;
3245     tmp26 = tmp10 - tmp13;
3246     tmp21 = tmp11 + tmp14;
3247     tmp25 = tmp11 - tmp14;
3248     tmp22 = tmp12 + tmp15;
3249     tmp24 = tmp12 - tmp15;
3250 
3251     /* Odd part */
3252 
3253     z1 = (INT32) wsptr[1];
3254     z2 = (INT32) wsptr[3];
3255     z3 = (INT32) wsptr[5];
3256     z4 = (INT32) wsptr[7];
3257     z4 <<= CONST_BITS;
3258 
3259     tmp14 = z1 + z3;
3260     tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
3261     tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
3262     tmp10 = tmp11 + tmp12 + z4 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
3263     tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
3264     tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
3265     z1    -= z2;
3266     tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4;           /* c11 */
3267     tmp16 += tmp15;
3268     tmp13 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - z4;    /* -c13 */
3269     tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948));       /* c3-c9-c13 */
3270     tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773));       /* c3+c5-c13 */
3271     tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
3272     tmp14 += tmp13 + z4 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
3273     tmp15 += tmp13 + MULTIPLY(z2, FIX(0.674957567));       /* c1+c11-c5 */
3274 
3275     tmp13 = ((z1 - z3) << CONST_BITS) + z4;
3276 
3277     /* Final output stage */
3278 
3279     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
3280 					       CONST_BITS+PASS1_BITS+3)
3281 			     & RANGE_MASK];
3282     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
3283 					       CONST_BITS+PASS1_BITS+3)
3284 			     & RANGE_MASK];
3285     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
3286 					       CONST_BITS+PASS1_BITS+3)
3287 			     & RANGE_MASK];
3288     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
3289 					       CONST_BITS+PASS1_BITS+3)
3290 			     & RANGE_MASK];
3291     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
3292 					       CONST_BITS+PASS1_BITS+3)
3293 			     & RANGE_MASK];
3294     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
3295 					       CONST_BITS+PASS1_BITS+3)
3296 			     & RANGE_MASK];
3297     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
3298 					       CONST_BITS+PASS1_BITS+3)
3299 			     & RANGE_MASK];
3300     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
3301 					       CONST_BITS+PASS1_BITS+3)
3302 			     & RANGE_MASK];
3303     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
3304 					       CONST_BITS+PASS1_BITS+3)
3305 			     & RANGE_MASK];
3306     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
3307 					       CONST_BITS+PASS1_BITS+3)
3308 			     & RANGE_MASK];
3309     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
3310 					       CONST_BITS+PASS1_BITS+3)
3311 			     & RANGE_MASK];
3312     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
3313 					       CONST_BITS+PASS1_BITS+3)
3314 			     & RANGE_MASK];
3315     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
3316 					       CONST_BITS+PASS1_BITS+3)
3317 			     & RANGE_MASK];
3318     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
3319 					       CONST_BITS+PASS1_BITS+3)
3320 			     & RANGE_MASK];
3321 
3322     wsptr += 8;		/* advance pointer to next row */
3323   }
3324 }
3325 
3326 
3327 /*
3328  * Perform dequantization and inverse DCT on one block of coefficients,
3329  * producing a 12x6 output block.
3330  *
3331  * 6-point IDCT in pass 1 (columns), 12-point in pass 2 (rows).
3332  */
3333 
3334 GLOBAL(void)
jpeg_idct_12x6(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)3335 jpeg_idct_12x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
3336 		JCOEFPTR coef_block,
3337 		JSAMPARRAY output_buf, JDIMENSION output_col)
3338 {
3339   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
3340   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
3341   INT32 z1, z2, z3, z4;
3342   JCOEFPTR inptr;
3343   ISLOW_MULT_TYPE * quantptr;
3344   int * wsptr;
3345   JSAMPROW outptr;
3346   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
3347   int ctr;
3348   int workspace[8*6];	/* buffers data between passes */
3349   SHIFT_TEMPS
3350 
3351   /* Pass 1: process columns from input, store into work array.
3352    * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
3353    */
3354   inptr = coef_block;
3355   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
3356   wsptr = workspace;
3357   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
3358     /* Even part */
3359 
3360     tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
3361     if (ctr == 0)
3362       CLAMP_DC(tmp10);
3363     tmp10 <<= CONST_BITS;
3364     /* Add fudge factor here for final descale. */
3365     tmp10 += ONE << (CONST_BITS-PASS1_BITS-1);
3366     tmp12 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
3367     tmp20 = MULTIPLY(tmp12, FIX(0.707106781));   /* c4 */
3368     tmp11 = tmp10 + tmp20;
3369     tmp21 = RIGHT_SHIFT(tmp10 - tmp20 - tmp20, CONST_BITS-PASS1_BITS);
3370     tmp20 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
3371     tmp10 = MULTIPLY(tmp20, FIX(1.224744871));   /* c2 */
3372     tmp20 = tmp11 + tmp10;
3373     tmp22 = tmp11 - tmp10;
3374 
3375     /* Odd part */
3376 
3377     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
3378     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
3379     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
3380     tmp11 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
3381     tmp10 = tmp11 + ((z1 + z2) << CONST_BITS);
3382     tmp12 = tmp11 + ((z3 - z2) << CONST_BITS);
3383     tmp11 = (z1 - z2 - z3) << PASS1_BITS;
3384 
3385     /* Final output stage */
3386 
3387     wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
3388     wsptr[8*5] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
3389     wsptr[8*1] = (int) (tmp21 + tmp11);
3390     wsptr[8*4] = (int) (tmp21 - tmp11);
3391     wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
3392     wsptr[8*3] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
3393   }
3394 
3395   /* Pass 2: process 6 rows from work array, store into output array.
3396    * 12-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/24).
3397    */
3398   wsptr = workspace;
3399   for (ctr = 0; ctr < 6; ctr++) {
3400     outptr = output_buf[ctr] + output_col;
3401 
3402     /* Even part */
3403 
3404     /* Add fudge factor here for final descale. */
3405     z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
3406     z3 <<= CONST_BITS;
3407 
3408     z4 = (INT32) wsptr[4];
3409     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
3410 
3411     tmp10 = z3 + z4;
3412     tmp11 = z3 - z4;
3413 
3414     z1 = (INT32) wsptr[2];
3415     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
3416     z1 <<= CONST_BITS;
3417     z2 = (INT32) wsptr[6];
3418     z2 <<= CONST_BITS;
3419 
3420     tmp12 = z1 - z2;
3421 
3422     tmp21 = z3 + tmp12;
3423     tmp24 = z3 - tmp12;
3424 
3425     tmp12 = z4 + z2;
3426 
3427     tmp20 = tmp10 + tmp12;
3428     tmp25 = tmp10 - tmp12;
3429 
3430     tmp12 = z4 - z1 - z2;
3431 
3432     tmp22 = tmp11 + tmp12;
3433     tmp23 = tmp11 - tmp12;
3434 
3435     /* Odd part */
3436 
3437     z1 = (INT32) wsptr[1];
3438     z2 = (INT32) wsptr[3];
3439     z3 = (INT32) wsptr[5];
3440     z4 = (INT32) wsptr[7];
3441 
3442     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
3443     tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
3444 
3445     tmp10 = z1 + z3;
3446     tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
3447     tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
3448     tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
3449     tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
3450     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
3451     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
3452     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
3453 	     MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
3454 
3455     z1 -= z4;
3456     z2 -= z3;
3457     z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
3458     tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
3459     tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
3460 
3461     /* Final output stage */
3462 
3463     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
3464 					       CONST_BITS+PASS1_BITS+3)
3465 			     & RANGE_MASK];
3466     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
3467 					       CONST_BITS+PASS1_BITS+3)
3468 			     & RANGE_MASK];
3469     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
3470 					       CONST_BITS+PASS1_BITS+3)
3471 			     & RANGE_MASK];
3472     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
3473 					       CONST_BITS+PASS1_BITS+3)
3474 			     & RANGE_MASK];
3475     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
3476 					       CONST_BITS+PASS1_BITS+3)
3477 			     & RANGE_MASK];
3478     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
3479 					       CONST_BITS+PASS1_BITS+3)
3480 			     & RANGE_MASK];
3481     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
3482 					       CONST_BITS+PASS1_BITS+3)
3483 			     & RANGE_MASK];
3484     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
3485 					       CONST_BITS+PASS1_BITS+3)
3486 			     & RANGE_MASK];
3487     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
3488 					       CONST_BITS+PASS1_BITS+3)
3489 			     & RANGE_MASK];
3490     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
3491 					       CONST_BITS+PASS1_BITS+3)
3492 			     & RANGE_MASK];
3493     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
3494 					       CONST_BITS+PASS1_BITS+3)
3495 			     & RANGE_MASK];
3496     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
3497 					       CONST_BITS+PASS1_BITS+3)
3498 			     & RANGE_MASK];
3499 
3500     wsptr += 8;		/* advance pointer to next row */
3501   }
3502 }
3503 
3504 
3505 /*
3506  * Perform dequantization and inverse DCT on one block of coefficients,
3507  * producing a 10x5 output block.
3508  *
3509  * 5-point IDCT in pass 1 (columns), 10-point in pass 2 (rows).
3510  */
3511 
3512 GLOBAL(void)
jpeg_idct_10x5(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)3513 jpeg_idct_10x5 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
3514 		JCOEFPTR coef_block,
3515 		JSAMPARRAY output_buf, JDIMENSION output_col)
3516 {
3517   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
3518   INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
3519   INT32 z1, z2, z3, z4;
3520   JCOEFPTR inptr;
3521   ISLOW_MULT_TYPE * quantptr;
3522   int * wsptr;
3523   JSAMPROW outptr;
3524   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
3525   int ctr;
3526   int workspace[8*5];	/* buffers data between passes */
3527   SHIFT_TEMPS
3528 
3529   /* Pass 1: process columns from input, store into work array.
3530    * 5-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/10).
3531    */
3532   inptr = coef_block;
3533   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
3534   wsptr = workspace;
3535   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
3536     /* Even part */
3537 
3538     tmp12 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
3539     if (ctr == 0)
3540       CLAMP_DC(tmp12);
3541     tmp12 <<= CONST_BITS;
3542     /* Add fudge factor here for final descale. */
3543     tmp12 += ONE << (CONST_BITS-PASS1_BITS-1);
3544     tmp13 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
3545     tmp14 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
3546     z1 = MULTIPLY(tmp13 + tmp14, FIX(0.790569415)); /* (c2+c4)/2 */
3547     z2 = MULTIPLY(tmp13 - tmp14, FIX(0.353553391)); /* (c2-c4)/2 */
3548     z3 = tmp12 + z2;
3549     tmp10 = z3 + z1;
3550     tmp11 = z3 - z1;
3551     tmp12 -= z2 << 2;
3552 
3553     /* Odd part */
3554 
3555     z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
3556     z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
3557 
3558     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));       /* c3 */
3559     tmp13 = z1 + MULTIPLY(z2, FIX(0.513743148));    /* c1-c3 */
3560     tmp14 = z1 - MULTIPLY(z3, FIX(2.176250899));    /* c1+c3 */
3561 
3562     /* Final output stage */
3563 
3564     wsptr[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp13, CONST_BITS-PASS1_BITS);
3565     wsptr[8*4] = (int) RIGHT_SHIFT(tmp10 - tmp13, CONST_BITS-PASS1_BITS);
3566     wsptr[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp14, CONST_BITS-PASS1_BITS);
3567     wsptr[8*3] = (int) RIGHT_SHIFT(tmp11 - tmp14, CONST_BITS-PASS1_BITS);
3568     wsptr[8*2] = (int) RIGHT_SHIFT(tmp12, CONST_BITS-PASS1_BITS);
3569   }
3570 
3571   /* Pass 2: process 5 rows from work array, store into output array.
3572    * 10-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/20).
3573    */
3574   wsptr = workspace;
3575   for (ctr = 0; ctr < 5; ctr++) {
3576     outptr = output_buf[ctr] + output_col;
3577 
3578     /* Even part */
3579 
3580     /* Add fudge factor here for final descale. */
3581     z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
3582     z3 <<= CONST_BITS;
3583     z4 = (INT32) wsptr[4];
3584     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
3585     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
3586     tmp10 = z3 + z1;
3587     tmp11 = z3 - z2;
3588 
3589     tmp22 = z3 - ((z1 - z2) << 1);               /* c0 = (c4-c8)*2 */
3590 
3591     z2 = (INT32) wsptr[2];
3592     z3 = (INT32) wsptr[6];
3593 
3594     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
3595     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
3596     tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
3597 
3598     tmp20 = tmp10 + tmp12;
3599     tmp24 = tmp10 - tmp12;
3600     tmp21 = tmp11 + tmp13;
3601     tmp23 = tmp11 - tmp13;
3602 
3603     /* Odd part */
3604 
3605     z1 = (INT32) wsptr[1];
3606     z2 = (INT32) wsptr[3];
3607     z3 = (INT32) wsptr[5];
3608     z3 <<= CONST_BITS;
3609     z4 = (INT32) wsptr[7];
3610 
3611     tmp11 = z2 + z4;
3612     tmp13 = z2 - z4;
3613 
3614     tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
3615 
3616     z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
3617     z4 = z3 + tmp12;
3618 
3619     tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
3620     tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
3621 
3622     z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
3623     z4 = z3 - tmp12 - (tmp13 << (CONST_BITS - 1));
3624 
3625     tmp12 = ((z1 - tmp13) << CONST_BITS) - z3;
3626 
3627     tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
3628     tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
3629 
3630     /* Final output stage */
3631 
3632     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
3633 					      CONST_BITS+PASS1_BITS+3)
3634 			    & RANGE_MASK];
3635     outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
3636 					      CONST_BITS+PASS1_BITS+3)
3637 			    & RANGE_MASK];
3638     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
3639 					      CONST_BITS+PASS1_BITS+3)
3640 			    & RANGE_MASK];
3641     outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
3642 					      CONST_BITS+PASS1_BITS+3)
3643 			    & RANGE_MASK];
3644     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
3645 					      CONST_BITS+PASS1_BITS+3)
3646 			    & RANGE_MASK];
3647     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
3648 					      CONST_BITS+PASS1_BITS+3)
3649 			    & RANGE_MASK];
3650     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
3651 					      CONST_BITS+PASS1_BITS+3)
3652 			    & RANGE_MASK];
3653     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
3654 					      CONST_BITS+PASS1_BITS+3)
3655 			    & RANGE_MASK];
3656     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
3657 					      CONST_BITS+PASS1_BITS+3)
3658 			    & RANGE_MASK];
3659     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
3660 					      CONST_BITS+PASS1_BITS+3)
3661 			    & RANGE_MASK];
3662 
3663     wsptr += 8;		/* advance pointer to next row */
3664   }
3665 }
3666 
3667 
3668 /*
3669  * Perform dequantization and inverse DCT on one block of coefficients,
3670  * producing a 8x4 output block.
3671  *
3672  * 4-point IDCT in pass 1 (columns), 8-point in pass 2 (rows).
3673  */
3674 
3675 GLOBAL(void)
jpeg_idct_8x4(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)3676 jpeg_idct_8x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
3677 	       JCOEFPTR coef_block,
3678 	       JSAMPARRAY output_buf, JDIMENSION output_col)
3679 {
3680   INT32 tmp0, tmp1, tmp2, tmp3;
3681   INT32 tmp10, tmp11, tmp12, tmp13;
3682   INT32 z1, z2, z3;
3683   JCOEFPTR inptr;
3684   ISLOW_MULT_TYPE * quantptr;
3685   int * wsptr;
3686   JSAMPROW outptr;
3687   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
3688   int ctr;
3689   int workspace[8*4];	/* buffers data between passes */
3690   SHIFT_TEMPS
3691 
3692   /* Pass 1: process columns from input, store into work array.
3693    * 4-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
3694    */
3695   inptr = coef_block;
3696   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
3697   wsptr = workspace;
3698   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
3699     /* Even part */
3700 
3701     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
3702     if (ctr == 0)
3703       CLAMP_DC(tmp0);
3704     tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
3705 
3706     tmp10 = (tmp0 + tmp2) << PASS1_BITS;
3707     tmp12 = (tmp0 - tmp2) << PASS1_BITS;
3708 
3709     /* Odd part */
3710     /* Same rotation as in the even part of the 8x8 LL&M IDCT */
3711 
3712     z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
3713     z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
3714 
3715     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);               /* c6 */
3716     /* Add fudge factor here for final descale. */
3717     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
3718     tmp0 = RIGHT_SHIFT(z1 + MULTIPLY(z2, FIX_0_765366865), /* c2-c6 */
3719 		       CONST_BITS-PASS1_BITS);
3720     tmp2 = RIGHT_SHIFT(z1 - MULTIPLY(z3, FIX_1_847759065), /* c2+c6 */
3721 		       CONST_BITS-PASS1_BITS);
3722 
3723     /* Final output stage */
3724 
3725     wsptr[8*0] = (int) (tmp10 + tmp0);
3726     wsptr[8*3] = (int) (tmp10 - tmp0);
3727     wsptr[8*1] = (int) (tmp12 + tmp2);
3728     wsptr[8*2] = (int) (tmp12 - tmp2);
3729   }
3730 
3731   /* Pass 2: process rows from work array, store into output array. */
3732   /* Note that we must descale the results by a factor of 8 == 2**3, */
3733   /* and also undo the PASS1_BITS scaling. */
3734 
3735   wsptr = workspace;
3736   for (ctr = 0; ctr < 4; ctr++) {
3737     outptr = output_buf[ctr] + output_col;
3738 
3739     /* Even part: reverse the even part of the forward DCT. */
3740     /* The rotator is sqrt(2)*c(-6). */
3741 
3742     z2 = (INT32) wsptr[2];
3743     z3 = (INT32) wsptr[6];
3744 
3745     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
3746     tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
3747     tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
3748 
3749     /* Add fudge factor here for final descale. */
3750     z2 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
3751     z3 = (INT32) wsptr[4];
3752 
3753     tmp0 = (z2 + z3) << CONST_BITS;
3754     tmp1 = (z2 - z3) << CONST_BITS;
3755 
3756     tmp10 = tmp0 + tmp2;
3757     tmp13 = tmp0 - tmp2;
3758     tmp11 = tmp1 + tmp3;
3759     tmp12 = tmp1 - tmp3;
3760 
3761     /* Odd part per figure 8; the matrix is unitary and hence its
3762      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
3763      */
3764 
3765     tmp0 = (INT32) wsptr[7];
3766     tmp1 = (INT32) wsptr[5];
3767     tmp2 = (INT32) wsptr[3];
3768     tmp3 = (INT32) wsptr[1];
3769 
3770     z2 = tmp0 + tmp2;
3771     z3 = tmp1 + tmp3;
3772 
3773     z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
3774     z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
3775     z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
3776     z2 += z1;
3777     z3 += z1;
3778 
3779     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
3780     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
3781     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
3782     tmp0 += z1 + z2;
3783     tmp3 += z1 + z3;
3784 
3785     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
3786     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
3787     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
3788     tmp1 += z1 + z3;
3789     tmp2 += z1 + z2;
3790 
3791     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
3792 
3793     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp3,
3794 					      CONST_BITS+PASS1_BITS+3)
3795 			    & RANGE_MASK];
3796     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp3,
3797 					      CONST_BITS+PASS1_BITS+3)
3798 			    & RANGE_MASK];
3799     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp2,
3800 					      CONST_BITS+PASS1_BITS+3)
3801 			    & RANGE_MASK];
3802     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp2,
3803 					      CONST_BITS+PASS1_BITS+3)
3804 			    & RANGE_MASK];
3805     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp1,
3806 					      CONST_BITS+PASS1_BITS+3)
3807 			    & RANGE_MASK];
3808     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp1,
3809 					      CONST_BITS+PASS1_BITS+3)
3810 			    & RANGE_MASK];
3811     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp0,
3812 					      CONST_BITS+PASS1_BITS+3)
3813 			    & RANGE_MASK];
3814     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp0,
3815 					      CONST_BITS+PASS1_BITS+3)
3816 			    & RANGE_MASK];
3817 
3818     wsptr += DCTSIZE;		/* advance pointer to next row */
3819   }
3820 }
3821 
3822 
3823 /*
3824  * Perform dequantization and inverse DCT on one block of coefficients,
3825  * producing a reduced-size 6x3 output block.
3826  *
3827  * 3-point IDCT in pass 1 (columns), 6-point in pass 2 (rows).
3828  */
3829 
3830 GLOBAL(void)
jpeg_idct_6x3(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)3831 jpeg_idct_6x3 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
3832 	       JCOEFPTR coef_block,
3833 	       JSAMPARRAY output_buf, JDIMENSION output_col)
3834 {
3835   INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
3836   INT32 z1, z2, z3;
3837   JCOEFPTR inptr;
3838   ISLOW_MULT_TYPE * quantptr;
3839   int * wsptr;
3840   JSAMPROW outptr;
3841   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
3842   int ctr;
3843   int workspace[6*3];	/* buffers data between passes */
3844   SHIFT_TEMPS
3845 
3846   /* Pass 1: process columns from input, store into work array.
3847    * 3-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/6).
3848    */
3849   inptr = coef_block;
3850   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
3851   wsptr = workspace;
3852   for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
3853     /* Even part */
3854 
3855     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
3856     if (ctr == 0)
3857       CLAMP_DC(tmp0);
3858     tmp0 <<= CONST_BITS;
3859     /* Add fudge factor here for final descale. */
3860     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
3861     tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
3862     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
3863     tmp10 = tmp0 + tmp12;
3864     tmp2 = tmp0 - tmp12 - tmp12;
3865 
3866     /* Odd part */
3867 
3868     tmp12 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
3869     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
3870 
3871     /* Final output stage */
3872 
3873     wsptr[6*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
3874     wsptr[6*2] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
3875     wsptr[6*1] = (int) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
3876   }
3877 
3878   /* Pass 2: process 3 rows from work array, store into output array.
3879    * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
3880    */
3881   wsptr = workspace;
3882   for (ctr = 0; ctr < 3; ctr++) {
3883     outptr = output_buf[ctr] + output_col;
3884 
3885     /* Even part */
3886 
3887     /* Add fudge factor here for final descale. */
3888     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
3889     tmp0 <<= CONST_BITS;
3890     tmp2 = (INT32) wsptr[4];
3891     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
3892     tmp1 = tmp0 + tmp10;
3893     tmp11 = tmp0 - tmp10 - tmp10;
3894     tmp10 = (INT32) wsptr[2];
3895     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
3896     tmp10 = tmp1 + tmp0;
3897     tmp12 = tmp1 - tmp0;
3898 
3899     /* Odd part */
3900 
3901     z1 = (INT32) wsptr[1];
3902     z2 = (INT32) wsptr[3];
3903     z3 = (INT32) wsptr[5];
3904     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
3905     tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
3906     tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
3907     tmp1 = (z1 - z2 - z3) << CONST_BITS;
3908 
3909     /* Final output stage */
3910 
3911     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
3912 					      CONST_BITS+PASS1_BITS+3)
3913 			    & RANGE_MASK];
3914     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
3915 					      CONST_BITS+PASS1_BITS+3)
3916 			    & RANGE_MASK];
3917     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
3918 					      CONST_BITS+PASS1_BITS+3)
3919 			    & RANGE_MASK];
3920     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
3921 					      CONST_BITS+PASS1_BITS+3)
3922 			    & RANGE_MASK];
3923     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
3924 					      CONST_BITS+PASS1_BITS+3)
3925 			    & RANGE_MASK];
3926     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
3927 					      CONST_BITS+PASS1_BITS+3)
3928 			    & RANGE_MASK];
3929 
3930     wsptr += 6;		/* advance pointer to next row */
3931   }
3932 }
3933 
3934 
3935 /*
3936  * Perform dequantization and inverse DCT on one block of coefficients,
3937  * producing a 4x2 output block.
3938  *
3939  * 2-point IDCT in pass 1 (columns), 4-point in pass 2 (rows).
3940  */
3941 
3942 GLOBAL(void)
jpeg_idct_4x2(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)3943 jpeg_idct_4x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
3944 	       JCOEFPTR coef_block,
3945 	       JSAMPARRAY output_buf, JDIMENSION output_col)
3946 {
3947   INT32 tmp0, tmp2, tmp10, tmp12;
3948   INT32 z1, z2, z3;
3949   JCOEFPTR inptr;
3950   ISLOW_MULT_TYPE * quantptr;
3951   INT32 * wsptr;
3952   JSAMPROW outptr;
3953   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
3954   int ctr;
3955   INT32 workspace[4*2];	/* buffers data between passes */
3956   SHIFT_TEMPS
3957 
3958   /* Pass 1: process columns from input, store into work array. */
3959 
3960   inptr = coef_block;
3961   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
3962   wsptr = workspace;
3963   for (ctr = 0; ctr < 4; ctr++, inptr++, quantptr++, wsptr++) {
3964     /* Even part */
3965 
3966     tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
3967     if (ctr == 0)
3968       CLAMP_DC(tmp10);
3969 
3970     /* Odd part */
3971 
3972     tmp0 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
3973 
3974     /* Final output stage */
3975 
3976     wsptr[4*0] = tmp10 + tmp0;
3977     wsptr[4*1] = tmp10 - tmp0;
3978   }
3979 
3980   /* Pass 2: process 2 rows from work array, store into output array.
3981    * 4-point IDCT kernel,
3982    * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
3983    */
3984   wsptr = workspace;
3985   for (ctr = 0; ctr < 2; ctr++) {
3986     outptr = output_buf[ctr] + output_col;
3987 
3988     /* Even part */
3989 
3990     /* Add fudge factor here for final descale. */
3991     tmp0 = wsptr[0] + (ONE << 2);
3992     tmp2 = wsptr[2];
3993 
3994     tmp10 = (tmp0 + tmp2) << CONST_BITS;
3995     tmp12 = (tmp0 - tmp2) << CONST_BITS;
3996 
3997     /* Odd part */
3998     /* Same rotation as in the even part of the 8x8 LL&M IDCT */
3999 
4000     z2 = wsptr[1];
4001     z3 = wsptr[3];
4002 
4003     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
4004     tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
4005     tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
4006 
4007     /* Final output stage */
4008 
4009     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
4010 					      CONST_BITS+3)
4011 			    & RANGE_MASK];
4012     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
4013 					      CONST_BITS+3)
4014 			    & RANGE_MASK];
4015     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
4016 					      CONST_BITS+3)
4017 			    & RANGE_MASK];
4018     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
4019 					      CONST_BITS+3)
4020 			    & RANGE_MASK];
4021 
4022     wsptr += 4;		/* advance pointer to next row */
4023   }
4024 }
4025 
4026 
4027 /*
4028  * Perform dequantization and inverse DCT on one block of coefficients,
4029  * producing a 2x1 output block.
4030  *
4031  * 1-point IDCT in pass 1 (columns), 2-point in pass 2 (rows).
4032  */
4033 
4034 GLOBAL(void)
jpeg_idct_2x1(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)4035 jpeg_idct_2x1 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
4036 	       JCOEFPTR coef_block,
4037 	       JSAMPARRAY output_buf, JDIMENSION output_col)
4038 {
4039   INT32 tmp0, tmp10;
4040   ISLOW_MULT_TYPE * quantptr;
4041   JSAMPROW outptr;
4042   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
4043   SHIFT_TEMPS
4044 
4045   /* Pass 1: empty. */
4046 
4047   /* Pass 2: process 1 row from input, store into output array. */
4048 
4049   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
4050   outptr = output_buf[0] + output_col;
4051 
4052   /* Even part */
4053 
4054   tmp10 = DEQUANTIZE(coef_block[0], quantptr[0]);
4055   CLAMP_DC(tmp10);
4056   /* Add fudge factor here for final descale. */
4057   tmp10 += ONE << 2;
4058 
4059   /* Odd part */
4060 
4061   tmp0 = DEQUANTIZE(coef_block[1], quantptr[1]);
4062 
4063   /* Final output stage */
4064 
4065   outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, 3) & RANGE_MASK];
4066   outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, 3) & RANGE_MASK];
4067 }
4068 
4069 
4070 /*
4071  * Perform dequantization and inverse DCT on one block of coefficients,
4072  * producing a 8x16 output block.
4073  *
4074  * 16-point IDCT in pass 1 (columns), 8-point in pass 2 (rows).
4075  */
4076 
4077 GLOBAL(void)
jpeg_idct_8x16(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)4078 jpeg_idct_8x16 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
4079 		JCOEFPTR coef_block,
4080 		JSAMPARRAY output_buf, JDIMENSION output_col)
4081 {
4082   INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
4083   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
4084   INT32 z1, z2, z3, z4;
4085   JCOEFPTR inptr;
4086   ISLOW_MULT_TYPE * quantptr;
4087   int * wsptr;
4088   JSAMPROW outptr;
4089   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
4090   int ctr;
4091   int workspace[8*16];	/* buffers data between passes */
4092   SHIFT_TEMPS
4093 
4094   /* Pass 1: process columns from input, store into work array.
4095    * 16-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
4096    */
4097   inptr = coef_block;
4098   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
4099   wsptr = workspace;
4100   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
4101     /* Even part */
4102 
4103     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
4104     if (ctr == 0)
4105       CLAMP_DC(tmp0);
4106     tmp0 <<= CONST_BITS;
4107     /* Add fudge factor here for final descale. */
4108     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
4109 
4110     z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
4111     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
4112     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
4113 
4114     tmp10 = tmp0 + tmp1;
4115     tmp11 = tmp0 - tmp1;
4116     tmp12 = tmp0 + tmp2;
4117     tmp13 = tmp0 - tmp2;
4118 
4119     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
4120     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
4121     z3 = z1 - z2;
4122     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
4123     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
4124 
4125     tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
4126     tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
4127     tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
4128     tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
4129 
4130     tmp20 = tmp10 + tmp0;
4131     tmp27 = tmp10 - tmp0;
4132     tmp21 = tmp12 + tmp1;
4133     tmp26 = tmp12 - tmp1;
4134     tmp22 = tmp13 + tmp2;
4135     tmp25 = tmp13 - tmp2;
4136     tmp23 = tmp11 + tmp3;
4137     tmp24 = tmp11 - tmp3;
4138 
4139     /* Odd part */
4140 
4141     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
4142     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
4143     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
4144     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
4145 
4146     tmp11 = z1 + z3;
4147 
4148     tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
4149     tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
4150     tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
4151     tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
4152     tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
4153     tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
4154     tmp0  = tmp1 + tmp2 + tmp3 -
4155 	    MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
4156     tmp13 = tmp10 + tmp11 + tmp12 -
4157 	    MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
4158     z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
4159     tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
4160     tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
4161     z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
4162     tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
4163     tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
4164     z2    += z4;
4165     z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
4166     tmp1  += z1;
4167     tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
4168     z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
4169     tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
4170     tmp12 += z2;
4171     z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
4172     tmp2  += z2;
4173     tmp3  += z2;
4174     z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
4175     tmp10 += z2;
4176     tmp11 += z2;
4177 
4178     /* Final output stage */
4179 
4180     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp0,  CONST_BITS-PASS1_BITS);
4181     wsptr[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0,  CONST_BITS-PASS1_BITS);
4182     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp1,  CONST_BITS-PASS1_BITS);
4183     wsptr[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1,  CONST_BITS-PASS1_BITS);
4184     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp2,  CONST_BITS-PASS1_BITS);
4185     wsptr[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2,  CONST_BITS-PASS1_BITS);
4186     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp3,  CONST_BITS-PASS1_BITS);
4187     wsptr[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3,  CONST_BITS-PASS1_BITS);
4188     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
4189     wsptr[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
4190     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
4191     wsptr[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
4192     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
4193     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
4194     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
4195     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
4196   }
4197 
4198   /* Pass 2: process rows from work array, store into output array. */
4199   /* Note that we must descale the results by a factor of 8 == 2**3, */
4200   /* and also undo the PASS1_BITS scaling. */
4201 
4202   wsptr = workspace;
4203   for (ctr = 0; ctr < 16; ctr++) {
4204     outptr = output_buf[ctr] + output_col;
4205 
4206     /* Even part: reverse the even part of the forward DCT. */
4207     /* The rotator is sqrt(2)*c(-6). */
4208 
4209     z2 = (INT32) wsptr[2];
4210     z3 = (INT32) wsptr[6];
4211 
4212     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
4213     tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
4214     tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
4215 
4216     /* Add fudge factor here for final descale. */
4217     z2 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
4218     z3 = (INT32) wsptr[4];
4219 
4220     tmp0 = (z2 + z3) << CONST_BITS;
4221     tmp1 = (z2 - z3) << CONST_BITS;
4222 
4223     tmp10 = tmp0 + tmp2;
4224     tmp13 = tmp0 - tmp2;
4225     tmp11 = tmp1 + tmp3;
4226     tmp12 = tmp1 - tmp3;
4227 
4228     /* Odd part per figure 8; the matrix is unitary and hence its
4229      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
4230      */
4231 
4232     tmp0 = (INT32) wsptr[7];
4233     tmp1 = (INT32) wsptr[5];
4234     tmp2 = (INT32) wsptr[3];
4235     tmp3 = (INT32) wsptr[1];
4236 
4237     z2 = tmp0 + tmp2;
4238     z3 = tmp1 + tmp3;
4239 
4240     z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
4241     z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
4242     z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
4243     z2 += z1;
4244     z3 += z1;
4245 
4246     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
4247     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
4248     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
4249     tmp0 += z1 + z2;
4250     tmp3 += z1 + z3;
4251 
4252     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
4253     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
4254     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
4255     tmp1 += z1 + z3;
4256     tmp2 += z1 + z2;
4257 
4258     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
4259 
4260     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp3,
4261 					      CONST_BITS+PASS1_BITS+3)
4262 			    & RANGE_MASK];
4263     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp3,
4264 					      CONST_BITS+PASS1_BITS+3)
4265 			    & RANGE_MASK];
4266     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp2,
4267 					      CONST_BITS+PASS1_BITS+3)
4268 			    & RANGE_MASK];
4269     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp2,
4270 					      CONST_BITS+PASS1_BITS+3)
4271 			    & RANGE_MASK];
4272     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp1,
4273 					      CONST_BITS+PASS1_BITS+3)
4274 			    & RANGE_MASK];
4275     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp1,
4276 					      CONST_BITS+PASS1_BITS+3)
4277 			    & RANGE_MASK];
4278     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp0,
4279 					      CONST_BITS+PASS1_BITS+3)
4280 			    & RANGE_MASK];
4281     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp0,
4282 					      CONST_BITS+PASS1_BITS+3)
4283 			    & RANGE_MASK];
4284 
4285     wsptr += DCTSIZE;		/* advance pointer to next row */
4286   }
4287 }
4288 
4289 
4290 /*
4291  * Perform dequantization and inverse DCT on one block of coefficients,
4292  * producing a 7x14 output block.
4293  *
4294  * 14-point IDCT in pass 1 (columns), 7-point in pass 2 (rows).
4295  */
4296 
4297 GLOBAL(void)
jpeg_idct_7x14(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)4298 jpeg_idct_7x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
4299 		JCOEFPTR coef_block,
4300 		JSAMPARRAY output_buf, JDIMENSION output_col)
4301 {
4302   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
4303   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
4304   INT32 z1, z2, z3, z4;
4305   JCOEFPTR inptr;
4306   ISLOW_MULT_TYPE * quantptr;
4307   int * wsptr;
4308   JSAMPROW outptr;
4309   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
4310   int ctr;
4311   int workspace[7*14];	/* buffers data between passes */
4312   SHIFT_TEMPS
4313 
4314   /* Pass 1: process columns from input, store into work array.
4315    * 14-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/28).
4316    */
4317   inptr = coef_block;
4318   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
4319   wsptr = workspace;
4320   for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) {
4321     /* Even part */
4322 
4323     z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
4324     if (ctr == 0)
4325       CLAMP_DC(z1);
4326     z1 <<= CONST_BITS;
4327     /* Add fudge factor here for final descale. */
4328     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
4329     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
4330     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
4331     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
4332     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
4333 
4334     tmp10 = z1 + z2;
4335     tmp11 = z1 + z3;
4336     tmp12 = z1 - z4;
4337 
4338     tmp23 = RIGHT_SHIFT(z1 - ((z2 + z3 - z4) << 1), /* c0 = (c4+c12-c8)*2 */
4339 			CONST_BITS-PASS1_BITS);
4340 
4341     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
4342     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
4343 
4344     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
4345 
4346     tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
4347     tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
4348     tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
4349 	    MULTIPLY(z2, FIX(1.378756276));      /* c2 */
4350 
4351     tmp20 = tmp10 + tmp13;
4352     tmp26 = tmp10 - tmp13;
4353     tmp21 = tmp11 + tmp14;
4354     tmp25 = tmp11 - tmp14;
4355     tmp22 = tmp12 + tmp15;
4356     tmp24 = tmp12 - tmp15;
4357 
4358     /* Odd part */
4359 
4360     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
4361     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
4362     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
4363     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
4364     tmp13 = z4 << CONST_BITS;
4365 
4366     tmp14 = z1 + z3;
4367     tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
4368     tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
4369     tmp10 = tmp11 + tmp12 + tmp13 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
4370     tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
4371     tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
4372     z1    -= z2;
4373     tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13;        /* c11 */
4374     tmp16 += tmp15;
4375     z1    += z4;
4376     z4    = MULTIPLY(z2 + z3, - FIX(0.158341681)) - tmp13; /* -c13 */
4377     tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948));          /* c3-c9-c13 */
4378     tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773));          /* c3+c5-c13 */
4379     z4    = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
4380     tmp14 += z4 + tmp13 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
4381     tmp15 += z4 + MULTIPLY(z2, FIX(0.674957567));          /* c1+c11-c5 */
4382 
4383     tmp13 = (z1 - z3) << PASS1_BITS;
4384 
4385     /* Final output stage */
4386 
4387     wsptr[7*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
4388     wsptr[7*13] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
4389     wsptr[7*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
4390     wsptr[7*12] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
4391     wsptr[7*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
4392     wsptr[7*11] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
4393     wsptr[7*3]  = (int) (tmp23 + tmp13);
4394     wsptr[7*10] = (int) (tmp23 - tmp13);
4395     wsptr[7*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
4396     wsptr[7*9]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
4397     wsptr[7*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
4398     wsptr[7*8]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
4399     wsptr[7*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
4400     wsptr[7*7]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
4401   }
4402 
4403   /* Pass 2: process 14 rows from work array, store into output array.
4404    * 7-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/14).
4405    */
4406   wsptr = workspace;
4407   for (ctr = 0; ctr < 14; ctr++) {
4408     outptr = output_buf[ctr] + output_col;
4409 
4410     /* Even part */
4411 
4412     /* Add fudge factor here for final descale. */
4413     tmp23 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
4414     tmp23 <<= CONST_BITS;
4415 
4416     z1 = (INT32) wsptr[2];
4417     z2 = (INT32) wsptr[4];
4418     z3 = (INT32) wsptr[6];
4419 
4420     tmp20 = MULTIPLY(z2 - z3, FIX(0.881747734));       /* c4 */
4421     tmp22 = MULTIPLY(z1 - z2, FIX(0.314692123));       /* c6 */
4422     tmp21 = tmp20 + tmp22 + tmp23 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
4423     tmp10 = z1 + z3;
4424     z2 -= tmp10;
4425     tmp10 = MULTIPLY(tmp10, FIX(1.274162392)) + tmp23; /* c2 */
4426     tmp20 += tmp10 - MULTIPLY(z3, FIX(0.077722536));   /* c2-c4-c6 */
4427     tmp22 += tmp10 - MULTIPLY(z1, FIX(2.470602249));   /* c2+c4+c6 */
4428     tmp23 += MULTIPLY(z2, FIX(1.414213562));           /* c0 */
4429 
4430     /* Odd part */
4431 
4432     z1 = (INT32) wsptr[1];
4433     z2 = (INT32) wsptr[3];
4434     z3 = (INT32) wsptr[5];
4435 
4436     tmp11 = MULTIPLY(z1 + z2, FIX(0.935414347));       /* (c3+c1-c5)/2 */
4437     tmp12 = MULTIPLY(z1 - z2, FIX(0.170262339));       /* (c3+c5-c1)/2 */
4438     tmp10 = tmp11 - tmp12;
4439     tmp11 += tmp12;
4440     tmp12 = MULTIPLY(z2 + z3, - FIX(1.378756276));     /* -c1 */
4441     tmp11 += tmp12;
4442     z2 = MULTIPLY(z1 + z3, FIX(0.613604268));          /* c5 */
4443     tmp10 += z2;
4444     tmp12 += z2 + MULTIPLY(z3, FIX(1.870828693));      /* c3+c1-c5 */
4445 
4446     /* Final output stage */
4447 
4448     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
4449 					      CONST_BITS+PASS1_BITS+3)
4450 			    & RANGE_MASK];
4451     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
4452 					      CONST_BITS+PASS1_BITS+3)
4453 			    & RANGE_MASK];
4454     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
4455 					      CONST_BITS+PASS1_BITS+3)
4456 			    & RANGE_MASK];
4457     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
4458 					      CONST_BITS+PASS1_BITS+3)
4459 			    & RANGE_MASK];
4460     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
4461 					      CONST_BITS+PASS1_BITS+3)
4462 			    & RANGE_MASK];
4463     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
4464 					      CONST_BITS+PASS1_BITS+3)
4465 			    & RANGE_MASK];
4466     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23,
4467 					      CONST_BITS+PASS1_BITS+3)
4468 			    & RANGE_MASK];
4469 
4470     wsptr += 7;		/* advance pointer to next row */
4471   }
4472 }
4473 
4474 
4475 /*
4476  * Perform dequantization and inverse DCT on one block of coefficients,
4477  * producing a 6x12 output block.
4478  *
4479  * 12-point IDCT in pass 1 (columns), 6-point in pass 2 (rows).
4480  */
4481 
4482 GLOBAL(void)
jpeg_idct_6x12(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)4483 jpeg_idct_6x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
4484 		JCOEFPTR coef_block,
4485 		JSAMPARRAY output_buf, JDIMENSION output_col)
4486 {
4487   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
4488   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
4489   INT32 z1, z2, z3, z4;
4490   JCOEFPTR inptr;
4491   ISLOW_MULT_TYPE * quantptr;
4492   int * wsptr;
4493   JSAMPROW outptr;
4494   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
4495   int ctr;
4496   int workspace[6*12];	/* buffers data between passes */
4497   SHIFT_TEMPS
4498 
4499   /* Pass 1: process columns from input, store into work array.
4500    * 12-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/24).
4501    */
4502   inptr = coef_block;
4503   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
4504   wsptr = workspace;
4505   for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
4506     /* Even part */
4507 
4508     z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
4509     if (ctr == 0)
4510       CLAMP_DC(z3);
4511     z3 <<= CONST_BITS;
4512     /* Add fudge factor here for final descale. */
4513     z3 += ONE << (CONST_BITS-PASS1_BITS-1);
4514 
4515     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
4516     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
4517 
4518     tmp10 = z3 + z4;
4519     tmp11 = z3 - z4;
4520 
4521     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
4522     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
4523     z1 <<= CONST_BITS;
4524     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
4525     z2 <<= CONST_BITS;
4526 
4527     tmp12 = z1 - z2;
4528 
4529     tmp21 = z3 + tmp12;
4530     tmp24 = z3 - tmp12;
4531 
4532     tmp12 = z4 + z2;
4533 
4534     tmp20 = tmp10 + tmp12;
4535     tmp25 = tmp10 - tmp12;
4536 
4537     tmp12 = z4 - z1 - z2;
4538 
4539     tmp22 = tmp11 + tmp12;
4540     tmp23 = tmp11 - tmp12;
4541 
4542     /* Odd part */
4543 
4544     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
4545     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
4546     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
4547     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
4548 
4549     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
4550     tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
4551 
4552     tmp10 = z1 + z3;
4553     tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
4554     tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
4555     tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
4556     tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
4557     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
4558     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
4559     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
4560 	     MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
4561 
4562     z1 -= z4;
4563     z2 -= z3;
4564     z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
4565     tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
4566     tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
4567 
4568     /* Final output stage */
4569 
4570     wsptr[6*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
4571     wsptr[6*11] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
4572     wsptr[6*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
4573     wsptr[6*10] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
4574     wsptr[6*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
4575     wsptr[6*9]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
4576     wsptr[6*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
4577     wsptr[6*8]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
4578     wsptr[6*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
4579     wsptr[6*7]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
4580     wsptr[6*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
4581     wsptr[6*6]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
4582   }
4583 
4584   /* Pass 2: process 12 rows from work array, store into output array.
4585    * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
4586    */
4587   wsptr = workspace;
4588   for (ctr = 0; ctr < 12; ctr++) {
4589     outptr = output_buf[ctr] + output_col;
4590 
4591     /* Even part */
4592 
4593     /* Add fudge factor here for final descale. */
4594     tmp10 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
4595     tmp10 <<= CONST_BITS;
4596     tmp12 = (INT32) wsptr[4];
4597     tmp20 = MULTIPLY(tmp12, FIX(0.707106781));   /* c4 */
4598     tmp11 = tmp10 + tmp20;
4599     tmp21 = tmp10 - tmp20 - tmp20;
4600     tmp20 = (INT32) wsptr[2];
4601     tmp10 = MULTIPLY(tmp20, FIX(1.224744871));   /* c2 */
4602     tmp20 = tmp11 + tmp10;
4603     tmp22 = tmp11 - tmp10;
4604 
4605     /* Odd part */
4606 
4607     z1 = (INT32) wsptr[1];
4608     z2 = (INT32) wsptr[3];
4609     z3 = (INT32) wsptr[5];
4610     tmp11 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
4611     tmp10 = tmp11 + ((z1 + z2) << CONST_BITS);
4612     tmp12 = tmp11 + ((z3 - z2) << CONST_BITS);
4613     tmp11 = (z1 - z2 - z3) << CONST_BITS;
4614 
4615     /* Final output stage */
4616 
4617     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
4618 					      CONST_BITS+PASS1_BITS+3)
4619 			    & RANGE_MASK];
4620     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
4621 					      CONST_BITS+PASS1_BITS+3)
4622 			    & RANGE_MASK];
4623     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
4624 					      CONST_BITS+PASS1_BITS+3)
4625 			    & RANGE_MASK];
4626     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
4627 					      CONST_BITS+PASS1_BITS+3)
4628 			    & RANGE_MASK];
4629     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
4630 					      CONST_BITS+PASS1_BITS+3)
4631 			    & RANGE_MASK];
4632     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
4633 					      CONST_BITS+PASS1_BITS+3)
4634 			    & RANGE_MASK];
4635 
4636     wsptr += 6;		/* advance pointer to next row */
4637   }
4638 }
4639 
4640 
4641 /*
4642  * Perform dequantization and inverse DCT on one block of coefficients,
4643  * producing a 5x10 output block.
4644  *
4645  * 10-point IDCT in pass 1 (columns), 5-point in pass 2 (rows).
4646  */
4647 
4648 GLOBAL(void)
jpeg_idct_5x10(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)4649 jpeg_idct_5x10 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
4650 		JCOEFPTR coef_block,
4651 		JSAMPARRAY output_buf, JDIMENSION output_col)
4652 {
4653   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
4654   INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
4655   INT32 z1, z2, z3, z4, z5;
4656   JCOEFPTR inptr;
4657   ISLOW_MULT_TYPE * quantptr;
4658   int * wsptr;
4659   JSAMPROW outptr;
4660   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
4661   int ctr;
4662   int workspace[5*10];	/* buffers data between passes */
4663   SHIFT_TEMPS
4664 
4665   /* Pass 1: process columns from input, store into work array.
4666    * 10-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/20).
4667    */
4668   inptr = coef_block;
4669   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
4670   wsptr = workspace;
4671   for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) {
4672     /* Even part */
4673 
4674     z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
4675     if (ctr == 0)
4676       CLAMP_DC(z3);
4677     z3 <<= CONST_BITS;
4678     /* Add fudge factor here for final descale. */
4679     z3 += ONE << (CONST_BITS-PASS1_BITS-1);
4680     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
4681     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
4682     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
4683     tmp10 = z3 + z1;
4684     tmp11 = z3 - z2;
4685 
4686     tmp22 = RIGHT_SHIFT(z3 - ((z1 - z2) << 1),   /* c0 = (c4-c8)*2 */
4687 			CONST_BITS-PASS1_BITS);
4688 
4689     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
4690     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
4691 
4692     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
4693     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
4694     tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
4695 
4696     tmp20 = tmp10 + tmp12;
4697     tmp24 = tmp10 - tmp12;
4698     tmp21 = tmp11 + tmp13;
4699     tmp23 = tmp11 - tmp13;
4700 
4701     /* Odd part */
4702 
4703     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
4704     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
4705     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
4706     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
4707 
4708     tmp11 = z2 + z4;
4709     tmp13 = z2 - z4;
4710 
4711     tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
4712     z5 = z3 << CONST_BITS;
4713 
4714     z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
4715     z4 = z5 + tmp12;
4716 
4717     tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
4718     tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
4719 
4720     z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
4721     z4 = z5 - tmp12 - (tmp13 << (CONST_BITS - 1));
4722 
4723     tmp12 = (z1 - tmp13 - z3) << PASS1_BITS;
4724 
4725     tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
4726     tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
4727 
4728     /* Final output stage */
4729 
4730     wsptr[5*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
4731     wsptr[5*9] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
4732     wsptr[5*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
4733     wsptr[5*8] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
4734     wsptr[5*2] = (int) (tmp22 + tmp12);
4735     wsptr[5*7] = (int) (tmp22 - tmp12);
4736     wsptr[5*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
4737     wsptr[5*6] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
4738     wsptr[5*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
4739     wsptr[5*5] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
4740   }
4741 
4742   /* Pass 2: process 10 rows from work array, store into output array.
4743    * 5-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/10).
4744    */
4745   wsptr = workspace;
4746   for (ctr = 0; ctr < 10; ctr++) {
4747     outptr = output_buf[ctr] + output_col;
4748 
4749     /* Even part */
4750 
4751     /* Add fudge factor here for final descale. */
4752     tmp12 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
4753     tmp12 <<= CONST_BITS;
4754     tmp13 = (INT32) wsptr[2];
4755     tmp14 = (INT32) wsptr[4];
4756     z1 = MULTIPLY(tmp13 + tmp14, FIX(0.790569415)); /* (c2+c4)/2 */
4757     z2 = MULTIPLY(tmp13 - tmp14, FIX(0.353553391)); /* (c2-c4)/2 */
4758     z3 = tmp12 + z2;
4759     tmp10 = z3 + z1;
4760     tmp11 = z3 - z1;
4761     tmp12 -= z2 << 2;
4762 
4763     /* Odd part */
4764 
4765     z2 = (INT32) wsptr[1];
4766     z3 = (INT32) wsptr[3];
4767 
4768     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));       /* c3 */
4769     tmp13 = z1 + MULTIPLY(z2, FIX(0.513743148));    /* c1-c3 */
4770     tmp14 = z1 - MULTIPLY(z3, FIX(2.176250899));    /* c1+c3 */
4771 
4772     /* Final output stage */
4773 
4774     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp13,
4775 					      CONST_BITS+PASS1_BITS+3)
4776 			    & RANGE_MASK];
4777     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp13,
4778 					      CONST_BITS+PASS1_BITS+3)
4779 			    & RANGE_MASK];
4780     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp14,
4781 					      CONST_BITS+PASS1_BITS+3)
4782 			    & RANGE_MASK];
4783     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp14,
4784 					      CONST_BITS+PASS1_BITS+3)
4785 			    & RANGE_MASK];
4786     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12,
4787 					      CONST_BITS+PASS1_BITS+3)
4788 			    & RANGE_MASK];
4789 
4790     wsptr += 5;		/* advance pointer to next row */
4791   }
4792 }
4793 
4794 
4795 /*
4796  * Perform dequantization and inverse DCT on one block of coefficients,
4797  * producing a 4x8 output block.
4798  *
4799  * 8-point IDCT in pass 1 (columns), 4-point in pass 2 (rows).
4800  */
4801 
4802 GLOBAL(void)
jpeg_idct_4x8(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)4803 jpeg_idct_4x8 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
4804 	       JCOEFPTR coef_block,
4805 	       JSAMPARRAY output_buf, JDIMENSION output_col)
4806 {
4807   INT32 tmp0, tmp1, tmp2, tmp3;
4808   INT32 tmp10, tmp11, tmp12, tmp13;
4809   INT32 z1, z2, z3;
4810   JCOEFPTR inptr;
4811   ISLOW_MULT_TYPE * quantptr;
4812   int * wsptr;
4813   JSAMPROW outptr;
4814   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
4815   int ctr;
4816   int workspace[4*8];	/* buffers data between passes */
4817   SHIFT_TEMPS
4818 
4819   /* Pass 1: process columns from input, store into work array. */
4820   /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
4821   /* furthermore, we scale the results by 2**PASS1_BITS. */
4822 
4823   inptr = coef_block;
4824   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
4825   wsptr = workspace;
4826   for (ctr = 4; ctr > 0; ctr--) {
4827     /* Due to quantization, we will usually find that many of the input
4828      * coefficients are zero, especially the AC terms.  We can exploit this
4829      * by short-circuiting the IDCT calculation for any column in which all
4830      * the AC terms are zero.  In that case each output is equal to the
4831      * DC coefficient (with scale factor as needed).
4832      * With typical images and quantization tables, half or more of the
4833      * column DCT calculations can be simplified this way.
4834      */
4835 
4836     if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
4837 	inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
4838 	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
4839 	inptr[DCTSIZE*7] == 0) {
4840       /* AC terms all zero */
4841       int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
4842       if (ctr == 4)
4843         CLAMP_DC(dcval);
4844       dcval <<= PASS1_BITS;
4845       wsptr[4*0] = dcval;
4846       wsptr[4*1] = dcval;
4847       wsptr[4*2] = dcval;
4848       wsptr[4*3] = dcval;
4849       wsptr[4*4] = dcval;
4850       wsptr[4*5] = dcval;
4851       wsptr[4*6] = dcval;
4852       wsptr[4*7] = dcval;
4853 
4854       inptr++;			/* advance pointers to next column */
4855       quantptr++;
4856       wsptr++;
4857       continue;
4858     }
4859 
4860     /* Even part: reverse the even part of the forward DCT. */
4861     /* The rotator is sqrt(2)*c(-6). */
4862 
4863     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
4864     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
4865 
4866     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
4867     tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
4868     tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
4869 
4870     z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
4871     if (ctr == 4)
4872       CLAMP_DC(z2);
4873     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
4874     z2 <<= CONST_BITS;
4875     z3 <<= CONST_BITS;
4876     /* Add fudge factor here for final descale. */
4877     z2 += ONE << (CONST_BITS-PASS1_BITS-1);
4878 
4879     tmp0 = z2 + z3;
4880     tmp1 = z2 - z3;
4881 
4882     tmp10 = tmp0 + tmp2;
4883     tmp13 = tmp0 - tmp2;
4884     tmp11 = tmp1 + tmp3;
4885     tmp12 = tmp1 - tmp3;
4886 
4887     /* Odd part per figure 8; the matrix is unitary and hence its
4888      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
4889      */
4890 
4891     tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
4892     tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
4893     tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
4894     tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
4895 
4896     z2 = tmp0 + tmp2;
4897     z3 = tmp1 + tmp3;
4898 
4899     z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
4900     z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
4901     z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
4902     z2 += z1;
4903     z3 += z1;
4904 
4905     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
4906     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
4907     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
4908     tmp0 += z1 + z2;
4909     tmp3 += z1 + z3;
4910 
4911     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
4912     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
4913     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
4914     tmp1 += z1 + z3;
4915     tmp2 += z1 + z2;
4916 
4917     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
4918 
4919     wsptr[4*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
4920     wsptr[4*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
4921     wsptr[4*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
4922     wsptr[4*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
4923     wsptr[4*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
4924     wsptr[4*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
4925     wsptr[4*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
4926     wsptr[4*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
4927 
4928     inptr++;			/* advance pointers to next column */
4929     quantptr++;
4930     wsptr++;
4931   }
4932 
4933   /* Pass 2: process 8 rows from work array, store into output array.
4934    * 4-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
4935    */
4936   wsptr = workspace;
4937   for (ctr = 0; ctr < 8; ctr++) {
4938     outptr = output_buf[ctr] + output_col;
4939 
4940     /* Even part */
4941 
4942     /* Add fudge factor here for final descale. */
4943     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
4944     tmp2 = (INT32) wsptr[2];
4945 
4946     tmp10 = (tmp0 + tmp2) << CONST_BITS;
4947     tmp12 = (tmp0 - tmp2) << CONST_BITS;
4948 
4949     /* Odd part */
4950     /* Same rotation as in the even part of the 8x8 LL&M IDCT */
4951 
4952     z2 = (INT32) wsptr[1];
4953     z3 = (INT32) wsptr[3];
4954 
4955     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
4956     tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
4957     tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
4958 
4959     /* Final output stage */
4960 
4961     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
4962 					      CONST_BITS+PASS1_BITS+3)
4963 			    & RANGE_MASK];
4964     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
4965 					      CONST_BITS+PASS1_BITS+3)
4966 			    & RANGE_MASK];
4967     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
4968 					      CONST_BITS+PASS1_BITS+3)
4969 			    & RANGE_MASK];
4970     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
4971 					      CONST_BITS+PASS1_BITS+3)
4972 			    & RANGE_MASK];
4973 
4974     wsptr += 4;		/* advance pointer to next row */
4975   }
4976 }
4977 
4978 
4979 /*
4980  * Perform dequantization and inverse DCT on one block of coefficients,
4981  * producing a reduced-size 3x6 output block.
4982  *
4983  * 6-point IDCT in pass 1 (columns), 3-point in pass 2 (rows).
4984  */
4985 
4986 GLOBAL(void)
jpeg_idct_3x6(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)4987 jpeg_idct_3x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
4988 	       JCOEFPTR coef_block,
4989 	       JSAMPARRAY output_buf, JDIMENSION output_col)
4990 {
4991   INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
4992   INT32 z1, z2, z3;
4993   JCOEFPTR inptr;
4994   ISLOW_MULT_TYPE * quantptr;
4995   int * wsptr;
4996   JSAMPROW outptr;
4997   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
4998   int ctr;
4999   int workspace[3*6];	/* buffers data between passes */
5000   SHIFT_TEMPS
5001 
5002   /* Pass 1: process columns from input, store into work array.
5003    * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
5004    */
5005   inptr = coef_block;
5006   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
5007   wsptr = workspace;
5008   for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) {
5009     /* Even part */
5010 
5011     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
5012     if (ctr == 0)
5013       CLAMP_DC(tmp0);
5014     tmp0 <<= CONST_BITS;
5015     /* Add fudge factor here for final descale. */
5016     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
5017     tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
5018     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
5019     tmp1 = tmp0 + tmp10;
5020     tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS-PASS1_BITS);
5021     tmp10 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
5022     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
5023     tmp10 = tmp1 + tmp0;
5024     tmp12 = tmp1 - tmp0;
5025 
5026     /* Odd part */
5027 
5028     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
5029     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
5030     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
5031     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
5032     tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
5033     tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
5034     tmp1 = (z1 - z2 - z3) << PASS1_BITS;
5035 
5036     /* Final output stage */
5037 
5038     wsptr[3*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
5039     wsptr[3*5] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
5040     wsptr[3*1] = (int) (tmp11 + tmp1);
5041     wsptr[3*4] = (int) (tmp11 - tmp1);
5042     wsptr[3*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
5043     wsptr[3*3] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
5044   }
5045 
5046   /* Pass 2: process 6 rows from work array, store into output array.
5047    * 3-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/6).
5048    */
5049   wsptr = workspace;
5050   for (ctr = 0; ctr < 6; ctr++) {
5051     outptr = output_buf[ctr] + output_col;
5052 
5053     /* Even part */
5054 
5055     /* Add fudge factor here for final descale. */
5056     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
5057     tmp0 <<= CONST_BITS;
5058     tmp2 = (INT32) wsptr[2];
5059     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
5060     tmp10 = tmp0 + tmp12;
5061     tmp2 = tmp0 - tmp12 - tmp12;
5062 
5063     /* Odd part */
5064 
5065     tmp12 = (INT32) wsptr[1];
5066     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
5067 
5068     /* Final output stage */
5069 
5070     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
5071 					      CONST_BITS+PASS1_BITS+3)
5072 			    & RANGE_MASK];
5073     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
5074 					      CONST_BITS+PASS1_BITS+3)
5075 			    & RANGE_MASK];
5076     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2,
5077 					      CONST_BITS+PASS1_BITS+3)
5078 			    & RANGE_MASK];
5079 
5080     wsptr += 3;		/* advance pointer to next row */
5081   }
5082 }
5083 
5084 
5085 /*
5086  * Perform dequantization and inverse DCT on one block of coefficients,
5087  * producing a 2x4 output block.
5088  *
5089  * 4-point IDCT in pass 1 (columns), 2-point in pass 2 (rows).
5090  */
5091 
5092 GLOBAL(void)
jpeg_idct_2x4(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)5093 jpeg_idct_2x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
5094 	       JCOEFPTR coef_block,
5095 	       JSAMPARRAY output_buf, JDIMENSION output_col)
5096 {
5097   INT32 tmp0, tmp2, tmp10, tmp12;
5098   INT32 z1, z2, z3;
5099   JCOEFPTR inptr;
5100   ISLOW_MULT_TYPE * quantptr;
5101   INT32 * wsptr;
5102   JSAMPROW outptr;
5103   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
5104   int ctr;
5105   INT32 workspace[2*4];	/* buffers data between passes */
5106   SHIFT_TEMPS
5107 
5108   /* Pass 1: process columns from input, store into work array.
5109    * 4-point IDCT kernel,
5110    * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
5111    */
5112   inptr = coef_block;
5113   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
5114   wsptr = workspace;
5115   for (ctr = 0; ctr < 2; ctr++, inptr++, quantptr++, wsptr++) {
5116     /* Even part */
5117 
5118     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
5119     if (ctr == 0)
5120       CLAMP_DC(tmp0);
5121     tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
5122 
5123     tmp10 = (tmp0 + tmp2) << CONST_BITS;
5124     tmp12 = (tmp0 - tmp2) << CONST_BITS;
5125 
5126     /* Odd part */
5127     /* Same rotation as in the even part of the 8x8 LL&M IDCT */
5128 
5129     z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
5130     z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
5131 
5132     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
5133     tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
5134     tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
5135 
5136     /* Final output stage */
5137 
5138     wsptr[2*0] = tmp10 + tmp0;
5139     wsptr[2*3] = tmp10 - tmp0;
5140     wsptr[2*1] = tmp12 + tmp2;
5141     wsptr[2*2] = tmp12 - tmp2;
5142   }
5143 
5144   /* Pass 2: process 4 rows from work array, store into output array. */
5145 
5146   wsptr = workspace;
5147   for (ctr = 0; ctr < 4; ctr++) {
5148     outptr = output_buf[ctr] + output_col;
5149 
5150     /* Even part */
5151 
5152     /* Add fudge factor here for final descale. */
5153     tmp10 = wsptr[0] + (ONE << (CONST_BITS+2));
5154 
5155     /* Odd part */
5156 
5157     tmp0 = wsptr[1];
5158 
5159     /* Final output stage */
5160 
5161     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS+3)
5162 			    & RANGE_MASK];
5163     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS+3)
5164 			    & RANGE_MASK];
5165 
5166     wsptr += 2;		/* advance pointer to next row */
5167   }
5168 }
5169 
5170 
5171 /*
5172  * Perform dequantization and inverse DCT on one block of coefficients,
5173  * producing a 1x2 output block.
5174  *
5175  * 2-point IDCT in pass 1 (columns), 1-point in pass 2 (rows).
5176  */
5177 
5178 GLOBAL(void)
jpeg_idct_1x2(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)5179 jpeg_idct_1x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
5180 	       JCOEFPTR coef_block,
5181 	       JSAMPARRAY output_buf, JDIMENSION output_col)
5182 {
5183   INT32 tmp0, tmp10;
5184   ISLOW_MULT_TYPE * quantptr;
5185   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
5186   SHIFT_TEMPS
5187 
5188   /* Process 1 column from input, store into output array. */
5189 
5190   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
5191 
5192   /* Even part */
5193 
5194   tmp10 = DEQUANTIZE(coef_block[DCTSIZE*0], quantptr[DCTSIZE*0]);
5195   CLAMP_DC(tmp10);
5196   /* Add fudge factor here for final descale. */
5197   tmp10 += ONE << 2;
5198 
5199   /* Odd part */
5200 
5201   tmp0 = DEQUANTIZE(coef_block[DCTSIZE*1], quantptr[DCTSIZE*1]);
5202 
5203   /* Final output stage */
5204 
5205   output_buf[0][output_col] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, 3)
5206 					  & RANGE_MASK];
5207   output_buf[1][output_col] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, 3)
5208 					  & RANGE_MASK];
5209 }
5210 
5211 #endif /* IDCT_SCALING_SUPPORTED */
5212 #endif /* DCT_ISLOW_SUPPORTED */
5213