1 /*
2  * jfdctint.c
3  *
4  * Copyright (C) 1991-1996, Thomas G. Lane.
5  * Modification developed 2003-2015 by Guido Vollbeding.
6  * This file is part of the Independent JPEG Group's software.
7  * For conditions of distribution and use, see the accompanying README file.
8  *
9  * This file contains a slow-but-accurate integer implementation of the
10  * forward DCT (Discrete Cosine Transform).
11  *
12  * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
13  * on each column.  Direct algorithms are also available, but they are
14  * much more complex and seem not to be any faster when reduced to code.
15  *
16  * This implementation is based on an algorithm described in
17  *   C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
18  *   Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
19  *   Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
20  * The primary algorithm described there uses 11 multiplies and 29 adds.
21  * We use their alternate method with 12 multiplies and 32 adds.
22  * The advantage of this method is that no data path contains more than one
23  * multiplication; this allows a very simple and accurate implementation in
24  * scaled fixed-point arithmetic, with a minimal number of shifts.
25  *
26  * We also provide FDCT routines with various input sample block sizes for
27  * direct resolution reduction or enlargement and for direct resolving the
28  * common 2x1 and 1x2 subsampling cases without additional resampling: NxN
29  * (N=1...16), 2NxN, and Nx2N (N=1...8) pixels for one 8x8 output DCT block.
30  *
31  * For N<8 we fill the remaining block coefficients with zero.
32  * For N>8 we apply a partial N-point FDCT on the input samples, computing
33  * just the lower 8 frequency coefficients and discarding the rest.
34  *
35  * We must scale the output coefficients of the N-point FDCT appropriately
36  * to the standard 8-point FDCT level by 8/N per 1-D pass.  This scaling
37  * is folded into the constant multipliers (pass 2) and/or final/initial
38  * shifting.
39  *
40  * CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
41  * since there would be too many additional constants to pre-calculate.
42  */
43 
44 #define JPEG_INTERNALS
45 #include "jinclude.h"
46 #include "jpeglib.h"
47 #include "jdct.h"		/* Private declarations for DCT subsystem */
48 
49 #ifdef DCT_ISLOW_SUPPORTED
50 
51 
52 /*
53  * This module is specialized to the case DCTSIZE = 8.
54  */
55 
56 #if DCTSIZE != 8
57   Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
58 #endif
59 
60 
61 /*
62  * The poop on this scaling stuff is as follows:
63  *
64  * Each 1-D DCT step produces outputs which are a factor of sqrt(N)
65  * larger than the true DCT outputs.  The final outputs are therefore
66  * a factor of N larger than desired; since N=8 this can be cured by
67  * a simple right shift at the end of the algorithm.  The advantage of
68  * this arrangement is that we save two multiplications per 1-D DCT,
69  * because the y0 and y4 outputs need not be divided by sqrt(N).
70  * In the IJG code, this factor of 8 is removed by the quantization step
71  * (in jcdctmgr.c), NOT in this module.
72  *
73  * We have to do addition and subtraction of the integer inputs, which
74  * is no problem, and multiplication by fractional constants, which is
75  * a problem to do in integer arithmetic.  We multiply all the constants
76  * by CONST_SCALE and convert them to integer constants (thus retaining
77  * CONST_BITS bits of precision in the constants).  After doing a
78  * multiplication we have to divide the product by CONST_SCALE, with proper
79  * rounding, to produce the correct output.  This division can be done
80  * cheaply as a right shift of CONST_BITS bits.  We postpone shifting
81  * as long as possible so that partial sums can be added together with
82  * full fractional precision.
83  *
84  * The outputs of the first pass are scaled up by PASS1_BITS bits so that
85  * they are represented to better-than-integral precision.  These outputs
86  * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
87  * with the recommended scaling.  (For 12-bit sample data, the intermediate
88  * array is INT32 anyway.)
89  *
90  * To avoid overflow of the 32-bit intermediate results in pass 2, we must
91  * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis
92  * shows that the values given below are the most effective.
93  */
94 
95 #if BITS_IN_JSAMPLE == 8
96 #define CONST_BITS  13
97 #define PASS1_BITS  2
98 #else
99 #define CONST_BITS  13
100 #define PASS1_BITS  1		/* lose a little precision to avoid overflow */
101 #endif
102 
103 /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
104  * causing a lot of useless floating-point operations at run time.
105  * To get around this we use the following pre-calculated constants.
106  * If you change CONST_BITS you may want to add appropriate values.
107  * (With a reasonable C compiler, you can just rely on the FIX() macro...)
108  */
109 
110 #if CONST_BITS == 13
111 #define FIX_0_298631336  ((INT32)  2446)	/* FIX(0.298631336) */
112 #define FIX_0_390180644  ((INT32)  3196)	/* FIX(0.390180644) */
113 #define FIX_0_541196100  ((INT32)  4433)	/* FIX(0.541196100) */
114 #define FIX_0_765366865  ((INT32)  6270)	/* FIX(0.765366865) */
115 #define FIX_0_899976223  ((INT32)  7373)	/* FIX(0.899976223) */
116 #define FIX_1_175875602  ((INT32)  9633)	/* FIX(1.175875602) */
117 #define FIX_1_501321110  ((INT32)  12299)	/* FIX(1.501321110) */
118 #define FIX_1_847759065  ((INT32)  15137)	/* FIX(1.847759065) */
119 #define FIX_1_961570560  ((INT32)  16069)	/* FIX(1.961570560) */
120 #define FIX_2_053119869  ((INT32)  16819)	/* FIX(2.053119869) */
121 #define FIX_2_562915447  ((INT32)  20995)	/* FIX(2.562915447) */
122 #define FIX_3_072711026  ((INT32)  25172)	/* FIX(3.072711026) */
123 #else
124 #define FIX_0_298631336  FIX(0.298631336)
125 #define FIX_0_390180644  FIX(0.390180644)
126 #define FIX_0_541196100  FIX(0.541196100)
127 #define FIX_0_765366865  FIX(0.765366865)
128 #define FIX_0_899976223  FIX(0.899976223)
129 #define FIX_1_175875602  FIX(1.175875602)
130 #define FIX_1_501321110  FIX(1.501321110)
131 #define FIX_1_847759065  FIX(1.847759065)
132 #define FIX_1_961570560  FIX(1.961570560)
133 #define FIX_2_053119869  FIX(2.053119869)
134 #define FIX_2_562915447  FIX(2.562915447)
135 #define FIX_3_072711026  FIX(3.072711026)
136 #endif
137 
138 
139 /* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
140  * For 8-bit samples with the recommended scaling, all the variable
141  * and constant values involved are no more than 16 bits wide, so a
142  * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
143  * For 12-bit samples, a full 32-bit multiplication will be needed.
144  */
145 
146 #if BITS_IN_JSAMPLE == 8
147 #define MULTIPLY(var,const)  MULTIPLY16C16(var,const)
148 #else
149 #define MULTIPLY(var,const)  ((var) * (const))
150 #endif
151 
152 
153 /*
154  * Perform the forward DCT on one block of samples.
155  */
156 
157 GLOBAL(void)
158 jpeg_fdct_islow (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
159 {
160   INT32 tmp0, tmp1, tmp2, tmp3;
161   INT32 tmp10, tmp11, tmp12, tmp13;
162   INT32 z1;
163   DCTELEM *dataptr;
164   JSAMPROW elemptr;
165   int ctr;
166   SHIFT_TEMPS
167 
168   /* Pass 1: process rows.
169    * Note results are scaled up by sqrt(8) compared to a true DCT;
170    * furthermore, we scale the results by 2**PASS1_BITS.
171    * cK represents sqrt(2) * cos(K*pi/16).
172    */
173 
174   dataptr = data;
175   for (ctr = 0; ctr < DCTSIZE; ctr++) {
176     elemptr = sample_data[ctr] + start_col;
177 
178     /* Even part per LL&M figure 1 --- note that published figure is faulty;
179      * rotator "c1" should be "c6".
180      */
181 
182     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
183     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
184     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
185     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
186 
187     tmp10 = tmp0 + tmp3;
188     tmp12 = tmp0 - tmp3;
189     tmp11 = tmp1 + tmp2;
190     tmp13 = tmp1 - tmp2;
191 
192     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
193     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
194     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
195     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
196 
197     /* Apply unsigned->signed conversion. */
198     dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS);
199     dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
200 
201     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);       /* c6 */
202     /* Add fudge factor here for final descale. */
203     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
204 
205     dataptr[2] = (DCTELEM)
206       RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
207 		  CONST_BITS-PASS1_BITS);
208     dataptr[6] = (DCTELEM)
209       RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
210 		  CONST_BITS-PASS1_BITS);
211 
212     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
213      * i0..i3 in the paper are tmp0..tmp3 here.
214      */
215 
216     tmp12 = tmp0 + tmp2;
217     tmp13 = tmp1 + tmp3;
218 
219     z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602);       /*  c3 */
220     /* Add fudge factor here for final descale. */
221     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
222 
223     tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);          /* -c3+c5 */
224     tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);          /* -c3-c5 */
225     tmp12 += z1;
226     tmp13 += z1;
227 
228     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223);       /* -c3+c7 */
229     tmp0 = MULTIPLY(tmp0, FIX_1_501321110);              /*  c1+c3-c5-c7 */
230     tmp3 = MULTIPLY(tmp3, FIX_0_298631336);              /* -c1+c3+c5-c7 */
231     tmp0 += z1 + tmp12;
232     tmp3 += z1 + tmp13;
233 
234     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447);       /* -c1-c3 */
235     tmp1 = MULTIPLY(tmp1, FIX_3_072711026);              /*  c1+c3+c5-c7 */
236     tmp2 = MULTIPLY(tmp2, FIX_2_053119869);              /*  c1+c3-c5+c7 */
237     tmp1 += z1 + tmp13;
238     tmp2 += z1 + tmp12;
239 
240     dataptr[1] = (DCTELEM) RIGHT_SHIFT(tmp0, CONST_BITS-PASS1_BITS);
241     dataptr[3] = (DCTELEM) RIGHT_SHIFT(tmp1, CONST_BITS-PASS1_BITS);
242     dataptr[5] = (DCTELEM) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
243     dataptr[7] = (DCTELEM) RIGHT_SHIFT(tmp3, CONST_BITS-PASS1_BITS);
244 
245     dataptr += DCTSIZE;		/* advance pointer to next row */
246   }
247 
248   /* Pass 2: process columns.
249    * We remove the PASS1_BITS scaling, but leave the results scaled up
250    * by an overall factor of 8.
251    * cK represents sqrt(2) * cos(K*pi/16).
252    */
253 
254   dataptr = data;
255   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
256     /* Even part per LL&M figure 1 --- note that published figure is faulty;
257      * rotator "c1" should be "c6".
258      */
259 
260     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
261     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
262     tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
263     tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
264 
265     /* Add fudge factor here for final descale. */
266     tmp10 = tmp0 + tmp3 + (ONE << (PASS1_BITS-1));
267     tmp12 = tmp0 - tmp3;
268     tmp11 = tmp1 + tmp2;
269     tmp13 = tmp1 - tmp2;
270 
271     tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
272     tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
273     tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
274     tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
275 
276     dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
277     dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);
278 
279     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);       /* c6 */
280     /* Add fudge factor here for final descale. */
281     z1 += ONE << (CONST_BITS+PASS1_BITS-1);
282 
283     dataptr[DCTSIZE*2] = (DCTELEM)
284       RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
285 		  CONST_BITS+PASS1_BITS);
286     dataptr[DCTSIZE*6] = (DCTELEM)
287       RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
288 		  CONST_BITS+PASS1_BITS);
289 
290     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
291      * i0..i3 in the paper are tmp0..tmp3 here.
292      */
293 
294     tmp12 = tmp0 + tmp2;
295     tmp13 = tmp1 + tmp3;
296 
297     z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602);       /*  c3 */
298     /* Add fudge factor here for final descale. */
299     z1 += ONE << (CONST_BITS+PASS1_BITS-1);
300 
301     tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);          /* -c3+c5 */
302     tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);          /* -c3-c5 */
303     tmp12 += z1;
304     tmp13 += z1;
305 
306     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223);       /* -c3+c7 */
307     tmp0 = MULTIPLY(tmp0, FIX_1_501321110);              /*  c1+c3-c5-c7 */
308     tmp3 = MULTIPLY(tmp3, FIX_0_298631336);              /* -c1+c3+c5-c7 */
309     tmp0 += z1 + tmp12;
310     tmp3 += z1 + tmp13;
311 
312     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447);       /* -c1-c3 */
313     tmp1 = MULTIPLY(tmp1, FIX_3_072711026);              /*  c1+c3+c5-c7 */
314     tmp2 = MULTIPLY(tmp2, FIX_2_053119869);              /*  c1+c3-c5+c7 */
315     tmp1 += z1 + tmp13;
316     tmp2 += z1 + tmp12;
317 
318     dataptr[DCTSIZE*1] = (DCTELEM) RIGHT_SHIFT(tmp0, CONST_BITS+PASS1_BITS);
319     dataptr[DCTSIZE*3] = (DCTELEM) RIGHT_SHIFT(tmp1, CONST_BITS+PASS1_BITS);
320     dataptr[DCTSIZE*5] = (DCTELEM) RIGHT_SHIFT(tmp2, CONST_BITS+PASS1_BITS);
321     dataptr[DCTSIZE*7] = (DCTELEM) RIGHT_SHIFT(tmp3, CONST_BITS+PASS1_BITS);
322 
323     dataptr++;			/* advance pointer to next column */
324   }
325 }
326 
327 #ifdef DCT_SCALING_SUPPORTED
328 
329 
330 /*
331  * Perform the forward DCT on a 7x7 sample block.
332  */
333 
334 GLOBAL(void)
jpeg_fdct_7x7(DCTELEM * data,JSAMPARRAY sample_data,JDIMENSION start_col)335 jpeg_fdct_7x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
336 {
337   INT32 tmp0, tmp1, tmp2, tmp3;
338   INT32 tmp10, tmp11, tmp12;
339   INT32 z1, z2, z3;
340   DCTELEM *dataptr;
341   JSAMPROW elemptr;
342   int ctr;
343   SHIFT_TEMPS
344 
345   /* Pre-zero output coefficient block. */
346   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
347 
348   /* Pass 1: process rows.
349    * Note results are scaled up by sqrt(8) compared to a true DCT;
350    * furthermore, we scale the results by 2**PASS1_BITS.
351    * cK represents sqrt(2) * cos(K*pi/14).
352    */
353 
354   dataptr = data;
355   for (ctr = 0; ctr < 7; ctr++) {
356     elemptr = sample_data[ctr] + start_col;
357 
358     /* Even part */
359 
360     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
361     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
362     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
363     tmp3 = GETJSAMPLE(elemptr[3]);
364 
365     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
366     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
367     tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
368 
369     z1 = tmp0 + tmp2;
370     /* Apply unsigned->signed conversion. */
371     dataptr[0] = (DCTELEM)
372       ((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS);
373     tmp3 += tmp3;
374     z1 -= tmp3;
375     z1 -= tmp3;
376     z1 = MULTIPLY(z1, FIX(0.353553391));                /* (c2+c6-c4)/2 */
377     z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002));       /* (c2+c4-c6)/2 */
378     z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123));       /* c6 */
379     dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
380     z1 -= z2;
381     z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734));       /* c4 */
382     dataptr[4] = (DCTELEM)
383       DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
384 	      CONST_BITS-PASS1_BITS);
385     dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
386 
387     /* Odd part */
388 
389     tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347));   /* (c3+c1-c5)/2 */
390     tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339));   /* (c3+c5-c1)/2 */
391     tmp0 = tmp1 - tmp2;
392     tmp1 += tmp2;
393     tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.378756276)); /* -c1 */
394     tmp1 += tmp2;
395     tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.613604268));   /* c5 */
396     tmp0 += tmp3;
397     tmp2 += tmp3 + MULTIPLY(tmp12, FIX(1.870828693));   /* c3+c1-c5 */
398 
399     dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
400     dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
401     dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
402 
403     dataptr += DCTSIZE;		/* advance pointer to next row */
404   }
405 
406   /* Pass 2: process columns.
407    * We remove the PASS1_BITS scaling, but leave the results scaled up
408    * by an overall factor of 8.
409    * We must also scale the output by (8/7)**2 = 64/49, which we fold
410    * into the constant multipliers:
411    * cK now represents sqrt(2) * cos(K*pi/14) * 64/49.
412    */
413 
414   dataptr = data;
415   for (ctr = 0; ctr < 7; ctr++) {
416     /* Even part */
417 
418     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*6];
419     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*5];
420     tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*4];
421     tmp3 = dataptr[DCTSIZE*3];
422 
423     tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*6];
424     tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*5];
425     tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*4];
426 
427     z1 = tmp0 + tmp2;
428     dataptr[DCTSIZE*0] = (DCTELEM)
429       DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(1.306122449)), /* 64/49 */
430 	      CONST_BITS+PASS1_BITS);
431     tmp3 += tmp3;
432     z1 -= tmp3;
433     z1 -= tmp3;
434     z1 = MULTIPLY(z1, FIX(0.461784020));                /* (c2+c6-c4)/2 */
435     z2 = MULTIPLY(tmp0 - tmp2, FIX(1.202428084));       /* (c2+c4-c6)/2 */
436     z3 = MULTIPLY(tmp1 - tmp2, FIX(0.411026446));       /* c6 */
437     dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS1_BITS);
438     z1 -= z2;
439     z2 = MULTIPLY(tmp0 - tmp1, FIX(1.151670509));       /* c4 */
440     dataptr[DCTSIZE*4] = (DCTELEM)
441       DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.923568041)), /* c2+c6-c4 */
442 	      CONST_BITS+PASS1_BITS);
443     dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS1_BITS);
444 
445     /* Odd part */
446 
447     tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.221765677));   /* (c3+c1-c5)/2 */
448     tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.222383464));   /* (c3+c5-c1)/2 */
449     tmp0 = tmp1 - tmp2;
450     tmp1 += tmp2;
451     tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.800824523)); /* -c1 */
452     tmp1 += tmp2;
453     tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.801442310));   /* c5 */
454     tmp0 += tmp3;
455     tmp2 += tmp3 + MULTIPLY(tmp12, FIX(2.443531355));   /* c3+c1-c5 */
456 
457     dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS);
458     dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS);
459     dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS);
460 
461     dataptr++;			/* advance pointer to next column */
462   }
463 }
464 
465 
466 /*
467  * Perform the forward DCT on a 6x6 sample block.
468  */
469 
470 GLOBAL(void)
jpeg_fdct_6x6(DCTELEM * data,JSAMPARRAY sample_data,JDIMENSION start_col)471 jpeg_fdct_6x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
472 {
473   INT32 tmp0, tmp1, tmp2;
474   INT32 tmp10, tmp11, tmp12;
475   DCTELEM *dataptr;
476   JSAMPROW elemptr;
477   int ctr;
478   SHIFT_TEMPS
479 
480   /* Pre-zero output coefficient block. */
481   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
482 
483   /* Pass 1: process rows.
484    * Note results are scaled up by sqrt(8) compared to a true DCT;
485    * furthermore, we scale the results by 2**PASS1_BITS.
486    * cK represents sqrt(2) * cos(K*pi/12).
487    */
488 
489   dataptr = data;
490   for (ctr = 0; ctr < 6; ctr++) {
491     elemptr = sample_data[ctr] + start_col;
492 
493     /* Even part */
494 
495     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
496     tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
497     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
498 
499     tmp10 = tmp0 + tmp2;
500     tmp12 = tmp0 - tmp2;
501 
502     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
503     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
504     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
505 
506     /* Apply unsigned->signed conversion. */
507     dataptr[0] = (DCTELEM)
508       ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS);
509     dataptr[2] = (DCTELEM)
510       DESCALE(MULTIPLY(tmp12, FIX(1.224744871)),                 /* c2 */
511 	      CONST_BITS-PASS1_BITS);
512     dataptr[4] = (DCTELEM)
513       DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
514 	      CONST_BITS-PASS1_BITS);
515 
516     /* Odd part */
517 
518     tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)),     /* c5 */
519 		    CONST_BITS-PASS1_BITS);
520 
521     dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
522     dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
523     dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
524 
525     dataptr += DCTSIZE;		/* advance pointer to next row */
526   }
527 
528   /* Pass 2: process columns.
529    * We remove the PASS1_BITS scaling, but leave the results scaled up
530    * by an overall factor of 8.
531    * We must also scale the output by (8/6)**2 = 16/9, which we fold
532    * into the constant multipliers:
533    * cK now represents sqrt(2) * cos(K*pi/12) * 16/9.
534    */
535 
536   dataptr = data;
537   for (ctr = 0; ctr < 6; ctr++) {
538     /* Even part */
539 
540     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
541     tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
542     tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
543 
544     tmp10 = tmp0 + tmp2;
545     tmp12 = tmp0 - tmp2;
546 
547     tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
548     tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
549     tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
550 
551     dataptr[DCTSIZE*0] = (DCTELEM)
552       DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)),         /* 16/9 */
553 	      CONST_BITS+PASS1_BITS);
554     dataptr[DCTSIZE*2] = (DCTELEM)
555       DESCALE(MULTIPLY(tmp12, FIX(2.177324216)),                 /* c2 */
556 	      CONST_BITS+PASS1_BITS);
557     dataptr[DCTSIZE*4] = (DCTELEM)
558       DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
559 	      CONST_BITS+PASS1_BITS);
560 
561     /* Odd part */
562 
563     tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829));             /* c5 */
564 
565     dataptr[DCTSIZE*1] = (DCTELEM)
566       DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),   /* 16/9 */
567 	      CONST_BITS+PASS1_BITS);
568     dataptr[DCTSIZE*3] = (DCTELEM)
569       DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)),    /* 16/9 */
570 	      CONST_BITS+PASS1_BITS);
571     dataptr[DCTSIZE*5] = (DCTELEM)
572       DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)),   /* 16/9 */
573 	      CONST_BITS+PASS1_BITS);
574 
575     dataptr++;			/* advance pointer to next column */
576   }
577 }
578 
579 
580 /*
581  * Perform the forward DCT on a 5x5 sample block.
582  */
583 
584 GLOBAL(void)
jpeg_fdct_5x5(DCTELEM * data,JSAMPARRAY sample_data,JDIMENSION start_col)585 jpeg_fdct_5x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
586 {
587   INT32 tmp0, tmp1, tmp2;
588   INT32 tmp10, tmp11;
589   DCTELEM *dataptr;
590   JSAMPROW elemptr;
591   int ctr;
592   SHIFT_TEMPS
593 
594   /* Pre-zero output coefficient block. */
595   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
596 
597   /* Pass 1: process rows.
598    * Note results are scaled up by sqrt(8) compared to a true DCT;
599    * furthermore, we scale the results by 2**PASS1_BITS.
600    * We scale the results further by 2 as part of output adaption
601    * scaling for different DCT size.
602    * cK represents sqrt(2) * cos(K*pi/10).
603    */
604 
605   dataptr = data;
606   for (ctr = 0; ctr < 5; ctr++) {
607     elemptr = sample_data[ctr] + start_col;
608 
609     /* Even part */
610 
611     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
612     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
613     tmp2 = GETJSAMPLE(elemptr[2]);
614 
615     tmp10 = tmp0 + tmp1;
616     tmp11 = tmp0 - tmp1;
617 
618     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
619     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
620 
621     /* Apply unsigned->signed conversion. */
622     dataptr[0] = (DCTELEM)
623       ((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << (PASS1_BITS+1));
624     tmp11 = MULTIPLY(tmp11, FIX(0.790569415));          /* (c2+c4)/2 */
625     tmp10 -= tmp2 << 2;
626     tmp10 = MULTIPLY(tmp10, FIX(0.353553391));          /* (c2-c4)/2 */
627     dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS-1);
628     dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS-1);
629 
630     /* Odd part */
631 
632     tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876));    /* c3 */
633 
634     dataptr[1] = (DCTELEM)
635       DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
636 	      CONST_BITS-PASS1_BITS-1);
637     dataptr[3] = (DCTELEM)
638       DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
639 	      CONST_BITS-PASS1_BITS-1);
640 
641     dataptr += DCTSIZE;		/* advance pointer to next row */
642   }
643 
644   /* Pass 2: process columns.
645    * We remove the PASS1_BITS scaling, but leave the results scaled up
646    * by an overall factor of 8.
647    * We must also scale the output by (8/5)**2 = 64/25, which we partially
648    * fold into the constant multipliers (other part was done in pass 1):
649    * cK now represents sqrt(2) * cos(K*pi/10) * 32/25.
650    */
651 
652   dataptr = data;
653   for (ctr = 0; ctr < 5; ctr++) {
654     /* Even part */
655 
656     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*4];
657     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*3];
658     tmp2 = dataptr[DCTSIZE*2];
659 
660     tmp10 = tmp0 + tmp1;
661     tmp11 = tmp0 - tmp1;
662 
663     tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*4];
664     tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*3];
665 
666     dataptr[DCTSIZE*0] = (DCTELEM)
667       DESCALE(MULTIPLY(tmp10 + tmp2, FIX(1.28)),        /* 32/25 */
668 	      CONST_BITS+PASS1_BITS);
669     tmp11 = MULTIPLY(tmp11, FIX(1.011928851));          /* (c2+c4)/2 */
670     tmp10 -= tmp2 << 2;
671     tmp10 = MULTIPLY(tmp10, FIX(0.452548340));          /* (c2-c4)/2 */
672     dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS1_BITS);
673     dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS1_BITS);
674 
675     /* Odd part */
676 
677     tmp10 = MULTIPLY(tmp0 + tmp1, FIX(1.064004961));    /* c3 */
678 
679     dataptr[DCTSIZE*1] = (DCTELEM)
680       DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.657591230)), /* c1-c3 */
681 	      CONST_BITS+PASS1_BITS);
682     dataptr[DCTSIZE*3] = (DCTELEM)
683       DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.785601151)), /* c1+c3 */
684 	      CONST_BITS+PASS1_BITS);
685 
686     dataptr++;			/* advance pointer to next column */
687   }
688 }
689 
690 
691 /*
692  * Perform the forward DCT on a 4x4 sample block.
693  */
694 
695 GLOBAL(void)
jpeg_fdct_4x4(DCTELEM * data,JSAMPARRAY sample_data,JDIMENSION start_col)696 jpeg_fdct_4x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
697 {
698   INT32 tmp0, tmp1;
699   INT32 tmp10, tmp11;
700   DCTELEM *dataptr;
701   JSAMPROW elemptr;
702   int ctr;
703   SHIFT_TEMPS
704 
705   /* Pre-zero output coefficient block. */
706   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
707 
708   /* Pass 1: process rows.
709    * Note results are scaled up by sqrt(8) compared to a true DCT;
710    * furthermore, we scale the results by 2**PASS1_BITS.
711    * We must also scale the output by (8/4)**2 = 2**2, which we add here.
712    * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
713    */
714 
715   dataptr = data;
716   for (ctr = 0; ctr < 4; ctr++) {
717     elemptr = sample_data[ctr] + start_col;
718 
719     /* Even part */
720 
721     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
722     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
723 
724     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
725     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
726 
727     /* Apply unsigned->signed conversion. */
728     dataptr[0] = (DCTELEM)
729       ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+2));
730     dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+2));
731 
732     /* Odd part */
733 
734     tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
735     /* Add fudge factor here for final descale. */
736     tmp0 += ONE << (CONST_BITS-PASS1_BITS-3);
737 
738     dataptr[1] = (DCTELEM)
739       RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
740 		  CONST_BITS-PASS1_BITS-2);
741     dataptr[3] = (DCTELEM)
742       RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
743 		  CONST_BITS-PASS1_BITS-2);
744 
745     dataptr += DCTSIZE;		/* advance pointer to next row */
746   }
747 
748   /* Pass 2: process columns.
749    * We remove the PASS1_BITS scaling, but leave the results scaled up
750    * by an overall factor of 8.
751    * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
752    */
753 
754   dataptr = data;
755   for (ctr = 0; ctr < 4; ctr++) {
756     /* Even part */
757 
758     /* Add fudge factor here for final descale. */
759     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3] + (ONE << (PASS1_BITS-1));
760     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
761 
762     tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
763     tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
764 
765     dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
766     dataptr[DCTSIZE*2] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
767 
768     /* Odd part */
769 
770     tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
771     /* Add fudge factor here for final descale. */
772     tmp0 += ONE << (CONST_BITS+PASS1_BITS-1);
773 
774     dataptr[DCTSIZE*1] = (DCTELEM)
775       RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
776 		  CONST_BITS+PASS1_BITS);
777     dataptr[DCTSIZE*3] = (DCTELEM)
778       RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
779 		  CONST_BITS+PASS1_BITS);
780 
781     dataptr++;			/* advance pointer to next column */
782   }
783 }
784 
785 
786 /*
787  * Perform the forward DCT on a 3x3 sample block.
788  */
789 
790 GLOBAL(void)
jpeg_fdct_3x3(DCTELEM * data,JSAMPARRAY sample_data,JDIMENSION start_col)791 jpeg_fdct_3x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
792 {
793   INT32 tmp0, tmp1, tmp2;
794   DCTELEM *dataptr;
795   JSAMPROW elemptr;
796   int ctr;
797   SHIFT_TEMPS
798 
799   /* Pre-zero output coefficient block. */
800   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
801 
802   /* Pass 1: process rows.
803    * Note results are scaled up by sqrt(8) compared to a true DCT;
804    * furthermore, we scale the results by 2**PASS1_BITS.
805    * We scale the results further by 2**2 as part of output adaption
806    * scaling for different DCT size.
807    * cK represents sqrt(2) * cos(K*pi/6).
808    */
809 
810   dataptr = data;
811   for (ctr = 0; ctr < 3; ctr++) {
812     elemptr = sample_data[ctr] + start_col;
813 
814     /* Even part */
815 
816     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
817     tmp1 = GETJSAMPLE(elemptr[1]);
818 
819     tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
820 
821     /* Apply unsigned->signed conversion. */
822     dataptr[0] = (DCTELEM)
823       ((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+2));
824     dataptr[2] = (DCTELEM)
825       DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
826 	      CONST_BITS-PASS1_BITS-2);
827 
828     /* Odd part */
829 
830     dataptr[1] = (DCTELEM)
831       DESCALE(MULTIPLY(tmp2, FIX(1.224744871)),               /* c1 */
832 	      CONST_BITS-PASS1_BITS-2);
833 
834     dataptr += DCTSIZE;		/* advance pointer to next row */
835   }
836 
837   /* Pass 2: process columns.
838    * We remove the PASS1_BITS scaling, but leave the results scaled up
839    * by an overall factor of 8.
840    * We must also scale the output by (8/3)**2 = 64/9, which we partially
841    * fold into the constant multipliers (other part was done in pass 1):
842    * cK now represents sqrt(2) * cos(K*pi/6) * 16/9.
843    */
844 
845   dataptr = data;
846   for (ctr = 0; ctr < 3; ctr++) {
847     /* Even part */
848 
849     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*2];
850     tmp1 = dataptr[DCTSIZE*1];
851 
852     tmp2 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*2];
853 
854     dataptr[DCTSIZE*0] = (DCTELEM)
855       DESCALE(MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),        /* 16/9 */
856 	      CONST_BITS+PASS1_BITS);
857     dataptr[DCTSIZE*2] = (DCTELEM)
858       DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(1.257078722)), /* c2 */
859 	      CONST_BITS+PASS1_BITS);
860 
861     /* Odd part */
862 
863     dataptr[DCTSIZE*1] = (DCTELEM)
864       DESCALE(MULTIPLY(tmp2, FIX(2.177324216)),               /* c1 */
865 	      CONST_BITS+PASS1_BITS);
866 
867     dataptr++;			/* advance pointer to next column */
868   }
869 }
870 
871 
872 /*
873  * Perform the forward DCT on a 2x2 sample block.
874  */
875 
876 GLOBAL(void)
jpeg_fdct_2x2(DCTELEM * data,JSAMPARRAY sample_data,JDIMENSION start_col)877 jpeg_fdct_2x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
878 {
879   DCTELEM tmp0, tmp1, tmp2, tmp3;
880   JSAMPROW elemptr;
881 
882   /* Pre-zero output coefficient block. */
883   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
884 
885   /* Pass 1: process rows.
886    * Note results are scaled up by sqrt(8) compared to a true DCT.
887    */
888 
889   /* Row 0 */
890   elemptr = sample_data[0] + start_col;
891 
892   tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
893   tmp1 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);
894 
895   /* Row 1 */
896   elemptr = sample_data[1] + start_col;
897 
898   tmp2 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
899   tmp3 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);
900 
901   /* Pass 2: process columns.
902    * We leave the results scaled up by an overall factor of 8.
903    * We must also scale the output by (8/2)**2 = 2**4.
904    */
905 
906   /* Column 0 */
907   /* Apply unsigned->signed conversion. */
908   data[DCTSIZE*0] = (tmp0 + tmp2 - 4 * CENTERJSAMPLE) << 4;
909   data[DCTSIZE*1] = (tmp0 - tmp2) << 4;
910 
911   /* Column 1 */
912   data[DCTSIZE*0+1] = (tmp1 + tmp3) << 4;
913   data[DCTSIZE*1+1] = (tmp1 - tmp3) << 4;
914 }
915 
916 
917 /*
918  * Perform the forward DCT on a 1x1 sample block.
919  */
920 
921 GLOBAL(void)
jpeg_fdct_1x1(DCTELEM * data,JSAMPARRAY sample_data,JDIMENSION start_col)922 jpeg_fdct_1x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
923 {
924   DCTELEM dcval;
925 
926   /* Pre-zero output coefficient block. */
927   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
928 
929   dcval = GETJSAMPLE(sample_data[0][start_col]);
930 
931   /* We leave the result scaled up by an overall factor of 8. */
932   /* We must also scale the output by (8/1)**2 = 2**6. */
933   /* Apply unsigned->signed conversion. */
934   data[0] = (dcval - CENTERJSAMPLE) << 6;
935 }
936 
937 
938 /*
939  * Perform the forward DCT on a 9x9 sample block.
940  */
941 
942 GLOBAL(void)
jpeg_fdct_9x9(DCTELEM * data,JSAMPARRAY sample_data,JDIMENSION start_col)943 jpeg_fdct_9x9 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
944 {
945   INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
946   INT32 tmp10, tmp11, tmp12, tmp13;
947   INT32 z1, z2;
948   DCTELEM workspace[8];
949   DCTELEM *dataptr;
950   DCTELEM *wsptr;
951   JSAMPROW elemptr;
952   int ctr;
953   SHIFT_TEMPS
954 
955   /* Pass 1: process rows.
956    * Note results are scaled up by sqrt(8) compared to a true DCT;
957    * we scale the results further by 2 as part of output adaption
958    * scaling for different DCT size.
959    * cK represents sqrt(2) * cos(K*pi/18).
960    */
961 
962   dataptr = data;
963   ctr = 0;
964   for (;;) {
965     elemptr = sample_data[ctr] + start_col;
966 
967     /* Even part */
968 
969     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[8]);
970     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[7]);
971     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[6]);
972     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[5]);
973     tmp4 = GETJSAMPLE(elemptr[4]);
974 
975     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[8]);
976     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[7]);
977     tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[6]);
978     tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[5]);
979 
980     z1 = tmp0 + tmp2 + tmp3;
981     z2 = tmp1 + tmp4;
982     /* Apply unsigned->signed conversion. */
983     dataptr[0] = (DCTELEM) ((z1 + z2 - 9 * CENTERJSAMPLE) << 1);
984     dataptr[6] = (DCTELEM)
985       DESCALE(MULTIPLY(z1 - z2 - z2, FIX(0.707106781)),  /* c6 */
986 	      CONST_BITS-1);
987     z1 = MULTIPLY(tmp0 - tmp2, FIX(1.328926049));        /* c2 */
988     z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(0.707106781)); /* c6 */
989     dataptr[2] = (DCTELEM)
990       DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.083350441))    /* c4 */
991 	      + z1 + z2, CONST_BITS-1);
992     dataptr[4] = (DCTELEM)
993       DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.245575608))    /* c8 */
994 	      + z1 - z2, CONST_BITS-1);
995 
996     /* Odd part */
997 
998     dataptr[3] = (DCTELEM)
999       DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.224744871)), /* c3 */
1000 	      CONST_BITS-1);
1001 
1002     tmp11 = MULTIPLY(tmp11, FIX(1.224744871));        /* c3 */
1003     tmp0 = MULTIPLY(tmp10 + tmp12, FIX(0.909038955)); /* c5 */
1004     tmp1 = MULTIPLY(tmp10 + tmp13, FIX(0.483689525)); /* c7 */
1005 
1006     dataptr[1] = (DCTELEM) DESCALE(tmp11 + tmp0 + tmp1, CONST_BITS-1);
1007 
1008     tmp2 = MULTIPLY(tmp12 - tmp13, FIX(1.392728481)); /* c1 */
1009 
1010     dataptr[5] = (DCTELEM) DESCALE(tmp0 - tmp11 - tmp2, CONST_BITS-1);
1011     dataptr[7] = (DCTELEM) DESCALE(tmp1 - tmp11 + tmp2, CONST_BITS-1);
1012 
1013     ctr++;
1014 
1015     if (ctr != DCTSIZE) {
1016       if (ctr == 9)
1017 	break;			/* Done. */
1018       dataptr += DCTSIZE;	/* advance pointer to next row */
1019     } else
1020       dataptr = workspace;	/* switch pointer to extended workspace */
1021   }
1022 
1023   /* Pass 2: process columns.
1024    * We leave the results scaled up by an overall factor of 8.
1025    * We must also scale the output by (8/9)**2 = 64/81, which we partially
1026    * fold into the constant multipliers and final/initial shifting:
1027    * cK now represents sqrt(2) * cos(K*pi/18) * 128/81.
1028    */
1029 
1030   dataptr = data;
1031   wsptr = workspace;
1032   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1033     /* Even part */
1034 
1035     tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*0];
1036     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*7];
1037     tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*6];
1038     tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*5];
1039     tmp4 = dataptr[DCTSIZE*4];
1040 
1041     tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*0];
1042     tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*7];
1043     tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*6];
1044     tmp13 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*5];
1045 
1046     z1 = tmp0 + tmp2 + tmp3;
1047     z2 = tmp1 + tmp4;
1048     dataptr[DCTSIZE*0] = (DCTELEM)
1049       DESCALE(MULTIPLY(z1 + z2, FIX(1.580246914)),       /* 128/81 */
1050 	      CONST_BITS+2);
1051     dataptr[DCTSIZE*6] = (DCTELEM)
1052       DESCALE(MULTIPLY(z1 - z2 - z2, FIX(1.117403309)),  /* c6 */
1053 	      CONST_BITS+2);
1054     z1 = MULTIPLY(tmp0 - tmp2, FIX(2.100031287));        /* c2 */
1055     z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(1.117403309)); /* c6 */
1056     dataptr[DCTSIZE*2] = (DCTELEM)
1057       DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.711961190))    /* c4 */
1058 	      + z1 + z2, CONST_BITS+2);
1059     dataptr[DCTSIZE*4] = (DCTELEM)
1060       DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.388070096))    /* c8 */
1061 	      + z1 - z2, CONST_BITS+2);
1062 
1063     /* Odd part */
1064 
1065     dataptr[DCTSIZE*3] = (DCTELEM)
1066       DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.935399303)), /* c3 */
1067 	      CONST_BITS+2);
1068 
1069     tmp11 = MULTIPLY(tmp11, FIX(1.935399303));        /* c3 */
1070     tmp0 = MULTIPLY(tmp10 + tmp12, FIX(1.436506004)); /* c5 */
1071     tmp1 = MULTIPLY(tmp10 + tmp13, FIX(0.764348879)); /* c7 */
1072 
1073     dataptr[DCTSIZE*1] = (DCTELEM)
1074       DESCALE(tmp11 + tmp0 + tmp1, CONST_BITS+2);
1075 
1076     tmp2 = MULTIPLY(tmp12 - tmp13, FIX(2.200854883)); /* c1 */
1077 
1078     dataptr[DCTSIZE*5] = (DCTELEM)
1079       DESCALE(tmp0 - tmp11 - tmp2, CONST_BITS+2);
1080     dataptr[DCTSIZE*7] = (DCTELEM)
1081       DESCALE(tmp1 - tmp11 + tmp2, CONST_BITS+2);
1082 
1083     dataptr++;			/* advance pointer to next column */
1084     wsptr++;			/* advance pointer to next column */
1085   }
1086 }
1087 
1088 
1089 /*
1090  * Perform the forward DCT on a 10x10 sample block.
1091  */
1092 
1093 GLOBAL(void)
jpeg_fdct_10x10(DCTELEM * data,JSAMPARRAY sample_data,JDIMENSION start_col)1094 jpeg_fdct_10x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1095 {
1096   INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
1097   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
1098   DCTELEM workspace[8*2];
1099   DCTELEM *dataptr;
1100   DCTELEM *wsptr;
1101   JSAMPROW elemptr;
1102   int ctr;
1103   SHIFT_TEMPS
1104 
1105   /* Pass 1: process rows.
1106    * Note results are scaled up by sqrt(8) compared to a true DCT;
1107    * we scale the results further by 2 as part of output adaption
1108    * scaling for different DCT size.
1109    * cK represents sqrt(2) * cos(K*pi/20).
1110    */
1111 
1112   dataptr = data;
1113   ctr = 0;
1114   for (;;) {
1115     elemptr = sample_data[ctr] + start_col;
1116 
1117     /* Even part */
1118 
1119     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
1120     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
1121     tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
1122     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
1123     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);
1124 
1125     tmp10 = tmp0 + tmp4;
1126     tmp13 = tmp0 - tmp4;
1127     tmp11 = tmp1 + tmp3;
1128     tmp14 = tmp1 - tmp3;
1129 
1130     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
1131     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
1132     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
1133     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
1134     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
1135 
1136     /* Apply unsigned->signed conversion. */
1137     dataptr[0] = (DCTELEM)
1138       ((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << 1);
1139     tmp12 += tmp12;
1140     dataptr[4] = (DCTELEM)
1141       DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
1142 	      MULTIPLY(tmp11 - tmp12, FIX(0.437016024)),  /* c8 */
1143 	      CONST_BITS-1);
1144     tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876));    /* c6 */
1145     dataptr[2] = (DCTELEM)
1146       DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)),  /* c2-c6 */
1147 	      CONST_BITS-1);
1148     dataptr[6] = (DCTELEM)
1149       DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)),  /* c2+c6 */
1150 	      CONST_BITS-1);
1151 
1152     /* Odd part */
1153 
1154     tmp10 = tmp0 + tmp4;
1155     tmp11 = tmp1 - tmp3;
1156     dataptr[5] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << 1);
1157     tmp2 <<= CONST_BITS;
1158     dataptr[1] = (DCTELEM)
1159       DESCALE(MULTIPLY(tmp0, FIX(1.396802247)) +          /* c1 */
1160 	      MULTIPLY(tmp1, FIX(1.260073511)) + tmp2 +   /* c3 */
1161 	      MULTIPLY(tmp3, FIX(0.642039522)) +          /* c7 */
1162 	      MULTIPLY(tmp4, FIX(0.221231742)),           /* c9 */
1163 	      CONST_BITS-1);
1164     tmp12 = MULTIPLY(tmp0 - tmp4, FIX(0.951056516)) -     /* (c3+c7)/2 */
1165 	    MULTIPLY(tmp1 + tmp3, FIX(0.587785252));      /* (c1-c9)/2 */
1166     tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.309016994)) +   /* (c3-c7)/2 */
1167 	    (tmp11 << (CONST_BITS - 1)) - tmp2;
1168     dataptr[3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-1);
1169     dataptr[7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-1);
1170 
1171     ctr++;
1172 
1173     if (ctr != DCTSIZE) {
1174       if (ctr == 10)
1175 	break;			/* Done. */
1176       dataptr += DCTSIZE;	/* advance pointer to next row */
1177     } else
1178       dataptr = workspace;	/* switch pointer to extended workspace */
1179   }
1180 
1181   /* Pass 2: process columns.
1182    * We leave the results scaled up by an overall factor of 8.
1183    * We must also scale the output by (8/10)**2 = 16/25, which we partially
1184    * fold into the constant multipliers and final/initial shifting:
1185    * cK now represents sqrt(2) * cos(K*pi/20) * 32/25.
1186    */
1187 
1188   dataptr = data;
1189   wsptr = workspace;
1190   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1191     /* Even part */
1192 
1193     tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*1];
1194     tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*0];
1195     tmp12 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*7];
1196     tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*6];
1197     tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5];
1198 
1199     tmp10 = tmp0 + tmp4;
1200     tmp13 = tmp0 - tmp4;
1201     tmp11 = tmp1 + tmp3;
1202     tmp14 = tmp1 - tmp3;
1203 
1204     tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*1];
1205     tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*0];
1206     tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*7];
1207     tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*6];
1208     tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*5];
1209 
1210     dataptr[DCTSIZE*0] = (DCTELEM)
1211       DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(1.28)), /* 32/25 */
1212 	      CONST_BITS+2);
1213     tmp12 += tmp12;
1214     dataptr[DCTSIZE*4] = (DCTELEM)
1215       DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.464477191)) - /* c4 */
1216 	      MULTIPLY(tmp11 - tmp12, FIX(0.559380511)),  /* c8 */
1217 	      CONST_BITS+2);
1218     tmp10 = MULTIPLY(tmp13 + tmp14, FIX(1.064004961));    /* c6 */
1219     dataptr[DCTSIZE*2] = (DCTELEM)
1220       DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.657591230)),  /* c2-c6 */
1221 	      CONST_BITS+2);
1222     dataptr[DCTSIZE*6] = (DCTELEM)
1223       DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.785601151)),  /* c2+c6 */
1224 	      CONST_BITS+2);
1225 
1226     /* Odd part */
1227 
1228     tmp10 = tmp0 + tmp4;
1229     tmp11 = tmp1 - tmp3;
1230     dataptr[DCTSIZE*5] = (DCTELEM)
1231       DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(1.28)),  /* 32/25 */
1232 	      CONST_BITS+2);
1233     tmp2 = MULTIPLY(tmp2, FIX(1.28));                     /* 32/25 */
1234     dataptr[DCTSIZE*1] = (DCTELEM)
1235       DESCALE(MULTIPLY(tmp0, FIX(1.787906876)) +          /* c1 */
1236 	      MULTIPLY(tmp1, FIX(1.612894094)) + tmp2 +   /* c3 */
1237 	      MULTIPLY(tmp3, FIX(0.821810588)) +          /* c7 */
1238 	      MULTIPLY(tmp4, FIX(0.283176630)),           /* c9 */
1239 	      CONST_BITS+2);
1240     tmp12 = MULTIPLY(tmp0 - tmp4, FIX(1.217352341)) -     /* (c3+c7)/2 */
1241 	    MULTIPLY(tmp1 + tmp3, FIX(0.752365123));      /* (c1-c9)/2 */
1242     tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.395541753)) +   /* (c3-c7)/2 */
1243 	    MULTIPLY(tmp11, FIX(0.64)) - tmp2;            /* 16/25 */
1244     dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS+2);
1245     dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS+2);
1246 
1247     dataptr++;			/* advance pointer to next column */
1248     wsptr++;			/* advance pointer to next column */
1249   }
1250 }
1251 
1252 
1253 /*
1254  * Perform the forward DCT on an 11x11 sample block.
1255  */
1256 
1257 GLOBAL(void)
jpeg_fdct_11x11(DCTELEM * data,JSAMPARRAY sample_data,JDIMENSION start_col)1258 jpeg_fdct_11x11 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1259 {
1260   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1261   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
1262   INT32 z1, z2, z3;
1263   DCTELEM workspace[8*3];
1264   DCTELEM *dataptr;
1265   DCTELEM *wsptr;
1266   JSAMPROW elemptr;
1267   int ctr;
1268   SHIFT_TEMPS
1269 
1270   /* Pass 1: process rows.
1271    * Note results are scaled up by sqrt(8) compared to a true DCT;
1272    * we scale the results further by 2 as part of output adaption
1273    * scaling for different DCT size.
1274    * cK represents sqrt(2) * cos(K*pi/22).
1275    */
1276 
1277   dataptr = data;
1278   ctr = 0;
1279   for (;;) {
1280     elemptr = sample_data[ctr] + start_col;
1281 
1282     /* Even part */
1283 
1284     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[10]);
1285     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[9]);
1286     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[8]);
1287     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[7]);
1288     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[6]);
1289     tmp5 = GETJSAMPLE(elemptr[5]);
1290 
1291     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[10]);
1292     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[9]);
1293     tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[8]);
1294     tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[7]);
1295     tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[6]);
1296 
1297     /* Apply unsigned->signed conversion. */
1298     dataptr[0] = (DCTELEM)
1299       ((tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 - 11 * CENTERJSAMPLE) << 1);
1300     tmp5 += tmp5;
1301     tmp0 -= tmp5;
1302     tmp1 -= tmp5;
1303     tmp2 -= tmp5;
1304     tmp3 -= tmp5;
1305     tmp4 -= tmp5;
1306     z1 = MULTIPLY(tmp0 + tmp3, FIX(1.356927976)) +       /* c2 */
1307 	 MULTIPLY(tmp2 + tmp4, FIX(0.201263574));        /* c10 */
1308     z2 = MULTIPLY(tmp1 - tmp3, FIX(0.926112931));        /* c6 */
1309     z3 = MULTIPLY(tmp0 - tmp1, FIX(1.189712156));        /* c4 */
1310     dataptr[2] = (DCTELEM)
1311       DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.018300590)) /* c2+c8-c6 */
1312 	      - MULTIPLY(tmp4, FIX(1.390975730)),        /* c4+c10 */
1313 	      CONST_BITS-1);
1314     dataptr[4] = (DCTELEM)
1315       DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.062335650)) /* c4-c6-c10 */
1316 	      - MULTIPLY(tmp2, FIX(1.356927976))         /* c2 */
1317 	      + MULTIPLY(tmp4, FIX(0.587485545)),        /* c8 */
1318 	      CONST_BITS-1);
1319     dataptr[6] = (DCTELEM)
1320       DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(1.620527200)) /* c2+c4-c6 */
1321 	      - MULTIPLY(tmp2, FIX(0.788749120)),        /* c8+c10 */
1322 	      CONST_BITS-1);
1323 
1324     /* Odd part */
1325 
1326     tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.286413905));    /* c3 */
1327     tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.068791298));    /* c5 */
1328     tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.764581576));    /* c7 */
1329     tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(1.719967871)) /* c7+c5+c3-c1 */
1330 	   + MULTIPLY(tmp14, FIX(0.398430003));          /* c9 */
1331     tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(0.764581576));  /* -c7 */
1332     tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.399818907));  /* -c1 */
1333     tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(1.276416582)) /* c9+c7+c1-c3 */
1334 	    - MULTIPLY(tmp14, FIX(1.068791298));         /* c5 */
1335     tmp10 = MULTIPLY(tmp12 + tmp13, FIX(0.398430003));   /* c9 */
1336     tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(1.989053629)) /* c9+c5+c3-c7 */
1337 	    + MULTIPLY(tmp14, FIX(1.399818907));         /* c1 */
1338     tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(1.305598626)) /* c1+c5-c9-c7 */
1339 	    - MULTIPLY(tmp14, FIX(1.286413905));         /* c3 */
1340 
1341     dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-1);
1342     dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-1);
1343     dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-1);
1344     dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS-1);
1345 
1346     ctr++;
1347 
1348     if (ctr != DCTSIZE) {
1349       if (ctr == 11)
1350 	break;			/* Done. */
1351       dataptr += DCTSIZE;	/* advance pointer to next row */
1352     } else
1353       dataptr = workspace;	/* switch pointer to extended workspace */
1354   }
1355 
1356   /* Pass 2: process columns.
1357    * We leave the results scaled up by an overall factor of 8.
1358    * We must also scale the output by (8/11)**2 = 64/121, which we partially
1359    * fold into the constant multipliers and final/initial shifting:
1360    * cK now represents sqrt(2) * cos(K*pi/22) * 128/121.
1361    */
1362 
1363   dataptr = data;
1364   wsptr = workspace;
1365   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1366     /* Even part */
1367 
1368     tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*2];
1369     tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*1];
1370     tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*0];
1371     tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*7];
1372     tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*6];
1373     tmp5 = dataptr[DCTSIZE*5];
1374 
1375     tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*2];
1376     tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*1];
1377     tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*0];
1378     tmp13 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*7];
1379     tmp14 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*6];
1380 
1381     dataptr[DCTSIZE*0] = (DCTELEM)
1382       DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5,
1383 		       FIX(1.057851240)),                /* 128/121 */
1384 	      CONST_BITS+2);
1385     tmp5 += tmp5;
1386     tmp0 -= tmp5;
1387     tmp1 -= tmp5;
1388     tmp2 -= tmp5;
1389     tmp3 -= tmp5;
1390     tmp4 -= tmp5;
1391     z1 = MULTIPLY(tmp0 + tmp3, FIX(1.435427942)) +       /* c2 */
1392 	 MULTIPLY(tmp2 + tmp4, FIX(0.212906922));        /* c10 */
1393     z2 = MULTIPLY(tmp1 - tmp3, FIX(0.979689713));        /* c6 */
1394     z3 = MULTIPLY(tmp0 - tmp1, FIX(1.258538479));        /* c4 */
1395     dataptr[DCTSIZE*2] = (DCTELEM)
1396       DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.077210542)) /* c2+c8-c6 */
1397 	      - MULTIPLY(tmp4, FIX(1.471445400)),        /* c4+c10 */
1398 	      CONST_BITS+2);
1399     dataptr[DCTSIZE*4] = (DCTELEM)
1400       DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.065941844)) /* c4-c6-c10 */
1401 	      - MULTIPLY(tmp2, FIX(1.435427942))         /* c2 */
1402 	      + MULTIPLY(tmp4, FIX(0.621472312)),        /* c8 */
1403 	      CONST_BITS+2);
1404     dataptr[DCTSIZE*6] = (DCTELEM)
1405       DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(1.714276708)) /* c2+c4-c6 */
1406 	      - MULTIPLY(tmp2, FIX(0.834379234)),        /* c8+c10 */
1407 	      CONST_BITS+2);
1408 
1409     /* Odd part */
1410 
1411     tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.360834544));    /* c3 */
1412     tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.130622199));    /* c5 */
1413     tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.808813568));    /* c7 */
1414     tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(1.819470145)) /* c7+c5+c3-c1 */
1415 	   + MULTIPLY(tmp14, FIX(0.421479672));          /* c9 */
1416     tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(0.808813568));  /* -c7 */
1417     tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.480800167));  /* -c1 */
1418     tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(1.350258864)) /* c9+c7+c1-c3 */
1419 	    - MULTIPLY(tmp14, FIX(1.130622199));         /* c5 */
1420     tmp10 = MULTIPLY(tmp12 + tmp13, FIX(0.421479672));   /* c9 */
1421     tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(2.104122847)) /* c9+c5+c3-c7 */
1422 	    + MULTIPLY(tmp14, FIX(1.480800167));         /* c1 */
1423     tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(1.381129125)) /* c1+c5-c9-c7 */
1424 	    - MULTIPLY(tmp14, FIX(1.360834544));         /* c3 */
1425 
1426     dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+2);
1427     dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+2);
1428     dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+2);
1429     dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+2);
1430 
1431     dataptr++;			/* advance pointer to next column */
1432     wsptr++;			/* advance pointer to next column */
1433   }
1434 }
1435 
1436 
1437 /*
1438  * Perform the forward DCT on a 12x12 sample block.
1439  */
1440 
1441 GLOBAL(void)
jpeg_fdct_12x12(DCTELEM * data,JSAMPARRAY sample_data,JDIMENSION start_col)1442 jpeg_fdct_12x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1443 {
1444   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1445   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1446   DCTELEM workspace[8*4];
1447   DCTELEM *dataptr;
1448   DCTELEM *wsptr;
1449   JSAMPROW elemptr;
1450   int ctr;
1451   SHIFT_TEMPS
1452 
1453   /* Pass 1: process rows.
1454    * Note results are scaled up by sqrt(8) compared to a true DCT.
1455    * cK represents sqrt(2) * cos(K*pi/24).
1456    */
1457 
1458   dataptr = data;
1459   ctr = 0;
1460   for (;;) {
1461     elemptr = sample_data[ctr] + start_col;
1462 
1463     /* Even part */
1464 
1465     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
1466     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
1467     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
1468     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
1469     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
1470     tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);
1471 
1472     tmp10 = tmp0 + tmp5;
1473     tmp13 = tmp0 - tmp5;
1474     tmp11 = tmp1 + tmp4;
1475     tmp14 = tmp1 - tmp4;
1476     tmp12 = tmp2 + tmp3;
1477     tmp15 = tmp2 - tmp3;
1478 
1479     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
1480     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
1481     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
1482     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
1483     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
1484     tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
1485 
1486     /* Apply unsigned->signed conversion. */
1487     dataptr[0] = (DCTELEM) (tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE);
1488     dataptr[6] = (DCTELEM) (tmp13 - tmp14 - tmp15);
1489     dataptr[4] = (DCTELEM)
1490       DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
1491 	      CONST_BITS);
1492     dataptr[2] = (DCTELEM)
1493       DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
1494 	      CONST_BITS);
1495 
1496     /* Odd part */
1497 
1498     tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100);    /* c9 */
1499     tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865);   /* c3-c9 */
1500     tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065);   /* c3+c9 */
1501     tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054));   /* c5 */
1502     tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669));   /* c7 */
1503     tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
1504 	    + MULTIPLY(tmp5, FIX(0.184591911));        /* c11 */
1505     tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */
1506     tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */
1507 	    + MULTIPLY(tmp5, FIX(0.860918669));        /* c7 */
1508     tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.725788011)) /* c1+c11-c7 */
1509 	    - MULTIPLY(tmp5, FIX(1.121971054));        /* c5 */
1510     tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.306562965)) /* c3 */
1511 	    - MULTIPLY(tmp2 + tmp5, FIX_0_541196100);  /* c9 */
1512 
1513     dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS);
1514     dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS);
1515     dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS);
1516     dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS);
1517 
1518     ctr++;
1519 
1520     if (ctr != DCTSIZE) {
1521       if (ctr == 12)
1522 	break;			/* Done. */
1523       dataptr += DCTSIZE;	/* advance pointer to next row */
1524     } else
1525       dataptr = workspace;	/* switch pointer to extended workspace */
1526   }
1527 
1528   /* Pass 2: process columns.
1529    * We leave the results scaled up by an overall factor of 8.
1530    * We must also scale the output by (8/12)**2 = 4/9, which we partially
1531    * fold into the constant multipliers and final shifting:
1532    * cK now represents sqrt(2) * cos(K*pi/24) * 8/9.
1533    */
1534 
1535   dataptr = data;
1536   wsptr = workspace;
1537   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1538     /* Even part */
1539 
1540     tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*3];
1541     tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*2];
1542     tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*1];
1543     tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*0];
1544     tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*7];
1545     tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*6];
1546 
1547     tmp10 = tmp0 + tmp5;
1548     tmp13 = tmp0 - tmp5;
1549     tmp11 = tmp1 + tmp4;
1550     tmp14 = tmp1 - tmp4;
1551     tmp12 = tmp2 + tmp3;
1552     tmp15 = tmp2 - tmp3;
1553 
1554     tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*3];
1555     tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*2];
1556     tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*1];
1557     tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*0];
1558     tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*7];
1559     tmp5 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*6];
1560 
1561     dataptr[DCTSIZE*0] = (DCTELEM)
1562       DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(0.888888889)), /* 8/9 */
1563 	      CONST_BITS+1);
1564     dataptr[DCTSIZE*6] = (DCTELEM)
1565       DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(0.888888889)), /* 8/9 */
1566 	      CONST_BITS+1);
1567     dataptr[DCTSIZE*4] = (DCTELEM)
1568       DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.088662108)),         /* c4 */
1569 	      CONST_BITS+1);
1570     dataptr[DCTSIZE*2] = (DCTELEM)
1571       DESCALE(MULTIPLY(tmp14 - tmp15, FIX(0.888888889)) +        /* 8/9 */
1572 	      MULTIPLY(tmp13 + tmp15, FIX(1.214244803)),         /* c2 */
1573 	      CONST_BITS+1);
1574 
1575     /* Odd part */
1576 
1577     tmp10 = MULTIPLY(tmp1 + tmp4, FIX(0.481063200));   /* c9 */
1578     tmp14 = tmp10 + MULTIPLY(tmp1, FIX(0.680326102));  /* c3-c9 */
1579     tmp15 = tmp10 - MULTIPLY(tmp4, FIX(1.642452502));  /* c3+c9 */
1580     tmp12 = MULTIPLY(tmp0 + tmp2, FIX(0.997307603));   /* c5 */
1581     tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.765261039));   /* c7 */
1582     tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.516244403)) /* c5+c7-c1 */
1583 	    + MULTIPLY(tmp5, FIX(0.164081699));        /* c11 */
1584     tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.164081699)); /* -c11 */
1585     tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.079550144)) /* c1+c5-c11 */
1586 	    + MULTIPLY(tmp5, FIX(0.765261039));        /* c7 */
1587     tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.645144899)) /* c1+c11-c7 */
1588 	    - MULTIPLY(tmp5, FIX(0.997307603));        /* c5 */
1589     tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.161389302)) /* c3 */
1590 	    - MULTIPLY(tmp2 + tmp5, FIX(0.481063200)); /* c9 */
1591 
1592     dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+1);
1593     dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+1);
1594     dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+1);
1595     dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+1);
1596 
1597     dataptr++;			/* advance pointer to next column */
1598     wsptr++;			/* advance pointer to next column */
1599   }
1600 }
1601 
1602 
1603 /*
1604  * Perform the forward DCT on a 13x13 sample block.
1605  */
1606 
1607 GLOBAL(void)
jpeg_fdct_13x13(DCTELEM * data,JSAMPARRAY sample_data,JDIMENSION start_col)1608 jpeg_fdct_13x13 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1609 {
1610   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1611   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1612   INT32 z1, z2;
1613   DCTELEM workspace[8*5];
1614   DCTELEM *dataptr;
1615   DCTELEM *wsptr;
1616   JSAMPROW elemptr;
1617   int ctr;
1618   SHIFT_TEMPS
1619 
1620   /* Pass 1: process rows.
1621    * Note results are scaled up by sqrt(8) compared to a true DCT.
1622    * cK represents sqrt(2) * cos(K*pi/26).
1623    */
1624 
1625   dataptr = data;
1626   ctr = 0;
1627   for (;;) {
1628     elemptr = sample_data[ctr] + start_col;
1629 
1630     /* Even part */
1631 
1632     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[12]);
1633     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[11]);
1634     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[10]);
1635     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[9]);
1636     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[8]);
1637     tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[7]);
1638     tmp6 = GETJSAMPLE(elemptr[6]);
1639 
1640     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[12]);
1641     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[11]);
1642     tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[10]);
1643     tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[9]);
1644     tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[8]);
1645     tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[7]);
1646 
1647     /* Apply unsigned->signed conversion. */
1648     dataptr[0] = (DCTELEM)
1649       (tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6 - 13 * CENTERJSAMPLE);
1650     tmp6 += tmp6;
1651     tmp0 -= tmp6;
1652     tmp1 -= tmp6;
1653     tmp2 -= tmp6;
1654     tmp3 -= tmp6;
1655     tmp4 -= tmp6;
1656     tmp5 -= tmp6;
1657     dataptr[2] = (DCTELEM)
1658       DESCALE(MULTIPLY(tmp0, FIX(1.373119086)) +   /* c2 */
1659 	      MULTIPLY(tmp1, FIX(1.058554052)) +   /* c6 */
1660 	      MULTIPLY(tmp2, FIX(0.501487041)) -   /* c10 */
1661 	      MULTIPLY(tmp3, FIX(0.170464608)) -   /* c12 */
1662 	      MULTIPLY(tmp4, FIX(0.803364869)) -   /* c8 */
1663 	      MULTIPLY(tmp5, FIX(1.252223920)),    /* c4 */
1664 	      CONST_BITS);
1665     z1 = MULTIPLY(tmp0 - tmp2, FIX(1.155388986)) - /* (c4+c6)/2 */
1666 	 MULTIPLY(tmp3 - tmp4, FIX(0.435816023)) - /* (c2-c10)/2 */
1667 	 MULTIPLY(tmp1 - tmp5, FIX(0.316450131));  /* (c8-c12)/2 */
1668     z2 = MULTIPLY(tmp0 + tmp2, FIX(0.096834934)) - /* (c4-c6)/2 */
1669 	 MULTIPLY(tmp3 + tmp4, FIX(0.937303064)) + /* (c2+c10)/2 */
1670 	 MULTIPLY(tmp1 + tmp5, FIX(0.486914739));  /* (c8+c12)/2 */
1671 
1672     dataptr[4] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS);
1673     dataptr[6] = (DCTELEM) DESCALE(z1 - z2, CONST_BITS);
1674 
1675     /* Odd part */
1676 
1677     tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.322312651));   /* c3 */
1678     tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.163874945));   /* c5 */
1679     tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.937797057)) +  /* c7 */
1680 	   MULTIPLY(tmp14 + tmp15, FIX(0.338443458));   /* c11 */
1681     tmp0 = tmp1 + tmp2 + tmp3 -
1682 	   MULTIPLY(tmp10, FIX(2.020082300)) +          /* c3+c5+c7-c1 */
1683 	   MULTIPLY(tmp14, FIX(0.318774355));           /* c9-c11 */
1684     tmp4 = MULTIPLY(tmp14 - tmp15, FIX(0.937797057)) -  /* c7 */
1685 	   MULTIPLY(tmp11 + tmp12, FIX(0.338443458));   /* c11 */
1686     tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.163874945)); /* -c5 */
1687     tmp1 += tmp4 + tmp5 +
1688 	    MULTIPLY(tmp11, FIX(0.837223564)) -         /* c5+c9+c11-c3 */
1689 	    MULTIPLY(tmp14, FIX(2.341699410));          /* c1+c7 */
1690     tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(0.657217813)); /* -c9 */
1691     tmp2 += tmp4 + tmp6 -
1692 	    MULTIPLY(tmp12, FIX(1.572116027)) +         /* c1+c5-c9-c11 */
1693 	    MULTIPLY(tmp15, FIX(2.260109708));          /* c3+c7 */
1694     tmp3 += tmp5 + tmp6 +
1695 	    MULTIPLY(tmp13, FIX(2.205608352)) -         /* c3+c5+c9-c7 */
1696 	    MULTIPLY(tmp15, FIX(1.742345811));          /* c1+c11 */
1697 
1698     dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS);
1699     dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS);
1700     dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS);
1701     dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS);
1702 
1703     ctr++;
1704 
1705     if (ctr != DCTSIZE) {
1706       if (ctr == 13)
1707 	break;			/* Done. */
1708       dataptr += DCTSIZE;	/* advance pointer to next row */
1709     } else
1710       dataptr = workspace;	/* switch pointer to extended workspace */
1711   }
1712 
1713   /* Pass 2: process columns.
1714    * We leave the results scaled up by an overall factor of 8.
1715    * We must also scale the output by (8/13)**2 = 64/169, which we partially
1716    * fold into the constant multipliers and final shifting:
1717    * cK now represents sqrt(2) * cos(K*pi/26) * 128/169.
1718    */
1719 
1720   dataptr = data;
1721   wsptr = workspace;
1722   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1723     /* Even part */
1724 
1725     tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*4];
1726     tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*3];
1727     tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*2];
1728     tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*1];
1729     tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*0];
1730     tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*7];
1731     tmp6 = dataptr[DCTSIZE*6];
1732 
1733     tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*4];
1734     tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*3];
1735     tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*2];
1736     tmp13 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*1];
1737     tmp14 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*0];
1738     tmp15 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*7];
1739 
1740     dataptr[DCTSIZE*0] = (DCTELEM)
1741       DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6,
1742 		       FIX(0.757396450)),          /* 128/169 */
1743 	      CONST_BITS+1);
1744     tmp6 += tmp6;
1745     tmp0 -= tmp6;
1746     tmp1 -= tmp6;
1747     tmp2 -= tmp6;
1748     tmp3 -= tmp6;
1749     tmp4 -= tmp6;
1750     tmp5 -= tmp6;
1751     dataptr[DCTSIZE*2] = (DCTELEM)
1752       DESCALE(MULTIPLY(tmp0, FIX(1.039995521)) +   /* c2 */
1753 	      MULTIPLY(tmp1, FIX(0.801745081)) +   /* c6 */
1754 	      MULTIPLY(tmp2, FIX(0.379824504)) -   /* c10 */
1755 	      MULTIPLY(tmp3, FIX(0.129109289)) -   /* c12 */
1756 	      MULTIPLY(tmp4, FIX(0.608465700)) -   /* c8 */
1757 	      MULTIPLY(tmp5, FIX(0.948429952)),    /* c4 */
1758 	      CONST_BITS+1);
1759     z1 = MULTIPLY(tmp0 - tmp2, FIX(0.875087516)) - /* (c4+c6)/2 */
1760 	 MULTIPLY(tmp3 - tmp4, FIX(0.330085509)) - /* (c2-c10)/2 */
1761 	 MULTIPLY(tmp1 - tmp5, FIX(0.239678205));  /* (c8-c12)/2 */
1762     z2 = MULTIPLY(tmp0 + tmp2, FIX(0.073342435)) - /* (c4-c6)/2 */
1763 	 MULTIPLY(tmp3 + tmp4, FIX(0.709910013)) + /* (c2+c10)/2 */
1764 	 MULTIPLY(tmp1 + tmp5, FIX(0.368787494));  /* (c8+c12)/2 */
1765 
1766     dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+1);
1767     dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 - z2, CONST_BITS+1);
1768 
1769     /* Odd part */
1770 
1771     tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.001514908));   /* c3 */
1772     tmp2 = MULTIPLY(tmp10 + tmp12, FIX(0.881514751));   /* c5 */
1773     tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.710284161)) +  /* c7 */
1774 	   MULTIPLY(tmp14 + tmp15, FIX(0.256335874));   /* c11 */
1775     tmp0 = tmp1 + tmp2 + tmp3 -
1776 	   MULTIPLY(tmp10, FIX(1.530003162)) +          /* c3+c5+c7-c1 */
1777 	   MULTIPLY(tmp14, FIX(0.241438564));           /* c9-c11 */
1778     tmp4 = MULTIPLY(tmp14 - tmp15, FIX(0.710284161)) -  /* c7 */
1779 	   MULTIPLY(tmp11 + tmp12, FIX(0.256335874));   /* c11 */
1780     tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(0.881514751)); /* -c5 */
1781     tmp1 += tmp4 + tmp5 +
1782 	    MULTIPLY(tmp11, FIX(0.634110155)) -         /* c5+c9+c11-c3 */
1783 	    MULTIPLY(tmp14, FIX(1.773594819));          /* c1+c7 */
1784     tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(0.497774438)); /* -c9 */
1785     tmp2 += tmp4 + tmp6 -
1786 	    MULTIPLY(tmp12, FIX(1.190715098)) +         /* c1+c5-c9-c11 */
1787 	    MULTIPLY(tmp15, FIX(1.711799069));          /* c3+c7 */
1788     tmp3 += tmp5 + tmp6 +
1789 	    MULTIPLY(tmp13, FIX(1.670519935)) -         /* c3+c5+c9-c7 */
1790 	    MULTIPLY(tmp15, FIX(1.319646532));          /* c1+c11 */
1791 
1792     dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+1);
1793     dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+1);
1794     dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+1);
1795     dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+1);
1796 
1797     dataptr++;			/* advance pointer to next column */
1798     wsptr++;			/* advance pointer to next column */
1799   }
1800 }
1801 
1802 
1803 /*
1804  * Perform the forward DCT on a 14x14 sample block.
1805  */
1806 
1807 GLOBAL(void)
jpeg_fdct_14x14(DCTELEM * data,JSAMPARRAY sample_data,JDIMENSION start_col)1808 jpeg_fdct_14x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1809 {
1810   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1811   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
1812   DCTELEM workspace[8*6];
1813   DCTELEM *dataptr;
1814   DCTELEM *wsptr;
1815   JSAMPROW elemptr;
1816   int ctr;
1817   SHIFT_TEMPS
1818 
1819   /* Pass 1: process rows.
1820    * Note results are scaled up by sqrt(8) compared to a true DCT.
1821    * cK represents sqrt(2) * cos(K*pi/28).
1822    */
1823 
1824   dataptr = data;
1825   ctr = 0;
1826   for (;;) {
1827     elemptr = sample_data[ctr] + start_col;
1828 
1829     /* Even part */
1830 
1831     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
1832     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
1833     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
1834     tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
1835     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
1836     tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
1837     tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);
1838 
1839     tmp10 = tmp0 + tmp6;
1840     tmp14 = tmp0 - tmp6;
1841     tmp11 = tmp1 + tmp5;
1842     tmp15 = tmp1 - tmp5;
1843     tmp12 = tmp2 + tmp4;
1844     tmp16 = tmp2 - tmp4;
1845 
1846     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
1847     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
1848     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
1849     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
1850     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
1851     tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
1852     tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
1853 
1854     /* Apply unsigned->signed conversion. */
1855     dataptr[0] = (DCTELEM)
1856       (tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE);
1857     tmp13 += tmp13;
1858     dataptr[4] = (DCTELEM)
1859       DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
1860 	      MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
1861 	      MULTIPLY(tmp12 - tmp13, FIX(0.881747734)),  /* c8 */
1862 	      CONST_BITS);
1863 
1864     tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686));    /* c6 */
1865 
1866     dataptr[2] = (DCTELEM)
1867       DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590))   /* c2-c6 */
1868 	      + MULTIPLY(tmp16, FIX(0.613604268)),        /* c10 */
1869 	      CONST_BITS);
1870     dataptr[6] = (DCTELEM)
1871       DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954))   /* c6+c10 */
1872 	      - MULTIPLY(tmp16, FIX(1.378756276)),        /* c2 */
1873 	      CONST_BITS);
1874 
1875     /* Odd part */
1876 
1877     tmp10 = tmp1 + tmp2;
1878     tmp11 = tmp5 - tmp4;
1879     dataptr[7] = (DCTELEM) (tmp0 - tmp10 + tmp3 - tmp11 - tmp6);
1880     tmp3 <<= CONST_BITS;
1881     tmp10 = MULTIPLY(tmp10, - FIX(0.158341681));          /* -c13 */
1882     tmp11 = MULTIPLY(tmp11, FIX(1.405321284));            /* c1 */
1883     tmp10 += tmp11 - tmp3;
1884     tmp11 = MULTIPLY(tmp0 + tmp2, FIX(1.197448846)) +     /* c5 */
1885 	    MULTIPLY(tmp4 + tmp6, FIX(0.752406978));      /* c9 */
1886     dataptr[5] = (DCTELEM)
1887       DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(2.373959773)) /* c3+c5-c13 */
1888 	      + MULTIPLY(tmp4, FIX(1.119999435)),         /* c1+c11-c9 */
1889 	      CONST_BITS);
1890     tmp12 = MULTIPLY(tmp0 + tmp1, FIX(1.334852607)) +     /* c3 */
1891 	    MULTIPLY(tmp5 - tmp6, FIX(0.467085129));      /* c11 */
1892     dataptr[3] = (DCTELEM)
1893       DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.424103948)) /* c3-c9-c13 */
1894 	      - MULTIPLY(tmp5, FIX(3.069855259)),         /* c1+c5+c11 */
1895 	      CONST_BITS);
1896     dataptr[1] = (DCTELEM)
1897       DESCALE(tmp11 + tmp12 + tmp3 + tmp6 -
1898 	      MULTIPLY(tmp0 + tmp6, FIX(1.126980169)),    /* c3+c5-c1 */
1899 	      CONST_BITS);
1900 
1901     ctr++;
1902 
1903     if (ctr != DCTSIZE) {
1904       if (ctr == 14)
1905 	break;			/* Done. */
1906       dataptr += DCTSIZE;	/* advance pointer to next row */
1907     } else
1908       dataptr = workspace;	/* switch pointer to extended workspace */
1909   }
1910 
1911   /* Pass 2: process columns.
1912    * We leave the results scaled up by an overall factor of 8.
1913    * We must also scale the output by (8/14)**2 = 16/49, which we partially
1914    * fold into the constant multipliers and final shifting:
1915    * cK now represents sqrt(2) * cos(K*pi/28) * 32/49.
1916    */
1917 
1918   dataptr = data;
1919   wsptr = workspace;
1920   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1921     /* Even part */
1922 
1923     tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*5];
1924     tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*4];
1925     tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*3];
1926     tmp13 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*2];
1927     tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*1];
1928     tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*0];
1929     tmp6 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7];
1930 
1931     tmp10 = tmp0 + tmp6;
1932     tmp14 = tmp0 - tmp6;
1933     tmp11 = tmp1 + tmp5;
1934     tmp15 = tmp1 - tmp5;
1935     tmp12 = tmp2 + tmp4;
1936     tmp16 = tmp2 - tmp4;
1937 
1938     tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*5];
1939     tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*4];
1940     tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*3];
1941     tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*2];
1942     tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*1];
1943     tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*0];
1944     tmp6 = dataptr[DCTSIZE*6] - dataptr[DCTSIZE*7];
1945 
1946     dataptr[DCTSIZE*0] = (DCTELEM)
1947       DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13,
1948 		       FIX(0.653061224)),                 /* 32/49 */
1949 	      CONST_BITS+1);
1950     tmp13 += tmp13;
1951     dataptr[DCTSIZE*4] = (DCTELEM)
1952       DESCALE(MULTIPLY(tmp10 - tmp13, FIX(0.832106052)) + /* c4 */
1953 	      MULTIPLY(tmp11 - tmp13, FIX(0.205513223)) - /* c12 */
1954 	      MULTIPLY(tmp12 - tmp13, FIX(0.575835255)),  /* c8 */
1955 	      CONST_BITS+1);
1956 
1957     tmp10 = MULTIPLY(tmp14 + tmp15, FIX(0.722074570));    /* c6 */
1958 
1959     dataptr[DCTSIZE*2] = (DCTELEM)
1960       DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.178337691))   /* c2-c6 */
1961 	      + MULTIPLY(tmp16, FIX(0.400721155)),        /* c10 */
1962 	      CONST_BITS+1);
1963     dataptr[DCTSIZE*6] = (DCTELEM)
1964       DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.122795725))   /* c6+c10 */
1965 	      - MULTIPLY(tmp16, FIX(0.900412262)),        /* c2 */
1966 	      CONST_BITS+1);
1967 
1968     /* Odd part */
1969 
1970     tmp10 = tmp1 + tmp2;
1971     tmp11 = tmp5 - tmp4;
1972     dataptr[DCTSIZE*7] = (DCTELEM)
1973       DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6,
1974 		       FIX(0.653061224)),                 /* 32/49 */
1975 	      CONST_BITS+1);
1976     tmp3  = MULTIPLY(tmp3 , FIX(0.653061224));            /* 32/49 */
1977     tmp10 = MULTIPLY(tmp10, - FIX(0.103406812));          /* -c13 */
1978     tmp11 = MULTIPLY(tmp11, FIX(0.917760839));            /* c1 */
1979     tmp10 += tmp11 - tmp3;
1980     tmp11 = MULTIPLY(tmp0 + tmp2, FIX(0.782007410)) +     /* c5 */
1981 	    MULTIPLY(tmp4 + tmp6, FIX(0.491367823));      /* c9 */
1982     dataptr[DCTSIZE*5] = (DCTELEM)
1983       DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(1.550341076)) /* c3+c5-c13 */
1984 	      + MULTIPLY(tmp4, FIX(0.731428202)),         /* c1+c11-c9 */
1985 	      CONST_BITS+1);
1986     tmp12 = MULTIPLY(tmp0 + tmp1, FIX(0.871740478)) +     /* c3 */
1987 	    MULTIPLY(tmp5 - tmp6, FIX(0.305035186));      /* c11 */
1988     dataptr[DCTSIZE*3] = (DCTELEM)
1989       DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.276965844)) /* c3-c9-c13 */
1990 	      - MULTIPLY(tmp5, FIX(2.004803435)),         /* c1+c5+c11 */
1991 	      CONST_BITS+1);
1992     dataptr[DCTSIZE*1] = (DCTELEM)
1993       DESCALE(tmp11 + tmp12 + tmp3
1994 	      - MULTIPLY(tmp0, FIX(0.735987049))          /* c3+c5-c1 */
1995 	      - MULTIPLY(tmp6, FIX(0.082925825)),         /* c9-c11-c13 */
1996 	      CONST_BITS+1);
1997 
1998     dataptr++;			/* advance pointer to next column */
1999     wsptr++;			/* advance pointer to next column */
2000   }
2001 }
2002 
2003 
2004 /*
2005  * Perform the forward DCT on a 15x15 sample block.
2006  */
2007 
2008 GLOBAL(void)
jpeg_fdct_15x15(DCTELEM * data,JSAMPARRAY sample_data,JDIMENSION start_col)2009 jpeg_fdct_15x15 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2010 {
2011   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2012   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
2013   INT32 z1, z2, z3;
2014   DCTELEM workspace[8*7];
2015   DCTELEM *dataptr;
2016   DCTELEM *wsptr;
2017   JSAMPROW elemptr;
2018   int ctr;
2019   SHIFT_TEMPS
2020 
2021   /* Pass 1: process rows.
2022    * Note results are scaled up by sqrt(8) compared to a true DCT.
2023    * cK represents sqrt(2) * cos(K*pi/30).
2024    */
2025 
2026   dataptr = data;
2027   ctr = 0;
2028   for (;;) {
2029     elemptr = sample_data[ctr] + start_col;
2030 
2031     /* Even part */
2032 
2033     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[14]);
2034     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[13]);
2035     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[12]);
2036     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[11]);
2037     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[10]);
2038     tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[9]);
2039     tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[8]);
2040     tmp7 = GETJSAMPLE(elemptr[7]);
2041 
2042     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[14]);
2043     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[13]);
2044     tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[12]);
2045     tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[11]);
2046     tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[10]);
2047     tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[9]);
2048     tmp16 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[8]);
2049 
2050     z1 = tmp0 + tmp4 + tmp5;
2051     z2 = tmp1 + tmp3 + tmp6;
2052     z3 = tmp2 + tmp7;
2053     /* Apply unsigned->signed conversion. */
2054     dataptr[0] = (DCTELEM) (z1 + z2 + z3 - 15 * CENTERJSAMPLE);
2055     z3 += z3;
2056     dataptr[6] = (DCTELEM)
2057       DESCALE(MULTIPLY(z1 - z3, FIX(1.144122806)) - /* c6 */
2058 	      MULTIPLY(z2 - z3, FIX(0.437016024)),  /* c12 */
2059 	      CONST_BITS);
2060     tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7;
2061     z1 = MULTIPLY(tmp3 - tmp2, FIX(1.531135173)) -  /* c2+c14 */
2062          MULTIPLY(tmp6 - tmp2, FIX(2.238241955));   /* c4+c8 */
2063     z2 = MULTIPLY(tmp5 - tmp2, FIX(0.798468008)) -  /* c8-c14 */
2064 	 MULTIPLY(tmp0 - tmp2, FIX(0.091361227));   /* c2-c4 */
2065     z3 = MULTIPLY(tmp0 - tmp3, FIX(1.383309603)) +  /* c2 */
2066 	 MULTIPLY(tmp6 - tmp5, FIX(0.946293579)) +  /* c8 */
2067 	 MULTIPLY(tmp1 - tmp4, FIX(0.790569415));   /* (c6+c12)/2 */
2068 
2069     dataptr[2] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS);
2070     dataptr[4] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS);
2071 
2072     /* Odd part */
2073 
2074     tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16,
2075 		    FIX(1.224744871));                         /* c5 */
2076     tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(1.344997024)) + /* c3 */
2077 	   MULTIPLY(tmp11 - tmp13 - tmp16, FIX(0.831253876));  /* c9 */
2078     tmp12 = MULTIPLY(tmp12, FIX(1.224744871));                 /* c5 */
2079     tmp4 = MULTIPLY(tmp10 - tmp16, FIX(1.406466353)) +         /* c1 */
2080 	   MULTIPLY(tmp11 + tmp14, FIX(1.344997024)) +         /* c3 */
2081 	   MULTIPLY(tmp13 + tmp15, FIX(0.575212477));          /* c11 */
2082     tmp0 = MULTIPLY(tmp13, FIX(0.475753014)) -                 /* c7-c11 */
2083 	   MULTIPLY(tmp14, FIX(0.513743148)) +                 /* c3-c9 */
2084 	   MULTIPLY(tmp16, FIX(1.700497885)) + tmp4 + tmp12;   /* c1+c13 */
2085     tmp3 = MULTIPLY(tmp10, - FIX(0.355500862)) -               /* -(c1-c7) */
2086 	   MULTIPLY(tmp11, FIX(2.176250899)) -                 /* c3+c9 */
2087 	   MULTIPLY(tmp15, FIX(0.869244010)) + tmp4 - tmp12;   /* c11+c13 */
2088 
2089     dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS);
2090     dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS);
2091     dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS);
2092     dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS);
2093 
2094     ctr++;
2095 
2096     if (ctr != DCTSIZE) {
2097       if (ctr == 15)
2098 	break;			/* Done. */
2099       dataptr += DCTSIZE;	/* advance pointer to next row */
2100     } else
2101       dataptr = workspace;	/* switch pointer to extended workspace */
2102   }
2103 
2104   /* Pass 2: process columns.
2105    * We leave the results scaled up by an overall factor of 8.
2106    * We must also scale the output by (8/15)**2 = 64/225, which we partially
2107    * fold into the constant multipliers and final shifting:
2108    * cK now represents sqrt(2) * cos(K*pi/30) * 256/225.
2109    */
2110 
2111   dataptr = data;
2112   wsptr = workspace;
2113   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2114     /* Even part */
2115 
2116     tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*6];
2117     tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*5];
2118     tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*4];
2119     tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*3];
2120     tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*2];
2121     tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*1];
2122     tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*0];
2123     tmp7 = dataptr[DCTSIZE*7];
2124 
2125     tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*6];
2126     tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*5];
2127     tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*4];
2128     tmp13 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*3];
2129     tmp14 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*2];
2130     tmp15 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*1];
2131     tmp16 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*0];
2132 
2133     z1 = tmp0 + tmp4 + tmp5;
2134     z2 = tmp1 + tmp3 + tmp6;
2135     z3 = tmp2 + tmp7;
2136     dataptr[DCTSIZE*0] = (DCTELEM)
2137       DESCALE(MULTIPLY(z1 + z2 + z3, FIX(1.137777778)), /* 256/225 */
2138 	      CONST_BITS+2);
2139     z3 += z3;
2140     dataptr[DCTSIZE*6] = (DCTELEM)
2141       DESCALE(MULTIPLY(z1 - z3, FIX(1.301757503)) - /* c6 */
2142 	      MULTIPLY(z2 - z3, FIX(0.497227121)),  /* c12 */
2143 	      CONST_BITS+2);
2144     tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7;
2145     z1 = MULTIPLY(tmp3 - tmp2, FIX(1.742091575)) -  /* c2+c14 */
2146          MULTIPLY(tmp6 - tmp2, FIX(2.546621957));   /* c4+c8 */
2147     z2 = MULTIPLY(tmp5 - tmp2, FIX(0.908479156)) -  /* c8-c14 */
2148 	 MULTIPLY(tmp0 - tmp2, FIX(0.103948774));   /* c2-c4 */
2149     z3 = MULTIPLY(tmp0 - tmp3, FIX(1.573898926)) +  /* c2 */
2150 	 MULTIPLY(tmp6 - tmp5, FIX(1.076671805)) +  /* c8 */
2151 	 MULTIPLY(tmp1 - tmp4, FIX(0.899492312));   /* (c6+c12)/2 */
2152 
2153     dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS+2);
2154     dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS+2);
2155 
2156     /* Odd part */
2157 
2158     tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16,
2159 		    FIX(1.393487498));                         /* c5 */
2160     tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(1.530307725)) + /* c3 */
2161 	   MULTIPLY(tmp11 - tmp13 - tmp16, FIX(0.945782187));  /* c9 */
2162     tmp12 = MULTIPLY(tmp12, FIX(1.393487498));                 /* c5 */
2163     tmp4 = MULTIPLY(tmp10 - tmp16, FIX(1.600246161)) +         /* c1 */
2164 	   MULTIPLY(tmp11 + tmp14, FIX(1.530307725)) +         /* c3 */
2165 	   MULTIPLY(tmp13 + tmp15, FIX(0.654463974));          /* c11 */
2166     tmp0 = MULTIPLY(tmp13, FIX(0.541301207)) -                 /* c7-c11 */
2167 	   MULTIPLY(tmp14, FIX(0.584525538)) +                 /* c3-c9 */
2168 	   MULTIPLY(tmp16, FIX(1.934788705)) + tmp4 + tmp12;   /* c1+c13 */
2169     tmp3 = MULTIPLY(tmp10, - FIX(0.404480980)) -               /* -(c1-c7) */
2170 	   MULTIPLY(tmp11, FIX(2.476089912)) -                 /* c3+c9 */
2171 	   MULTIPLY(tmp15, FIX(0.989006518)) + tmp4 - tmp12;   /* c11+c13 */
2172 
2173     dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+2);
2174     dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+2);
2175     dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+2);
2176     dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+2);
2177 
2178     dataptr++;			/* advance pointer to next column */
2179     wsptr++;			/* advance pointer to next column */
2180   }
2181 }
2182 
2183 
2184 /*
2185  * Perform the forward DCT on a 16x16 sample block.
2186  */
2187 
2188 GLOBAL(void)
jpeg_fdct_16x16(DCTELEM * data,JSAMPARRAY sample_data,JDIMENSION start_col)2189 jpeg_fdct_16x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2190 {
2191   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2192   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
2193   DCTELEM workspace[DCTSIZE2];
2194   DCTELEM *dataptr;
2195   DCTELEM *wsptr;
2196   JSAMPROW elemptr;
2197   int ctr;
2198   SHIFT_TEMPS
2199 
2200   /* Pass 1: process rows.
2201    * Note results are scaled up by sqrt(8) compared to a true DCT;
2202    * furthermore, we scale the results by 2**PASS1_BITS.
2203    * cK represents sqrt(2) * cos(K*pi/32).
2204    */
2205 
2206   dataptr = data;
2207   ctr = 0;
2208   for (;;) {
2209     elemptr = sample_data[ctr] + start_col;
2210 
2211     /* Even part */
2212 
2213     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
2214     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
2215     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
2216     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
2217     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
2218     tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
2219     tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
2220     tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);
2221 
2222     tmp10 = tmp0 + tmp7;
2223     tmp14 = tmp0 - tmp7;
2224     tmp11 = tmp1 + tmp6;
2225     tmp15 = tmp1 - tmp6;
2226     tmp12 = tmp2 + tmp5;
2227     tmp16 = tmp2 - tmp5;
2228     tmp13 = tmp3 + tmp4;
2229     tmp17 = tmp3 - tmp4;
2230 
2231     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
2232     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
2233     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
2234     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
2235     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
2236     tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
2237     tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
2238     tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
2239 
2240     /* Apply unsigned->signed conversion. */
2241     dataptr[0] = (DCTELEM)
2242       ((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS);
2243     dataptr[4] = (DCTELEM)
2244       DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
2245 	      MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
2246 	      CONST_BITS-PASS1_BITS);
2247 
2248     tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
2249 	    MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */
2250 
2251     dataptr[2] = (DCTELEM)
2252       DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
2253 	      + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+c10 */
2254 	      CONST_BITS-PASS1_BITS);
2255     dataptr[6] = (DCTELEM)
2256       DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
2257 	      - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
2258 	      CONST_BITS-PASS1_BITS);
2259 
2260     /* Odd part */
2261 
2262     tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) +         /* c3 */
2263 	    MULTIPLY(tmp6 - tmp7, FIX(0.410524528));          /* c13 */
2264     tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) +         /* c5 */
2265 	    MULTIPLY(tmp5 + tmp7, FIX(0.666655658));          /* c11 */
2266     tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) +         /* c7 */
2267 	    MULTIPLY(tmp4 - tmp7, FIX(0.897167586));          /* c9 */
2268     tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) +         /* c15 */
2269 	    MULTIPLY(tmp6 - tmp5, FIX(1.407403738));          /* c1 */
2270     tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) +       /* -c11 */
2271 	    MULTIPLY(tmp4 + tmp6, - FIX(1.247225013));        /* -c5 */
2272     tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) +       /* -c3 */
2273 	    MULTIPLY(tmp5 - tmp4, FIX(0.410524528));          /* c13 */
2274     tmp10 = tmp11 + tmp12 + tmp13 -
2275 	    MULTIPLY(tmp0, FIX(2.286341144)) +                /* c7+c5+c3-c1 */
2276 	    MULTIPLY(tmp7, FIX(0.779653625));                 /* c15+c13-c11+c9 */
2277     tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
2278 	     - MULTIPLY(tmp6, FIX(1.663905119));              /* c7+c13+c1-c5 */
2279     tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
2280 	     + MULTIPLY(tmp5, FIX(1.227391138));              /* c9-c11+c1-c13 */
2281     tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
2282 	     + MULTIPLY(tmp4, FIX(2.167985692));              /* c1+c13+c5-c9 */
2283 
2284     dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
2285     dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
2286     dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
2287     dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
2288 
2289     ctr++;
2290 
2291     if (ctr != DCTSIZE) {
2292       if (ctr == DCTSIZE * 2)
2293 	break;			/* Done. */
2294       dataptr += DCTSIZE;	/* advance pointer to next row */
2295     } else
2296       dataptr = workspace;	/* switch pointer to extended workspace */
2297   }
2298 
2299   /* Pass 2: process columns.
2300    * We remove the PASS1_BITS scaling, but leave the results scaled up
2301    * by an overall factor of 8.
2302    * We must also scale the output by (8/16)**2 = 1/2**2.
2303    * cK represents sqrt(2) * cos(K*pi/32).
2304    */
2305 
2306   dataptr = data;
2307   wsptr = workspace;
2308   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2309     /* Even part */
2310 
2311     tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*7];
2312     tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*6];
2313     tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*5];
2314     tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*4];
2315     tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*3];
2316     tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*2];
2317     tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*1];
2318     tmp7 = dataptr[DCTSIZE*7] + wsptr[DCTSIZE*0];
2319 
2320     tmp10 = tmp0 + tmp7;
2321     tmp14 = tmp0 - tmp7;
2322     tmp11 = tmp1 + tmp6;
2323     tmp15 = tmp1 - tmp6;
2324     tmp12 = tmp2 + tmp5;
2325     tmp16 = tmp2 - tmp5;
2326     tmp13 = tmp3 + tmp4;
2327     tmp17 = tmp3 - tmp4;
2328 
2329     tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*7];
2330     tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*6];
2331     tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*5];
2332     tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*4];
2333     tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*3];
2334     tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*2];
2335     tmp6 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*1];
2336     tmp7 = dataptr[DCTSIZE*7] - wsptr[DCTSIZE*0];
2337 
2338     dataptr[DCTSIZE*0] = (DCTELEM)
2339       DESCALE(tmp10 + tmp11 + tmp12 + tmp13, PASS1_BITS+2);
2340     dataptr[DCTSIZE*4] = (DCTELEM)
2341       DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
2342 	      MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
2343 	      CONST_BITS+PASS1_BITS+2);
2344 
2345     tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
2346 	    MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */
2347 
2348     dataptr[DCTSIZE*2] = (DCTELEM)
2349       DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
2350 	      + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+10 */
2351 	      CONST_BITS+PASS1_BITS+2);
2352     dataptr[DCTSIZE*6] = (DCTELEM)
2353       DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
2354 	      - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
2355 	      CONST_BITS+PASS1_BITS+2);
2356 
2357     /* Odd part */
2358 
2359     tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) +         /* c3 */
2360 	    MULTIPLY(tmp6 - tmp7, FIX(0.410524528));          /* c13 */
2361     tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) +         /* c5 */
2362 	    MULTIPLY(tmp5 + tmp7, FIX(0.666655658));          /* c11 */
2363     tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) +         /* c7 */
2364 	    MULTIPLY(tmp4 - tmp7, FIX(0.897167586));          /* c9 */
2365     tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) +         /* c15 */
2366 	    MULTIPLY(tmp6 - tmp5, FIX(1.407403738));          /* c1 */
2367     tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) +       /* -c11 */
2368 	    MULTIPLY(tmp4 + tmp6, - FIX(1.247225013));        /* -c5 */
2369     tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) +       /* -c3 */
2370 	    MULTIPLY(tmp5 - tmp4, FIX(0.410524528));          /* c13 */
2371     tmp10 = tmp11 + tmp12 + tmp13 -
2372 	    MULTIPLY(tmp0, FIX(2.286341144)) +                /* c7+c5+c3-c1 */
2373 	    MULTIPLY(tmp7, FIX(0.779653625));                 /* c15+c13-c11+c9 */
2374     tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
2375 	     - MULTIPLY(tmp6, FIX(1.663905119));              /* c7+c13+c1-c5 */
2376     tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
2377 	     + MULTIPLY(tmp5, FIX(1.227391138));              /* c9-c11+c1-c13 */
2378     tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
2379 	     + MULTIPLY(tmp4, FIX(2.167985692));              /* c1+c13+c5-c9 */
2380 
2381     dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS+2);
2382     dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS+2);
2383     dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS+2);
2384     dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS+2);
2385 
2386     dataptr++;			/* advance pointer to next column */
2387     wsptr++;			/* advance pointer to next column */
2388   }
2389 }
2390 
2391 
2392 /*
2393  * Perform the forward DCT on a 16x8 sample block.
2394  *
2395  * 16-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
2396  */
2397 
2398 GLOBAL(void)
jpeg_fdct_16x8(DCTELEM * data,JSAMPARRAY sample_data,JDIMENSION start_col)2399 jpeg_fdct_16x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2400 {
2401   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2402   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
2403   INT32 z1;
2404   DCTELEM *dataptr;
2405   JSAMPROW elemptr;
2406   int ctr;
2407   SHIFT_TEMPS
2408 
2409   /* Pass 1: process rows.
2410    * Note results are scaled up by sqrt(8) compared to a true DCT;
2411    * furthermore, we scale the results by 2**PASS1_BITS.
2412    * 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
2413    */
2414 
2415   dataptr = data;
2416   ctr = 0;
2417   for (ctr = 0; ctr < DCTSIZE; ctr++) {
2418     elemptr = sample_data[ctr] + start_col;
2419 
2420     /* Even part */
2421 
2422     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
2423     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
2424     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
2425     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
2426     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
2427     tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
2428     tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
2429     tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);
2430 
2431     tmp10 = tmp0 + tmp7;
2432     tmp14 = tmp0 - tmp7;
2433     tmp11 = tmp1 + tmp6;
2434     tmp15 = tmp1 - tmp6;
2435     tmp12 = tmp2 + tmp5;
2436     tmp16 = tmp2 - tmp5;
2437     tmp13 = tmp3 + tmp4;
2438     tmp17 = tmp3 - tmp4;
2439 
2440     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
2441     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
2442     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
2443     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
2444     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
2445     tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
2446     tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
2447     tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
2448 
2449     /* Apply unsigned->signed conversion. */
2450     dataptr[0] = (DCTELEM)
2451       ((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS);
2452     dataptr[4] = (DCTELEM)
2453       DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
2454 	      MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
2455 	      CONST_BITS-PASS1_BITS);
2456 
2457     tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
2458 	    MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */
2459 
2460     dataptr[2] = (DCTELEM)
2461       DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
2462 	      + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+c10 */
2463 	      CONST_BITS-PASS1_BITS);
2464     dataptr[6] = (DCTELEM)
2465       DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
2466 	      - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
2467 	      CONST_BITS-PASS1_BITS);
2468 
2469     /* Odd part */
2470 
2471     tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) +         /* c3 */
2472 	    MULTIPLY(tmp6 - tmp7, FIX(0.410524528));          /* c13 */
2473     tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) +         /* c5 */
2474 	    MULTIPLY(tmp5 + tmp7, FIX(0.666655658));          /* c11 */
2475     tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) +         /* c7 */
2476 	    MULTIPLY(tmp4 - tmp7, FIX(0.897167586));          /* c9 */
2477     tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) +         /* c15 */
2478 	    MULTIPLY(tmp6 - tmp5, FIX(1.407403738));          /* c1 */
2479     tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) +       /* -c11 */
2480 	    MULTIPLY(tmp4 + tmp6, - FIX(1.247225013));        /* -c5 */
2481     tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) +       /* -c3 */
2482 	    MULTIPLY(tmp5 - tmp4, FIX(0.410524528));          /* c13 */
2483     tmp10 = tmp11 + tmp12 + tmp13 -
2484 	    MULTIPLY(tmp0, FIX(2.286341144)) +                /* c7+c5+c3-c1 */
2485 	    MULTIPLY(tmp7, FIX(0.779653625));                 /* c15+c13-c11+c9 */
2486     tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
2487 	     - MULTIPLY(tmp6, FIX(1.663905119));              /* c7+c13+c1-c5 */
2488     tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
2489 	     + MULTIPLY(tmp5, FIX(1.227391138));              /* c9-c11+c1-c13 */
2490     tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
2491 	     + MULTIPLY(tmp4, FIX(2.167985692));              /* c1+c13+c5-c9 */
2492 
2493     dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
2494     dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
2495     dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
2496     dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
2497 
2498     dataptr += DCTSIZE;		/* advance pointer to next row */
2499   }
2500 
2501   /* Pass 2: process columns.
2502    * We remove the PASS1_BITS scaling, but leave the results scaled up
2503    * by an overall factor of 8.
2504    * We must also scale the output by 8/16 = 1/2.
2505    * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
2506    */
2507 
2508   dataptr = data;
2509   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2510     /* Even part per LL&M figure 1 --- note that published figure is faulty;
2511      * rotator "c1" should be "c6".
2512      */
2513 
2514     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
2515     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
2516     tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
2517     tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
2518 
2519     tmp10 = tmp0 + tmp3;
2520     tmp12 = tmp0 - tmp3;
2521     tmp11 = tmp1 + tmp2;
2522     tmp13 = tmp1 - tmp2;
2523 
2524     tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
2525     tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
2526     tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
2527     tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
2528 
2529     dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS+1);
2530     dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS+1);
2531 
2532     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);   /* c6 */
2533     dataptr[DCTSIZE*2] = (DCTELEM)
2534       DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
2535 	      CONST_BITS+PASS1_BITS+1);
2536     dataptr[DCTSIZE*6] = (DCTELEM)
2537       DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
2538 	      CONST_BITS+PASS1_BITS+1);
2539 
2540     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
2541      * i0..i3 in the paper are tmp0..tmp3 here.
2542      */
2543 
2544     tmp12 = tmp0 + tmp2;
2545     tmp13 = tmp1 + tmp3;
2546 
2547     z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602);   /*  c3 */
2548     tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);      /* -c3+c5 */
2549     tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);      /* -c3-c5 */
2550     tmp12 += z1;
2551     tmp13 += z1;
2552 
2553     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223);   /* -c3+c7 */
2554     tmp0 = MULTIPLY(tmp0, FIX_1_501321110);          /*  c1+c3-c5-c7 */
2555     tmp3 = MULTIPLY(tmp3, FIX_0_298631336);          /* -c1+c3+c5-c7 */
2556     tmp0 += z1 + tmp12;
2557     tmp3 += z1 + tmp13;
2558 
2559     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447);   /* -c1-c3 */
2560     tmp1 = MULTIPLY(tmp1, FIX_3_072711026);          /*  c1+c3+c5-c7 */
2561     tmp2 = MULTIPLY(tmp2, FIX_2_053119869);          /*  c1+c3-c5+c7 */
2562     tmp1 += z1 + tmp13;
2563     tmp2 += z1 + tmp12;
2564 
2565     dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS+1);
2566     dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS+1);
2567     dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS+1);
2568     dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+PASS1_BITS+1);
2569 
2570     dataptr++;			/* advance pointer to next column */
2571   }
2572 }
2573 
2574 
2575 /*
2576  * Perform the forward DCT on a 14x7 sample block.
2577  *
2578  * 14-point FDCT in pass 1 (rows), 7-point in pass 2 (columns).
2579  */
2580 
2581 GLOBAL(void)
jpeg_fdct_14x7(DCTELEM * data,JSAMPARRAY sample_data,JDIMENSION start_col)2582 jpeg_fdct_14x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2583 {
2584   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
2585   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
2586   INT32 z1, z2, z3;
2587   DCTELEM *dataptr;
2588   JSAMPROW elemptr;
2589   int ctr;
2590   SHIFT_TEMPS
2591 
2592   /* Zero bottom row of output coefficient block. */
2593   MEMZERO(&data[DCTSIZE*7], SIZEOF(DCTELEM) * DCTSIZE);
2594 
2595   /* Pass 1: process rows.
2596    * Note results are scaled up by sqrt(8) compared to a true DCT;
2597    * furthermore, we scale the results by 2**PASS1_BITS.
2598    * 14-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/28).
2599    */
2600 
2601   dataptr = data;
2602   for (ctr = 0; ctr < 7; ctr++) {
2603     elemptr = sample_data[ctr] + start_col;
2604 
2605     /* Even part */
2606 
2607     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
2608     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
2609     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
2610     tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
2611     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
2612     tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
2613     tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);
2614 
2615     tmp10 = tmp0 + tmp6;
2616     tmp14 = tmp0 - tmp6;
2617     tmp11 = tmp1 + tmp5;
2618     tmp15 = tmp1 - tmp5;
2619     tmp12 = tmp2 + tmp4;
2620     tmp16 = tmp2 - tmp4;
2621 
2622     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
2623     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
2624     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
2625     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
2626     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
2627     tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
2628     tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
2629 
2630     /* Apply unsigned->signed conversion. */
2631     dataptr[0] = (DCTELEM)
2632       ((tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE) << PASS1_BITS);
2633     tmp13 += tmp13;
2634     dataptr[4] = (DCTELEM)
2635       DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
2636 	      MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
2637 	      MULTIPLY(tmp12 - tmp13, FIX(0.881747734)),  /* c8 */
2638 	      CONST_BITS-PASS1_BITS);
2639 
2640     tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686));    /* c6 */
2641 
2642     dataptr[2] = (DCTELEM)
2643       DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590))   /* c2-c6 */
2644 	      + MULTIPLY(tmp16, FIX(0.613604268)),        /* c10 */
2645 	      CONST_BITS-PASS1_BITS);
2646     dataptr[6] = (DCTELEM)
2647       DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954))   /* c6+c10 */
2648 	      - MULTIPLY(tmp16, FIX(1.378756276)),        /* c2 */
2649 	      CONST_BITS-PASS1_BITS);
2650 
2651     /* Odd part */
2652 
2653     tmp10 = tmp1 + tmp2;
2654     tmp11 = tmp5 - tmp4;
2655     dataptr[7] = (DCTELEM) ((tmp0 - tmp10 + tmp3 - tmp11 - tmp6) << PASS1_BITS);
2656     tmp3 <<= CONST_BITS;
2657     tmp10 = MULTIPLY(tmp10, - FIX(0.158341681));          /* -c13 */
2658     tmp11 = MULTIPLY(tmp11, FIX(1.405321284));            /* c1 */
2659     tmp10 += tmp11 - tmp3;
2660     tmp11 = MULTIPLY(tmp0 + tmp2, FIX(1.197448846)) +     /* c5 */
2661 	    MULTIPLY(tmp4 + tmp6, FIX(0.752406978));      /* c9 */
2662     dataptr[5] = (DCTELEM)
2663       DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(2.373959773)) /* c3+c5-c13 */
2664 	      + MULTIPLY(tmp4, FIX(1.119999435)),         /* c1+c11-c9 */
2665 	      CONST_BITS-PASS1_BITS);
2666     tmp12 = MULTIPLY(tmp0 + tmp1, FIX(1.334852607)) +     /* c3 */
2667 	    MULTIPLY(tmp5 - tmp6, FIX(0.467085129));      /* c11 */
2668     dataptr[3] = (DCTELEM)
2669       DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.424103948)) /* c3-c9-c13 */
2670 	      - MULTIPLY(tmp5, FIX(3.069855259)),         /* c1+c5+c11 */
2671 	      CONST_BITS-PASS1_BITS);
2672     dataptr[1] = (DCTELEM)
2673       DESCALE(tmp11 + tmp12 + tmp3 + tmp6 -
2674 	      MULTIPLY(tmp0 + tmp6, FIX(1.126980169)),    /* c3+c5-c1 */
2675 	      CONST_BITS-PASS1_BITS);
2676 
2677     dataptr += DCTSIZE;		/* advance pointer to next row */
2678   }
2679 
2680   /* Pass 2: process columns.
2681    * We remove the PASS1_BITS scaling, but leave the results scaled up
2682    * by an overall factor of 8.
2683    * We must also scale the output by (8/14)*(8/7) = 32/49, which we
2684    * partially fold into the constant multipliers and final shifting:
2685    * 7-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/14) * 64/49.
2686    */
2687 
2688   dataptr = data;
2689   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2690     /* Even part */
2691 
2692     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*6];
2693     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*5];
2694     tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*4];
2695     tmp3 = dataptr[DCTSIZE*3];
2696 
2697     tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*6];
2698     tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*5];
2699     tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*4];
2700 
2701     z1 = tmp0 + tmp2;
2702     dataptr[DCTSIZE*0] = (DCTELEM)
2703       DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(1.306122449)), /* 64/49 */
2704 	      CONST_BITS+PASS1_BITS+1);
2705     tmp3 += tmp3;
2706     z1 -= tmp3;
2707     z1 -= tmp3;
2708     z1 = MULTIPLY(z1, FIX(0.461784020));                /* (c2+c6-c4)/2 */
2709     z2 = MULTIPLY(tmp0 - tmp2, FIX(1.202428084));       /* (c2+c4-c6)/2 */
2710     z3 = MULTIPLY(tmp1 - tmp2, FIX(0.411026446));       /* c6 */
2711     dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS1_BITS+1);
2712     z1 -= z2;
2713     z2 = MULTIPLY(tmp0 - tmp1, FIX(1.151670509));       /* c4 */
2714     dataptr[DCTSIZE*4] = (DCTELEM)
2715       DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.923568041)), /* c2+c6-c4 */
2716 	      CONST_BITS+PASS1_BITS+1);
2717     dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS1_BITS+1);
2718 
2719     /* Odd part */
2720 
2721     tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.221765677));   /* (c3+c1-c5)/2 */
2722     tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.222383464));   /* (c3+c5-c1)/2 */
2723     tmp0 = tmp1 - tmp2;
2724     tmp1 += tmp2;
2725     tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.800824523)); /* -c1 */
2726     tmp1 += tmp2;
2727     tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.801442310));   /* c5 */
2728     tmp0 += tmp3;
2729     tmp2 += tmp3 + MULTIPLY(tmp12, FIX(2.443531355));   /* c3+c1-c5 */
2730 
2731     dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS+1);
2732     dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS+1);
2733     dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS+1);
2734 
2735     dataptr++;			/* advance pointer to next column */
2736   }
2737 }
2738 
2739 
2740 /*
2741  * Perform the forward DCT on a 12x6 sample block.
2742  *
2743  * 12-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
2744  */
2745 
2746 GLOBAL(void)
jpeg_fdct_12x6(DCTELEM * data,JSAMPARRAY sample_data,JDIMENSION start_col)2747 jpeg_fdct_12x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2748 {
2749   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
2750   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
2751   DCTELEM *dataptr;
2752   JSAMPROW elemptr;
2753   int ctr;
2754   SHIFT_TEMPS
2755 
2756   /* Zero 2 bottom rows of output coefficient block. */
2757   MEMZERO(&data[DCTSIZE*6], SIZEOF(DCTELEM) * DCTSIZE * 2);
2758 
2759   /* Pass 1: process rows.
2760    * Note results are scaled up by sqrt(8) compared to a true DCT;
2761    * furthermore, we scale the results by 2**PASS1_BITS.
2762    * 12-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/24).
2763    */
2764 
2765   dataptr = data;
2766   for (ctr = 0; ctr < 6; ctr++) {
2767     elemptr = sample_data[ctr] + start_col;
2768 
2769     /* Even part */
2770 
2771     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
2772     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
2773     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
2774     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
2775     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
2776     tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);
2777 
2778     tmp10 = tmp0 + tmp5;
2779     tmp13 = tmp0 - tmp5;
2780     tmp11 = tmp1 + tmp4;
2781     tmp14 = tmp1 - tmp4;
2782     tmp12 = tmp2 + tmp3;
2783     tmp15 = tmp2 - tmp3;
2784 
2785     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
2786     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
2787     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
2788     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
2789     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
2790     tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
2791 
2792     /* Apply unsigned->signed conversion. */
2793     dataptr[0] = (DCTELEM)
2794       ((tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE) << PASS1_BITS);
2795     dataptr[6] = (DCTELEM) ((tmp13 - tmp14 - tmp15) << PASS1_BITS);
2796     dataptr[4] = (DCTELEM)
2797       DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
2798 	      CONST_BITS-PASS1_BITS);
2799     dataptr[2] = (DCTELEM)
2800       DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
2801 	      CONST_BITS-PASS1_BITS);
2802 
2803     /* Odd part */
2804 
2805     tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100);    /* c9 */
2806     tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865);   /* c3-c9 */
2807     tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065);   /* c3+c9 */
2808     tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054));   /* c5 */
2809     tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669));   /* c7 */
2810     tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
2811 	    + MULTIPLY(tmp5, FIX(0.184591911));        /* c11 */
2812     tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */
2813     tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */
2814 	    + MULTIPLY(tmp5, FIX(0.860918669));        /* c7 */
2815     tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.725788011)) /* c1+c11-c7 */
2816 	    - MULTIPLY(tmp5, FIX(1.121971054));        /* c5 */
2817     tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.306562965)) /* c3 */
2818 	    - MULTIPLY(tmp2 + tmp5, FIX_0_541196100);  /* c9 */
2819 
2820     dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
2821     dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
2822     dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
2823     dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
2824 
2825     dataptr += DCTSIZE;		/* advance pointer to next row */
2826   }
2827 
2828   /* Pass 2: process columns.
2829    * We remove the PASS1_BITS scaling, but leave the results scaled up
2830    * by an overall factor of 8.
2831    * We must also scale the output by (8/12)*(8/6) = 8/9, which we
2832    * partially fold into the constant multipliers and final shifting:
2833    * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12) * 16/9.
2834    */
2835 
2836   dataptr = data;
2837   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2838     /* Even part */
2839 
2840     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
2841     tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
2842     tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
2843 
2844     tmp10 = tmp0 + tmp2;
2845     tmp12 = tmp0 - tmp2;
2846 
2847     tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
2848     tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
2849     tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
2850 
2851     dataptr[DCTSIZE*0] = (DCTELEM)
2852       DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)),         /* 16/9 */
2853 	      CONST_BITS+PASS1_BITS+1);
2854     dataptr[DCTSIZE*2] = (DCTELEM)
2855       DESCALE(MULTIPLY(tmp12, FIX(2.177324216)),                 /* c2 */
2856 	      CONST_BITS+PASS1_BITS+1);
2857     dataptr[DCTSIZE*4] = (DCTELEM)
2858       DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
2859 	      CONST_BITS+PASS1_BITS+1);
2860 
2861     /* Odd part */
2862 
2863     tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829));             /* c5 */
2864 
2865     dataptr[DCTSIZE*1] = (DCTELEM)
2866       DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),   /* 16/9 */
2867 	      CONST_BITS+PASS1_BITS+1);
2868     dataptr[DCTSIZE*3] = (DCTELEM)
2869       DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)),    /* 16/9 */
2870 	      CONST_BITS+PASS1_BITS+1);
2871     dataptr[DCTSIZE*5] = (DCTELEM)
2872       DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)),   /* 16/9 */
2873 	      CONST_BITS+PASS1_BITS+1);
2874 
2875     dataptr++;			/* advance pointer to next column */
2876   }
2877 }
2878 
2879 
2880 /*
2881  * Perform the forward DCT on a 10x5 sample block.
2882  *
2883  * 10-point FDCT in pass 1 (rows), 5-point in pass 2 (columns).
2884  */
2885 
2886 GLOBAL(void)
jpeg_fdct_10x5(DCTELEM * data,JSAMPARRAY sample_data,JDIMENSION start_col)2887 jpeg_fdct_10x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2888 {
2889   INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
2890   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
2891   DCTELEM *dataptr;
2892   JSAMPROW elemptr;
2893   int ctr;
2894   SHIFT_TEMPS
2895 
2896   /* Zero 3 bottom rows of output coefficient block. */
2897   MEMZERO(&data[DCTSIZE*5], SIZEOF(DCTELEM) * DCTSIZE * 3);
2898 
2899   /* Pass 1: process rows.
2900    * Note results are scaled up by sqrt(8) compared to a true DCT;
2901    * furthermore, we scale the results by 2**PASS1_BITS.
2902    * 10-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/20).
2903    */
2904 
2905   dataptr = data;
2906   for (ctr = 0; ctr < 5; ctr++) {
2907     elemptr = sample_data[ctr] + start_col;
2908 
2909     /* Even part */
2910 
2911     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
2912     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
2913     tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
2914     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
2915     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);
2916 
2917     tmp10 = tmp0 + tmp4;
2918     tmp13 = tmp0 - tmp4;
2919     tmp11 = tmp1 + tmp3;
2920     tmp14 = tmp1 - tmp3;
2921 
2922     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
2923     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
2924     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
2925     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
2926     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
2927 
2928     /* Apply unsigned->signed conversion. */
2929     dataptr[0] = (DCTELEM)
2930       ((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << PASS1_BITS);
2931     tmp12 += tmp12;
2932     dataptr[4] = (DCTELEM)
2933       DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
2934 	      MULTIPLY(tmp11 - tmp12, FIX(0.437016024)),  /* c8 */
2935 	      CONST_BITS-PASS1_BITS);
2936     tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876));    /* c6 */
2937     dataptr[2] = (DCTELEM)
2938       DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)),  /* c2-c6 */
2939 	      CONST_BITS-PASS1_BITS);
2940     dataptr[6] = (DCTELEM)
2941       DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)),  /* c2+c6 */
2942 	      CONST_BITS-PASS1_BITS);
2943 
2944     /* Odd part */
2945 
2946     tmp10 = tmp0 + tmp4;
2947     tmp11 = tmp1 - tmp3;
2948     dataptr[5] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << PASS1_BITS);
2949     tmp2 <<= CONST_BITS;
2950     dataptr[1] = (DCTELEM)
2951       DESCALE(MULTIPLY(tmp0, FIX(1.396802247)) +          /* c1 */
2952 	      MULTIPLY(tmp1, FIX(1.260073511)) + tmp2 +   /* c3 */
2953 	      MULTIPLY(tmp3, FIX(0.642039522)) +          /* c7 */
2954 	      MULTIPLY(tmp4, FIX(0.221231742)),           /* c9 */
2955 	      CONST_BITS-PASS1_BITS);
2956     tmp12 = MULTIPLY(tmp0 - tmp4, FIX(0.951056516)) -     /* (c3+c7)/2 */
2957 	    MULTIPLY(tmp1 + tmp3, FIX(0.587785252));      /* (c1-c9)/2 */
2958     tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.309016994)) +   /* (c3-c7)/2 */
2959 	    (tmp11 << (CONST_BITS - 1)) - tmp2;
2960     dataptr[3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-PASS1_BITS);
2961     dataptr[7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-PASS1_BITS);
2962 
2963     dataptr += DCTSIZE;		/* advance pointer to next row */
2964   }
2965 
2966   /* Pass 2: process columns.
2967    * We remove the PASS1_BITS scaling, but leave the results scaled up
2968    * by an overall factor of 8.
2969    * We must also scale the output by (8/10)*(8/5) = 32/25, which we
2970    * fold into the constant multipliers:
2971    * 5-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/10) * 32/25.
2972    */
2973 
2974   dataptr = data;
2975   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2976     /* Even part */
2977 
2978     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*4];
2979     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*3];
2980     tmp2 = dataptr[DCTSIZE*2];
2981 
2982     tmp10 = tmp0 + tmp1;
2983     tmp11 = tmp0 - tmp1;
2984 
2985     tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*4];
2986     tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*3];
2987 
2988     dataptr[DCTSIZE*0] = (DCTELEM)
2989       DESCALE(MULTIPLY(tmp10 + tmp2, FIX(1.28)),        /* 32/25 */
2990 	      CONST_BITS+PASS1_BITS);
2991     tmp11 = MULTIPLY(tmp11, FIX(1.011928851));          /* (c2+c4)/2 */
2992     tmp10 -= tmp2 << 2;
2993     tmp10 = MULTIPLY(tmp10, FIX(0.452548340));          /* (c2-c4)/2 */
2994     dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS1_BITS);
2995     dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS1_BITS);
2996 
2997     /* Odd part */
2998 
2999     tmp10 = MULTIPLY(tmp0 + tmp1, FIX(1.064004961));    /* c3 */
3000 
3001     dataptr[DCTSIZE*1] = (DCTELEM)
3002       DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.657591230)), /* c1-c3 */
3003 	      CONST_BITS+PASS1_BITS);
3004     dataptr[DCTSIZE*3] = (DCTELEM)
3005       DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.785601151)), /* c1+c3 */
3006 	      CONST_BITS+PASS1_BITS);
3007 
3008     dataptr++;			/* advance pointer to next column */
3009   }
3010 }
3011 
3012 
3013 /*
3014  * Perform the forward DCT on an 8x4 sample block.
3015  *
3016  * 8-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
3017  */
3018 
3019 GLOBAL(void)
jpeg_fdct_8x4(DCTELEM * data,JSAMPARRAY sample_data,JDIMENSION start_col)3020 jpeg_fdct_8x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3021 {
3022   INT32 tmp0, tmp1, tmp2, tmp3;
3023   INT32 tmp10, tmp11, tmp12, tmp13;
3024   INT32 z1;
3025   DCTELEM *dataptr;
3026   JSAMPROW elemptr;
3027   int ctr;
3028   SHIFT_TEMPS
3029 
3030   /* Zero 4 bottom rows of output coefficient block. */
3031   MEMZERO(&data[DCTSIZE*4], SIZEOF(DCTELEM) * DCTSIZE * 4);
3032 
3033   /* Pass 1: process rows.
3034    * Note results are scaled up by sqrt(8) compared to a true DCT;
3035    * furthermore, we scale the results by 2**PASS1_BITS.
3036    * We must also scale the output by 8/4 = 2, which we add here.
3037    * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
3038    */
3039 
3040   dataptr = data;
3041   for (ctr = 0; ctr < 4; ctr++) {
3042     elemptr = sample_data[ctr] + start_col;
3043 
3044     /* Even part per LL&M figure 1 --- note that published figure is faulty;
3045      * rotator "c1" should be "c6".
3046      */
3047 
3048     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
3049     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
3050     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
3051     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
3052 
3053     tmp10 = tmp0 + tmp3;
3054     tmp12 = tmp0 - tmp3;
3055     tmp11 = tmp1 + tmp2;
3056     tmp13 = tmp1 - tmp2;
3057 
3058     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
3059     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
3060     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
3061     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
3062 
3063     /* Apply unsigned->signed conversion. */
3064     dataptr[0] = (DCTELEM)
3065       ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << (PASS1_BITS+1));
3066     dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << (PASS1_BITS+1));
3067 
3068     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);       /* c6 */
3069     /* Add fudge factor here for final descale. */
3070     z1 += ONE << (CONST_BITS-PASS1_BITS-2);
3071 
3072     dataptr[2] = (DCTELEM)
3073       RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
3074 		  CONST_BITS-PASS1_BITS-1);
3075     dataptr[6] = (DCTELEM)
3076       RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
3077 		  CONST_BITS-PASS1_BITS-1);
3078 
3079     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
3080      * i0..i3 in the paper are tmp0..tmp3 here.
3081      */
3082 
3083     tmp12 = tmp0 + tmp2;
3084     tmp13 = tmp1 + tmp3;
3085 
3086     z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602);       /*  c3 */
3087     /* Add fudge factor here for final descale. */
3088     z1 += ONE << (CONST_BITS-PASS1_BITS-2);
3089 
3090     tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);          /* -c3+c5 */
3091     tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);          /* -c3-c5 */
3092     tmp12 += z1;
3093     tmp13 += z1;
3094 
3095     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223);       /* -c3+c7 */
3096     tmp0 = MULTIPLY(tmp0, FIX_1_501321110);              /*  c1+c3-c5-c7 */
3097     tmp3 = MULTIPLY(tmp3, FIX_0_298631336);              /* -c1+c3+c5-c7 */
3098     tmp0 += z1 + tmp12;
3099     tmp3 += z1 + tmp13;
3100 
3101     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447);       /* -c1-c3 */
3102     tmp1 = MULTIPLY(tmp1, FIX_3_072711026);              /*  c1+c3+c5-c7 */
3103     tmp2 = MULTIPLY(tmp2, FIX_2_053119869);              /*  c1+c3-c5+c7 */
3104     tmp1 += z1 + tmp13;
3105     tmp2 += z1 + tmp12;
3106 
3107     dataptr[1] = (DCTELEM) RIGHT_SHIFT(tmp0, CONST_BITS-PASS1_BITS-1);
3108     dataptr[3] = (DCTELEM) RIGHT_SHIFT(tmp1, CONST_BITS-PASS1_BITS-1);
3109     dataptr[5] = (DCTELEM) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS-1);
3110     dataptr[7] = (DCTELEM) RIGHT_SHIFT(tmp3, CONST_BITS-PASS1_BITS-1);
3111 
3112     dataptr += DCTSIZE;		/* advance pointer to next row */
3113   }
3114 
3115   /* Pass 2: process columns.
3116    * We remove the PASS1_BITS scaling, but leave the results scaled up
3117    * by an overall factor of 8.
3118    * 4-point FDCT kernel,
3119    * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
3120    */
3121 
3122   dataptr = data;
3123   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
3124     /* Even part */
3125 
3126     /* Add fudge factor here for final descale. */
3127     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3] + (ONE << (PASS1_BITS-1));
3128     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
3129 
3130     tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
3131     tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
3132 
3133     dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
3134     dataptr[DCTSIZE*2] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
3135 
3136     /* Odd part */
3137 
3138     tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
3139     /* Add fudge factor here for final descale. */
3140     tmp0 += ONE << (CONST_BITS+PASS1_BITS-1);
3141 
3142     dataptr[DCTSIZE*1] = (DCTELEM)
3143       RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
3144 		  CONST_BITS+PASS1_BITS);
3145     dataptr[DCTSIZE*3] = (DCTELEM)
3146       RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
3147 		  CONST_BITS+PASS1_BITS);
3148 
3149     dataptr++;			/* advance pointer to next column */
3150   }
3151 }
3152 
3153 
3154 /*
3155  * Perform the forward DCT on a 6x3 sample block.
3156  *
3157  * 6-point FDCT in pass 1 (rows), 3-point in pass 2 (columns).
3158  */
3159 
3160 GLOBAL(void)
jpeg_fdct_6x3(DCTELEM * data,JSAMPARRAY sample_data,JDIMENSION start_col)3161 jpeg_fdct_6x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3162 {
3163   INT32 tmp0, tmp1, tmp2;
3164   INT32 tmp10, tmp11, tmp12;
3165   DCTELEM *dataptr;
3166   JSAMPROW elemptr;
3167   int ctr;
3168   SHIFT_TEMPS
3169 
3170   /* Pre-zero output coefficient block. */
3171   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3172 
3173   /* Pass 1: process rows.
3174    * Note results are scaled up by sqrt(8) compared to a true DCT;
3175    * furthermore, we scale the results by 2**PASS1_BITS.
3176    * We scale the results further by 2 as part of output adaption
3177    * scaling for different DCT size.
3178    * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
3179    */
3180 
3181   dataptr = data;
3182   for (ctr = 0; ctr < 3; ctr++) {
3183     elemptr = sample_data[ctr] + start_col;
3184 
3185     /* Even part */
3186 
3187     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
3188     tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
3189     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
3190 
3191     tmp10 = tmp0 + tmp2;
3192     tmp12 = tmp0 - tmp2;
3193 
3194     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
3195     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
3196     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
3197 
3198     /* Apply unsigned->signed conversion. */
3199     dataptr[0] = (DCTELEM)
3200       ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << (PASS1_BITS+1));
3201     dataptr[2] = (DCTELEM)
3202       DESCALE(MULTIPLY(tmp12, FIX(1.224744871)),                 /* c2 */
3203 	      CONST_BITS-PASS1_BITS-1);
3204     dataptr[4] = (DCTELEM)
3205       DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
3206 	      CONST_BITS-PASS1_BITS-1);
3207 
3208     /* Odd part */
3209 
3210     tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)),     /* c5 */
3211 		    CONST_BITS-PASS1_BITS-1);
3212 
3213     dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << (PASS1_BITS+1)));
3214     dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << (PASS1_BITS+1));
3215     dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << (PASS1_BITS+1)));
3216 
3217     dataptr += DCTSIZE;		/* advance pointer to next row */
3218   }
3219 
3220   /* Pass 2: process columns.
3221    * We remove the PASS1_BITS scaling, but leave the results scaled up
3222    * by an overall factor of 8.
3223    * We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
3224    * fold into the constant multipliers (other part was done in pass 1):
3225    * 3-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/6) * 16/9.
3226    */
3227 
3228   dataptr = data;
3229   for (ctr = 0; ctr < 6; ctr++) {
3230     /* Even part */
3231 
3232     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*2];
3233     tmp1 = dataptr[DCTSIZE*1];
3234 
3235     tmp2 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*2];
3236 
3237     dataptr[DCTSIZE*0] = (DCTELEM)
3238       DESCALE(MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),        /* 16/9 */
3239 	      CONST_BITS+PASS1_BITS);
3240     dataptr[DCTSIZE*2] = (DCTELEM)
3241       DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(1.257078722)), /* c2 */
3242 	      CONST_BITS+PASS1_BITS);
3243 
3244     /* Odd part */
3245 
3246     dataptr[DCTSIZE*1] = (DCTELEM)
3247       DESCALE(MULTIPLY(tmp2, FIX(2.177324216)),               /* c1 */
3248 	      CONST_BITS+PASS1_BITS);
3249 
3250     dataptr++;			/* advance pointer to next column */
3251   }
3252 }
3253 
3254 
3255 /*
3256  * Perform the forward DCT on a 4x2 sample block.
3257  *
3258  * 4-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
3259  */
3260 
3261 GLOBAL(void)
jpeg_fdct_4x2(DCTELEM * data,JSAMPARRAY sample_data,JDIMENSION start_col)3262 jpeg_fdct_4x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3263 {
3264   INT32 tmp0, tmp1;
3265   INT32 tmp10, tmp11;
3266   DCTELEM *dataptr;
3267   JSAMPROW elemptr;
3268   int ctr;
3269   SHIFT_TEMPS
3270 
3271   /* Pre-zero output coefficient block. */
3272   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3273 
3274   /* Pass 1: process rows.
3275    * Note results are scaled up by sqrt(8) compared to a true DCT;
3276    * furthermore, we scale the results by 2**PASS1_BITS.
3277    * We must also scale the output by (8/4)*(8/2) = 2**3, which we add here.
3278    * 4-point FDCT kernel,
3279    * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
3280    */
3281 
3282   dataptr = data;
3283   for (ctr = 0; ctr < 2; ctr++) {
3284     elemptr = sample_data[ctr] + start_col;
3285 
3286     /* Even part */
3287 
3288     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
3289     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
3290 
3291     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
3292     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
3293 
3294     /* Apply unsigned->signed conversion. */
3295     dataptr[0] = (DCTELEM)
3296       ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+3));
3297     dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+3));
3298 
3299     /* Odd part */
3300 
3301     tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
3302     /* Add fudge factor here for final descale. */
3303     tmp0 += ONE << (CONST_BITS-PASS1_BITS-4);
3304 
3305     dataptr[1] = (DCTELEM)
3306       RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
3307 		  CONST_BITS-PASS1_BITS-3);
3308     dataptr[3] = (DCTELEM)
3309       RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
3310 		  CONST_BITS-PASS1_BITS-3);
3311 
3312     dataptr += DCTSIZE;		/* advance pointer to next row */
3313   }
3314 
3315   /* Pass 2: process columns.
3316    * We remove the PASS1_BITS scaling, but leave the results scaled up
3317    * by an overall factor of 8.
3318    */
3319 
3320   dataptr = data;
3321   for (ctr = 0; ctr < 4; ctr++) {
3322     /* Even part */
3323 
3324     /* Add fudge factor here for final descale. */
3325     tmp0 = dataptr[DCTSIZE*0] + (ONE << (PASS1_BITS-1));
3326     tmp1 = dataptr[DCTSIZE*1];
3327 
3328     dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
3329 
3330     /* Odd part */
3331 
3332     dataptr[DCTSIZE*1] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
3333 
3334     dataptr++;			/* advance pointer to next column */
3335   }
3336 }
3337 
3338 
3339 /*
3340  * Perform the forward DCT on a 2x1 sample block.
3341  *
3342  * 2-point FDCT in pass 1 (rows), 1-point in pass 2 (columns).
3343  */
3344 
3345 GLOBAL(void)
jpeg_fdct_2x1(DCTELEM * data,JSAMPARRAY sample_data,JDIMENSION start_col)3346 jpeg_fdct_2x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3347 {
3348   DCTELEM tmp0, tmp1;
3349   JSAMPROW elemptr;
3350 
3351   /* Pre-zero output coefficient block. */
3352   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3353 
3354   elemptr = sample_data[0] + start_col;
3355 
3356   tmp0 = GETJSAMPLE(elemptr[0]);
3357   tmp1 = GETJSAMPLE(elemptr[1]);
3358 
3359   /* We leave the results scaled up by an overall factor of 8.
3360    * We must also scale the output by (8/2)*(8/1) = 2**5.
3361    */
3362 
3363   /* Even part */
3364 
3365   /* Apply unsigned->signed conversion. */
3366   data[0] = (tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 5;
3367 
3368   /* Odd part */
3369 
3370   data[1] = (tmp0 - tmp1) << 5;
3371 }
3372 
3373 
3374 /*
3375  * Perform the forward DCT on an 8x16 sample block.
3376  *
3377  * 8-point FDCT in pass 1 (rows), 16-point in pass 2 (columns).
3378  */
3379 
3380 GLOBAL(void)
jpeg_fdct_8x16(DCTELEM * data,JSAMPARRAY sample_data,JDIMENSION start_col)3381 jpeg_fdct_8x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3382 {
3383   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3384   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
3385   INT32 z1;
3386   DCTELEM workspace[DCTSIZE2];
3387   DCTELEM *dataptr;
3388   DCTELEM *wsptr;
3389   JSAMPROW elemptr;
3390   int ctr;
3391   SHIFT_TEMPS
3392 
3393   /* Pass 1: process rows.
3394    * Note results are scaled up by sqrt(8) compared to a true DCT;
3395    * furthermore, we scale the results by 2**PASS1_BITS.
3396    * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
3397    */
3398 
3399   dataptr = data;
3400   ctr = 0;
3401   for (;;) {
3402     elemptr = sample_data[ctr] + start_col;
3403 
3404     /* Even part per LL&M figure 1 --- note that published figure is faulty;
3405      * rotator "c1" should be "c6".
3406      */
3407 
3408     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
3409     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
3410     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
3411     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
3412 
3413     tmp10 = tmp0 + tmp3;
3414     tmp12 = tmp0 - tmp3;
3415     tmp11 = tmp1 + tmp2;
3416     tmp13 = tmp1 - tmp2;
3417 
3418     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
3419     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
3420     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
3421     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
3422 
3423     /* Apply unsigned->signed conversion. */
3424     dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS);
3425     dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
3426 
3427     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);   /* c6 */
3428     dataptr[2] = (DCTELEM)
3429       DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
3430 	      CONST_BITS-PASS1_BITS);
3431     dataptr[6] = (DCTELEM)
3432       DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
3433 	      CONST_BITS-PASS1_BITS);
3434 
3435     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
3436      * i0..i3 in the paper are tmp0..tmp3 here.
3437      */
3438 
3439     tmp12 = tmp0 + tmp2;
3440     tmp13 = tmp1 + tmp3;
3441 
3442     z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602);   /*  c3 */
3443     tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);      /* -c3+c5 */
3444     tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);      /* -c3-c5 */
3445     tmp12 += z1;
3446     tmp13 += z1;
3447 
3448     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223);   /* -c3+c7 */
3449     tmp0 = MULTIPLY(tmp0, FIX_1_501321110);          /*  c1+c3-c5-c7 */
3450     tmp3 = MULTIPLY(tmp3, FIX_0_298631336);          /* -c1+c3+c5-c7 */
3451     tmp0 += z1 + tmp12;
3452     tmp3 += z1 + tmp13;
3453 
3454     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447);   /* -c1-c3 */
3455     tmp1 = MULTIPLY(tmp1, FIX_3_072711026);          /*  c1+c3+c5-c7 */
3456     tmp2 = MULTIPLY(tmp2, FIX_2_053119869);          /*  c1+c3-c5+c7 */
3457     tmp1 += z1 + tmp13;
3458     tmp2 += z1 + tmp12;
3459 
3460     dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
3461     dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
3462     dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
3463     dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS-PASS1_BITS);
3464 
3465     ctr++;
3466 
3467     if (ctr != DCTSIZE) {
3468       if (ctr == DCTSIZE * 2)
3469 	break;			/* Done. */
3470       dataptr += DCTSIZE;	/* advance pointer to next row */
3471     } else
3472       dataptr = workspace;	/* switch pointer to extended workspace */
3473   }
3474 
3475   /* Pass 2: process columns.
3476    * We remove the PASS1_BITS scaling, but leave the results scaled up
3477    * by an overall factor of 8.
3478    * We must also scale the output by 8/16 = 1/2.
3479    * 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
3480    */
3481 
3482   dataptr = data;
3483   wsptr = workspace;
3484   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
3485     /* Even part */
3486 
3487     tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*7];
3488     tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*6];
3489     tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*5];
3490     tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*4];
3491     tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*3];
3492     tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*2];
3493     tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*1];
3494     tmp7 = dataptr[DCTSIZE*7] + wsptr[DCTSIZE*0];
3495 
3496     tmp10 = tmp0 + tmp7;
3497     tmp14 = tmp0 - tmp7;
3498     tmp11 = tmp1 + tmp6;
3499     tmp15 = tmp1 - tmp6;
3500     tmp12 = tmp2 + tmp5;
3501     tmp16 = tmp2 - tmp5;
3502     tmp13 = tmp3 + tmp4;
3503     tmp17 = tmp3 - tmp4;
3504 
3505     tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*7];
3506     tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*6];
3507     tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*5];
3508     tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*4];
3509     tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*3];
3510     tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*2];
3511     tmp6 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*1];
3512     tmp7 = dataptr[DCTSIZE*7] - wsptr[DCTSIZE*0];
3513 
3514     dataptr[DCTSIZE*0] = (DCTELEM)
3515       DESCALE(tmp10 + tmp11 + tmp12 + tmp13, PASS1_BITS+1);
3516     dataptr[DCTSIZE*4] = (DCTELEM)
3517       DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
3518 	      MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
3519 	      CONST_BITS+PASS1_BITS+1);
3520 
3521     tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
3522 	    MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */
3523 
3524     dataptr[DCTSIZE*2] = (DCTELEM)
3525       DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
3526 	      + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+c10 */
3527 	      CONST_BITS+PASS1_BITS+1);
3528     dataptr[DCTSIZE*6] = (DCTELEM)
3529       DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
3530 	      - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
3531 	      CONST_BITS+PASS1_BITS+1);
3532 
3533     /* Odd part */
3534 
3535     tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) +         /* c3 */
3536 	    MULTIPLY(tmp6 - tmp7, FIX(0.410524528));          /* c13 */
3537     tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) +         /* c5 */
3538 	    MULTIPLY(tmp5 + tmp7, FIX(0.666655658));          /* c11 */
3539     tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) +         /* c7 */
3540 	    MULTIPLY(tmp4 - tmp7, FIX(0.897167586));          /* c9 */
3541     tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) +         /* c15 */
3542 	    MULTIPLY(tmp6 - tmp5, FIX(1.407403738));          /* c1 */
3543     tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) +       /* -c11 */
3544 	    MULTIPLY(tmp4 + tmp6, - FIX(1.247225013));        /* -c5 */
3545     tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) +       /* -c3 */
3546 	    MULTIPLY(tmp5 - tmp4, FIX(0.410524528));          /* c13 */
3547     tmp10 = tmp11 + tmp12 + tmp13 -
3548 	    MULTIPLY(tmp0, FIX(2.286341144)) +                /* c7+c5+c3-c1 */
3549 	    MULTIPLY(tmp7, FIX(0.779653625));                 /* c15+c13-c11+c9 */
3550     tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
3551 	     - MULTIPLY(tmp6, FIX(1.663905119));              /* c7+c13+c1-c5 */
3552     tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
3553 	     + MULTIPLY(tmp5, FIX(1.227391138));              /* c9-c11+c1-c13 */
3554     tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
3555 	     + MULTIPLY(tmp4, FIX(2.167985692));              /* c1+c13+c5-c9 */
3556 
3557     dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS+1);
3558     dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS+1);
3559     dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS+1);
3560     dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS+1);
3561 
3562     dataptr++;			/* advance pointer to next column */
3563     wsptr++;			/* advance pointer to next column */
3564   }
3565 }
3566 
3567 
3568 /*
3569  * Perform the forward DCT on a 7x14 sample block.
3570  *
3571  * 7-point FDCT in pass 1 (rows), 14-point in pass 2 (columns).
3572  */
3573 
3574 GLOBAL(void)
jpeg_fdct_7x14(DCTELEM * data,JSAMPARRAY sample_data,JDIMENSION start_col)3575 jpeg_fdct_7x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3576 {
3577   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
3578   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
3579   INT32 z1, z2, z3;
3580   DCTELEM workspace[8*6];
3581   DCTELEM *dataptr;
3582   DCTELEM *wsptr;
3583   JSAMPROW elemptr;
3584   int ctr;
3585   SHIFT_TEMPS
3586 
3587   /* Pre-zero output coefficient block. */
3588   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3589 
3590   /* Pass 1: process rows.
3591    * Note results are scaled up by sqrt(8) compared to a true DCT;
3592    * furthermore, we scale the results by 2**PASS1_BITS.
3593    * 7-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/14).
3594    */
3595 
3596   dataptr = data;
3597   ctr = 0;
3598   for (;;) {
3599     elemptr = sample_data[ctr] + start_col;
3600 
3601     /* Even part */
3602 
3603     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
3604     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
3605     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
3606     tmp3 = GETJSAMPLE(elemptr[3]);
3607 
3608     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
3609     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
3610     tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
3611 
3612     z1 = tmp0 + tmp2;
3613     /* Apply unsigned->signed conversion. */
3614     dataptr[0] = (DCTELEM)
3615       ((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS);
3616     tmp3 += tmp3;
3617     z1 -= tmp3;
3618     z1 -= tmp3;
3619     z1 = MULTIPLY(z1, FIX(0.353553391));                /* (c2+c6-c4)/2 */
3620     z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002));       /* (c2+c4-c6)/2 */
3621     z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123));       /* c6 */
3622     dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
3623     z1 -= z2;
3624     z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734));       /* c4 */
3625     dataptr[4] = (DCTELEM)
3626       DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
3627 	      CONST_BITS-PASS1_BITS);
3628     dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
3629 
3630     /* Odd part */
3631 
3632     tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347));   /* (c3+c1-c5)/2 */
3633     tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339));   /* (c3+c5-c1)/2 */
3634     tmp0 = tmp1 - tmp2;
3635     tmp1 += tmp2;
3636     tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.378756276)); /* -c1 */
3637     tmp1 += tmp2;
3638     tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.613604268));   /* c5 */
3639     tmp0 += tmp3;
3640     tmp2 += tmp3 + MULTIPLY(tmp12, FIX(1.870828693));   /* c3+c1-c5 */
3641 
3642     dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
3643     dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
3644     dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
3645 
3646     ctr++;
3647 
3648     if (ctr != DCTSIZE) {
3649       if (ctr == 14)
3650 	break;			/* Done. */
3651       dataptr += DCTSIZE;	/* advance pointer to next row */
3652     } else
3653       dataptr = workspace;	/* switch pointer to extended workspace */
3654   }
3655 
3656   /* Pass 2: process columns.
3657    * We remove the PASS1_BITS scaling, but leave the results scaled up
3658    * by an overall factor of 8.
3659    * We must also scale the output by (8/7)*(8/14) = 32/49, which we
3660    * fold into the constant multipliers:
3661    * 14-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/28) * 32/49.
3662    */
3663 
3664   dataptr = data;
3665   wsptr = workspace;
3666   for (ctr = 0; ctr < 7; ctr++) {
3667     /* Even part */
3668 
3669     tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*5];
3670     tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*4];
3671     tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*3];
3672     tmp13 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*2];
3673     tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*1];
3674     tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*0];
3675     tmp6 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7];
3676 
3677     tmp10 = tmp0 + tmp6;
3678     tmp14 = tmp0 - tmp6;
3679     tmp11 = tmp1 + tmp5;
3680     tmp15 = tmp1 - tmp5;
3681     tmp12 = tmp2 + tmp4;
3682     tmp16 = tmp2 - tmp4;
3683 
3684     tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*5];
3685     tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*4];
3686     tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*3];
3687     tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*2];
3688     tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*1];
3689     tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*0];
3690     tmp6 = dataptr[DCTSIZE*6] - dataptr[DCTSIZE*7];
3691 
3692     dataptr[DCTSIZE*0] = (DCTELEM)
3693       DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13,
3694 		       FIX(0.653061224)),                 /* 32/49 */
3695 	      CONST_BITS+PASS1_BITS);
3696     tmp13 += tmp13;
3697     dataptr[DCTSIZE*4] = (DCTELEM)
3698       DESCALE(MULTIPLY(tmp10 - tmp13, FIX(0.832106052)) + /* c4 */
3699 	      MULTIPLY(tmp11 - tmp13, FIX(0.205513223)) - /* c12 */
3700 	      MULTIPLY(tmp12 - tmp13, FIX(0.575835255)),  /* c8 */
3701 	      CONST_BITS+PASS1_BITS);
3702 
3703     tmp10 = MULTIPLY(tmp14 + tmp15, FIX(0.722074570));    /* c6 */
3704 
3705     dataptr[DCTSIZE*2] = (DCTELEM)
3706       DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.178337691))   /* c2-c6 */
3707 	      + MULTIPLY(tmp16, FIX(0.400721155)),        /* c10 */
3708 	      CONST_BITS+PASS1_BITS);
3709     dataptr[DCTSIZE*6] = (DCTELEM)
3710       DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.122795725))   /* c6+c10 */
3711 	      - MULTIPLY(tmp16, FIX(0.900412262)),        /* c2 */
3712 	      CONST_BITS+PASS1_BITS);
3713 
3714     /* Odd part */
3715 
3716     tmp10 = tmp1 + tmp2;
3717     tmp11 = tmp5 - tmp4;
3718     dataptr[DCTSIZE*7] = (DCTELEM)
3719       DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6,
3720 		       FIX(0.653061224)),                 /* 32/49 */
3721 	      CONST_BITS+PASS1_BITS);
3722     tmp3  = MULTIPLY(tmp3 , FIX(0.653061224));            /* 32/49 */
3723     tmp10 = MULTIPLY(tmp10, - FIX(0.103406812));          /* -c13 */
3724     tmp11 = MULTIPLY(tmp11, FIX(0.917760839));            /* c1 */
3725     tmp10 += tmp11 - tmp3;
3726     tmp11 = MULTIPLY(tmp0 + tmp2, FIX(0.782007410)) +     /* c5 */
3727 	    MULTIPLY(tmp4 + tmp6, FIX(0.491367823));      /* c9 */
3728     dataptr[DCTSIZE*5] = (DCTELEM)
3729       DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(1.550341076)) /* c3+c5-c13 */
3730 	      + MULTIPLY(tmp4, FIX(0.731428202)),         /* c1+c11-c9 */
3731 	      CONST_BITS+PASS1_BITS);
3732     tmp12 = MULTIPLY(tmp0 + tmp1, FIX(0.871740478)) +     /* c3 */
3733 	    MULTIPLY(tmp5 - tmp6, FIX(0.305035186));      /* c11 */
3734     dataptr[DCTSIZE*3] = (DCTELEM)
3735       DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.276965844)) /* c3-c9-c13 */
3736 	      - MULTIPLY(tmp5, FIX(2.004803435)),         /* c1+c5+c11 */
3737 	      CONST_BITS+PASS1_BITS);
3738     dataptr[DCTSIZE*1] = (DCTELEM)
3739       DESCALE(tmp11 + tmp12 + tmp3
3740 	      - MULTIPLY(tmp0, FIX(0.735987049))          /* c3+c5-c1 */
3741 	      - MULTIPLY(tmp6, FIX(0.082925825)),         /* c9-c11-c13 */
3742 	      CONST_BITS+PASS1_BITS);
3743 
3744     dataptr++;			/* advance pointer to next column */
3745     wsptr++;			/* advance pointer to next column */
3746   }
3747 }
3748 
3749 
3750 /*
3751  * Perform the forward DCT on a 6x12 sample block.
3752  *
3753  * 6-point FDCT in pass 1 (rows), 12-point in pass 2 (columns).
3754  */
3755 
3756 GLOBAL(void)
jpeg_fdct_6x12(DCTELEM * data,JSAMPARRAY sample_data,JDIMENSION start_col)3757 jpeg_fdct_6x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3758 {
3759   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3760   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
3761   DCTELEM workspace[8*4];
3762   DCTELEM *dataptr;
3763   DCTELEM *wsptr;
3764   JSAMPROW elemptr;
3765   int ctr;
3766   SHIFT_TEMPS
3767 
3768   /* Pre-zero output coefficient block. */
3769   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3770 
3771   /* Pass 1: process rows.
3772    * Note results are scaled up by sqrt(8) compared to a true DCT;
3773    * furthermore, we scale the results by 2**PASS1_BITS.
3774    * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
3775    */
3776 
3777   dataptr = data;
3778   ctr = 0;
3779   for (;;) {
3780     elemptr = sample_data[ctr] + start_col;
3781 
3782     /* Even part */
3783 
3784     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
3785     tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
3786     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
3787 
3788     tmp10 = tmp0 + tmp2;
3789     tmp12 = tmp0 - tmp2;
3790 
3791     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
3792     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
3793     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
3794 
3795     /* Apply unsigned->signed conversion. */
3796     dataptr[0] = (DCTELEM)
3797       ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS);
3798     dataptr[2] = (DCTELEM)
3799       DESCALE(MULTIPLY(tmp12, FIX(1.224744871)),                 /* c2 */
3800 	      CONST_BITS-PASS1_BITS);
3801     dataptr[4] = (DCTELEM)
3802       DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
3803 	      CONST_BITS-PASS1_BITS);
3804 
3805     /* Odd part */
3806 
3807     tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)),     /* c5 */
3808 		    CONST_BITS-PASS1_BITS);
3809 
3810     dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
3811     dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
3812     dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
3813 
3814     ctr++;
3815 
3816     if (ctr != DCTSIZE) {
3817       if (ctr == 12)
3818 	break;			/* Done. */
3819       dataptr += DCTSIZE;	/* advance pointer to next row */
3820     } else
3821       dataptr = workspace;	/* switch pointer to extended workspace */
3822   }
3823 
3824   /* Pass 2: process columns.
3825    * We remove the PASS1_BITS scaling, but leave the results scaled up
3826    * by an overall factor of 8.
3827    * We must also scale the output by (8/6)*(8/12) = 8/9, which we
3828    * fold into the constant multipliers:
3829    * 12-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/24) * 8/9.
3830    */
3831 
3832   dataptr = data;
3833   wsptr = workspace;
3834   for (ctr = 0; ctr < 6; ctr++) {
3835     /* Even part */
3836 
3837     tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*3];
3838     tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*2];
3839     tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*1];
3840     tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*0];
3841     tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*7];
3842     tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*6];
3843 
3844     tmp10 = tmp0 + tmp5;
3845     tmp13 = tmp0 - tmp5;
3846     tmp11 = tmp1 + tmp4;
3847     tmp14 = tmp1 - tmp4;
3848     tmp12 = tmp2 + tmp3;
3849     tmp15 = tmp2 - tmp3;
3850 
3851     tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*3];
3852     tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*2];
3853     tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*1];
3854     tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*0];
3855     tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*7];
3856     tmp5 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*6];
3857 
3858     dataptr[DCTSIZE*0] = (DCTELEM)
3859       DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(0.888888889)), /* 8/9 */
3860 	      CONST_BITS+PASS1_BITS);
3861     dataptr[DCTSIZE*6] = (DCTELEM)
3862       DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(0.888888889)), /* 8/9 */
3863 	      CONST_BITS+PASS1_BITS);
3864     dataptr[DCTSIZE*4] = (DCTELEM)
3865       DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.088662108)),         /* c4 */
3866 	      CONST_BITS+PASS1_BITS);
3867     dataptr[DCTSIZE*2] = (DCTELEM)
3868       DESCALE(MULTIPLY(tmp14 - tmp15, FIX(0.888888889)) +        /* 8/9 */
3869 	      MULTIPLY(tmp13 + tmp15, FIX(1.214244803)),         /* c2 */
3870 	      CONST_BITS+PASS1_BITS);
3871 
3872     /* Odd part */
3873 
3874     tmp10 = MULTIPLY(tmp1 + tmp4, FIX(0.481063200));   /* c9 */
3875     tmp14 = tmp10 + MULTIPLY(tmp1, FIX(0.680326102));  /* c3-c9 */
3876     tmp15 = tmp10 - MULTIPLY(tmp4, FIX(1.642452502));  /* c3+c9 */
3877     tmp12 = MULTIPLY(tmp0 + tmp2, FIX(0.997307603));   /* c5 */
3878     tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.765261039));   /* c7 */
3879     tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.516244403)) /* c5+c7-c1 */
3880 	    + MULTIPLY(tmp5, FIX(0.164081699));        /* c11 */
3881     tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.164081699)); /* -c11 */
3882     tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.079550144)) /* c1+c5-c11 */
3883 	    + MULTIPLY(tmp5, FIX(0.765261039));        /* c7 */
3884     tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.645144899)) /* c1+c11-c7 */
3885 	    - MULTIPLY(tmp5, FIX(0.997307603));        /* c5 */
3886     tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.161389302)) /* c3 */
3887 	    - MULTIPLY(tmp2 + tmp5, FIX(0.481063200)); /* c9 */
3888 
3889     dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS);
3890     dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS);
3891     dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS);
3892     dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS);
3893 
3894     dataptr++;			/* advance pointer to next column */
3895     wsptr++;			/* advance pointer to next column */
3896   }
3897 }
3898 
3899 
3900 /*
3901  * Perform the forward DCT on a 5x10 sample block.
3902  *
3903  * 5-point FDCT in pass 1 (rows), 10-point in pass 2 (columns).
3904  */
3905 
3906 GLOBAL(void)
jpeg_fdct_5x10(DCTELEM * data,JSAMPARRAY sample_data,JDIMENSION start_col)3907 jpeg_fdct_5x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3908 {
3909   INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
3910   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
3911   DCTELEM workspace[8*2];
3912   DCTELEM *dataptr;
3913   DCTELEM *wsptr;
3914   JSAMPROW elemptr;
3915   int ctr;
3916   SHIFT_TEMPS
3917 
3918   /* Pre-zero output coefficient block. */
3919   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3920 
3921   /* Pass 1: process rows.
3922    * Note results are scaled up by sqrt(8) compared to a true DCT;
3923    * furthermore, we scale the results by 2**PASS1_BITS.
3924    * 5-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/10).
3925    */
3926 
3927   dataptr = data;
3928   ctr = 0;
3929   for (;;) {
3930     elemptr = sample_data[ctr] + start_col;
3931 
3932     /* Even part */
3933 
3934     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
3935     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
3936     tmp2 = GETJSAMPLE(elemptr[2]);
3937 
3938     tmp10 = tmp0 + tmp1;
3939     tmp11 = tmp0 - tmp1;
3940 
3941     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
3942     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
3943 
3944     /* Apply unsigned->signed conversion. */
3945     dataptr[0] = (DCTELEM)
3946       ((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << PASS1_BITS);
3947     tmp11 = MULTIPLY(tmp11, FIX(0.790569415));          /* (c2+c4)/2 */
3948     tmp10 -= tmp2 << 2;
3949     tmp10 = MULTIPLY(tmp10, FIX(0.353553391));          /* (c2-c4)/2 */
3950     dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS);
3951     dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS);
3952 
3953     /* Odd part */
3954 
3955     tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876));    /* c3 */
3956 
3957     dataptr[1] = (DCTELEM)
3958       DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
3959 	      CONST_BITS-PASS1_BITS);
3960     dataptr[3] = (DCTELEM)
3961       DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
3962 	      CONST_BITS-PASS1_BITS);
3963 
3964     ctr++;
3965 
3966     if (ctr != DCTSIZE) {
3967       if (ctr == 10)
3968 	break;			/* Done. */
3969       dataptr += DCTSIZE;	/* advance pointer to next row */
3970     } else
3971       dataptr = workspace;	/* switch pointer to extended workspace */
3972   }
3973 
3974   /* Pass 2: process columns.
3975    * We remove the PASS1_BITS scaling, but leave the results scaled up
3976    * by an overall factor of 8.
3977    * We must also scale the output by (8/5)*(8/10) = 32/25, which we
3978    * fold into the constant multipliers:
3979    * 10-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/20) * 32/25.
3980    */
3981 
3982   dataptr = data;
3983   wsptr = workspace;
3984   for (ctr = 0; ctr < 5; ctr++) {
3985     /* Even part */
3986 
3987     tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*1];
3988     tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*0];
3989     tmp12 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*7];
3990     tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*6];
3991     tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5];
3992 
3993     tmp10 = tmp0 + tmp4;
3994     tmp13 = tmp0 - tmp4;
3995     tmp11 = tmp1 + tmp3;
3996     tmp14 = tmp1 - tmp3;
3997 
3998     tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*1];
3999     tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*0];
4000     tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*7];
4001     tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*6];
4002     tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*5];
4003 
4004     dataptr[DCTSIZE*0] = (DCTELEM)
4005       DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(1.28)), /* 32/25 */
4006 	      CONST_BITS+PASS1_BITS);
4007     tmp12 += tmp12;
4008     dataptr[DCTSIZE*4] = (DCTELEM)
4009       DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.464477191)) - /* c4 */
4010 	      MULTIPLY(tmp11 - tmp12, FIX(0.559380511)),  /* c8 */
4011 	      CONST_BITS+PASS1_BITS);
4012     tmp10 = MULTIPLY(tmp13 + tmp14, FIX(1.064004961));    /* c6 */
4013     dataptr[DCTSIZE*2] = (DCTELEM)
4014       DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.657591230)),  /* c2-c6 */
4015 	      CONST_BITS+PASS1_BITS);
4016     dataptr[DCTSIZE*6] = (DCTELEM)
4017       DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.785601151)),  /* c2+c6 */
4018 	      CONST_BITS+PASS1_BITS);
4019 
4020     /* Odd part */
4021 
4022     tmp10 = tmp0 + tmp4;
4023     tmp11 = tmp1 - tmp3;
4024     dataptr[DCTSIZE*5] = (DCTELEM)
4025       DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(1.28)),  /* 32/25 */
4026 	      CONST_BITS+PASS1_BITS);
4027     tmp2 = MULTIPLY(tmp2, FIX(1.28));                     /* 32/25 */
4028     dataptr[DCTSIZE*1] = (DCTELEM)
4029       DESCALE(MULTIPLY(tmp0, FIX(1.787906876)) +          /* c1 */
4030 	      MULTIPLY(tmp1, FIX(1.612894094)) + tmp2 +   /* c3 */
4031 	      MULTIPLY(tmp3, FIX(0.821810588)) +          /* c7 */
4032 	      MULTIPLY(tmp4, FIX(0.283176630)),           /* c9 */
4033 	      CONST_BITS+PASS1_BITS);
4034     tmp12 = MULTIPLY(tmp0 - tmp4, FIX(1.217352341)) -     /* (c3+c7)/2 */
4035 	    MULTIPLY(tmp1 + tmp3, FIX(0.752365123));      /* (c1-c9)/2 */
4036     tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.395541753)) +   /* (c3-c7)/2 */
4037 	    MULTIPLY(tmp11, FIX(0.64)) - tmp2;            /* 16/25 */
4038     dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS+PASS1_BITS);
4039     dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS+PASS1_BITS);
4040 
4041     dataptr++;			/* advance pointer to next column */
4042     wsptr++;			/* advance pointer to next column */
4043   }
4044 }
4045 
4046 
4047 /*
4048  * Perform the forward DCT on a 4x8 sample block.
4049  *
4050  * 4-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
4051  */
4052 
4053 GLOBAL(void)
jpeg_fdct_4x8(DCTELEM * data,JSAMPARRAY sample_data,JDIMENSION start_col)4054 jpeg_fdct_4x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
4055 {
4056   INT32 tmp0, tmp1, tmp2, tmp3;
4057   INT32 tmp10, tmp11, tmp12, tmp13;
4058   INT32 z1;
4059   DCTELEM *dataptr;
4060   JSAMPROW elemptr;
4061   int ctr;
4062   SHIFT_TEMPS
4063 
4064   /* Pre-zero output coefficient block. */
4065   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
4066 
4067   /* Pass 1: process rows.
4068    * Note results are scaled up by sqrt(8) compared to a true DCT;
4069    * furthermore, we scale the results by 2**PASS1_BITS.
4070    * We must also scale the output by 8/4 = 2, which we add here.
4071    * 4-point FDCT kernel,
4072    * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
4073    */
4074 
4075   dataptr = data;
4076   for (ctr = 0; ctr < DCTSIZE; ctr++) {
4077     elemptr = sample_data[ctr] + start_col;
4078 
4079     /* Even part */
4080 
4081     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
4082     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
4083 
4084     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
4085     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
4086 
4087     /* Apply unsigned->signed conversion. */
4088     dataptr[0] = (DCTELEM)
4089       ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+1));
4090     dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+1));
4091 
4092     /* Odd part */
4093 
4094     tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
4095     /* Add fudge factor here for final descale. */
4096     tmp0 += ONE << (CONST_BITS-PASS1_BITS-2);
4097 
4098     dataptr[1] = (DCTELEM)
4099       RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
4100 		  CONST_BITS-PASS1_BITS-1);
4101     dataptr[3] = (DCTELEM)
4102       RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
4103 		  CONST_BITS-PASS1_BITS-1);
4104 
4105     dataptr += DCTSIZE;		/* advance pointer to next row */
4106   }
4107 
4108   /* Pass 2: process columns.
4109    * We remove the PASS1_BITS scaling, but leave the results scaled up
4110    * by an overall factor of 8.
4111    * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
4112    */
4113 
4114   dataptr = data;
4115   for (ctr = 0; ctr < 4; ctr++) {
4116     /* Even part per LL&M figure 1 --- note that published figure is faulty;
4117      * rotator "c1" should be "c6".
4118      */
4119 
4120     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
4121     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
4122     tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
4123     tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
4124 
4125     /* Add fudge factor here for final descale. */
4126     tmp10 = tmp0 + tmp3 + (ONE << (PASS1_BITS-1));
4127     tmp12 = tmp0 - tmp3;
4128     tmp11 = tmp1 + tmp2;
4129     tmp13 = tmp1 - tmp2;
4130 
4131     tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
4132     tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
4133     tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
4134     tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
4135 
4136     dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
4137     dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);
4138 
4139     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);       /* c6 */
4140     /* Add fudge factor here for final descale. */
4141     z1 += ONE << (CONST_BITS+PASS1_BITS-1);
4142 
4143     dataptr[DCTSIZE*2] = (DCTELEM)
4144       RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
4145 		  CONST_BITS+PASS1_BITS);
4146     dataptr[DCTSIZE*6] = (DCTELEM)
4147       RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
4148 		  CONST_BITS+PASS1_BITS);
4149 
4150     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
4151      * i0..i3 in the paper are tmp0..tmp3 here.
4152      */
4153 
4154     tmp12 = tmp0 + tmp2;
4155     tmp13 = tmp1 + tmp3;
4156 
4157     z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602);       /*  c3 */
4158     /* Add fudge factor here for final descale. */
4159     z1 += ONE << (CONST_BITS+PASS1_BITS-1);
4160 
4161     tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);          /* -c3+c5 */
4162     tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);          /* -c3-c5 */
4163     tmp12 += z1;
4164     tmp13 += z1;
4165 
4166     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223);       /* -c3+c7 */
4167     tmp0 = MULTIPLY(tmp0, FIX_1_501321110);              /*  c1+c3-c5-c7 */
4168     tmp3 = MULTIPLY(tmp3, FIX_0_298631336);              /* -c1+c3+c5-c7 */
4169     tmp0 += z1 + tmp12;
4170     tmp3 += z1 + tmp13;
4171 
4172     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447);       /* -c1-c3 */
4173     tmp1 = MULTIPLY(tmp1, FIX_3_072711026);              /*  c1+c3+c5-c7 */
4174     tmp2 = MULTIPLY(tmp2, FIX_2_053119869);              /*  c1+c3-c5+c7 */
4175     tmp1 += z1 + tmp13;
4176     tmp2 += z1 + tmp12;
4177 
4178     dataptr[DCTSIZE*1] = (DCTELEM) RIGHT_SHIFT(tmp0, CONST_BITS+PASS1_BITS);
4179     dataptr[DCTSIZE*3] = (DCTELEM) RIGHT_SHIFT(tmp1, CONST_BITS+PASS1_BITS);
4180     dataptr[DCTSIZE*5] = (DCTELEM) RIGHT_SHIFT(tmp2, CONST_BITS+PASS1_BITS);
4181     dataptr[DCTSIZE*7] = (DCTELEM) RIGHT_SHIFT(tmp3, CONST_BITS+PASS1_BITS);
4182 
4183     dataptr++;			/* advance pointer to next column */
4184   }
4185 }
4186 
4187 
4188 /*
4189  * Perform the forward DCT on a 3x6 sample block.
4190  *
4191  * 3-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
4192  */
4193 
4194 GLOBAL(void)
jpeg_fdct_3x6(DCTELEM * data,JSAMPARRAY sample_data,JDIMENSION start_col)4195 jpeg_fdct_3x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
4196 {
4197   INT32 tmp0, tmp1, tmp2;
4198   INT32 tmp10, tmp11, tmp12;
4199   DCTELEM *dataptr;
4200   JSAMPROW elemptr;
4201   int ctr;
4202   SHIFT_TEMPS
4203 
4204   /* Pre-zero output coefficient block. */
4205   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
4206 
4207   /* Pass 1: process rows.
4208    * Note results are scaled up by sqrt(8) compared to a true DCT;
4209    * furthermore, we scale the results by 2**PASS1_BITS.
4210    * We scale the results further by 2 as part of output adaption
4211    * scaling for different DCT size.
4212    * 3-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/6).
4213    */
4214 
4215   dataptr = data;
4216   for (ctr = 0; ctr < 6; ctr++) {
4217     elemptr = sample_data[ctr] + start_col;
4218 
4219     /* Even part */
4220 
4221     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
4222     tmp1 = GETJSAMPLE(elemptr[1]);
4223 
4224     tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
4225 
4226     /* Apply unsigned->signed conversion. */
4227     dataptr[0] = (DCTELEM)
4228       ((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+1));
4229     dataptr[2] = (DCTELEM)
4230       DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
4231 	      CONST_BITS-PASS1_BITS-1);
4232 
4233     /* Odd part */
4234 
4235     dataptr[1] = (DCTELEM)
4236       DESCALE(MULTIPLY(tmp2, FIX(1.224744871)),               /* c1 */
4237 	      CONST_BITS-PASS1_BITS-1);
4238 
4239     dataptr += DCTSIZE;		/* advance pointer to next row */
4240   }
4241 
4242   /* Pass 2: process columns.
4243    * We remove the PASS1_BITS scaling, but leave the results scaled up
4244    * by an overall factor of 8.
4245    * We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
4246    * fold into the constant multipliers (other part was done in pass 1):
4247    * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12) * 16/9.
4248    */
4249 
4250   dataptr = data;
4251   for (ctr = 0; ctr < 3; ctr++) {
4252     /* Even part */
4253 
4254     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
4255     tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
4256     tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
4257 
4258     tmp10 = tmp0 + tmp2;
4259     tmp12 = tmp0 - tmp2;
4260 
4261     tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
4262     tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
4263     tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
4264 
4265     dataptr[DCTSIZE*0] = (DCTELEM)
4266       DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)),         /* 16/9 */
4267 	      CONST_BITS+PASS1_BITS);
4268     dataptr[DCTSIZE*2] = (DCTELEM)
4269       DESCALE(MULTIPLY(tmp12, FIX(2.177324216)),                 /* c2 */
4270 	      CONST_BITS+PASS1_BITS);
4271     dataptr[DCTSIZE*4] = (DCTELEM)
4272       DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
4273 	      CONST_BITS+PASS1_BITS);
4274 
4275     /* Odd part */
4276 
4277     tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829));             /* c5 */
4278 
4279     dataptr[DCTSIZE*1] = (DCTELEM)
4280       DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),   /* 16/9 */
4281 	      CONST_BITS+PASS1_BITS);
4282     dataptr[DCTSIZE*3] = (DCTELEM)
4283       DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)),    /* 16/9 */
4284 	      CONST_BITS+PASS1_BITS);
4285     dataptr[DCTSIZE*5] = (DCTELEM)
4286       DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)),   /* 16/9 */
4287 	      CONST_BITS+PASS1_BITS);
4288 
4289     dataptr++;			/* advance pointer to next column */
4290   }
4291 }
4292 
4293 
4294 /*
4295  * Perform the forward DCT on a 2x4 sample block.
4296  *
4297  * 2-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
4298  */
4299 
4300 GLOBAL(void)
jpeg_fdct_2x4(DCTELEM * data,JSAMPARRAY sample_data,JDIMENSION start_col)4301 jpeg_fdct_2x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
4302 {
4303   INT32 tmp0, tmp1;
4304   INT32 tmp10, tmp11;
4305   DCTELEM *dataptr;
4306   JSAMPROW elemptr;
4307   int ctr;
4308   SHIFT_TEMPS
4309 
4310   /* Pre-zero output coefficient block. */
4311   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
4312 
4313   /* Pass 1: process rows.
4314    * Note results are scaled up by sqrt(8) compared to a true DCT.
4315    * We must also scale the output by (8/2)*(8/4) = 2**3, which we add here.
4316    */
4317 
4318   dataptr = data;
4319   for (ctr = 0; ctr < 4; ctr++) {
4320     elemptr = sample_data[ctr] + start_col;
4321 
4322     /* Even part */
4323 
4324     tmp0 = GETJSAMPLE(elemptr[0]);
4325     tmp1 = GETJSAMPLE(elemptr[1]);
4326 
4327     /* Apply unsigned->signed conversion. */
4328     dataptr[0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 3);
4329 
4330     /* Odd part */
4331 
4332     dataptr[1] = (DCTELEM) ((tmp0 - tmp1) << 3);
4333 
4334     dataptr += DCTSIZE;		/* advance pointer to next row */
4335   }
4336 
4337   /* Pass 2: process columns.
4338    * We leave the results scaled up by an overall factor of 8.
4339    * 4-point FDCT kernel,
4340    * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
4341    */
4342 
4343   dataptr = data;
4344   for (ctr = 0; ctr < 2; ctr++) {
4345     /* Even part */
4346 
4347     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3];
4348     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
4349 
4350     tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
4351     tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
4352 
4353     dataptr[DCTSIZE*0] = (DCTELEM) (tmp0 + tmp1);
4354     dataptr[DCTSIZE*2] = (DCTELEM) (tmp0 - tmp1);
4355 
4356     /* Odd part */
4357 
4358     tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
4359     /* Add fudge factor here for final descale. */
4360     tmp0 += ONE << (CONST_BITS-1);
4361 
4362     dataptr[DCTSIZE*1] = (DCTELEM)
4363       RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
4364 		  CONST_BITS);
4365     dataptr[DCTSIZE*3] = (DCTELEM)
4366       RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
4367 		  CONST_BITS);
4368 
4369     dataptr++;			/* advance pointer to next column */
4370   }
4371 }
4372 
4373 
4374 /*
4375  * Perform the forward DCT on a 1x2 sample block.
4376  *
4377  * 1-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
4378  */
4379 
4380 GLOBAL(void)
jpeg_fdct_1x2(DCTELEM * data,JSAMPARRAY sample_data,JDIMENSION start_col)4381 jpeg_fdct_1x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
4382 {
4383   DCTELEM tmp0, tmp1;
4384 
4385   /* Pre-zero output coefficient block. */
4386   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
4387 
4388   /* Pass 1: empty. */
4389 
4390   /* Pass 2: process columns.
4391    * We leave the results scaled up by an overall factor of 8.
4392    * We must also scale the output by (8/1)*(8/2) = 2**5.
4393    */
4394 
4395   /* Even part */
4396 
4397   tmp0 = GETJSAMPLE(sample_data[0][start_col]);
4398   tmp1 = GETJSAMPLE(sample_data[1][start_col]);
4399 
4400   /* Apply unsigned->signed conversion. */
4401   data[DCTSIZE*0] = (tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 5;
4402 
4403   /* Odd part */
4404 
4405   data[DCTSIZE*1] = (tmp0 - tmp1) << 5;
4406 }
4407 
4408 #endif /* DCT_SCALING_SUPPORTED */
4409 #endif /* DCT_ISLOW_SUPPORTED */
4410