1 /*
2  * Copyright (c) 1994 Regents of the University of California.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *	This product includes software developed by the Network Research
16  *	Group at Lawrence Berkeley Laboratory.
17  * 4. Neither the name of the University nor of the Laboratory may be used
18  *    to endorse or promote products derived from this software without
19  *    specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  */
33 
34 /************ Change log
35  *
36  * $Log: dct.cxx,v $
37  * Revision 1.3  2006/08/10 07:05:46  csoutheren
38  * Fixed compile warnings on VC 2005
39  *
40  * Revision 1.2  2006/07/31 09:09:21  csoutheren
41  * Checkin of validated codec used during development
42  *
43  * Revision 1.1.2.1  2006/04/06 01:17:17  csoutheren
44  * Initial version of H.261 video codec plugin for OPAL
45  *
46  * Revision 2.1  2003/03/15 23:42:59  robertj
47  * Update to OpenH323 v1.11.7
48  *
49  * Revision 1.14  2003/03/14 07:25:55  robertj
50  * Removed $header keyword so is not different on alternate repositories
51  *
52  * Revision 1.13  2002/10/24 21:05:26  dereks
53  * Fix compile time warning.
54  *
55  * Revision 1.12  2002/05/17 01:47:33  dereks
56  * backout the integer maths in the h261 codec.
57  *
58  * Revision 1.11  2002/02/15 03:54:31  yurik
59  * Warnings removed during compilation, patch courtesy of Jehan Bing, jehan@bravobrava.com
60  *
61  * Revision 1.10  2001/10/24 20:24:32  dereks
62  * Remove green stripes under windows for INT_64. Thanks to Robert Lupa.
63  *
64  * Revision 1.9  2001/10/17 03:52:39  robertj
65  * Fixed MSVC compatibility
66  *
67  * Revision 1.8  2001/10/17 01:54:36  yurik
68  * Fixed clash with CE includes for INT32 type
69  *
70  * Revision 1.7  2001/10/16 23:51:42  dereks
71  * Change vic's fdct() from floating-point to fix-point. Improves performance
72  * for h261 video significantly on some machines. Thanks to Cosmos Jiang
73  *
74  * Revision 1.6  2001/10/16 21:20:07  yurik
75  * Removed warnings on Windows CE. Submitted by Jehan Bing, jehan@bravobrava.com
76  *
77  * Revision 1.3  2000/12/19 22:22:34  dereks
78  * Remove connection to grabber-OS.cxx files. grabber-OS.cxx files no longer used.
79  * Video data is now read from a video channel, using the pwlib classes.
80  *
81  * Revision 1.2  2000/08/25 03:18:49  dereks
82  * Add change log facility (Thanks Robert for the info on implementation)
83  *
84  *
85  *
86  ********/
87 
88 #include "bsd-endian.h"
89 #include "dct.h"
90 
91 /*
92  * Macros for fix-point (integer) arithmetic.  FP_NBITS gives the number
93  * of binary digits past the decimal point.  FP_MUL computes the product
94  * of two fixed point numbers.  A fixed point number and an integer
95  * can be directly multiplied to give a fixed point number.  FP_SCALE
96  * converts a floating point number to fixed point (and is used only
97  * at startup, not by the dct engine).  FP_NORM converts a fixed
98  * point number to scalar by rounding to the closest integer.
99  * FP_JNORM is similar except it folds the jpeg bias of 128 into the
100  * rounding addition.
101  */
102 #define FP_NBITS 15
103 #define FP_MUL(a, b)	((((a) >> 5) * ((b) >> 5)) >> (FP_NBITS - 10))
104 #define FP_SCALE(v)	(int)((double)(v) * double(1 << FP_NBITS) + 0.5)
105 #define FP_NORM(v)	(((v) + (1 << (FP_NBITS-1))) >> FP_NBITS)
106 #define FP_JNORM(v)	(((v) + (257 << (FP_NBITS-1))) >> FP_NBITS)
107 
108 #define M(n) ((m0 >> (n)) & 1)
109 
110 /*
111  * This macro stolen from nv.
112  */
113 /* Sick little macro which will limit x to [0..255] with logical ops */
114 #define LIMIT8(x, t) ((t = (x)), (t &= ~(t>>31)), (t | ~((t-256) >> 31)))
115 #define LIMIT(x, t) (LIMIT8((x), t) & 0xff)
116 
117 /* row order */
118 const u_char ROWZAG[] = {
119 	0,  1,  8, 16,  9,  2,  3, 10,
120 	17, 24, 32, 25, 18, 11,  4,  5,
121 	12, 19, 26, 33, 40, 48, 41, 34,
122 	27, 20, 13,  6,  7, 14, 21, 28,
123 	35, 42, 49, 56, 57, 50, 43, 36,
124 	29, 22, 15, 23, 30, 37, 44, 51,
125 	58, 59, 52, 45, 38, 31, 39, 46,
126 	53, 60, 61, 54, 47, 55, 62, 63,
127 	0,  0,  0,  0,  0,  0,  0,  0,
128 	0,  0,  0,  0,  0,  0,  0,  0
129 };
130 /* column order */
131 const u_char COLZAG[] = {
132 	0, 8, 1, 2, 9, 16, 24, 17,
133 	10, 3, 4, 11, 18, 25, 32, 40,
134 	33, 26, 19, 12, 5, 6, 13, 20,
135 	27, 34, 41, 48, 56, 49, 42, 35,
136 	28, 21, 14, 7, 15, 22, 29, 36,
137 	43, 50, 57, 58, 51, 44, 37, 30,
138 	23, 31, 38, 45, 52, 59, 60, 53,
139 	46, 39, 47, 54, 61, 62, 55, 63,
140 	0,  0,  0,  0,  0,  0,  0,  0,
141 	0,  0,  0,  0,  0,  0,  0,  0
142 };
143 
144 #define A1 FP_SCALE(0.7071068)
145 #define A2 FP_SCALE(0.5411961)
146 #define A3 A1
147 #define A4 FP_SCALE(1.3065630)
148 #define A5 FP_SCALE(0.3826834)
149 
150 #define FA1 (0.707106781f)
151 #define FA2 (0.541196100f)
152 #define FA3 FA1
153 #define FA4 (1.306562965f)
154 #define FA5 (0.382683433f)
155 
156 #ifdef B0
157 #undef B0
158 #endif
159 /*
160  * these magic numbers are scaling factors for each coef of the 1-d
161  * AA&N DCT.  The scale factor for coef 0 is 1 and coef 1<=n<=7 is
162  * cos(n*PI/16)*sqrt(2).  There is also a normalization of sqrt(8).
163  * Formally you divide by the scale factor but we multiply by the
164  * inverse because it's faster.  So the numbers below are the inverse
165  * of what was just described.
166  */
167 #define B0 0.35355339059327376220
168 #define B1 0.25489778955207958447
169 #define B2 0.27059805007309849220
170 #define B3 0.30067244346752264027
171 #define B4 0.35355339059327376220
172 #define B5 0.44998811156820785231
173 #define B6 0.65328148243818826392
174 #define B7 1.28145772387075308943
175 
176 /*
177  * Output multipliers for AA&N DCT
178  * (i.e., first stage multipliers for inverse DCT).
179  */
180 static const double first_stage[8] = { B0, B1, B2, B3, B4, B5, B6, B7, };
181 
182 /*
183  * The first_stage array crossed with itself.  This allows us
184  * to embed the first stage multipliers of the row pass by
185  * computing scaled versions of the columns.
186  */
187 static const int cross_stage[64] = {
188 	FP_SCALE(B0 * B0),
189 	FP_SCALE(B0 * B1),
190 	FP_SCALE(B0 * B2),
191 	FP_SCALE(B0 * B3),
192 	FP_SCALE(B0 * B4),
193 	FP_SCALE(B0 * B5),
194 	FP_SCALE(B0 * B6),
195 	FP_SCALE(B0 * B7),
196 
197 	FP_SCALE(B1 * B0),
198 	FP_SCALE(B1 * B1),
199 	FP_SCALE(B1 * B2),
200 	FP_SCALE(B1 * B3),
201 	FP_SCALE(B1 * B4),
202 	FP_SCALE(B1 * B5),
203 	FP_SCALE(B1 * B6),
204 	FP_SCALE(B1 * B7),
205 
206 	FP_SCALE(B2 * B0),
207 	FP_SCALE(B2 * B1),
208 	FP_SCALE(B2 * B2),
209 	FP_SCALE(B2 * B3),
210 	FP_SCALE(B2 * B4),
211 	FP_SCALE(B2 * B5),
212 	FP_SCALE(B2 * B6),
213 	FP_SCALE(B2 * B7),
214 
215 	FP_SCALE(B3 * B0),
216 	FP_SCALE(B3 * B1),
217 	FP_SCALE(B3 * B2),
218 	FP_SCALE(B3 * B3),
219 	FP_SCALE(B3 * B4),
220 	FP_SCALE(B3 * B5),
221 	FP_SCALE(B3 * B6),
222 	FP_SCALE(B3 * B7),
223 
224 	FP_SCALE(B4 * B0),
225 	FP_SCALE(B4 * B1),
226 	FP_SCALE(B4 * B2),
227 	FP_SCALE(B4 * B3),
228 	FP_SCALE(B4 * B4),
229 	FP_SCALE(B4 * B5),
230 	FP_SCALE(B4 * B6),
231 	FP_SCALE(B4 * B7),
232 
233 	FP_SCALE(B5 * B0),
234 	FP_SCALE(B5 * B1),
235 	FP_SCALE(B5 * B2),
236 	FP_SCALE(B5 * B3),
237 	FP_SCALE(B5 * B4),
238 	FP_SCALE(B5 * B5),
239 	FP_SCALE(B5 * B6),
240 	FP_SCALE(B5 * B7),
241 
242 	FP_SCALE(B6 * B0),
243 	FP_SCALE(B6 * B1),
244 	FP_SCALE(B6 * B2),
245 	FP_SCALE(B6 * B3),
246 	FP_SCALE(B6 * B4),
247 	FP_SCALE(B6 * B5),
248 	FP_SCALE(B6 * B6),
249 	FP_SCALE(B6 * B7),
250 
251 	FP_SCALE(B7 * B0),
252 	FP_SCALE(B7 * B1),
253 	FP_SCALE(B7 * B2),
254 	FP_SCALE(B7 * B3),
255 	FP_SCALE(B7 * B4),
256 	FP_SCALE(B7 * B5),
257 	FP_SCALE(B7 * B6),
258 	FP_SCALE(B7 * B7),
259 };
260 static const float f_cross_stage[64] = {
261 	(float)(B0 * B0),
262 	(float)(B0 * B1),
263 	(float)(B0 * B2),
264 	(float)(B0 * B3),
265 	(float)(B0 * B4),
266 	(float)(B0 * B5),
267 	(float)(B0 * B6),
268 	(float)(B0 * B7),
269 
270 	(float)(B1 * B0),
271 	(float)(B1 * B1),
272 	(float)(B1 * B2),
273 	(float)(B1 * B3),
274 	(float)(B1 * B4),
275 	(float)(B1 * B5),
276 	(float)(B1 * B6),
277 	(float)(B1 * B7),
278 
279 	(float)(B2 * B0),
280 	(float)(B2 * B1),
281 	(float)(B2 * B2),
282 	(float)(B2 * B3),
283 	(float)(B2 * B4),
284 	(float)(B2 * B5),
285 	(float)(B2 * B6),
286 	(float)(B2 * B7),
287 
288 	(float)(B3 * B0),
289 	(float)(B3 * B1),
290 	(float)(B3 * B2),
291 	(float)(B3 * B3),
292 	(float)(B3 * B4),
293 	(float)(B3 * B5),
294 	(float)(B3 * B6),
295 	(float)(B3 * B7),
296 
297 	(float)(B4 * B0),
298 	(float)(B4 * B1),
299 	(float)(B4 * B2),
300 	(float)(B4 * B3),
301 	(float)(B4 * B4),
302 	(float)(B4 * B5),
303 	(float)(B4 * B6),
304 	(float)(B4 * B7),
305 
306 	(float)(B5 * B0),
307 	(float)(B5 * B1),
308 	(float)(B5 * B2),
309 	(float)(B5 * B3),
310 	(float)(B5 * B4),
311 	(float)(B5 * B5),
312 	(float)(B5 * B6),
313 	(float)(B5 * B7),
314 
315 	(float)(B6 * B0),
316 	(float)(B6 * B1),
317 	(float)(B6 * B2),
318 	(float)(B6 * B3),
319 	(float)(B6 * B4),
320 	(float)(B6 * B5),
321 	(float)(B6 * B6),
322 	(float)(B6 * B7),
323 
324 	(float)(B7 * B0),
325 	(float)(B7 * B1),
326 	(float)(B7 * B2),
327 	(float)(B7 * B3),
328 	(float)(B7 * B4),
329 	(float)(B7 * B5),
330 	(float)(B7 * B6),
331 	(float)(B7 * B7),
332 };
333 
334 /*
335  * Map a quantization table in natural, row-order,
336  * into the qt input expected by rdct().
337  */
338 void
rdct_fold_q(const int * in,int * out)339 rdct_fold_q(const int* in, int* out)
340 {
341 	for (int i = 0; i < 64; ++i) {
342 		/*
343 		 * Fold column and row passes of the dct.
344 		 * By scaling each column DCT independently,
345 		 * we pre-bias all the row DCT's so the
346 		 * first multiplier is already embedded
347 		 * in the temporary result.  Thanks to
348 		 * Martin Vetterli for explaining how
349 		 * to do this.
350 		 */
351 		double v = double(in[i]);
352 		v *= first_stage[i & 7];
353 		v *= first_stage[i >> 3];
354 		out[i] = FP_SCALE(v);
355 	}
356 }
357 
358 /*
359  * Just like rdct_fold_q() but we divide by the quantizer.
360  */
fdct_fold_q(const int * in,float * out)361 void fdct_fold_q(const int* in, float* out)
362 {
363 	for (int i = 0; i < 64; ++i) {
364 		double v = first_stage[i >> 3];
365 		v *= first_stage[i & 7];
366 		double q = double(in[i]);
367 		out[i] = (float)(v / q);
368 	}
369 }
370 
dcsum(int dc,u_char * in,u_char * out,int stride)371 void dcsum(int dc, u_char* in, u_char* out, int stride)
372 {
373 	for (int k = 8; --k >= 0; ) {
374 		int t;
375 #ifdef INT_64
376 		/*XXX assume little-endian */
377 		INT_64 i = *(INT_64*)in;
378 		INT_64 o = (INT_64)LIMIT(dc + (int)(i >> 56 & 0xff), t) << 56;
379 		o |=  (INT_64)LIMIT(dc + (int)(i >> 48 & 0xff), t) << 48;
380 		o |=  (INT_64)LIMIT(dc + (int)(i >> 40 & 0xff), t) << 40;
381 		o |=  (INT_64)LIMIT(dc + (int)(i >> 32 & 0xff), t) << 32;
382 		o |=  (INT_64)LIMIT(dc + (int)(i >> 24 & 0xff), t) << 24;
383 		o |=  (INT_64)LIMIT(dc + (int)(i >> 16 & 0xff), t) << 16;
384 		o |=  (INT_64)LIMIT(dc + (int)(i >> 8 & 0xff), t) << 8;
385 		o |=  (INT_64)LIMIT(dc + (int)(i & 0xff), t);
386 		*(INT_64*)out = o;
387 #else
388 		u_int o = 0;
389 		u_int i = *(u_int*)in;
390 		SPLICE(o, LIMIT(dc + EXTRACT(i, 24), t), 24);
391 		SPLICE(o, LIMIT(dc + EXTRACT(i, 16), t), 16);
392 		SPLICE(o, LIMIT(dc + EXTRACT(i, 8), t), 8);
393 		SPLICE(o, LIMIT(dc + EXTRACT(i, 0), t), 0);
394 		*(u_int*)out = o;
395 
396 		o = 0;
397 		i = *(u_int*)(in + 4);
398 		SPLICE(o, LIMIT(dc + EXTRACT(i, 24),  t), 24);
399 		SPLICE(o, LIMIT(dc + EXTRACT(i, 16), t), 16);
400 		SPLICE(o, LIMIT(dc + EXTRACT(i, 8), t), 8);
401 		SPLICE(o, LIMIT(dc + EXTRACT(i, 0), t), 0);
402 		*(u_int*)(out + 4) = o;
403 #endif
404 		in += stride;
405 		out += stride;
406 	}
407 }
408 
dcsum2(int dc,u_char * in,u_char * out,int stride)409 void dcsum2(int dc, u_char* in, u_char* out, int stride)
410 {
411 	for (int k = 8; --k >= 0; ) {
412 		int t;
413 		u_int o = 0;
414 		SPLICE(o, LIMIT(dc + in[0], t), 24);
415 		SPLICE(o, LIMIT(dc + in[1], t), 16);
416 		SPLICE(o, LIMIT(dc + in[2], t), 8);
417 		SPLICE(o, LIMIT(dc + in[3], t), 0);
418 		*(u_int*)out = o;
419 
420 		o = 0;
421 		SPLICE(o, LIMIT(dc + in[4], t), 24);
422 		SPLICE(o, LIMIT(dc + in[5], t), 16);
423 		SPLICE(o, LIMIT(dc + in[6], t), 8);
424 		SPLICE(o, LIMIT(dc + in[7], t), 0);
425 		*(u_int*)(out + 4) = o;
426 
427 		in += stride;
428 		out += stride;
429 	}
430 }
431 
dcfill(int DC,u_char * out,int stride)432 void dcfill(int DC, u_char* out, int stride)
433 {
434 	int t;
435 	u_int dc = DC;
436 	dc = LIMIT(dc, t);
437 	dc |= dc << 8;
438 	dc |= dc << 16;
439 #ifdef INT_64
440 	INT_64 xdc = dc;
441 	xdc |= xdc << 32;
442 	*(INT_64 *)out = xdc;
443 	out += stride;
444 	*(INT_64 *)out = xdc;
445 	out += stride;
446 	*(INT_64 *)out = xdc;
447 	out += stride;
448 	*(INT_64 *)out = xdc;
449 	out += stride;
450 	*(INT_64 *)out = xdc;
451 	out += stride;
452 	*(INT_64 *)out = xdc;
453 	out += stride;
454 	*(INT_64 *)out = xdc;
455 	out += stride;
456 	*(INT_64 *)out = xdc;
457 #else
458 	*(u_int*)out = dc;
459 	*(u_int*)(out + 4) = dc;
460 	out += stride;
461 	*(u_int*)out = dc;
462 	*(u_int*)(out + 4) = dc;
463 	out += stride;
464 	*(u_int*)out = dc;
465 	*(u_int*)(out + 4) = dc;
466 	out += stride;
467 	*(u_int*)out = dc;
468 	*(u_int*)(out + 4) = dc;
469 	out += stride;
470 	*(u_int*)out = dc;
471 	*(u_int*)(out + 4) = dc;
472 	out += stride;
473 	*(u_int*)out = dc;
474 	*(u_int*)(out + 4) = dc;
475 	out += stride;
476 	*(u_int*)out = dc;
477 	*(u_int*)(out + 4) = dc;
478 	out += stride;
479 	*(u_int*)out = dc;
480 	*(u_int*)(out + 4) = dc;
481 #endif
482 }
483 
484 /*
485  * This routine mixes the DC & AC components of an 8x8 block of
486  * pixels.  This routine is called for every block decoded so it
487  * needs to be efficient.  It tries to do as many pixels in parallel
488  * as will fit in a word.  The one complication is that it has to
489  * deal with overflow (sum > 255) and underflow (sum < 0).  Underflow
490  * & overflow are only possible if both terms have the same sign and
491  * are indicated by the result having a different sign than the terms.
492  * Note that underflow is more worrisome than overflow since it results
493  * in bright white dots in a black field.
494  * The DC term and sum are biased by 128 so a negative number has the
495  * 2^7 bit = 0.  The AC term is not biased so a negative number has
496  * the 2^7 bit = 1.  So underflow is indicated by (DC & AC & sum) != 0;
497  */
498 #define MIX_LOGIC(sum, a, b, omask, uflo) \
499 { \
500 	sum = a + b; \
501 	uflo = (a ^ b) & (a ^ sum) & omask; \
502 	if (uflo) { \
503 		if ((b = uflo & a) != 0) { \
504 			/* integer overflows */ \
505 			b |= b >> 1; \
506 			b |= b >> 2; \
507 			b |= b >> 4; \
508 			sum |= b; \
509 		} \
510 		if ((uflo &=~ b) != 0) { \
511 			/* integer underflow(s) */ \
512 			uflo |= uflo >> 1; \
513 			uflo |= uflo >> 2; \
514 			uflo |= uflo >> 4; \
515 			sum &= ~uflo; \
516 		} \
517 	} \
518 }
519 /*
520  * Table of products of 8-bit scaled coefficients
521  * and idct coefficients (there are only 33 unique
522  * coefficients so we index via a compact ID).
523  */
524 extern "C" u_char multab[];
525 /*
526  * Array of coefficient ID's used to index multab.
527  */
528 extern "C" u_int dct_basis[64][64 / sizeof(u_int)];
529 
530 /*XXX*/
531 #define LIMIT_512(s) ((s) > 511 ? 511 : (s) < -512 ? -512 : (s))
532 
533 void
bv_rdct1(int dc,short * bp,int acx,u_char * out,int stride)534 bv_rdct1(int dc, short* bp, int acx, u_char* out, int stride)
535 {
536 	u_int omask = 0x80808080;
537 	u_int uflo;
538 	u_int* vp = dct_basis[acx];
539 	int s = LIMIT_512(bp[acx]);
540 	s = (s >> 2) & 0xff;
541 	/* 66 unique coefficients require 7 bits */
542 	char* mt = (char*)&multab[s << 7];
543 
544 	dc |= dc << 8;
545 	dc |= dc << 16;
546 	for (int k = 8; --k >= 0; ) {
547 		u_int v = *vp++;
548 		u_int m = mt[v >> 24] << SHIFT(24) |
549 			mt[v >> 16 & 0xff] << SHIFT(16) |
550 			mt[v >> 8 & 0xff] << SHIFT(8) |
551 			mt[v & 0xff] << SHIFT(0);
552 		MIX_LOGIC(v, dc, m, omask, uflo);
553 		*(u_int*)out = v;
554 		v = *vp++;
555 		m = mt[v >> 24] << SHIFT(24) |
556 			mt[v >> 16 & 0xff] << SHIFT(16) |
557 			mt[v >> 8 & 0xff] << SHIFT(8) |
558 			mt[v & 0xff] << SHIFT(0);
559 		MIX_LOGIC(v, dc, m, omask, uflo);
560 		*(u_int*)(out + 4) = v;
561 		out += stride;
562 	}
563 }
564 
565 /* XXX this version has to be exact */
566 void
bv_rdct2(int dc,short * bp,int ac0,u_char * in,u_char * out,int stride)567 bv_rdct2(int dc, short* bp, int ac0, u_char* in, u_char* out, int stride)
568 {
569 	int s0 = LIMIT_512(bp[ac0]);
570 	s0 = (s0 >> 2) & 0xff;
571 	/* 66 unique coefficients require 7 bits */
572 	const char* mt = (const char*)&multab[s0 << 7];
573 	const u_int* vp0 = dct_basis[ac0];
574 
575 	dc |= dc << 8;
576 	dc |= dc << 16;
577 	u_int omask = 0x80808080;
578 	u_int uflo;
579 	for (int k = 8; --k >= 0; ) {
580 		u_int v, m, i;
581 		v = *vp0++;
582 		m = mt[v >> 24] << SHIFT(24) | mt[v >> 16 & 0xff] << SHIFT(16) |
583 		    mt[v >> 8 & 0xff] << SHIFT(8) | mt[v & 0xff] << SHIFT(0);
584 		MIX_LOGIC(v, dc, m, omask, uflo);
585 		i = in[0] << SHIFT(24) | in[1] << SHIFT(16) |
586 		    in[2] << SHIFT(8) | in[3] << SHIFT(0);
587 		MIX_LOGIC(m, i, v, omask, uflo);
588 		*(u_int*)out = m;
589 
590 		v = *vp0++;
591 		m = mt[v >> 24] << SHIFT(24) | mt[v >> 16 & 0xff] << SHIFT(16) |
592 		    mt[v >> 8 & 0xff] << SHIFT(8) | mt[v & 0xff] << SHIFT(0);
593 		MIX_LOGIC(v, dc, m, omask, uflo);
594 		i = in[4] << SHIFT(24) | in[5] << SHIFT(16) |
595 		    in[6] << SHIFT(8) | in[7] << SHIFT(0);
596 		MIX_LOGIC(m, i, v, omask, uflo);
597 		*(u_int*)(out + 4) = m;
598 		out += stride;
599 		in += stride;
600 	}
601 }
602 
603 /* XXX this version has to be exact */
604 void
bv_rdct3(int dc,short * bp,int ac0,int ac1,u_char * in,u_char * out,int stride)605 bv_rdct3(int dc, short* bp, int ac0, int ac1, u_char* in, u_char* out, int stride)
606 {
607 	int s0 = LIMIT_512(bp[ac0]);
608 	s0 = (s0 >> 2) & 0xff;
609 	/* 66 unique coefficients require 7 bits */
610 	char* mt0 = (char*)&multab[s0 << 7];
611 
612 	int s1 = LIMIT_512(bp[ac1]);
613 	s1 = (s1 >> 2) & 0xff;
614 	char* mt1 = (char*)&multab[s1 << 7];
615 
616 	u_int* vp0 = dct_basis[ac0];
617 	u_int* vp1 = dct_basis[ac1];
618 
619 	for (int k = 8; --k >= 0; ) {
620 		int t;
621 		u_int v0 = *vp0++;
622 		u_int v1 = *vp1++;
623 		s0 = mt0[v0 >> 24] + mt1[v1 >> 24] + in[0] + dc;
624 		u_int m = LIMIT(s0, t) << SHIFT(24);
625 		s0 = mt0[v0 >> 16 & 0xff] + mt1[v1 >> 16 & 0xff] + in[1] + dc;
626 		m |= LIMIT(s0, t) << SHIFT(16);
627 		s0 = mt0[v0 >> 8 & 0xff] + mt1[v1 >> 8 & 0xff] + in[2] + dc;
628 		m |= LIMIT(s0, t) << SHIFT(8);
629 		s0 = mt0[v0 & 0xff] + mt1[v1 & 0xff] + in[3] + dc;
630 		m |= LIMIT(s0, t) << SHIFT(0);
631 		*(u_int*)out = m;
632 
633 		v0 = *vp0++;
634 		v1 = *vp1++;
635 		s0 = mt0[v0 >> 24] + mt1[v1 >> 24] + in[4] + dc;
636 		m = 0;
637 		m |= LIMIT(s0, t) << SHIFT(24);
638 		s0 = mt0[v0 >> 16 & 0xff] + mt1[v1 >> 16 & 0xff] + in[5] + dc;
639 		m |= LIMIT(s0, t) << SHIFT(16);
640 		s0 = mt0[v0 >> 8 & 0xff] + mt1[v1 >> 8 & 0xff] + in[6] + dc;
641 		m |= LIMIT(s0, t) << SHIFT(8);
642 		s0 = mt0[v0 & 0xff] + mt1[v1 & 0xff] + in[7] + dc;
643 		m |= LIMIT(s0, t) << SHIFT(0);
644 		*(u_int*)(out + 4) = m;
645 
646 		out += stride;
647 		in += stride;
648 	}
649 }
650 
651 #ifdef INT_64
652 /*XXX assume little-endian */
653 #define PSPLICE(v, n) pix |= (INT_64)(v) << ((n)*8)
654 #define DID4PIX
655 #define PSTORE ((INT_64*)p)[0] = pix
656 #define PIXDEF INT_64 pix = 0; int v, oflo = 0
657 #else
658 #define PSPLICE(v, n) SPLICE(pix, (v), (3 - ((n)&3)) * 8)
659 #define DID4PIX pix0 = pix; pix = 0
660 #define PSTORE ((u_int*)p)[0] = pix0; ((u_int*)p)[1] = pix
661 #define PIXDEF	u_int pix0, pix = 0; int v, oflo = 0
662 #endif
663 #define DOJPIX(val, n) v = FP_JNORM(val); oflo |= v; PSPLICE(v, n)
664 #define DOJPIXLIMIT(val, n) PSPLICE(LIMIT(FP_JNORM(val),t), n)
665 #define DOPIX(val, n) v = FP_NORM(val); oflo |= v; PSPLICE(v, n)
666 #define DOPIXLIMIT(val, n) PSPLICE(LIMIT(FP_NORM(val),t), n)
667 #define DOPIXIN(val, n) v = FP_NORM(val) + in[n]; oflo |= v; PSPLICE(v, n)
668 #define DOPIXINLIMIT(val, n) PSPLICE(LIMIT(FP_NORM(val) + in[n], t), n)
669 
670 /*
671  * A 2D Inverse DCT based on a column-row decomposition using
672  * Arai, Agui, and Nakajmia's 8pt 1D Inverse DCT, from Fig. 4-8
673  * Pennebaker & Mitchell (i.e., the pink JPEG book).  This figure
674  * is the forward transform; reverse the flowgraph for the inverse
675  * (you need to draw a picture).  The outputs are DFT coefficients
676  * and need to be scaled according to Eq [4-3].
677  *
678  * The input coefficients are, counter to tradition, in column-order.
679  * The bit mask indicates which coefficients are non-zero.  If the
680  * corresponding bit is zero, then the coefficient is assumed zero
681  * and the input coefficient is not referenced and need not be defined.
682  * The 8-bit outputs are computed in row order and placed in the
683  * output array pointed to by p, with each of the eight 8-byte lines
684  * offset by "stride" bytes.
685  *
686  * qt is the inverse quantization table in column order.  These
687  * coefficients are the product of the inverse quantization factor,
688  * specified by the jpeg quantization table, and the first multiplier
689  * in the inverse DCT flow graph.  This first multiplier is actually
690  * biased by the row term so that the columns are pre-scaled to
691  * eliminate the first row multiplication stage.
692  *
693  * The output is biased by 128, i.e., [-128,127] is mapped to [0,255],
694  * which is relevant to jpeg.
695  */
696 void
697 #ifdef INT_64
rdct(register short * bp,INT_64 m0,u_char * p,int stride,const int * qt)698 rdct(register short *bp, INT_64 m0, u_char* p, int stride, const int* qt)
699 #else
700 rdct(register short *bp, u_int m0, u_int m1, u_char* p,
701      int stride, const int* qt)
702 #endif
703 {
704 	/*
705 	 * First pass is 1D transform over the columns.  Note that
706 	 * the coefficients are stored in column order, so even
707 	 * though we're accessing the columns, we access them
708 	 * in a row-like fashion.  We use a column-row decomposition
709 	 * instead of a row-column decomposition so we can splice
710 	 * pixels in an efficient, row-wise order in the second
711 	 * stage.
712 	 *
713 	 * The inverse quantization is folded together with the
714 	 * first stage multiplier.  This reduces the number of
715 	 * multiplies (per 8-pt transform) from 11 to 5 (ignoring
716 	 * the inverse quantization which must be done anyway).
717 	 *
718 	 * Because we compute appropriately scaled column DCTs,
719 	 * the row DCTs require only 5 multiplies per row total.
720 	 * So the total number of multiplies for the 8x8 DCT is 80
721 	 * (ignoring the checks for zero coefficients).
722 	 */
723 	int tmp[64];
724 	int* tp = tmp;
725 	int i;
726 	for (i = 8; --i >= 0; ) {
727 		if ((m0 & 0xfe) == 0) {
728 			/* AC terms all zero */
729 			int v = M(0) ? qt[0] * bp[0] : 0;
730 			tp[0] = v;
731 			tp[1] = v;
732 			tp[2] = v;
733 			tp[3] = v;
734 			tp[4] = v;
735 			tp[5] = v;
736 			tp[6] = v;
737 			tp[7] = v;
738 		} else {
739 			int t0, t1, t2, t3, t4, t5, t6, t7;
740 
741 			if ((m0 & 0xaa) == 0)
742 				t4 = t5 = t6 = t7 = 0;
743 			else {
744 				t0 = M(5) ? qt[5] * bp[5] : 0;
745 				t2 = M(1) ? qt[1] * bp[1] : 0;
746 				t6 = M(7) ? qt[7] * bp[7] : 0;
747 				t7 = M(3) ? qt[3] * bp[3] : 0;
748 
749 				t4 = t0 - t7;
750 				t1 = t2 + t6;
751 				t6 = t2 - t6;
752 				t7 += t0;
753 
754 				t5 = t1 - t7;
755 				t7 += t1;
756 
757 				t2 = FP_MUL(-A5, t4 + t6);
758 				t4 = FP_MUL(-A2, t4);
759 				t0 = t4 + t2;
760 				t1 = FP_MUL(A3, t5);
761 				t2 += FP_MUL(A4, t6);
762 
763 				t4 = -t0;
764 				t5 = t1 - t0;
765 				t6 = t1 + t2;
766 				t7 += t2;
767 			}
768 
769 #ifdef notdef
770 			if ((m0 & 0x55) == 0)
771 				t0 = t1 = t2 = t3 = 0;
772 			else {
773 #endif
774 				t0 = M(0) ? qt[0] * bp[0] : 0;
775 				t1 = M(4) ? qt[4] * bp[4] : 0;
776 				int x0 = t0 + t1;
777 				int x1 = t0 - t1;
778 
779 				t0 = M(2) ? qt[2] * bp[2] : 0;
780 				t3 = M(6) ? qt[6] * bp[6] : 0;
781 				t2 = t0 - t3;
782 				t3 += t0;
783 
784 				t2 = FP_MUL(A1, t2);
785 				t3 += t2;
786 
787 				t0 = x0 + t3;
788 				t1 = x1 + t2;
789 				t2 = x1 - t2;
790 				t3 = x0 - t3;
791 #ifdef notdef
792 			}
793 #endif
794 
795 			tp[0] = t0 + t7;
796 			tp[7] = t0 - t7;
797 			tp[1] = t1 + t6;
798 			tp[6] = t1 - t6;
799 			tp[2] = t2 + t5;
800 			tp[5] = t2 - t5;
801 			tp[3] = t3 + t4;
802 			tp[4] = t3 - t4;
803 		}
804 		tp += 8;
805 		bp += 8;
806 		qt += 8;
807 
808 		m0 >>= 8;
809 #ifndef INT_64
810 		m0 |= m1 << 24;
811 		m1 >>= 8;
812 #endif
813 	}
814 	tp -= 64;
815 	/*
816 	 * Second pass is 1D transform over the rows.  Note that
817 	 * the coefficients are stored in column order, so even
818 	 * though we're accessing the rows, we access them
819 	 * in a column-like fashion.
820 	 *
821 	 * The pass above already computed the first multiplier
822 	 * in the flow graph.
823 	 */
824 	for (i = 8; --i >= 0; ) {
825 		int t0, t1, t2, t3, t4, t5, t6, t7;
826 
827 		t0 = tp[8*5];
828 		t2 = tp[8*1];
829 		t6 = tp[8*7];
830 		t7 = tp[8*3];
831 
832 #ifdef notdef
833 		if ((t0 | t2 | t6 | t7) == 0) {
834 			t4 = t5 = 0;
835 		} else
836 #endif
837 {
838 			t4 = t0 - t7;
839 			t1 = t2 + t6;
840 			t6 = t2 - t6;
841 			t7 += t0;
842 
843 			t5 = t1 - t7;
844 			t7 += t1;
845 
846 			t2 = FP_MUL(-A5, t4 + t6);
847 			t4 = FP_MUL(-A2, t4);
848 			t0 = t4 + t2;
849 			t1 = FP_MUL(A3, t5);
850 			t2 += FP_MUL(A4, t6);
851 
852 			t4 = -t0;
853 			t5 = t1 - t0;
854 			t6 = t1 + t2;
855 			t7 += t2;
856 		}
857 
858 		t0 = tp[8*0];
859 		t1 = tp[8*4];
860 		int x0 = t0 + t1;
861 		int x1 = t0 - t1;
862 
863 		t0 = tp[8*2];
864 		t3 = tp[8*6];
865 		t2 = t0 - t3;
866 		t3 += t0;
867 
868 		t2 = FP_MUL(A1, t2);
869 		t3 += t2;
870 
871 		t0 = x0 + t3;
872 		t1 = x1 + t2;
873 		t2 = x1 - t2;
874 		t3 = x0 - t3;
875 
876 		PIXDEF;
877 		DOJPIX(t0 + t7, 0);
878 		DOJPIX(t1 + t6, 1);
879 		DOJPIX(t2 + t5, 2);
880 		DOJPIX(t3 + t4, 3);
881 		DID4PIX;
882 		DOJPIX(t3 - t4, 4);
883 		DOJPIX(t2 - t5, 5);
884 		DOJPIX(t1 - t6, 6);
885 		DOJPIX(t0 - t7, 7);
886 		if (oflo & ~0xff) {
887 			int t;
888 			pix = 0;
889 			DOJPIXLIMIT(t0 + t7, 0);
890 			DOJPIXLIMIT(t1 + t6, 1);
891 			DOJPIXLIMIT(t2 + t5, 2);
892 			DOJPIXLIMIT(t3 + t4, 3);
893 			DID4PIX;
894 			DOJPIXLIMIT(t3 - t4, 4);
895 			DOJPIXLIMIT(t2 - t5, 5);
896 			DOJPIXLIMIT(t1 - t6, 6);
897 			DOJPIXLIMIT(t0 - t7, 7);
898 		}
899 		PSTORE;
900 
901 		++tp;
902 		p += stride;
903 	}
904 }
905 
906 /*
907  * Inverse 2-D transform, similar to routine above (see comment above),
908  * but more appropriate for H.261 instead of JPEG.  This routine does
909  * not bias the output by 128, and has an additional argument which is
910  * an input array which gets summed together with the inverse-transform.
911  * For example, this allows motion-compensation to be folded in here,
912  * saving an extra traversal of the block.  The input pointer can be
913  * null, if motion-compensation is not needed.
914  *
915  * This routine does not take a quantization table, since the H.261
916  * inverse quantizer is easily implemented via table lookup in the decoder.
917  */
918 void
919 #ifdef INT_64
rdct(register short * bp,INT_64 m0,u_char * p,int stride,const u_char * in)920 rdct(register short *bp, INT_64 m0, u_char* p, int stride, const u_char* in)
921 #else
922 rdct(register short *bp, u_int m0, u_int m1, u_char* p, int stride, const u_char *in)
923 #endif
924 {
925 	int tmp[64];
926 	int* tp = tmp;
927 	const int* qt = cross_stage;
928 	/*
929 	 * First pass is 1D transform over the rows of the input array.
930 	 */
931 	int i;
932 	for (i = 8; --i >= 0; ) {
933 		if ((m0 & 0xfe) == 0) {
934 			/*
935 			 * All ac terms are zero.
936 			 */
937 			int v = 0;
938 			if (M(0))
939 				v = qt[0] * bp[0];
940 			tp[0] = v;
941 			tp[1] = v;
942 			tp[2] = v;
943 			tp[3] = v;
944 			tp[4] = v;
945 			tp[5] = v;
946 			tp[6] = v;
947 			tp[7] = v;
948 		} else {
949 			int t4 = 0, t5 = 0, t6 = 0, t7 = 0;
950 			if (m0 & 0xaa) {
951 				/* odd part */
952 				if (M(1))
953 					t4 = qt[1] * bp[1];
954 				if (M(3))
955 					t5 = qt[3] * bp[3];
956 				if (M(5))
957 					t6 = qt[5] * bp[5];
958 				if (M(7))
959 					t7 = qt[7] * bp[7];
960 
961 				int x0 = t6 - t5;
962 				t6 += t5;
963 				int x1 = t4 - t7;
964 				t7 += t4;
965 
966 				t5 = FP_MUL(t7 - t6, A3);
967 				t7 += t6;
968 
969 				t4 = FP_MUL(x1 + x0, A5);
970 				t6 = FP_MUL(x1, A4) - t4;
971 				t4 += FP_MUL(x0, A2);
972 
973 				t7 += t6;
974 				t6 += t5;
975 				t5 += t4;
976 			}
977 			int t0 = 0, t1 = 0, t2 = 0, t3 = 0;
978 			if (m0 & 0x55) {
979 				/* even part */
980 				if (M(0))
981 					t0 = qt[0] * bp[0];
982 				if (M(2))
983 					t1 = qt[2] * bp[2];
984 				if (M(4))
985 					t2 = qt[4] * bp[4];
986 				if (M(6))
987 					t3 = qt[6] * bp[6];
988 
989 				int x0 = FP_MUL(t1 - t3, A1);
990 				t3 += t1;
991 				t1 = t0 - t2;
992 				t0 += t2;
993 				t2 = t3 + x0;
994 				t3 = t0 - t2;
995 				t0 += t2;
996 				t2 = t1 - x0;
997 				t1 += x0;
998 			}
999 			tp[0] = t0 + t7;
1000 			tp[1] = t1 + t6;
1001 			tp[2] = t2 + t5;
1002 			tp[3] = t3 + t4;
1003 			tp[4] = t3 - t4;
1004 			tp[5] = t2 - t5;
1005 			tp[6] = t1 - t6;
1006 			tp[7] = t0 - t7;
1007 		}
1008 		qt += 8;
1009 		tp += 8;
1010 		bp += 8;
1011 		m0 >>= 8;
1012 #ifndef INT_64
1013 		m0 |= m1 << 24;
1014 		m1 >>= 8;
1015 #endif
1016 	}
1017 	tp -= 64;
1018 	/*
1019 	 * Second pass is 1D transform over the rows of the temp array.
1020 	 */
1021 	for (i = 8; --i >= 0; ) {
1022 		int t4 = tp[8*1];
1023 		int t5 = tp[8*3];
1024 		int t6 = tp[8*5];
1025 		int t7 = tp[8*7];
1026 		if ((t4|t5|t6|t7) != 0) {
1027 			/* odd part */
1028 			int x0 = t6 - t5;
1029 			t6 += t5;
1030 			int x1 = t4 - t7;
1031 			t7 += t4;
1032 
1033 			t5 = FP_MUL(t7 - t6, A3);
1034 			t7 += t6;
1035 
1036 			t4 = FP_MUL(x1 + x0, A5);
1037 			t6 = FP_MUL(x1, A4) - t4;
1038 			t4 += FP_MUL(x0, A2);
1039 
1040 			t7 += t6;
1041 			t6 += t5;
1042 			t5 += t4;
1043 		}
1044 		int t0 = tp[8*0];
1045 		int t1 = tp[8*2];
1046 		int t2 = tp[8*4];
1047 		int t3 = tp[8*6];
1048 		if ((t0|t1|t2|t3) != 0) {
1049 			/* even part */
1050 			int x0 = FP_MUL(t1 - t3, A1);
1051 			t3 += t1;
1052 			t1 = t0 - t2;
1053 			t0 += t2;
1054 			t2 = t3 + x0;
1055 			t3 = t0 - t2;
1056 			t0 += t2;
1057 			t2 = t1 - x0;
1058 			t1 += x0;
1059 		}
1060 		if (in != 0) {
1061 			PIXDEF;
1062 			DOPIXIN(t0 + t7, 0);
1063 			DOPIXIN(t1 + t6, 1);
1064 			DOPIXIN(t2 + t5, 2);
1065 			DOPIXIN(t3 + t4, 3);
1066 			DID4PIX;
1067 			DOPIXIN(t3 - t4, 4);
1068 			DOPIXIN(t2 - t5, 5);
1069 			DOPIXIN(t1 - t6, 6);
1070 			DOPIXIN(t0 - t7, 7);
1071 			if (oflo & ~0xff) {
1072 				int t;
1073 				pix = 0;
1074 				DOPIXINLIMIT(t0 + t7, 0);
1075 				DOPIXINLIMIT(t1 + t6, 1);
1076 				DOPIXINLIMIT(t2 + t5, 2);
1077 				DOPIXINLIMIT(t3 + t4, 3);
1078 				DID4PIX;
1079 				DOPIXINLIMIT(t3 - t4, 4);
1080 				DOPIXINLIMIT(t2 - t5, 5);
1081 				DOPIXINLIMIT(t1 - t6, 6);
1082 				DOPIXINLIMIT(t0 - t7, 7);
1083 			}
1084 			PSTORE;
1085 			in += stride;
1086 		} else {
1087 			PIXDEF;
1088 			DOPIX(t0 + t7, 0);
1089 			DOPIX(t1 + t6, 1);
1090 			DOPIX(t2 + t5, 2);
1091 			DOPIX(t3 + t4, 3);
1092 			DID4PIX;
1093 			DOPIX(t3 - t4, 4);
1094 			DOPIX(t2 - t5, 5);
1095 			DOPIX(t1 - t6, 6);
1096 			DOPIX(t0 - t7, 7);
1097 			if (oflo & ~0xff) {
1098 				int t;
1099 				pix = 0;
1100 				DOPIXLIMIT(t0 + t7, 0);
1101 				DOPIXLIMIT(t1 + t6, 1);
1102 				DOPIXLIMIT(t2 + t5, 2);
1103 				DOPIXLIMIT(t3 + t4, 3);
1104 				DID4PIX;
1105 				DOPIXLIMIT(t3 - t4, 4);
1106 				DOPIXLIMIT(t2 - t5, 5);
1107 				DOPIXLIMIT(t1 - t6, 6);
1108 				DOPIXLIMIT(t0 - t7, 7);
1109 			}
1110 			PSTORE;
1111 		}
1112 		tp += 1;
1113 		p += stride;
1114 	}
1115 }
1116 
1117 /*
1118  * This macro does the combined descale-and-quantize
1119  * multiply.  It truncates rather than rounds to give
1120  * the behavior required for the h.261 deadband quantizer.
1121  */
1122 
1123 #define FWD_DandQ(v, iq) short((v) * qt[iq])
1124 
fdct(const u_char * in,int stride,short * out,const float * qt)1125 void fdct(const u_char* in, int stride, short* out, const float* qt)
1126 {
1127 	float tmp[64];
1128 	float* tp = tmp;
1129 
1130 	int i;
1131 	for (i = 8; --i >= 0; ) {
1132                float x0, x1, x2, x3, t0, t1, t2, t3, t4, t5, t6, t7;
1133                t0 = float(in[0] + in[7]);
1134                t7 = float(in[0] - in[7]);
1135                t1 = float(in[1] + in[6]);
1136                t6 = float(in[1] - in[6]);
1137                t2 = float(in[2] + in[5]);
1138                t5 = float(in[2] - in[5]);
1139                t3 = float(in[3] + in[4]);
1140                t4 = float(in[3] - in[4]);
1141 
1142 
1143 		/* even part */
1144 		x0 = t0 + t3;
1145 		x2 = t1 + t2;
1146 		tp[8*0] = x0 + x2;
1147 		tp[8*4] = x0 - x2;
1148 
1149 		x1 = t0 - t3;
1150 		x3 = t1 - t2;
1151 		t0 = (x1 + x3) * FA1;
1152 		tp[8*2] = x1 + t0;
1153 		tp[8*6] = x1 - t0;
1154 
1155 		/* odd part */
1156 		x0 = t4 + t5;
1157 		x1 = t5 + t6;
1158 		x2 = t6 + t7;
1159 
1160 		t3 = x1 * FA1;
1161 		t4 = t7 - t3;
1162 
1163 		t0 = (x0 - x2) * FA5;
1164 		t1 = x0 * FA2 + t0;
1165 		tp[8*3] = t4 - t1;
1166 		tp[8*5] = t4 + t1;
1167 
1168 		t7 += t3;
1169 		t2 = x2 * FA4 + t0;
1170 		tp[8*1] = t7 + t2;
1171 		tp[8*7] = t7 - t2;
1172 
1173 		in += stride;
1174 		tp += 1;
1175 	}
1176 	tp -= 8;
1177 
1178 	for (i = 8; --i >= 0; ) {
1179 		float x0, x1, x2, x3, t0, t1, t2, t3, t4, t5, t6, t7;
1180 		t0 = tp[0] + tp[7];
1181 		t7 = tp[0] - tp[7];
1182 		t1 = tp[1] + tp[6];
1183 		t6 = tp[1] - tp[6];
1184 		t2 = tp[2] + tp[5];
1185 		t5 = tp[2] - tp[5];
1186 		t3 = tp[3] + tp[4];
1187 		t4 = tp[3] - tp[4];
1188 
1189 		/* even part */
1190 		x0 = t0 + t3;
1191 		x2 = t1 + t2;
1192 		out[0] = FWD_DandQ(x0 + x2, 0);
1193 		out[4] = FWD_DandQ(x0 - x2, 4);
1194 
1195 		x1 = t0 - t3;
1196 		x3 = t1 - t2;
1197 		t0 = (x1 + x3) * FA1;
1198 		out[2] = FWD_DandQ(x1 + t0, 2);
1199 		out[6] = FWD_DandQ(x1 - t0, 6);
1200 
1201 		/* odd part */
1202 		x0 = t4 + t5;
1203 		x1 = t5 + t6;
1204 		x2 = t6 + t7;
1205 
1206 		t3 = x1 * FA1;
1207 		t4 = t7 - t3;
1208 
1209 		t0 = (x0 - x2) * FA5;
1210 		t1 =  x0 * FA2 + t0;
1211 		out[3] = FWD_DandQ(t4 - t1, 3);
1212 		out[5] = FWD_DandQ(t4 + t1, 5);
1213 
1214 		t7 += t3;
1215 		t2 = x2 * FA4 + t0;
1216 		out[1] = FWD_DandQ(t7 + t2, 1);
1217 		out[7] = FWD_DandQ(t7 - t2, 7);
1218 
1219 		out += 8;
1220 		tp += 8;
1221 		qt += 8;
1222 	}
1223 }
1224 
1225 /*
1226  * decimate the *rows* of the two input 8x8 DCT matrices into
1227  * a single output matrix.  we decimate rows rather than
1228  * columns even though we want column decimation because
1229  * the DCTs are stored in column order.
1230  */
1231 void
dct_decimate(const short * in0,const short * in1,short * o)1232 dct_decimate(const short* in0, const short* in1, short* o)
1233 {
1234 	for (int k = 0; k < 8; ++k) {
1235 		int x00 = in0[0];
1236 		int x01 = in0[1];
1237 		int x02 = in0[2];
1238 		int x03 = in0[3];
1239 		int x10 = in1[0];
1240 		int x11 = in1[1];
1241 		int x12 = in1[2];
1242 		int x13 = in1[3];
1243 #define X_N 4
1244 #define X_5(v)  ((v) << (X_N - 1))
1245 #define X_25(v)  ((v) << (X_N - 2))
1246 #define X_125(v)  ((v) << (X_N - 3))
1247 #define X_0625(v)  ((v) << (X_N - 4))
1248 #define X_375(v) (X_25(v) + X_125(v))
1249 #define X_625(v) (X_5(v) + X_125(v))
1250 #define X_75(v) (X_5(v) + X_25(v))
1251 #define X_6875(v) (X_5(v) + X_125(v) + X_0625(v))
1252 #define X_1875(v) (X_125(v) + X_0625(v))
1253 #define X_NORM(v) ((v) >> X_N)
1254 
1255 		/*
1256 		 * 0.50000000  0.09011998  0.00000000 0.10630376
1257 		 * 	0.50000000  0.09011998  0.00000000  0.10630376
1258 		 * 0.45306372  0.28832037  0.03732892 0.08667963
1259 		 * 	-0.45306372  0.11942621  0.10630376 -0.06764951
1260 		 * 0.00000000  0.49039264  0.17677670 0.00000000
1261 		 * 	0.00000000 -0.49039264 -0.17677670  0.00000000
1262 		 * -0.15909482  0.34009707  0.38408888 0.05735049
1263 		 *	0.15909482  0.43576792 -0.09011998 -0.13845632
1264 		 * 0.00000000 -0.03732892  0.46193977 0.25663998
1265 		 * 	0.00000000 -0.03732892  0.46193977  0.25663998
1266 		 * 0.10630376 -0.18235049  0.25663998 0.42361940
1267 		 *	-0.10630376 -0.16332037 -0.45306372 -0.01587282
1268 		 * 0.00000000  0.00000000 -0.07322330 0.41573481
1269 		 * 	0.00000000  0.00000000  0.07322330 -0.41573481
1270 		 * -0.09011998  0.13399123 -0.18766514 0.24442621
1271 		 *	0.09011998  0.13845632  0.15909482  0.47539609
1272 		 */
1273 
1274 		o[0] = X_NORM(X_5(x00 + x10) + X_0625(x01 + x11) +
1275 			      X_125(x03 + x13));
1276 		o[1] = X_NORM(X_5(x00 - x10) + X_25(x01) + X_0625(x03) +
1277 			      X_125(x11 + x12));
1278 		o[2] = X_NORM(X_5(x01 - x11) + X_1875(x02 + x12));
1279 		o[3] = X_NORM(X_1875(x10 - x00) + X_375(x01 + x02) +
1280 			      X_5(x11) - X_125(x13));
1281 		o[4] = X_NORM(X_5(x02 + x12) + X_25(x03 + x13));
1282 		o[5] = X_NORM(X_125(x00 - x10) - X_1875(x01 + x11) +
1283 			      X_25(x02) + X_5(x03 - x12));
1284 		o[6] = X_NORM(X_625(x12 - x02) + X_375(x03 + x13));
1285 		o[7] = X_NORM(X_125(x01 - x00 + x11 + x10 + x12) +
1286 			      X_1875(x02) + X_25(x03) + X_5(x13));
1287 
1288 		o += 8;
1289 		in0 += 8;
1290 		in1 += 8;
1291 	}
1292 }
1293