1 /*
2 * Copyright (c) 1994 Regents of the University of California.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 * must display the following acknowledgement:
15 * This product includes software developed by the Network Research
16 * Group at Lawrence Berkeley Laboratory.
17 * 4. Neither the name of the University nor of the Laboratory may be used
18 * to endorse or promote products derived from this software without
19 * specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 */
33
34 /************ Change log
35 *
36 * $Log: dct.cxx,v $
37 * Revision 1.3 2006/08/10 07:05:46 csoutheren
38 * Fixed compile warnings on VC 2005
39 *
40 * Revision 1.2 2006/07/31 09:09:21 csoutheren
41 * Checkin of validated codec used during development
42 *
43 * Revision 1.1.2.1 2006/04/06 01:17:17 csoutheren
44 * Initial version of H.261 video codec plugin for OPAL
45 *
46 * Revision 2.1 2003/03/15 23:42:59 robertj
47 * Update to OpenH323 v1.11.7
48 *
49 * Revision 1.14 2003/03/14 07:25:55 robertj
50 * Removed $header keyword so is not different on alternate repositories
51 *
52 * Revision 1.13 2002/10/24 21:05:26 dereks
53 * Fix compile time warning.
54 *
55 * Revision 1.12 2002/05/17 01:47:33 dereks
56 * backout the integer maths in the h261 codec.
57 *
58 * Revision 1.11 2002/02/15 03:54:31 yurik
59 * Warnings removed during compilation, patch courtesy of Jehan Bing, jehan@bravobrava.com
60 *
61 * Revision 1.10 2001/10/24 20:24:32 dereks
62 * Remove green stripes under windows for INT_64. Thanks to Robert Lupa.
63 *
64 * Revision 1.9 2001/10/17 03:52:39 robertj
65 * Fixed MSVC compatibility
66 *
67 * Revision 1.8 2001/10/17 01:54:36 yurik
68 * Fixed clash with CE includes for INT32 type
69 *
70 * Revision 1.7 2001/10/16 23:51:42 dereks
71 * Change vic's fdct() from floating-point to fix-point. Improves performance
72 * for h261 video significantly on some machines. Thanks to Cosmos Jiang
73 *
74 * Revision 1.6 2001/10/16 21:20:07 yurik
75 * Removed warnings on Windows CE. Submitted by Jehan Bing, jehan@bravobrava.com
76 *
77 * Revision 1.3 2000/12/19 22:22:34 dereks
78 * Remove connection to grabber-OS.cxx files. grabber-OS.cxx files no longer used.
79 * Video data is now read from a video channel, using the pwlib classes.
80 *
81 * Revision 1.2 2000/08/25 03:18:49 dereks
82 * Add change log facility (Thanks Robert for the info on implementation)
83 *
84 *
85 *
86 ********/
87
88 #include "bsd-endian.h"
89 #include "dct.h"
90
91 /*
92 * Macros for fix-point (integer) arithmetic. FP_NBITS gives the number
93 * of binary digits past the decimal point. FP_MUL computes the product
94 * of two fixed point numbers. A fixed point number and an integer
95 * can be directly multiplied to give a fixed point number. FP_SCALE
96 * converts a floating point number to fixed point (and is used only
97 * at startup, not by the dct engine). FP_NORM converts a fixed
98 * point number to scalar by rounding to the closest integer.
99 * FP_JNORM is similar except it folds the jpeg bias of 128 into the
100 * rounding addition.
101 */
102 #define FP_NBITS 15
103 #define FP_MUL(a, b) ((((a) >> 5) * ((b) >> 5)) >> (FP_NBITS - 10))
104 #define FP_SCALE(v) (int)((double)(v) * double(1 << FP_NBITS) + 0.5)
105 #define FP_NORM(v) (((v) + (1 << (FP_NBITS-1))) >> FP_NBITS)
106 #define FP_JNORM(v) (((v) + (257 << (FP_NBITS-1))) >> FP_NBITS)
107
108 #define M(n) ((m0 >> (n)) & 1)
109
110 /*
111 * This macro stolen from nv.
112 */
113 /* Sick little macro which will limit x to [0..255] with logical ops */
114 #define LIMIT8(x, t) ((t = (x)), (t &= ~(t>>31)), (t | ~((t-256) >> 31)))
115 #define LIMIT(x, t) (LIMIT8((x), t) & 0xff)
116
117 /* row order */
118 const u_char ROWZAG[] = {
119 0, 1, 8, 16, 9, 2, 3, 10,
120 17, 24, 32, 25, 18, 11, 4, 5,
121 12, 19, 26, 33, 40, 48, 41, 34,
122 27, 20, 13, 6, 7, 14, 21, 28,
123 35, 42, 49, 56, 57, 50, 43, 36,
124 29, 22, 15, 23, 30, 37, 44, 51,
125 58, 59, 52, 45, 38, 31, 39, 46,
126 53, 60, 61, 54, 47, 55, 62, 63,
127 0, 0, 0, 0, 0, 0, 0, 0,
128 0, 0, 0, 0, 0, 0, 0, 0
129 };
130 /* column order */
131 const u_char COLZAG[] = {
132 0, 8, 1, 2, 9, 16, 24, 17,
133 10, 3, 4, 11, 18, 25, 32, 40,
134 33, 26, 19, 12, 5, 6, 13, 20,
135 27, 34, 41, 48, 56, 49, 42, 35,
136 28, 21, 14, 7, 15, 22, 29, 36,
137 43, 50, 57, 58, 51, 44, 37, 30,
138 23, 31, 38, 45, 52, 59, 60, 53,
139 46, 39, 47, 54, 61, 62, 55, 63,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0
142 };
143
144 #define A1 FP_SCALE(0.7071068)
145 #define A2 FP_SCALE(0.5411961)
146 #define A3 A1
147 #define A4 FP_SCALE(1.3065630)
148 #define A5 FP_SCALE(0.3826834)
149
150 #define FA1 (0.707106781f)
151 #define FA2 (0.541196100f)
152 #define FA3 FA1
153 #define FA4 (1.306562965f)
154 #define FA5 (0.382683433f)
155
156 #ifdef B0
157 #undef B0
158 #endif
159 /*
160 * these magic numbers are scaling factors for each coef of the 1-d
161 * AA&N DCT. The scale factor for coef 0 is 1 and coef 1<=n<=7 is
162 * cos(n*PI/16)*sqrt(2). There is also a normalization of sqrt(8).
163 * Formally you divide by the scale factor but we multiply by the
164 * inverse because it's faster. So the numbers below are the inverse
165 * of what was just described.
166 */
167 #define B0 0.35355339059327376220
168 #define B1 0.25489778955207958447
169 #define B2 0.27059805007309849220
170 #define B3 0.30067244346752264027
171 #define B4 0.35355339059327376220
172 #define B5 0.44998811156820785231
173 #define B6 0.65328148243818826392
174 #define B7 1.28145772387075308943
175
176 /*
177 * Output multipliers for AA&N DCT
178 * (i.e., first stage multipliers for inverse DCT).
179 */
180 static const double first_stage[8] = { B0, B1, B2, B3, B4, B5, B6, B7, };
181
182 /*
183 * The first_stage array crossed with itself. This allows us
184 * to embed the first stage multipliers of the row pass by
185 * computing scaled versions of the columns.
186 */
187 static const int cross_stage[64] = {
188 FP_SCALE(B0 * B0),
189 FP_SCALE(B0 * B1),
190 FP_SCALE(B0 * B2),
191 FP_SCALE(B0 * B3),
192 FP_SCALE(B0 * B4),
193 FP_SCALE(B0 * B5),
194 FP_SCALE(B0 * B6),
195 FP_SCALE(B0 * B7),
196
197 FP_SCALE(B1 * B0),
198 FP_SCALE(B1 * B1),
199 FP_SCALE(B1 * B2),
200 FP_SCALE(B1 * B3),
201 FP_SCALE(B1 * B4),
202 FP_SCALE(B1 * B5),
203 FP_SCALE(B1 * B6),
204 FP_SCALE(B1 * B7),
205
206 FP_SCALE(B2 * B0),
207 FP_SCALE(B2 * B1),
208 FP_SCALE(B2 * B2),
209 FP_SCALE(B2 * B3),
210 FP_SCALE(B2 * B4),
211 FP_SCALE(B2 * B5),
212 FP_SCALE(B2 * B6),
213 FP_SCALE(B2 * B7),
214
215 FP_SCALE(B3 * B0),
216 FP_SCALE(B3 * B1),
217 FP_SCALE(B3 * B2),
218 FP_SCALE(B3 * B3),
219 FP_SCALE(B3 * B4),
220 FP_SCALE(B3 * B5),
221 FP_SCALE(B3 * B6),
222 FP_SCALE(B3 * B7),
223
224 FP_SCALE(B4 * B0),
225 FP_SCALE(B4 * B1),
226 FP_SCALE(B4 * B2),
227 FP_SCALE(B4 * B3),
228 FP_SCALE(B4 * B4),
229 FP_SCALE(B4 * B5),
230 FP_SCALE(B4 * B6),
231 FP_SCALE(B4 * B7),
232
233 FP_SCALE(B5 * B0),
234 FP_SCALE(B5 * B1),
235 FP_SCALE(B5 * B2),
236 FP_SCALE(B5 * B3),
237 FP_SCALE(B5 * B4),
238 FP_SCALE(B5 * B5),
239 FP_SCALE(B5 * B6),
240 FP_SCALE(B5 * B7),
241
242 FP_SCALE(B6 * B0),
243 FP_SCALE(B6 * B1),
244 FP_SCALE(B6 * B2),
245 FP_SCALE(B6 * B3),
246 FP_SCALE(B6 * B4),
247 FP_SCALE(B6 * B5),
248 FP_SCALE(B6 * B6),
249 FP_SCALE(B6 * B7),
250
251 FP_SCALE(B7 * B0),
252 FP_SCALE(B7 * B1),
253 FP_SCALE(B7 * B2),
254 FP_SCALE(B7 * B3),
255 FP_SCALE(B7 * B4),
256 FP_SCALE(B7 * B5),
257 FP_SCALE(B7 * B6),
258 FP_SCALE(B7 * B7),
259 };
260 static const float f_cross_stage[64] = {
261 (float)(B0 * B0),
262 (float)(B0 * B1),
263 (float)(B0 * B2),
264 (float)(B0 * B3),
265 (float)(B0 * B4),
266 (float)(B0 * B5),
267 (float)(B0 * B6),
268 (float)(B0 * B7),
269
270 (float)(B1 * B0),
271 (float)(B1 * B1),
272 (float)(B1 * B2),
273 (float)(B1 * B3),
274 (float)(B1 * B4),
275 (float)(B1 * B5),
276 (float)(B1 * B6),
277 (float)(B1 * B7),
278
279 (float)(B2 * B0),
280 (float)(B2 * B1),
281 (float)(B2 * B2),
282 (float)(B2 * B3),
283 (float)(B2 * B4),
284 (float)(B2 * B5),
285 (float)(B2 * B6),
286 (float)(B2 * B7),
287
288 (float)(B3 * B0),
289 (float)(B3 * B1),
290 (float)(B3 * B2),
291 (float)(B3 * B3),
292 (float)(B3 * B4),
293 (float)(B3 * B5),
294 (float)(B3 * B6),
295 (float)(B3 * B7),
296
297 (float)(B4 * B0),
298 (float)(B4 * B1),
299 (float)(B4 * B2),
300 (float)(B4 * B3),
301 (float)(B4 * B4),
302 (float)(B4 * B5),
303 (float)(B4 * B6),
304 (float)(B4 * B7),
305
306 (float)(B5 * B0),
307 (float)(B5 * B1),
308 (float)(B5 * B2),
309 (float)(B5 * B3),
310 (float)(B5 * B4),
311 (float)(B5 * B5),
312 (float)(B5 * B6),
313 (float)(B5 * B7),
314
315 (float)(B6 * B0),
316 (float)(B6 * B1),
317 (float)(B6 * B2),
318 (float)(B6 * B3),
319 (float)(B6 * B4),
320 (float)(B6 * B5),
321 (float)(B6 * B6),
322 (float)(B6 * B7),
323
324 (float)(B7 * B0),
325 (float)(B7 * B1),
326 (float)(B7 * B2),
327 (float)(B7 * B3),
328 (float)(B7 * B4),
329 (float)(B7 * B5),
330 (float)(B7 * B6),
331 (float)(B7 * B7),
332 };
333
334 /*
335 * Map a quantization table in natural, row-order,
336 * into the qt input expected by rdct().
337 */
338 void
rdct_fold_q(const int * in,int * out)339 rdct_fold_q(const int* in, int* out)
340 {
341 for (int i = 0; i < 64; ++i) {
342 /*
343 * Fold column and row passes of the dct.
344 * By scaling each column DCT independently,
345 * we pre-bias all the row DCT's so the
346 * first multiplier is already embedded
347 * in the temporary result. Thanks to
348 * Martin Vetterli for explaining how
349 * to do this.
350 */
351 double v = double(in[i]);
352 v *= first_stage[i & 7];
353 v *= first_stage[i >> 3];
354 out[i] = FP_SCALE(v);
355 }
356 }
357
358 /*
359 * Just like rdct_fold_q() but we divide by the quantizer.
360 */
fdct_fold_q(const int * in,float * out)361 void fdct_fold_q(const int* in, float* out)
362 {
363 for (int i = 0; i < 64; ++i) {
364 double v = first_stage[i >> 3];
365 v *= first_stage[i & 7];
366 double q = double(in[i]);
367 out[i] = (float)(v / q);
368 }
369 }
370
dcsum(int dc,u_char * in,u_char * out,int stride)371 void dcsum(int dc, u_char* in, u_char* out, int stride)
372 {
373 for (int k = 8; --k >= 0; ) {
374 int t;
375 #ifdef INT_64
376 /*XXX assume little-endian */
377 INT_64 i = *(INT_64*)in;
378 INT_64 o = (INT_64)LIMIT(dc + (int)(i >> 56 & 0xff), t) << 56;
379 o |= (INT_64)LIMIT(dc + (int)(i >> 48 & 0xff), t) << 48;
380 o |= (INT_64)LIMIT(dc + (int)(i >> 40 & 0xff), t) << 40;
381 o |= (INT_64)LIMIT(dc + (int)(i >> 32 & 0xff), t) << 32;
382 o |= (INT_64)LIMIT(dc + (int)(i >> 24 & 0xff), t) << 24;
383 o |= (INT_64)LIMIT(dc + (int)(i >> 16 & 0xff), t) << 16;
384 o |= (INT_64)LIMIT(dc + (int)(i >> 8 & 0xff), t) << 8;
385 o |= (INT_64)LIMIT(dc + (int)(i & 0xff), t);
386 *(INT_64*)out = o;
387 #else
388 u_int o = 0;
389 u_int i = *(u_int*)in;
390 SPLICE(o, LIMIT(dc + EXTRACT(i, 24), t), 24);
391 SPLICE(o, LIMIT(dc + EXTRACT(i, 16), t), 16);
392 SPLICE(o, LIMIT(dc + EXTRACT(i, 8), t), 8);
393 SPLICE(o, LIMIT(dc + EXTRACT(i, 0), t), 0);
394 *(u_int*)out = o;
395
396 o = 0;
397 i = *(u_int*)(in + 4);
398 SPLICE(o, LIMIT(dc + EXTRACT(i, 24), t), 24);
399 SPLICE(o, LIMIT(dc + EXTRACT(i, 16), t), 16);
400 SPLICE(o, LIMIT(dc + EXTRACT(i, 8), t), 8);
401 SPLICE(o, LIMIT(dc + EXTRACT(i, 0), t), 0);
402 *(u_int*)(out + 4) = o;
403 #endif
404 in += stride;
405 out += stride;
406 }
407 }
408
dcsum2(int dc,u_char * in,u_char * out,int stride)409 void dcsum2(int dc, u_char* in, u_char* out, int stride)
410 {
411 for (int k = 8; --k >= 0; ) {
412 int t;
413 u_int o = 0;
414 SPLICE(o, LIMIT(dc + in[0], t), 24);
415 SPLICE(o, LIMIT(dc + in[1], t), 16);
416 SPLICE(o, LIMIT(dc + in[2], t), 8);
417 SPLICE(o, LIMIT(dc + in[3], t), 0);
418 *(u_int*)out = o;
419
420 o = 0;
421 SPLICE(o, LIMIT(dc + in[4], t), 24);
422 SPLICE(o, LIMIT(dc + in[5], t), 16);
423 SPLICE(o, LIMIT(dc + in[6], t), 8);
424 SPLICE(o, LIMIT(dc + in[7], t), 0);
425 *(u_int*)(out + 4) = o;
426
427 in += stride;
428 out += stride;
429 }
430 }
431
dcfill(int DC,u_char * out,int stride)432 void dcfill(int DC, u_char* out, int stride)
433 {
434 int t;
435 u_int dc = DC;
436 dc = LIMIT(dc, t);
437 dc |= dc << 8;
438 dc |= dc << 16;
439 #ifdef INT_64
440 INT_64 xdc = dc;
441 xdc |= xdc << 32;
442 *(INT_64 *)out = xdc;
443 out += stride;
444 *(INT_64 *)out = xdc;
445 out += stride;
446 *(INT_64 *)out = xdc;
447 out += stride;
448 *(INT_64 *)out = xdc;
449 out += stride;
450 *(INT_64 *)out = xdc;
451 out += stride;
452 *(INT_64 *)out = xdc;
453 out += stride;
454 *(INT_64 *)out = xdc;
455 out += stride;
456 *(INT_64 *)out = xdc;
457 #else
458 *(u_int*)out = dc;
459 *(u_int*)(out + 4) = dc;
460 out += stride;
461 *(u_int*)out = dc;
462 *(u_int*)(out + 4) = dc;
463 out += stride;
464 *(u_int*)out = dc;
465 *(u_int*)(out + 4) = dc;
466 out += stride;
467 *(u_int*)out = dc;
468 *(u_int*)(out + 4) = dc;
469 out += stride;
470 *(u_int*)out = dc;
471 *(u_int*)(out + 4) = dc;
472 out += stride;
473 *(u_int*)out = dc;
474 *(u_int*)(out + 4) = dc;
475 out += stride;
476 *(u_int*)out = dc;
477 *(u_int*)(out + 4) = dc;
478 out += stride;
479 *(u_int*)out = dc;
480 *(u_int*)(out + 4) = dc;
481 #endif
482 }
483
484 /*
485 * This routine mixes the DC & AC components of an 8x8 block of
486 * pixels. This routine is called for every block decoded so it
487 * needs to be efficient. It tries to do as many pixels in parallel
488 * as will fit in a word. The one complication is that it has to
489 * deal with overflow (sum > 255) and underflow (sum < 0). Underflow
490 * & overflow are only possible if both terms have the same sign and
491 * are indicated by the result having a different sign than the terms.
492 * Note that underflow is more worrisome than overflow since it results
493 * in bright white dots in a black field.
494 * The DC term and sum are biased by 128 so a negative number has the
495 * 2^7 bit = 0. The AC term is not biased so a negative number has
496 * the 2^7 bit = 1. So underflow is indicated by (DC & AC & sum) != 0;
497 */
498 #define MIX_LOGIC(sum, a, b, omask, uflo) \
499 { \
500 sum = a + b; \
501 uflo = (a ^ b) & (a ^ sum) & omask; \
502 if (uflo) { \
503 if ((b = uflo & a) != 0) { \
504 /* integer overflows */ \
505 b |= b >> 1; \
506 b |= b >> 2; \
507 b |= b >> 4; \
508 sum |= b; \
509 } \
510 if ((uflo &=~ b) != 0) { \
511 /* integer underflow(s) */ \
512 uflo |= uflo >> 1; \
513 uflo |= uflo >> 2; \
514 uflo |= uflo >> 4; \
515 sum &= ~uflo; \
516 } \
517 } \
518 }
519 /*
520 * Table of products of 8-bit scaled coefficients
521 * and idct coefficients (there are only 33 unique
522 * coefficients so we index via a compact ID).
523 */
524 extern "C" u_char multab[];
525 /*
526 * Array of coefficient ID's used to index multab.
527 */
528 extern "C" u_int dct_basis[64][64 / sizeof(u_int)];
529
530 /*XXX*/
531 #define LIMIT_512(s) ((s) > 511 ? 511 : (s) < -512 ? -512 : (s))
532
533 void
bv_rdct1(int dc,short * bp,int acx,u_char * out,int stride)534 bv_rdct1(int dc, short* bp, int acx, u_char* out, int stride)
535 {
536 u_int omask = 0x80808080;
537 u_int uflo;
538 u_int* vp = dct_basis[acx];
539 int s = LIMIT_512(bp[acx]);
540 s = (s >> 2) & 0xff;
541 /* 66 unique coefficients require 7 bits */
542 char* mt = (char*)&multab[s << 7];
543
544 dc |= dc << 8;
545 dc |= dc << 16;
546 for (int k = 8; --k >= 0; ) {
547 u_int v = *vp++;
548 u_int m = mt[v >> 24] << SHIFT(24) |
549 mt[v >> 16 & 0xff] << SHIFT(16) |
550 mt[v >> 8 & 0xff] << SHIFT(8) |
551 mt[v & 0xff] << SHIFT(0);
552 MIX_LOGIC(v, dc, m, omask, uflo);
553 *(u_int*)out = v;
554 v = *vp++;
555 m = mt[v >> 24] << SHIFT(24) |
556 mt[v >> 16 & 0xff] << SHIFT(16) |
557 mt[v >> 8 & 0xff] << SHIFT(8) |
558 mt[v & 0xff] << SHIFT(0);
559 MIX_LOGIC(v, dc, m, omask, uflo);
560 *(u_int*)(out + 4) = v;
561 out += stride;
562 }
563 }
564
565 /* XXX this version has to be exact */
566 void
bv_rdct2(int dc,short * bp,int ac0,u_char * in,u_char * out,int stride)567 bv_rdct2(int dc, short* bp, int ac0, u_char* in, u_char* out, int stride)
568 {
569 int s0 = LIMIT_512(bp[ac0]);
570 s0 = (s0 >> 2) & 0xff;
571 /* 66 unique coefficients require 7 bits */
572 const char* mt = (const char*)&multab[s0 << 7];
573 const u_int* vp0 = dct_basis[ac0];
574
575 dc |= dc << 8;
576 dc |= dc << 16;
577 u_int omask = 0x80808080;
578 u_int uflo;
579 for (int k = 8; --k >= 0; ) {
580 u_int v, m, i;
581 v = *vp0++;
582 m = mt[v >> 24] << SHIFT(24) | mt[v >> 16 & 0xff] << SHIFT(16) |
583 mt[v >> 8 & 0xff] << SHIFT(8) | mt[v & 0xff] << SHIFT(0);
584 MIX_LOGIC(v, dc, m, omask, uflo);
585 i = in[0] << SHIFT(24) | in[1] << SHIFT(16) |
586 in[2] << SHIFT(8) | in[3] << SHIFT(0);
587 MIX_LOGIC(m, i, v, omask, uflo);
588 *(u_int*)out = m;
589
590 v = *vp0++;
591 m = mt[v >> 24] << SHIFT(24) | mt[v >> 16 & 0xff] << SHIFT(16) |
592 mt[v >> 8 & 0xff] << SHIFT(8) | mt[v & 0xff] << SHIFT(0);
593 MIX_LOGIC(v, dc, m, omask, uflo);
594 i = in[4] << SHIFT(24) | in[5] << SHIFT(16) |
595 in[6] << SHIFT(8) | in[7] << SHIFT(0);
596 MIX_LOGIC(m, i, v, omask, uflo);
597 *(u_int*)(out + 4) = m;
598 out += stride;
599 in += stride;
600 }
601 }
602
603 /* XXX this version has to be exact */
604 void
bv_rdct3(int dc,short * bp,int ac0,int ac1,u_char * in,u_char * out,int stride)605 bv_rdct3(int dc, short* bp, int ac0, int ac1, u_char* in, u_char* out, int stride)
606 {
607 int s0 = LIMIT_512(bp[ac0]);
608 s0 = (s0 >> 2) & 0xff;
609 /* 66 unique coefficients require 7 bits */
610 char* mt0 = (char*)&multab[s0 << 7];
611
612 int s1 = LIMIT_512(bp[ac1]);
613 s1 = (s1 >> 2) & 0xff;
614 char* mt1 = (char*)&multab[s1 << 7];
615
616 u_int* vp0 = dct_basis[ac0];
617 u_int* vp1 = dct_basis[ac1];
618
619 for (int k = 8; --k >= 0; ) {
620 int t;
621 u_int v0 = *vp0++;
622 u_int v1 = *vp1++;
623 s0 = mt0[v0 >> 24] + mt1[v1 >> 24] + in[0] + dc;
624 u_int m = LIMIT(s0, t) << SHIFT(24);
625 s0 = mt0[v0 >> 16 & 0xff] + mt1[v1 >> 16 & 0xff] + in[1] + dc;
626 m |= LIMIT(s0, t) << SHIFT(16);
627 s0 = mt0[v0 >> 8 & 0xff] + mt1[v1 >> 8 & 0xff] + in[2] + dc;
628 m |= LIMIT(s0, t) << SHIFT(8);
629 s0 = mt0[v0 & 0xff] + mt1[v1 & 0xff] + in[3] + dc;
630 m |= LIMIT(s0, t) << SHIFT(0);
631 *(u_int*)out = m;
632
633 v0 = *vp0++;
634 v1 = *vp1++;
635 s0 = mt0[v0 >> 24] + mt1[v1 >> 24] + in[4] + dc;
636 m = 0;
637 m |= LIMIT(s0, t) << SHIFT(24);
638 s0 = mt0[v0 >> 16 & 0xff] + mt1[v1 >> 16 & 0xff] + in[5] + dc;
639 m |= LIMIT(s0, t) << SHIFT(16);
640 s0 = mt0[v0 >> 8 & 0xff] + mt1[v1 >> 8 & 0xff] + in[6] + dc;
641 m |= LIMIT(s0, t) << SHIFT(8);
642 s0 = mt0[v0 & 0xff] + mt1[v1 & 0xff] + in[7] + dc;
643 m |= LIMIT(s0, t) << SHIFT(0);
644 *(u_int*)(out + 4) = m;
645
646 out += stride;
647 in += stride;
648 }
649 }
650
651 #ifdef INT_64
652 /*XXX assume little-endian */
653 #define PSPLICE(v, n) pix |= (INT_64)(v) << ((n)*8)
654 #define DID4PIX
655 #define PSTORE ((INT_64*)p)[0] = pix
656 #define PIXDEF INT_64 pix = 0; int v, oflo = 0
657 #else
658 #define PSPLICE(v, n) SPLICE(pix, (v), (3 - ((n)&3)) * 8)
659 #define DID4PIX pix0 = pix; pix = 0
660 #define PSTORE ((u_int*)p)[0] = pix0; ((u_int*)p)[1] = pix
661 #define PIXDEF u_int pix0, pix = 0; int v, oflo = 0
662 #endif
663 #define DOJPIX(val, n) v = FP_JNORM(val); oflo |= v; PSPLICE(v, n)
664 #define DOJPIXLIMIT(val, n) PSPLICE(LIMIT(FP_JNORM(val),t), n)
665 #define DOPIX(val, n) v = FP_NORM(val); oflo |= v; PSPLICE(v, n)
666 #define DOPIXLIMIT(val, n) PSPLICE(LIMIT(FP_NORM(val),t), n)
667 #define DOPIXIN(val, n) v = FP_NORM(val) + in[n]; oflo |= v; PSPLICE(v, n)
668 #define DOPIXINLIMIT(val, n) PSPLICE(LIMIT(FP_NORM(val) + in[n], t), n)
669
670 /*
671 * A 2D Inverse DCT based on a column-row decomposition using
672 * Arai, Agui, and Nakajmia's 8pt 1D Inverse DCT, from Fig. 4-8
673 * Pennebaker & Mitchell (i.e., the pink JPEG book). This figure
674 * is the forward transform; reverse the flowgraph for the inverse
675 * (you need to draw a picture). The outputs are DFT coefficients
676 * and need to be scaled according to Eq [4-3].
677 *
678 * The input coefficients are, counter to tradition, in column-order.
679 * The bit mask indicates which coefficients are non-zero. If the
680 * corresponding bit is zero, then the coefficient is assumed zero
681 * and the input coefficient is not referenced and need not be defined.
682 * The 8-bit outputs are computed in row order and placed in the
683 * output array pointed to by p, with each of the eight 8-byte lines
684 * offset by "stride" bytes.
685 *
686 * qt is the inverse quantization table in column order. These
687 * coefficients are the product of the inverse quantization factor,
688 * specified by the jpeg quantization table, and the first multiplier
689 * in the inverse DCT flow graph. This first multiplier is actually
690 * biased by the row term so that the columns are pre-scaled to
691 * eliminate the first row multiplication stage.
692 *
693 * The output is biased by 128, i.e., [-128,127] is mapped to [0,255],
694 * which is relevant to jpeg.
695 */
696 void
697 #ifdef INT_64
rdct(register short * bp,INT_64 m0,u_char * p,int stride,const int * qt)698 rdct(register short *bp, INT_64 m0, u_char* p, int stride, const int* qt)
699 #else
700 rdct(register short *bp, u_int m0, u_int m1, u_char* p,
701 int stride, const int* qt)
702 #endif
703 {
704 /*
705 * First pass is 1D transform over the columns. Note that
706 * the coefficients are stored in column order, so even
707 * though we're accessing the columns, we access them
708 * in a row-like fashion. We use a column-row decomposition
709 * instead of a row-column decomposition so we can splice
710 * pixels in an efficient, row-wise order in the second
711 * stage.
712 *
713 * The inverse quantization is folded together with the
714 * first stage multiplier. This reduces the number of
715 * multiplies (per 8-pt transform) from 11 to 5 (ignoring
716 * the inverse quantization which must be done anyway).
717 *
718 * Because we compute appropriately scaled column DCTs,
719 * the row DCTs require only 5 multiplies per row total.
720 * So the total number of multiplies for the 8x8 DCT is 80
721 * (ignoring the checks for zero coefficients).
722 */
723 int tmp[64];
724 int* tp = tmp;
725 int i;
726 for (i = 8; --i >= 0; ) {
727 if ((m0 & 0xfe) == 0) {
728 /* AC terms all zero */
729 int v = M(0) ? qt[0] * bp[0] : 0;
730 tp[0] = v;
731 tp[1] = v;
732 tp[2] = v;
733 tp[3] = v;
734 tp[4] = v;
735 tp[5] = v;
736 tp[6] = v;
737 tp[7] = v;
738 } else {
739 int t0, t1, t2, t3, t4, t5, t6, t7;
740
741 if ((m0 & 0xaa) == 0)
742 t4 = t5 = t6 = t7 = 0;
743 else {
744 t0 = M(5) ? qt[5] * bp[5] : 0;
745 t2 = M(1) ? qt[1] * bp[1] : 0;
746 t6 = M(7) ? qt[7] * bp[7] : 0;
747 t7 = M(3) ? qt[3] * bp[3] : 0;
748
749 t4 = t0 - t7;
750 t1 = t2 + t6;
751 t6 = t2 - t6;
752 t7 += t0;
753
754 t5 = t1 - t7;
755 t7 += t1;
756
757 t2 = FP_MUL(-A5, t4 + t6);
758 t4 = FP_MUL(-A2, t4);
759 t0 = t4 + t2;
760 t1 = FP_MUL(A3, t5);
761 t2 += FP_MUL(A4, t6);
762
763 t4 = -t0;
764 t5 = t1 - t0;
765 t6 = t1 + t2;
766 t7 += t2;
767 }
768
769 #ifdef notdef
770 if ((m0 & 0x55) == 0)
771 t0 = t1 = t2 = t3 = 0;
772 else {
773 #endif
774 t0 = M(0) ? qt[0] * bp[0] : 0;
775 t1 = M(4) ? qt[4] * bp[4] : 0;
776 int x0 = t0 + t1;
777 int x1 = t0 - t1;
778
779 t0 = M(2) ? qt[2] * bp[2] : 0;
780 t3 = M(6) ? qt[6] * bp[6] : 0;
781 t2 = t0 - t3;
782 t3 += t0;
783
784 t2 = FP_MUL(A1, t2);
785 t3 += t2;
786
787 t0 = x0 + t3;
788 t1 = x1 + t2;
789 t2 = x1 - t2;
790 t3 = x0 - t3;
791 #ifdef notdef
792 }
793 #endif
794
795 tp[0] = t0 + t7;
796 tp[7] = t0 - t7;
797 tp[1] = t1 + t6;
798 tp[6] = t1 - t6;
799 tp[2] = t2 + t5;
800 tp[5] = t2 - t5;
801 tp[3] = t3 + t4;
802 tp[4] = t3 - t4;
803 }
804 tp += 8;
805 bp += 8;
806 qt += 8;
807
808 m0 >>= 8;
809 #ifndef INT_64
810 m0 |= m1 << 24;
811 m1 >>= 8;
812 #endif
813 }
814 tp -= 64;
815 /*
816 * Second pass is 1D transform over the rows. Note that
817 * the coefficients are stored in column order, so even
818 * though we're accessing the rows, we access them
819 * in a column-like fashion.
820 *
821 * The pass above already computed the first multiplier
822 * in the flow graph.
823 */
824 for (i = 8; --i >= 0; ) {
825 int t0, t1, t2, t3, t4, t5, t6, t7;
826
827 t0 = tp[8*5];
828 t2 = tp[8*1];
829 t6 = tp[8*7];
830 t7 = tp[8*3];
831
832 #ifdef notdef
833 if ((t0 | t2 | t6 | t7) == 0) {
834 t4 = t5 = 0;
835 } else
836 #endif
837 {
838 t4 = t0 - t7;
839 t1 = t2 + t6;
840 t6 = t2 - t6;
841 t7 += t0;
842
843 t5 = t1 - t7;
844 t7 += t1;
845
846 t2 = FP_MUL(-A5, t4 + t6);
847 t4 = FP_MUL(-A2, t4);
848 t0 = t4 + t2;
849 t1 = FP_MUL(A3, t5);
850 t2 += FP_MUL(A4, t6);
851
852 t4 = -t0;
853 t5 = t1 - t0;
854 t6 = t1 + t2;
855 t7 += t2;
856 }
857
858 t0 = tp[8*0];
859 t1 = tp[8*4];
860 int x0 = t0 + t1;
861 int x1 = t0 - t1;
862
863 t0 = tp[8*2];
864 t3 = tp[8*6];
865 t2 = t0 - t3;
866 t3 += t0;
867
868 t2 = FP_MUL(A1, t2);
869 t3 += t2;
870
871 t0 = x0 + t3;
872 t1 = x1 + t2;
873 t2 = x1 - t2;
874 t3 = x0 - t3;
875
876 PIXDEF;
877 DOJPIX(t0 + t7, 0);
878 DOJPIX(t1 + t6, 1);
879 DOJPIX(t2 + t5, 2);
880 DOJPIX(t3 + t4, 3);
881 DID4PIX;
882 DOJPIX(t3 - t4, 4);
883 DOJPIX(t2 - t5, 5);
884 DOJPIX(t1 - t6, 6);
885 DOJPIX(t0 - t7, 7);
886 if (oflo & ~0xff) {
887 int t;
888 pix = 0;
889 DOJPIXLIMIT(t0 + t7, 0);
890 DOJPIXLIMIT(t1 + t6, 1);
891 DOJPIXLIMIT(t2 + t5, 2);
892 DOJPIXLIMIT(t3 + t4, 3);
893 DID4PIX;
894 DOJPIXLIMIT(t3 - t4, 4);
895 DOJPIXLIMIT(t2 - t5, 5);
896 DOJPIXLIMIT(t1 - t6, 6);
897 DOJPIXLIMIT(t0 - t7, 7);
898 }
899 PSTORE;
900
901 ++tp;
902 p += stride;
903 }
904 }
905
906 /*
907 * Inverse 2-D transform, similar to routine above (see comment above),
908 * but more appropriate for H.261 instead of JPEG. This routine does
909 * not bias the output by 128, and has an additional argument which is
910 * an input array which gets summed together with the inverse-transform.
911 * For example, this allows motion-compensation to be folded in here,
912 * saving an extra traversal of the block. The input pointer can be
913 * null, if motion-compensation is not needed.
914 *
915 * This routine does not take a quantization table, since the H.261
916 * inverse quantizer is easily implemented via table lookup in the decoder.
917 */
918 void
919 #ifdef INT_64
rdct(register short * bp,INT_64 m0,u_char * p,int stride,const u_char * in)920 rdct(register short *bp, INT_64 m0, u_char* p, int stride, const u_char* in)
921 #else
922 rdct(register short *bp, u_int m0, u_int m1, u_char* p, int stride, const u_char *in)
923 #endif
924 {
925 int tmp[64];
926 int* tp = tmp;
927 const int* qt = cross_stage;
928 /*
929 * First pass is 1D transform over the rows of the input array.
930 */
931 int i;
932 for (i = 8; --i >= 0; ) {
933 if ((m0 & 0xfe) == 0) {
934 /*
935 * All ac terms are zero.
936 */
937 int v = 0;
938 if (M(0))
939 v = qt[0] * bp[0];
940 tp[0] = v;
941 tp[1] = v;
942 tp[2] = v;
943 tp[3] = v;
944 tp[4] = v;
945 tp[5] = v;
946 tp[6] = v;
947 tp[7] = v;
948 } else {
949 int t4 = 0, t5 = 0, t6 = 0, t7 = 0;
950 if (m0 & 0xaa) {
951 /* odd part */
952 if (M(1))
953 t4 = qt[1] * bp[1];
954 if (M(3))
955 t5 = qt[3] * bp[3];
956 if (M(5))
957 t6 = qt[5] * bp[5];
958 if (M(7))
959 t7 = qt[7] * bp[7];
960
961 int x0 = t6 - t5;
962 t6 += t5;
963 int x1 = t4 - t7;
964 t7 += t4;
965
966 t5 = FP_MUL(t7 - t6, A3);
967 t7 += t6;
968
969 t4 = FP_MUL(x1 + x0, A5);
970 t6 = FP_MUL(x1, A4) - t4;
971 t4 += FP_MUL(x0, A2);
972
973 t7 += t6;
974 t6 += t5;
975 t5 += t4;
976 }
977 int t0 = 0, t1 = 0, t2 = 0, t3 = 0;
978 if (m0 & 0x55) {
979 /* even part */
980 if (M(0))
981 t0 = qt[0] * bp[0];
982 if (M(2))
983 t1 = qt[2] * bp[2];
984 if (M(4))
985 t2 = qt[4] * bp[4];
986 if (M(6))
987 t3 = qt[6] * bp[6];
988
989 int x0 = FP_MUL(t1 - t3, A1);
990 t3 += t1;
991 t1 = t0 - t2;
992 t0 += t2;
993 t2 = t3 + x0;
994 t3 = t0 - t2;
995 t0 += t2;
996 t2 = t1 - x0;
997 t1 += x0;
998 }
999 tp[0] = t0 + t7;
1000 tp[1] = t1 + t6;
1001 tp[2] = t2 + t5;
1002 tp[3] = t3 + t4;
1003 tp[4] = t3 - t4;
1004 tp[5] = t2 - t5;
1005 tp[6] = t1 - t6;
1006 tp[7] = t0 - t7;
1007 }
1008 qt += 8;
1009 tp += 8;
1010 bp += 8;
1011 m0 >>= 8;
1012 #ifndef INT_64
1013 m0 |= m1 << 24;
1014 m1 >>= 8;
1015 #endif
1016 }
1017 tp -= 64;
1018 /*
1019 * Second pass is 1D transform over the rows of the temp array.
1020 */
1021 for (i = 8; --i >= 0; ) {
1022 int t4 = tp[8*1];
1023 int t5 = tp[8*3];
1024 int t6 = tp[8*5];
1025 int t7 = tp[8*7];
1026 if ((t4|t5|t6|t7) != 0) {
1027 /* odd part */
1028 int x0 = t6 - t5;
1029 t6 += t5;
1030 int x1 = t4 - t7;
1031 t7 += t4;
1032
1033 t5 = FP_MUL(t7 - t6, A3);
1034 t7 += t6;
1035
1036 t4 = FP_MUL(x1 + x0, A5);
1037 t6 = FP_MUL(x1, A4) - t4;
1038 t4 += FP_MUL(x0, A2);
1039
1040 t7 += t6;
1041 t6 += t5;
1042 t5 += t4;
1043 }
1044 int t0 = tp[8*0];
1045 int t1 = tp[8*2];
1046 int t2 = tp[8*4];
1047 int t3 = tp[8*6];
1048 if ((t0|t1|t2|t3) != 0) {
1049 /* even part */
1050 int x0 = FP_MUL(t1 - t3, A1);
1051 t3 += t1;
1052 t1 = t0 - t2;
1053 t0 += t2;
1054 t2 = t3 + x0;
1055 t3 = t0 - t2;
1056 t0 += t2;
1057 t2 = t1 - x0;
1058 t1 += x0;
1059 }
1060 if (in != 0) {
1061 PIXDEF;
1062 DOPIXIN(t0 + t7, 0);
1063 DOPIXIN(t1 + t6, 1);
1064 DOPIXIN(t2 + t5, 2);
1065 DOPIXIN(t3 + t4, 3);
1066 DID4PIX;
1067 DOPIXIN(t3 - t4, 4);
1068 DOPIXIN(t2 - t5, 5);
1069 DOPIXIN(t1 - t6, 6);
1070 DOPIXIN(t0 - t7, 7);
1071 if (oflo & ~0xff) {
1072 int t;
1073 pix = 0;
1074 DOPIXINLIMIT(t0 + t7, 0);
1075 DOPIXINLIMIT(t1 + t6, 1);
1076 DOPIXINLIMIT(t2 + t5, 2);
1077 DOPIXINLIMIT(t3 + t4, 3);
1078 DID4PIX;
1079 DOPIXINLIMIT(t3 - t4, 4);
1080 DOPIXINLIMIT(t2 - t5, 5);
1081 DOPIXINLIMIT(t1 - t6, 6);
1082 DOPIXINLIMIT(t0 - t7, 7);
1083 }
1084 PSTORE;
1085 in += stride;
1086 } else {
1087 PIXDEF;
1088 DOPIX(t0 + t7, 0);
1089 DOPIX(t1 + t6, 1);
1090 DOPIX(t2 + t5, 2);
1091 DOPIX(t3 + t4, 3);
1092 DID4PIX;
1093 DOPIX(t3 - t4, 4);
1094 DOPIX(t2 - t5, 5);
1095 DOPIX(t1 - t6, 6);
1096 DOPIX(t0 - t7, 7);
1097 if (oflo & ~0xff) {
1098 int t;
1099 pix = 0;
1100 DOPIXLIMIT(t0 + t7, 0);
1101 DOPIXLIMIT(t1 + t6, 1);
1102 DOPIXLIMIT(t2 + t5, 2);
1103 DOPIXLIMIT(t3 + t4, 3);
1104 DID4PIX;
1105 DOPIXLIMIT(t3 - t4, 4);
1106 DOPIXLIMIT(t2 - t5, 5);
1107 DOPIXLIMIT(t1 - t6, 6);
1108 DOPIXLIMIT(t0 - t7, 7);
1109 }
1110 PSTORE;
1111 }
1112 tp += 1;
1113 p += stride;
1114 }
1115 }
1116
1117 /*
1118 * This macro does the combined descale-and-quantize
1119 * multiply. It truncates rather than rounds to give
1120 * the behavior required for the h.261 deadband quantizer.
1121 */
1122
1123 #define FWD_DandQ(v, iq) short((v) * qt[iq])
1124
fdct(const u_char * in,int stride,short * out,const float * qt)1125 void fdct(const u_char* in, int stride, short* out, const float* qt)
1126 {
1127 float tmp[64];
1128 float* tp = tmp;
1129
1130 int i;
1131 for (i = 8; --i >= 0; ) {
1132 float x0, x1, x2, x3, t0, t1, t2, t3, t4, t5, t6, t7;
1133 t0 = float(in[0] + in[7]);
1134 t7 = float(in[0] - in[7]);
1135 t1 = float(in[1] + in[6]);
1136 t6 = float(in[1] - in[6]);
1137 t2 = float(in[2] + in[5]);
1138 t5 = float(in[2] - in[5]);
1139 t3 = float(in[3] + in[4]);
1140 t4 = float(in[3] - in[4]);
1141
1142
1143 /* even part */
1144 x0 = t0 + t3;
1145 x2 = t1 + t2;
1146 tp[8*0] = x0 + x2;
1147 tp[8*4] = x0 - x2;
1148
1149 x1 = t0 - t3;
1150 x3 = t1 - t2;
1151 t0 = (x1 + x3) * FA1;
1152 tp[8*2] = x1 + t0;
1153 tp[8*6] = x1 - t0;
1154
1155 /* odd part */
1156 x0 = t4 + t5;
1157 x1 = t5 + t6;
1158 x2 = t6 + t7;
1159
1160 t3 = x1 * FA1;
1161 t4 = t7 - t3;
1162
1163 t0 = (x0 - x2) * FA5;
1164 t1 = x0 * FA2 + t0;
1165 tp[8*3] = t4 - t1;
1166 tp[8*5] = t4 + t1;
1167
1168 t7 += t3;
1169 t2 = x2 * FA4 + t0;
1170 tp[8*1] = t7 + t2;
1171 tp[8*7] = t7 - t2;
1172
1173 in += stride;
1174 tp += 1;
1175 }
1176 tp -= 8;
1177
1178 for (i = 8; --i >= 0; ) {
1179 float x0, x1, x2, x3, t0, t1, t2, t3, t4, t5, t6, t7;
1180 t0 = tp[0] + tp[7];
1181 t7 = tp[0] - tp[7];
1182 t1 = tp[1] + tp[6];
1183 t6 = tp[1] - tp[6];
1184 t2 = tp[2] + tp[5];
1185 t5 = tp[2] - tp[5];
1186 t3 = tp[3] + tp[4];
1187 t4 = tp[3] - tp[4];
1188
1189 /* even part */
1190 x0 = t0 + t3;
1191 x2 = t1 + t2;
1192 out[0] = FWD_DandQ(x0 + x2, 0);
1193 out[4] = FWD_DandQ(x0 - x2, 4);
1194
1195 x1 = t0 - t3;
1196 x3 = t1 - t2;
1197 t0 = (x1 + x3) * FA1;
1198 out[2] = FWD_DandQ(x1 + t0, 2);
1199 out[6] = FWD_DandQ(x1 - t0, 6);
1200
1201 /* odd part */
1202 x0 = t4 + t5;
1203 x1 = t5 + t6;
1204 x2 = t6 + t7;
1205
1206 t3 = x1 * FA1;
1207 t4 = t7 - t3;
1208
1209 t0 = (x0 - x2) * FA5;
1210 t1 = x0 * FA2 + t0;
1211 out[3] = FWD_DandQ(t4 - t1, 3);
1212 out[5] = FWD_DandQ(t4 + t1, 5);
1213
1214 t7 += t3;
1215 t2 = x2 * FA4 + t0;
1216 out[1] = FWD_DandQ(t7 + t2, 1);
1217 out[7] = FWD_DandQ(t7 - t2, 7);
1218
1219 out += 8;
1220 tp += 8;
1221 qt += 8;
1222 }
1223 }
1224
1225 /*
1226 * decimate the *rows* of the two input 8x8 DCT matrices into
1227 * a single output matrix. we decimate rows rather than
1228 * columns even though we want column decimation because
1229 * the DCTs are stored in column order.
1230 */
1231 void
dct_decimate(const short * in0,const short * in1,short * o)1232 dct_decimate(const short* in0, const short* in1, short* o)
1233 {
1234 for (int k = 0; k < 8; ++k) {
1235 int x00 = in0[0];
1236 int x01 = in0[1];
1237 int x02 = in0[2];
1238 int x03 = in0[3];
1239 int x10 = in1[0];
1240 int x11 = in1[1];
1241 int x12 = in1[2];
1242 int x13 = in1[3];
1243 #define X_N 4
1244 #define X_5(v) ((v) << (X_N - 1))
1245 #define X_25(v) ((v) << (X_N - 2))
1246 #define X_125(v) ((v) << (X_N - 3))
1247 #define X_0625(v) ((v) << (X_N - 4))
1248 #define X_375(v) (X_25(v) + X_125(v))
1249 #define X_625(v) (X_5(v) + X_125(v))
1250 #define X_75(v) (X_5(v) + X_25(v))
1251 #define X_6875(v) (X_5(v) + X_125(v) + X_0625(v))
1252 #define X_1875(v) (X_125(v) + X_0625(v))
1253 #define X_NORM(v) ((v) >> X_N)
1254
1255 /*
1256 * 0.50000000 0.09011998 0.00000000 0.10630376
1257 * 0.50000000 0.09011998 0.00000000 0.10630376
1258 * 0.45306372 0.28832037 0.03732892 0.08667963
1259 * -0.45306372 0.11942621 0.10630376 -0.06764951
1260 * 0.00000000 0.49039264 0.17677670 0.00000000
1261 * 0.00000000 -0.49039264 -0.17677670 0.00000000
1262 * -0.15909482 0.34009707 0.38408888 0.05735049
1263 * 0.15909482 0.43576792 -0.09011998 -0.13845632
1264 * 0.00000000 -0.03732892 0.46193977 0.25663998
1265 * 0.00000000 -0.03732892 0.46193977 0.25663998
1266 * 0.10630376 -0.18235049 0.25663998 0.42361940
1267 * -0.10630376 -0.16332037 -0.45306372 -0.01587282
1268 * 0.00000000 0.00000000 -0.07322330 0.41573481
1269 * 0.00000000 0.00000000 0.07322330 -0.41573481
1270 * -0.09011998 0.13399123 -0.18766514 0.24442621
1271 * 0.09011998 0.13845632 0.15909482 0.47539609
1272 */
1273
1274 o[0] = X_NORM(X_5(x00 + x10) + X_0625(x01 + x11) +
1275 X_125(x03 + x13));
1276 o[1] = X_NORM(X_5(x00 - x10) + X_25(x01) + X_0625(x03) +
1277 X_125(x11 + x12));
1278 o[2] = X_NORM(X_5(x01 - x11) + X_1875(x02 + x12));
1279 o[3] = X_NORM(X_1875(x10 - x00) + X_375(x01 + x02) +
1280 X_5(x11) - X_125(x13));
1281 o[4] = X_NORM(X_5(x02 + x12) + X_25(x03 + x13));
1282 o[5] = X_NORM(X_125(x00 - x10) - X_1875(x01 + x11) +
1283 X_25(x02) + X_5(x03 - x12));
1284 o[6] = X_NORM(X_625(x12 - x02) + X_375(x03 + x13));
1285 o[7] = X_NORM(X_125(x01 - x00 + x11 + x10 + x12) +
1286 X_1875(x02) + X_25(x03) + X_5(x13));
1287
1288 o += 8;
1289 in0 += 8;
1290 in1 += 8;
1291 }
1292 }
1293