1 /* fdct_idct.c, this file is part of the
2 * AltiVec optimized library for MJPEG tools MPEG-1/2 Video Encoder
3 * Copyright (C) 2002 James Klicman <james@klicman.org>
4 *
5 * This library is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20 /***************************************************************
21 *
22 * AltiVec code optimizations from http://www.motorola.com
23 * /SPS/RISC/smartnetworks/arch/powerpc/features/altivec/avcode.htm
24 *
25 * Adapted Aug 20, 2001 James Klicman <james@klicman.org>
26 *
27 * Update Jan 30, 2002: code samples moved to http://e-www.motorola.com
28 * /webapp/sps/site/taxonomy.jsp?nodeId=03M943030450467M0ymKF9DH
29 *
30 */
31
32 #ifdef HAVE_CONFIG_H
33 #include <config.h>
34 #endif
35
36 #include "vectorize.h"
37
38 #ifdef HAVE_ALTIVEC_H
39 /* include last to ensure AltiVec type semantics, especially for bool. */
40 #include <altivec.h>
41 #endif
42
43 /***************************************************************
44 *
45 * Copyright: (c) Copyright Motorola Inc. 1998
46 *
47 * Date: April 17, 1998
48 *
49 * Function: Matrix_Transpose
50 *
51 * Description: The following Matrix Transpose is adapted
52 * from an algorithm developed by Brett Olsson
53 * from IBM. It performs a 8x8 16-bit element
54 * full matrix transpose.
55 *
56 * Inputs: array elements stored in input
57 * input[0] = [ 00 01 02 03 04 05 06 07 ]
58 * input[1] = [ 10 11 12 13 14 15 16 17 ]
59 * input[2] = [ 20 21 22 23 24 25 26 27 ]
60 * input[3] = [ 30 31 32 33 34 35 36 37 ]
61 * input[4] = [ 40 41 42 43 44 45 46 47 ]
62 * input[5] = [ 50 51 52 53 54 55 56 57 ]
63 * input[6] = [ 60 61 62 63 64 65 66 67 ]
64 * input[7] = [ 70 71 72 73 74 75 76 77 ]
65 *
66 * Outputs: transposed elements in output
67 *
68 **************************************************************/
69
Matrix_Transpose(vector signed short * input,vector signed short * output)70 static inline void Matrix_Transpose ( vector signed short *input,
71 vector signed short *output )
72 {
73 vector signed short a0, a1, a2, a3, a4, a5, a6, a7;
74 vector signed short b0, b1, b2, b3, b4, b5, b6, b7;
75
76 b0 = vec_mergeh( input[0], input[4] ); /* [ 00 40 01 41 02 42 03 43 ]*/
77 b1 = vec_mergel( input[0], input[4] ); /* [ 04 44 05 45 06 46 07 47 ]*/
78 b2 = vec_mergeh( input[1], input[5] ); /* [ 10 50 11 51 12 52 13 53 ]*/
79 b3 = vec_mergel( input[1], input[5] ); /* [ 14 54 15 55 16 56 17 57 ]*/
80 b4 = vec_mergeh( input[2], input[6] ); /* [ 20 60 21 61 22 62 23 63 ]*/
81 b5 = vec_mergel( input[2], input[6] ); /* [ 24 64 25 65 26 66 27 67 ]*/
82 b6 = vec_mergeh( input[3], input[7] ); /* [ 30 70 31 71 32 72 33 73 ]*/
83 b7 = vec_mergel( input[3], input[7] ); /* [ 34 74 35 75 36 76 37 77 ]*/
84
85 a0 = vec_mergeh( b0, b4 ); /* [ 00 20 40 60 01 21 41 61 ]*/
86 a1 = vec_mergel( b0, b4 ); /* [ 02 22 42 62 03 23 43 63 ]*/
87 a2 = vec_mergeh( b1, b5 ); /* [ 04 24 44 64 05 25 45 65 ]*/
88 a3 = vec_mergel( b1, b5 ); /* [ 06 26 46 66 07 27 47 67 ]*/
89 a4 = vec_mergeh( b2, b6 ); /* [ 10 30 50 70 11 31 51 71 ]*/
90 a5 = vec_mergel( b2, b6 ); /* [ 12 32 52 72 13 33 53 73 ]*/
91 a6 = vec_mergeh( b3, b7 ); /* [ 14 34 54 74 15 35 55 75 ]*/
92 a7 = vec_mergel( b3, b7 ); /* [ 16 36 56 76 17 37 57 77 ]*/
93
94 output[0] = vec_mergeh( a0, a4 ); /* [ 00 10 20 30 40 50 60 70 ]*/
95 output[1] = vec_mergel( a0, a4 ); /* [ 01 11 21 31 41 51 61 71 ]*/
96 output[2] = vec_mergeh( a1, a5 ); /* [ 02 12 22 32 42 52 62 72 ]*/
97 output[3] = vec_mergel( a1, a5 ); /* [ 03 13 23 33 43 53 63 73 ]*/
98 output[4] = vec_mergeh( a2, a6 ); /* [ 04 14 24 34 44 54 64 74 ]*/
99 output[5] = vec_mergel( a2, a6 ); /* [ 05 15 25 35 45 55 65 75 ]*/
100 output[6] = vec_mergeh( a3, a7 ); /* [ 06 16 26 36 46 56 66 76 ]*/
101 output[7] = vec_mergel( a3, a7 ); /* [ 07 17 27 37 47 57 67 77 ]*/
102
103 }
104 /***************************************************************
105 *
106 * Copyright: (c) Copyright Motorola Inc. 1998
107 *
108 * Date: April 15, 1998
109 *
110 * Macro: DCT_Transform
111 *
112 * Description: Discrete Cosign Transform implemented by the
113 * Scaled Chen (II) Algorithm developed by Haifa
114 * Research Lab. The major differnce between this
115 * algorithm and the Scaled Chen (I) is that
116 * certain multiply-subtracts are replaced by
117 * multiply adds. A full description of the
118 * Scaled Chen (I) algorithm can be found in:
119 * W.C.Chen, C.H.Smith and S.C.Fralick, "A Fast
120 * Computational Algorithm for the Discrete Cosine
121 * Transform", IEEE Transactions on Cummnuications,
122 * Vol. COM-25, No. 9, pp 1004-1009, Sept. 1997.
123 *
124 * Inputs: vx : array of vector short
125 * t1-t10 : temporary vector variables set up by caller
126 * c4 : cos(4*pi/16)
127 * mc4 : -c4
128 * a0 : c6/c2
129 * a1 : c7/c1
130 * a2 : c5/c3
131 * ma2 : -a2
132 * zero : an array of zero elements
133 *
134 * Outputs: vy : array of vector short
135 *
136 **************************************************************/
137
138 #define DCT_Transform(vx,vy) \
139 \
140 /* 1st stage. */ \
141 t8 = vec_adds( vx[0], vx[7] ); /* t0 + t7 */ \
142 t9 = vec_subs( vx[0], vx[7] ); /* t0 - t7 */ \
143 t0 = vec_adds( vx[1], vx[6] ); /* t1 + t6 */ \
144 t7 = vec_subs( vx[1], vx[6] ); /* t1 - t6 */ \
145 t1 = vec_adds( vx[2], vx[5] ); /* t2 + t6 */ \
146 t6 = vec_subs( vx[2], vx[5] ); /* t2 - t6 */ \
147 t2 = vec_adds( vx[3], vx[4] ); /* t3 + t4 */ \
148 t5 = vec_subs( vx[3], vx[4] ); /* t3 - t4 */ \
149 \
150 /* 2nd stage. */ \
151 t3 = vec_adds( t8, t2 ); /* (t0+t7) + (t3+t4) */ \
152 t4 = vec_subs( t8, t2 ); /* (t0+t7) - (t3+t4) */ \
153 t2 = vec_adds( t0, t1 ); /* (t1+t6) + (t2+t5) */ \
154 t8 = vec_subs( t0, t1 ); /* (t1+t6) - (t2+t5) */ \
155 \
156 t1 = vec_adds( t7, t6 ); /* (t1-t6) + (t2-t5) */ \
157 t0 = vec_subs( t7, t6 ); /* (t1-t6) - (t2-t5) */ \
158 \
159 /* 3rd stage */ \
160 vy[0] = vec_adds( t3, t2 ); /* y0 = t3 + t2 */ \
161 vy[4] = vec_subs( t3, t2 ); /* y4 = t3 + t2 */ \
162 vy[2] = vec_mradds( t8, a0, t4 ); /* y2 = t8 * (a0) + t4 */ \
163 t10 = vec_mradds( t4, a0, zero ); \
164 vy[6] = vec_subs( t10, t8 ); /* y6 = t4 * (a0) - t8 */ \
165 \
166 t6 = vec_mradds( t0, c4, t5 ); /* t6 = t0 * (c4) + t5 */ \
167 t7 = vec_mradds( t0, mc4, t5 ); /* t7 = t0 * (-c4) + t5 */ \
168 t2 = vec_mradds( t1, mc4, t9 ); /* t2 = t1 * (-c4) + t9 */ \
169 t3 = vec_mradds( t1, c4, t9 ); /* t3 = t1 * (c4) + t9 */ \
170 \
171 /* 4th stage. */ \
172 vy[1] = vec_mradds( t6, a1, t3 ); /* y1 = t6 * (a1) + t3 */ \
173 t9 = vec_mradds( t3, a1, zero ); \
174 vy[7] = vec_subs( t9, t6 ) ; /* y7 = t3 * (a1) - t6 */ \
175 vy[5] = vec_mradds( t2, a2, t7 ); /* y5 = t2 + (a2) + t7 */ \
176 vy[3] = vec_mradds( t7, ma2, t2 ); /* y3 = t7 * (-a2) + t2 */
177
178 /* Post-scaling matrix -- scaled by 1 */
179 static const vector signed short PostScale[8] = {
180 (vector signed short)VCONST(4095, 5681, 5351, 4816, 4095, 4816, 5351, 5681),
181 (vector signed short)VCONST(5681, 7880, 7422, 6680, 5681, 6680, 7422, 7880),
182 (vector signed short)VCONST(5351, 7422, 6992, 6292, 5351, 6292, 6992, 7422),
183 (vector signed short)VCONST(4816, 6680, 6292, 5663, 4816, 5663, 6292, 6680),
184 (vector signed short)VCONST(4095, 5681, 5351, 4816, 4095, 4816, 5351, 5681),
185 (vector signed short)VCONST(4816, 6680, 6292, 5663, 4816, 5663, 6292, 6680),
186 (vector signed short)VCONST(5351, 7422, 6992, 6292, 5351, 6292, 6992, 7422),
187 (vector signed short)VCONST(5681, 7880, 7422, 6680, 5681, 6680, 7422, 7880)
188 };
189
190 /***************************************************************
191 *
192 * Copyright: (c) Copyright Motorola Inc. 1998
193 *
194 * Date: April 17, 1998
195 *
196 * Function: DCT
197 *
198 * Description: Scaled Chen (II) algorithm for DCT
199 * Arithmetic is 16-bit fixed point.
200 *
201 * Inputs: input - Pointer to input data (short), which
202 * must be between -255 to +255.
203 * It is assumed that the allocated array
204 * has been 128-bit aligned and contains
205 * 8x8 short elements.
206 *
207 * Outputs: output - Pointer to output area for the transfored
208 * data. The output values are between -2040
209 * and 2040. It is assumed that a 128-bit
210 * aligned 8x8 array of short has been
211 * pre-allocated.
212 *
213 * Return: None
214 *
215 ***************************************************************/
216
217 /* Note: these constants could all be loaded directly, but using the
218 * SpecialConstants approach causes vsplth instructions to be generated instead
219 * of lvx which is more efficient given the remainder of the instruction mix.
220 */
221 static const vector signed short SpecialConstants =
222 (const vector signed short)VCONST(23170, 13573, 6518, 21895, -23170, -21895, 0, 0);
223
224 #ifdef ORIGINAL_SOURCE
DCT(short * input,short * output)225 void DCT(short *input, short *output) {
226 #else
227 void fdct_altivec_old(short *blocks) {
228 #endif
229
230 vector signed short t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
231 vector signed short a0, a1, a2, ma2, c4, mc4, zero;
232 vector signed short vx[8], vy[8];
233 vector signed short *vec_ptr; /* used for conversion between
234 arrays of short and vector
235 signed short array. */
236
237 /* load the multiplication constants */
238 c4 = vec_splat( SpecialConstants, 0 ); /* c4 = cos(4*pi/16) */
239 a0 = vec_splat( SpecialConstants, 1 ); /* a0 = c6/c2 */
240 a1 = vec_splat( SpecialConstants, 2 ); /* a1 = c7/c1 */
241 a2 = vec_splat( SpecialConstants, 3 ); /* a2 = c5/c3 */
242 mc4 = vec_splat( SpecialConstants, 4 ); /* -c4 */
243 ma2 = vec_splat( SpecialConstants, 5 ); /* -a2 */
244 zero = vec_splat_s16(0);
245
246 /* copy the rows of input data */
247 #ifdef ORIGINAL_SOURCE
248 vec_ptr = ( vector signed short * ) input;
249 #else
250 vec_ptr = ( vector signed short * ) blocks;
251 #endif
252 vx[0] = vec_ptr[0];
253 vx[1] = vec_ptr[1];
254 vx[2] = vec_ptr[2];
255 vx[3] = vec_ptr[3];
256 vx[4] = vec_ptr[4];
257 vx[5] = vec_ptr[5];
258 vx[6] = vec_ptr[6];
259 vx[7] = vec_ptr[7];
260
261 /* Perform DCT first on the 8 columns */
262 DCT_Transform( vx, vy );
263
264 /* Transpose matrix to work on rows */
265 Matrix_Transpose( vy, vx );
266
267 /* Perform DCT first on the 8 rows */
268 DCT_Transform( vx, vy );
269
270 #ifndef ORIGINAL_SOURCE
271 /* mpeg2enc requires the matrix transposed back */
272 Matrix_Transpose( vy, vx );
273 #endif
274
275 /* Post-scale and store result. */
276 #ifdef ORIGINAL_SOURCE
277 vec_ptr = (vector signed short *) output;
278 #endif
279 vec_ptr[0] = vec_mradds( PostScale[0], vx[0], zero );
280 vec_ptr[1] = vec_mradds( PostScale[1], vx[1], zero );
281 vec_ptr[2] = vec_mradds( PostScale[2], vx[2], zero );
282 vec_ptr[3] = vec_mradds( PostScale[3], vx[3], zero );
283 vec_ptr[4] = vec_mradds( PostScale[4], vx[4], zero );
284 vec_ptr[5] = vec_mradds( PostScale[5], vx[5], zero );
285 vec_ptr[6] = vec_mradds( PostScale[6], vx[6], zero );
286 vec_ptr[7] = vec_mradds( PostScale[7], vx[7], zero );
287 }
288
289
290 /***************************************************************
291 *
292 * Copyright: (c) Copyright Motorola Inc. 1998
293 *
294 * Date: April 20, 1998
295 *
296 * Macro: IDCT_Transform
297 *
298 * Description: Discrete Cosign Transform implemented by the
299 * Scaled Chen (III) Algorithm developed by Haifa
300 * Research Lab. The major difference between this
301 * algorithm and the Scaled Chen (I) is that
302 * certain multiply-subtracts are replaced by
303 * multiply adds. A full description of the
304 * Scaled Chen (I) algorithm can be found in:
305 * W.C.Chen, C.H.Smith and S.C.Fralick, "A Fast
306 * Computational Algorithm for the Discrete Cosine
307 * Transform", IEEE Transactions on Commnuications,
308 * Vol. COM-25, No. 9, pp 1004-1009, Sept. 1997.
309 *
310 * Inputs: vx : array of vector short
311 * t1-t10 : temporary vector variables set up by caller
312 * c4 : cos(4*pi/16)
313 * mc4 : -c4
314 * a0 : c6/c2
315 * a1 : c7/c1
316 * a2 : c5/c3
317 * ma2 : -a2
318 * zero : an array of zero elements
319 *
320 * Outputs: vy : array of vector short
321 *
322 **************************************************************/
323
324 #define IDCT_Transform(vx,vy) \
325 \
326 /* 1st stage. */ \
327 t9 = vec_mradds( a1, vx[1], zero ); /* t8 = (a1) * x1 - x7 */ \
328 t8 = vec_subs( t9, vx[7]); \
329 t1 = vec_mradds( a1, vx[7], vx[1] ); /* t1 = (a1) * x7 + x1 */ \
330 t7 = vec_mradds( a2, vx[5], vx[3] ); /* t7 = (a2) * x5 + x3 */ \
331 t3 = vec_mradds( ma2, vx[3], vx[5] );/* t3 = (-a2) * x5 + x3 */ \
332 \
333 /* 2nd stage */ \
334 t5 = vec_adds( vx[0], vx[4] ); /* t5 = x0 + x4 */ \
335 t0 = vec_subs( vx[0], vx[4] ); /* t0 = x0 - x4 */ \
336 t9 = vec_mradds( a0, vx[2], zero ); /* t4 = (a0) * x2 - x6 */ \
337 t4 = vec_subs( t9, vx[6] ); \
338 t2 = vec_mradds( a0, vx[6], vx[2] ); /* t2 = (a0) * x6 + x2 */ \
339 \
340 t6 = vec_adds( t8, t3 ); /* t6 = t8 + t3 */ \
341 t3 = vec_subs( t8, t3 ); /* t3 = t8 - t3 */ \
342 t8 = vec_subs( t1, t7 ); /* t8 = t1 - t7 */ \
343 t1 = vec_adds( t1, t7 ); /* t1 = t1 + t7 */ \
344 \
345 /* 3rd stage. */ \
346 t7 = vec_adds( t5, t2 ); /* t7 = t5 + t2 */ \
347 t2 = vec_subs( t5, t2 ); /* t2 = t5 - t2 */ \
348 t5 = vec_adds( t0, t4 ); /* t5 = t0 + t4 */ \
349 t0 = vec_subs( t0, t4 ); /* t0 = t0 - t4 */ \
350 \
351 t4 = vec_subs( t8, t3 ); /* t4 = t8 - t3 */ \
352 t3 = vec_adds( t8, t3 ); /* t3 = t8 + t3 */ \
353 \
354 /* 4th stage. */ \
355 vy[0] = vec_adds( t7, t1 ); /* y0 = t7 + t1 */ \
356 vy[7] = vec_subs( t7, t1 ); /* y7 = t7 - t1 */ \
357 vy[1] = vec_mradds( c4, t3, t5 ); /* y1 = (c4) * t3 + t5 */ \
358 vy[6] = vec_mradds( mc4, t3, t5 ); /* y6 = (-c4) * t3 + t5 */ \
359 vy[2] = vec_mradds( c4, t4, t0 ); /* y2 = (c4) * t4 + t0 */ \
360 vy[5] = vec_mradds( mc4, t4, t0 ); /* y5 = (-c4) * t4 + t0 */ \
361 vy[3] = vec_adds( t2, t6 ); /* y3 = t2 + t6 */ \
362 vy[4] = vec_subs( t2, t6 ); /* y4 = t2 - t6 */
363
364
365 /* Pre-Scaling matrix -- scaled by 1 */
366 static const vector signed short PreScale[8] = {
367 (vector signed short)VCONST(4095, 5681, 5351, 4816, 4095, 4816, 5351, 5681),
368 (vector signed short)VCONST(5681, 7880, 7422, 6680, 5681, 6680, 7422, 7880),
369 (vector signed short)VCONST(5351, 7422, 6992, 6292, 5351, 6292, 6992, 7422),
370 (vector signed short)VCONST(4816, 6680, 6292, 5663, 4816, 5663, 6292, 6680),
371 (vector signed short)VCONST(4095, 5681, 5351, 4816, 4095, 4816, 5351, 5681),
372 (vector signed short)VCONST(4816, 6680, 6292, 5663, 4816, 5663, 6292, 6680),
373 (vector signed short)VCONST(5351, 7422, 6992, 6292, 5351, 6292, 6992, 7422),
374 (vector signed short)VCONST(5681, 7880, 7422, 6680, 5681, 6680, 7422, 7880)
375 };
376
377
378 /***************************************************************
379 *
380 * Copyright: (c) Copyright Motorola Inc. 1998
381 *
382 * Date: April 17, 1998
383 *
384 * Function: IDCT
385 *
386 * Description: Scaled Chen (III) algorithm for IDCT
387 * Arithmetic is 16-bit fixed point.
388 *
389 * Inputs: input - Pointer to input data (short), which
390 * must be between -2048 to +2047.
391 * It is assumed that the allocated array
392 * has been 128-bit aligned and contains
393 * 8x8 short elements.
394 *
395 * Outputs: output - Pointer to output area for the transfored
396 * data. The output values are between -255
397 * and 255 . It is assumed that a 128-bit
398 * aligned 8x8 array of short has been
399 * pre-allocated.
400 *
401 * Return: None
402 *
403 ***************************************************************/
404
405 #ifdef ORIGINAL_SOURCE
406 void IDCT(short *input, short *output) {
407 #else
408 void idct_altivec_old(short *block) {
409 #endif
410
411 vector signed short t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
412 vector signed short a0, a1, a2, ma2, c4, mc4, zero;
413 vector signed short vx[8], vy[8];
414 vector signed short *vec_ptr; /* used for conversion between
415 arrays of short and vector
416 signed short array. */
417
418
419 /* Load the multiplication constants. */
420 c4 = vec_splat( SpecialConstants, 0 ); /* c4 = cos(4*pi/16) */
421 a0 = vec_splat( SpecialConstants, 1 ); /* a0 = c6/c2 */
422 a1 = vec_splat( SpecialConstants, 2 ); /* a1 = c7/c1 */
423 a2 = vec_splat( SpecialConstants, 3 ); /* a2 = c5/c3 */
424 mc4 = vec_splat( SpecialConstants, 4 ); /* -c4 */
425 ma2 = vec_splat( SpecialConstants, 5 ); /* -a2 */
426 zero = vec_splat_s16(0);
427
428 /* Load the rows of input data and Pre-Scale them. */
429 #ifdef ORIGINAL_SOURCE
430 vec_ptr = ( vector signed short * ) input;
431 #else
432 vec_ptr = ( vector signed short * ) block;
433 #endif
434 vx[0] = vec_mradds( vec_ptr[0], PreScale[0], zero );
435 vx[1] = vec_mradds( vec_ptr[1], PreScale[1], zero );
436 vx[2] = vec_mradds( vec_ptr[2], PreScale[2], zero );
437 vx[3] = vec_mradds( vec_ptr[3], PreScale[3], zero );
438 vx[4] = vec_mradds( vec_ptr[4], PreScale[4], zero );
439 vx[5] = vec_mradds( vec_ptr[5], PreScale[5], zero );
440 vx[6] = vec_mradds( vec_ptr[6], PreScale[6], zero );
441 vx[7] = vec_mradds( vec_ptr[7], PreScale[7], zero );
442
443 /* Perform IDCT first on the 8 columns */
444 IDCT_Transform( vx, vy );
445
446 /* Transpose matrix to work on rows */
447 Matrix_Transpose( vy, vx );
448
449 /* Perform IDCT next on the 8 rows */
450 IDCT_Transform( vx, vy );
451
452 #ifndef ORIGINAL_SOURCE
453 /* mpeg2enc requires the matrix transposed back */
454 Matrix_Transpose( vy, vx );
455 #endif
456
457 /* Post-scale and store result. */
458 #ifdef ORIGINAL_SOURCE
459 vec_ptr = (vector signed short *) output;
460 #endif
461 vec_ptr[0] = vx[0];
462 vec_ptr[1] = vx[1];
463 vec_ptr[2] = vx[2];
464 vec_ptr[3] = vx[3];
465 vec_ptr[4] = vx[4];
466 vec_ptr[5] = vx[5];
467 vec_ptr[6] = vx[6];
468 vec_ptr[7] = vx[7];
469 }
470