1 /* fdct_idct.c, this file is part of the
2  * AltiVec optimized library for MJPEG tools MPEG-1/2 Video Encoder
3  * Copyright (C) 2002  James Klicman <james@klicman.org>
4  *
5  * This library is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18  */
19 
20 /***************************************************************
21  *
22  * AltiVec code optimizations from http://www.motorola.com
23  * /SPS/RISC/smartnetworks/arch/powerpc/features/altivec/avcode.htm
24  *
25  * Adapted Aug 20, 2001 James Klicman <james@klicman.org>
26  *
27  * Update Jan 30, 2002: code samples moved to http://e-www.motorola.com
28  * /webapp/sps/site/taxonomy.jsp?nodeId=03M943030450467M0ymKF9DH
29  *
30  */
31 
32 #ifdef HAVE_CONFIG_H
33 #include <config.h>
34 #endif
35 
36 #include "vectorize.h"
37 
38 #ifdef HAVE_ALTIVEC_H
39 /* include last to ensure AltiVec type semantics, especially for bool. */
40 #include <altivec.h>
41 #endif
42 
43 /***************************************************************
44  *
45  * Copyright:   (c) Copyright Motorola Inc. 1998
46  *
47  * Date:        April 17, 1998
48  *
49  * Function:    Matrix_Transpose
50  *
51  * Description: The following Matrix Transpose is adapted
52  *              from an algorithm developed by Brett Olsson
53  *              from IBM. It performs a 8x8 16-bit element
54  *              full matrix transpose.
55  *
56  * Inputs:      array elements stored in input
57  *               input[0] = [ 00 01 02 03 04 05 06 07 ]
58  *               input[1] = [ 10 11 12 13 14 15 16 17 ]
59  *               input[2] = [ 20 21 22 23 24 25 26 27 ]
60  *               input[3] = [ 30 31 32 33 34 35 36 37 ]
61  *               input[4] = [ 40 41 42 43 44 45 46 47 ]
62  *               input[5] = [ 50 51 52 53 54 55 56 57 ]
63  *               input[6] = [ 60 61 62 63 64 65 66 67 ]
64  *               input[7] = [ 70 71 72 73 74 75 76 77 ]
65  *
66  * Outputs:     transposed elements in output
67  *
68  **************************************************************/
69 
Matrix_Transpose(vector signed short * input,vector signed short * output)70 static inline void Matrix_Transpose ( vector signed short *input,
71                                       vector signed short *output )
72 {
73   vector signed short a0, a1, a2, a3, a4, a5, a6, a7;
74   vector signed short b0, b1, b2, b3, b4, b5, b6, b7;
75 
76   b0 = vec_mergeh( input[0], input[4] );     /* [ 00 40 01 41 02 42 03 43 ]*/
77   b1 = vec_mergel( input[0], input[4] );     /* [ 04 44 05 45 06 46 07 47 ]*/
78   b2 = vec_mergeh( input[1], input[5] );     /* [ 10 50 11 51 12 52 13 53 ]*/
79   b3 = vec_mergel( input[1], input[5] );     /* [ 14 54 15 55 16 56 17 57 ]*/
80   b4 = vec_mergeh( input[2], input[6] );     /* [ 20 60 21 61 22 62 23 63 ]*/
81   b5 = vec_mergel( input[2], input[6] );     /* [ 24 64 25 65 26 66 27 67 ]*/
82   b6 = vec_mergeh( input[3], input[7] );     /* [ 30 70 31 71 32 72 33 73 ]*/
83   b7 = vec_mergel( input[3], input[7] );     /* [ 34 74 35 75 36 76 37 77 ]*/
84 
85   a0 = vec_mergeh( b0, b4 );                 /* [ 00 20 40 60 01 21 41 61 ]*/
86   a1 = vec_mergel( b0, b4 );                 /* [ 02 22 42 62 03 23 43 63 ]*/
87   a2 = vec_mergeh( b1, b5 );                 /* [ 04 24 44 64 05 25 45 65 ]*/
88   a3 = vec_mergel( b1, b5 );                 /* [ 06 26 46 66 07 27 47 67 ]*/
89   a4 = vec_mergeh( b2, b6 );                 /* [ 10 30 50 70 11 31 51 71 ]*/
90   a5 = vec_mergel( b2, b6 );                 /* [ 12 32 52 72 13 33 53 73 ]*/
91   a6 = vec_mergeh( b3, b7 );                 /* [ 14 34 54 74 15 35 55 75 ]*/
92   a7 = vec_mergel( b3, b7 );                 /* [ 16 36 56 76 17 37 57 77 ]*/
93 
94   output[0] = vec_mergeh( a0, a4 );          /* [ 00 10 20 30 40 50 60 70 ]*/
95   output[1] = vec_mergel( a0, a4 );          /* [ 01 11 21 31 41 51 61 71 ]*/
96   output[2] = vec_mergeh( a1, a5 );          /* [ 02 12 22 32 42 52 62 72 ]*/
97   output[3] = vec_mergel( a1, a5 );          /* [ 03 13 23 33 43 53 63 73 ]*/
98   output[4] = vec_mergeh( a2, a6 );          /* [ 04 14 24 34 44 54 64 74 ]*/
99   output[5] = vec_mergel( a2, a6 );          /* [ 05 15 25 35 45 55 65 75 ]*/
100   output[6] = vec_mergeh( a3, a7 );          /* [ 06 16 26 36 46 56 66 76 ]*/
101   output[7] = vec_mergel( a3, a7 );          /* [ 07 17 27 37 47 57 67 77 ]*/
102 
103 }
104 /***************************************************************
105  *
106  * Copyright:   (c) Copyright Motorola Inc. 1998
107  *
108  * Date:        April 15, 1998
109  *
110  * Macro:       DCT_Transform
111  *
112  * Description: Discrete Cosign Transform implemented by the
113  *              Scaled Chen (II) Algorithm developed by Haifa
114  *              Research Lab.  The major differnce between this
115  *              algorithm and the Scaled Chen (I) is that
116  *              certain multiply-subtracts are replaced by
117  *              multiply adds.  A full description of the
118  *              Scaled Chen (I) algorithm can be found in:
119  *              W.C.Chen, C.H.Smith and S.C.Fralick, "A Fast
120  *              Computational Algorithm for the Discrete Cosine
121  *              Transform", IEEE Transactions on Cummnuications,
122  *              Vol. COM-25, No. 9, pp 1004-1009, Sept. 1997.
123  *
124  * Inputs:      vx     : array of vector short
125  *              t1-t10 : temporary vector variables set up by caller
126  *              c4     : cos(4*pi/16)
127  *              mc4    : -c4
128  *              a0     : c6/c2
129  *              a1     : c7/c1
130  *              a2     : c5/c3
131  *              ma2    : -a2
132  *              zero   : an array of zero elements
133  *
134  * Outputs:     vy     : array of vector short
135  *
136  **************************************************************/
137 
138 #define DCT_Transform(vx,vy) \
139                                                                    \
140   /* 1st stage. */                                                 \
141   t8 = vec_adds( vx[0], vx[7] );     /* t0 + t7 */                 \
142   t9 = vec_subs( vx[0], vx[7] );     /* t0 - t7 */                 \
143   t0 = vec_adds( vx[1], vx[6] );     /* t1 + t6 */                 \
144   t7 = vec_subs( vx[1], vx[6] );     /* t1 - t6 */                 \
145   t1 = vec_adds( vx[2], vx[5] );     /* t2 + t6 */                 \
146   t6 = vec_subs( vx[2], vx[5] );     /* t2 - t6 */                 \
147   t2 = vec_adds( vx[3], vx[4] );     /* t3 + t4 */                 \
148   t5 = vec_subs( vx[3], vx[4] );     /* t3 - t4 */                 \
149                                                                    \
150   /* 2nd stage. */                                                 \
151   t3 = vec_adds( t8, t2 );           /* (t0+t7) + (t3+t4) */       \
152   t4 = vec_subs( t8, t2 );           /* (t0+t7) - (t3+t4) */       \
153   t2 = vec_adds( t0, t1 );           /* (t1+t6) + (t2+t5) */       \
154   t8 = vec_subs( t0, t1 );           /* (t1+t6) - (t2+t5) */       \
155                                                                    \
156   t1 = vec_adds( t7, t6 );           /* (t1-t6) + (t2-t5) */       \
157   t0 = vec_subs( t7, t6 );           /* (t1-t6) - (t2-t5) */       \
158                                                                    \
159   /* 3rd stage */                                                  \
160   vy[0] = vec_adds( t3, t2 );        /* y0 = t3 + t2 */            \
161   vy[4] = vec_subs( t3, t2 );        /* y4 = t3 + t2 */            \
162   vy[2] = vec_mradds( t8, a0, t4 );  /* y2 = t8 * (a0) + t4 */     \
163   t10 = vec_mradds( t4, a0, zero );                                \
164   vy[6]  = vec_subs( t10, t8 );       /* y6 = t4 * (a0) - t8 */    \
165                                                                    \
166   t6 = vec_mradds( t0, c4, t5 );     /* t6 = t0 * (c4) + t5  */    \
167   t7 = vec_mradds( t0, mc4, t5 );    /* t7 = t0 * (-c4) + t5 */    \
168   t2 = vec_mradds( t1, mc4, t9 );    /* t2 = t1 * (-c4) + t9 */    \
169   t3 = vec_mradds( t1, c4, t9 );     /* t3 = t1 * (c4) + t9  */    \
170                                                                    \
171   /* 4th stage. */                                                 \
172   vy[1] = vec_mradds( t6, a1, t3 );    /* y1 = t6 * (a1) + t3  */  \
173   t9 = vec_mradds( t3, a1, zero );                                 \
174   vy[7] = vec_subs( t9, t6 ) ;         /* y7 = t3 * (a1) - t6  */  \
175   vy[5] = vec_mradds( t2, a2, t7 );    /* y5 = t2 + (a2) + t7  */  \
176   vy[3] = vec_mradds( t7, ma2, t2 );   /* y3 = t7 * (-a2) + t2 */
177 
178 /* Post-scaling matrix -- scaled by 1 */
179 static const vector signed short PostScale[8] = {
180     (vector signed short)VCONST(4095, 5681, 5351, 4816, 4095, 4816, 5351, 5681),
181     (vector signed short)VCONST(5681, 7880, 7422, 6680, 5681, 6680, 7422, 7880),
182     (vector signed short)VCONST(5351, 7422, 6992, 6292, 5351, 6292, 6992, 7422),
183     (vector signed short)VCONST(4816, 6680, 6292, 5663, 4816, 5663, 6292, 6680),
184     (vector signed short)VCONST(4095, 5681, 5351, 4816, 4095, 4816, 5351, 5681),
185     (vector signed short)VCONST(4816, 6680, 6292, 5663, 4816, 5663, 6292, 6680),
186     (vector signed short)VCONST(5351, 7422, 6992, 6292, 5351, 6292, 6992, 7422),
187     (vector signed short)VCONST(5681, 7880, 7422, 6680, 5681, 6680, 7422, 7880)
188 };
189 
190 /***************************************************************
191  *
192  * Copyright:   (c) Copyright Motorola Inc. 1998
193  *
194  * Date:        April 17, 1998
195  *
196  * Function:    DCT
197  *
198  * Description: Scaled Chen (II) algorithm for DCT
199  *              Arithmetic is 16-bit fixed point.
200  *
201  * Inputs:      input - Pointer to input data (short), which
202  *                      must be between -255 to +255.
203  *                      It is assumed that the allocated array
204  *                      has been 128-bit aligned and contains
205  *                      8x8 short elements.
206  *
207  * Outputs:     output - Pointer to output area for the transfored
208  *                       data. The output values are between -2040
209  *                       and 2040. It is assumed that a 128-bit
210  *                       aligned 8x8 array of short has been
211  *                       pre-allocated.
212  *
213  * Return:      None
214  *
215  ***************************************************************/
216 
217 /* Note: these constants could all be loaded directly, but using the
218  * SpecialConstants approach causes vsplth instructions to be generated instead
219  * of lvx which is more efficient given the remainder of the instruction mix.
220  */
221 static const vector signed short SpecialConstants =
222   (const vector signed short)VCONST(23170, 13573, 6518, 21895, -23170, -21895, 0, 0);
223 
224 #ifdef ORIGINAL_SOURCE
DCT(short * input,short * output)225 void DCT(short *input, short *output) {
226 #else
227 void fdct_altivec_old(short *blocks) {
228 #endif
229 
230   vector signed short t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
231   vector signed short a0, a1, a2, ma2, c4, mc4, zero;
232   vector signed short vx[8], vy[8];
233   vector signed short *vec_ptr;  /* used for conversion between
234                                     arrays of short and vector
235                                     signed short array.  */
236 
237   /* load the multiplication constants */
238   c4   = vec_splat( SpecialConstants, 0 );  /* c4 = cos(4*pi/16)  */
239   a0   = vec_splat( SpecialConstants, 1 );  /* a0 = c6/c2         */
240   a1   = vec_splat( SpecialConstants, 2 );  /* a1 = c7/c1         */
241   a2   = vec_splat( SpecialConstants, 3 );  /* a2 = c5/c3         */
242   mc4  = vec_splat( SpecialConstants, 4 );  /* -c4                */
243   ma2  = vec_splat( SpecialConstants, 5 );  /* -a2                */
244   zero = vec_splat_s16(0);
245 
246   /* copy the rows of input data */
247 #ifdef ORIGINAL_SOURCE
248   vec_ptr = ( vector signed short * ) input;
249 #else
250   vec_ptr = ( vector signed short * ) blocks;
251 #endif
252   vx[0] = vec_ptr[0];
253   vx[1] = vec_ptr[1];
254   vx[2] = vec_ptr[2];
255   vx[3] = vec_ptr[3];
256   vx[4] = vec_ptr[4];
257   vx[5] = vec_ptr[5];
258   vx[6] = vec_ptr[6];
259   vx[7] = vec_ptr[7];
260 
261   /* Perform DCT first on the 8 columns */
262   DCT_Transform( vx, vy );
263 
264   /* Transpose matrix to work on rows */
265   Matrix_Transpose( vy, vx );
266 
267   /* Perform DCT first on the 8 rows */
268   DCT_Transform( vx, vy );
269 
270 #ifndef ORIGINAL_SOURCE
271   /* mpeg2enc requires the matrix transposed back */
272   Matrix_Transpose( vy, vx );
273 #endif
274 
275   /* Post-scale and store result. */
276 #ifdef ORIGINAL_SOURCE
277   vec_ptr = (vector signed short *) output;
278 #endif
279   vec_ptr[0] = vec_mradds( PostScale[0], vx[0], zero );
280   vec_ptr[1] = vec_mradds( PostScale[1], vx[1], zero );
281   vec_ptr[2] = vec_mradds( PostScale[2], vx[2], zero );
282   vec_ptr[3] = vec_mradds( PostScale[3], vx[3], zero );
283   vec_ptr[4] = vec_mradds( PostScale[4], vx[4], zero );
284   vec_ptr[5] = vec_mradds( PostScale[5], vx[5], zero );
285   vec_ptr[6] = vec_mradds( PostScale[6], vx[6], zero );
286   vec_ptr[7] = vec_mradds( PostScale[7], vx[7], zero );
287 }
288 
289 
290 /***************************************************************
291  *
292  * Copyright:   (c) Copyright Motorola Inc. 1998
293  *
294  * Date:        April 20, 1998
295  *
296  * Macro:       IDCT_Transform
297  *
298  * Description: Discrete Cosign Transform implemented by the
299  *              Scaled Chen (III) Algorithm developed by Haifa
300  *              Research Lab.  The major difference between this
301  *              algorithm and the Scaled Chen (I) is that
302  *              certain multiply-subtracts are replaced by
303  *              multiply adds.  A full description of the
304  *              Scaled Chen (I) algorithm can be found in:
305  *              W.C.Chen, C.H.Smith and S.C.Fralick, "A Fast
306  *              Computational Algorithm for the Discrete Cosine
307  *              Transform", IEEE Transactions on Commnuications,
308  *              Vol. COM-25, No. 9, pp 1004-1009, Sept. 1997.
309  *
310  * Inputs:      vx     : array of vector short
311  *              t1-t10 : temporary vector variables set up by caller
312  *              c4     : cos(4*pi/16)
313  *              mc4    : -c4
314  *              a0     : c6/c2
315  *              a1     : c7/c1
316  *              a2     : c5/c3
317  *              ma2    : -a2
318  *              zero   : an array of zero elements
319  *
320  * Outputs:     vy     : array of vector short
321  *
322  **************************************************************/
323 
324 #define IDCT_Transform(vx,vy) \
325                                                                   \
326   /* 1st stage. */                                                \
327   t9 = vec_mradds( a1, vx[1], zero );  /* t8 = (a1) * x1 - x7  */ \
328   t8 = vec_subs( t9, vx[7]);                                      \
329   t1 = vec_mradds( a1, vx[7], vx[1] ); /* t1 = (a1) * x7 + x1  */ \
330   t7 = vec_mradds( a2, vx[5], vx[3] ); /* t7 = (a2) * x5 + x3  */ \
331   t3 = vec_mradds( ma2, vx[3], vx[5] );/* t3 = (-a2) * x5 + x3 */ \
332                                                                   \
333   /* 2nd stage */                                                 \
334   t5 = vec_adds( vx[0], vx[4] );        /* t5 = x0 + x4 */        \
335   t0 = vec_subs( vx[0], vx[4] );        /* t0 = x0 - x4 */        \
336   t9 = vec_mradds( a0, vx[2], zero );   /* t4 = (a0) * x2 - x6 */ \
337   t4 = vec_subs( t9, vx[6] );                                     \
338   t2 = vec_mradds( a0, vx[6], vx[2] );  /* t2 = (a0) * x6 + x2 */ \
339                                                                   \
340   t6 = vec_adds( t8, t3 );              /* t6 = t8 + t3 */        \
341   t3 = vec_subs( t8, t3 );              /* t3 = t8 - t3 */        \
342   t8 = vec_subs( t1, t7 );              /* t8 = t1 - t7 */        \
343   t1 = vec_adds( t1, t7 );              /* t1 = t1 + t7 */        \
344                                                                   \
345   /* 3rd stage. */                                                \
346   t7 = vec_adds( t5, t2 );              /* t7 = t5 + t2 */        \
347   t2 = vec_subs( t5, t2 );              /* t2 = t5 - t2 */        \
348   t5 = vec_adds( t0, t4 );              /* t5 = t0 + t4 */        \
349   t0 = vec_subs( t0, t4 );              /* t0 = t0 - t4 */        \
350                                                                   \
351   t4 = vec_subs( t8, t3 );              /* t4 = t8 - t3 */        \
352   t3 = vec_adds( t8, t3 );              /* t3 = t8 + t3 */        \
353                                                                   \
354   /* 4th stage. */                                                \
355   vy[0] = vec_adds( t7, t1 );        /* y0 = t7 + t1 */           \
356   vy[7] = vec_subs( t7, t1 );        /* y7 = t7 - t1 */           \
357   vy[1] = vec_mradds( c4, t3, t5 );  /* y1 = (c4) * t3 + t5  */   \
358   vy[6] = vec_mradds( mc4, t3, t5 ); /* y6 = (-c4) * t3 + t5 */   \
359   vy[2] = vec_mradds( c4, t4, t0 );  /* y2 = (c4) * t4 + t0  */   \
360   vy[5] = vec_mradds( mc4, t4, t0 ); /* y5 = (-c4) * t4 + t0 */   \
361   vy[3] = vec_adds( t2, t6 );        /* y3 = t2 + t6 */           \
362   vy[4] = vec_subs( t2, t6 );        /* y4 = t2 - t6 */
363 
364 
365 /* Pre-Scaling matrix -- scaled by 1 */
366 static const vector signed short PreScale[8] = {
367     (vector signed short)VCONST(4095, 5681, 5351, 4816, 4095, 4816, 5351, 5681),
368     (vector signed short)VCONST(5681, 7880, 7422, 6680, 5681, 6680, 7422, 7880),
369     (vector signed short)VCONST(5351, 7422, 6992, 6292, 5351, 6292, 6992, 7422),
370     (vector signed short)VCONST(4816, 6680, 6292, 5663, 4816, 5663, 6292, 6680),
371     (vector signed short)VCONST(4095, 5681, 5351, 4816, 4095, 4816, 5351, 5681),
372     (vector signed short)VCONST(4816, 6680, 6292, 5663, 4816, 5663, 6292, 6680),
373     (vector signed short)VCONST(5351, 7422, 6992, 6292, 5351, 6292, 6992, 7422),
374     (vector signed short)VCONST(5681, 7880, 7422, 6680, 5681, 6680, 7422, 7880)
375 };
376 
377 
378 /***************************************************************
379  *
380  * Copyright:   (c) Copyright Motorola Inc. 1998
381  *
382  * Date:        April 17, 1998
383  *
384  * Function:    IDCT
385  *
386  * Description: Scaled Chen (III) algorithm for IDCT
387  *              Arithmetic is 16-bit fixed point.
388  *
389  * Inputs:      input - Pointer to input data (short), which
390  *                      must be between -2048 to +2047.
391  *                      It is assumed that the allocated array
392  *                      has been 128-bit aligned and contains
393  *                      8x8 short elements.
394  *
395  * Outputs:     output - Pointer to output area for the transfored
396  *                       data. The output values are between -255
397  *                       and 255 . It is assumed that a 128-bit
398  *                       aligned 8x8 array of short has been
399  *                       pre-allocated.
400  *
401  * Return:      None
402  *
403  ***************************************************************/
404 
405 #ifdef ORIGINAL_SOURCE
406 void IDCT(short *input, short *output) {
407 #else
408 void idct_altivec_old(short *block) {
409 #endif
410 
411   vector signed short t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
412   vector signed short a0, a1, a2, ma2, c4, mc4, zero;
413   vector signed short vx[8], vy[8];
414   vector signed short *vec_ptr;  /* used for conversion between
415                                     arrays of short and vector
416                                     signed short array.  */
417 
418 
419   /* Load the multiplication constants. */
420   c4   = vec_splat( SpecialConstants, 0 );  /* c4 = cos(4*pi/16)  */
421   a0   = vec_splat( SpecialConstants, 1 );  /* a0 = c6/c2         */
422   a1   = vec_splat( SpecialConstants, 2 );  /* a1 = c7/c1         */
423   a2   = vec_splat( SpecialConstants, 3 );  /* a2 = c5/c3         */
424   mc4  = vec_splat( SpecialConstants, 4 );  /* -c4                */
425   ma2  = vec_splat( SpecialConstants, 5 );  /* -a2                */
426   zero = vec_splat_s16(0);
427 
428   /* Load the rows of input data and Pre-Scale them. */
429 #ifdef ORIGINAL_SOURCE
430   vec_ptr = ( vector signed short * ) input;
431 #else
432   vec_ptr = ( vector signed short * ) block;
433 #endif
434   vx[0] = vec_mradds( vec_ptr[0], PreScale[0], zero );
435   vx[1] = vec_mradds( vec_ptr[1], PreScale[1], zero );
436   vx[2] = vec_mradds( vec_ptr[2], PreScale[2], zero );
437   vx[3] = vec_mradds( vec_ptr[3], PreScale[3], zero );
438   vx[4] = vec_mradds( vec_ptr[4], PreScale[4], zero );
439   vx[5] = vec_mradds( vec_ptr[5], PreScale[5], zero );
440   vx[6] = vec_mradds( vec_ptr[6], PreScale[6], zero );
441   vx[7] = vec_mradds( vec_ptr[7], PreScale[7], zero );
442 
443   /* Perform IDCT first on the 8 columns */
444   IDCT_Transform( vx, vy );
445 
446   /* Transpose matrix to work on rows */
447   Matrix_Transpose( vy, vx );
448 
449   /* Perform IDCT next on the 8 rows */
450   IDCT_Transform( vx, vy );
451 
452 #ifndef ORIGINAL_SOURCE
453   /* mpeg2enc requires the matrix transposed back */
454   Matrix_Transpose( vy, vx );
455 #endif
456 
457   /* Post-scale and store result. */
458 #ifdef ORIGINAL_SOURCE
459   vec_ptr = (vector signed short *) output;
460 #endif
461   vec_ptr[0] = vx[0];
462   vec_ptr[1] = vx[1];
463   vec_ptr[2] = vx[2];
464   vec_ptr[3] = vx[3];
465   vec_ptr[4] = vx[4];
466   vec_ptr[5] = vx[5];
467   vec_ptr[6] = vx[6];
468   vec_ptr[7] = vx[7];
469 }
470