1 /*
2 	dct64_altivec.c: Discrete Cosine Tansform (DCT) for Altivec
3 
4 	copyright ?-2006 by the mpg123 project - free software under the terms of the LGPL 2.1
5 	see COPYING and AUTHORS files in distribution or http://mpg123.org
6 	initially written by Michael Hipp
7 	altivec optimization by tmkk
8 */
9 
10 /*
11  * Discrete Cosine Tansform (DCT) for subband synthesis
12  *
13  * -funroll-loops (for gcc) will remove the loops for better performance
14  * using loops in the source-code enhances readabillity
15  *
16  *
17  * TODO: write an optimized version for the down-sampling modes
18  *       (in these modes the bands 16-31 (2:1) or 8-31 (4:1) are zero
19  */
20 
21 #include "mpg123lib_intern.h"
22 
23 #ifndef __APPLE__
24 #include <altivec.h>
25 #endif
26 
dct64_altivec(real * out0,real * out1,real * samples)27 void dct64_altivec(real *out0,real *out1,real *samples)
28 {
29   ALIGNED(16) real bufs[32];
30 
31 	{
32 		register real *b1,*costab;
33 
34 		vector unsigned char vinvert,vperm1,vperm2,vperm3,vperm4;
35 		vector float v1,v2,v3,v4,v5,v6,v7,v8;
36 		vector float vbs1,vbs2,vbs3,vbs4,vbs5,vbs6,vbs7,vbs8;
37 		vector float vbs9,vbs10,vbs11,vbs12,vbs13,vbs14,vbs15,vbs16;
38 		vector float vzero;
39 		b1 = samples;
40 		costab = pnts[0];
41 
42 		vzero = vec_xor(vzero,vzero);
43 #ifdef __APPLE__
44 		vinvert = (vector unsigned char)(12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3);
45 #else
46 		vinvert = (vector unsigned char){12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3};
47 #endif
48 		vperm1 = vec_lvsl(0,b1);
49 		vperm2 = vec_perm(vperm1,vperm1,vinvert);
50 
51 		v1 = vec_ld(0,b1);
52 		v2 = vec_ld(16,b1);
53 		v3 = vec_ld(112,b1);
54 		v4 = vec_ld(127,b1);
55 		v5 = vec_perm(v1,v2,vperm1); /* b1[0,1,2,3] */
56 		v6 = vec_perm(v3,v4,vperm2); /* b1[31,30,29,28] */
57 
58 		vbs1 = vec_add(v5,v6);
59 		vbs8 = vec_sub(v5,v6);
60 
61 		v1 = vec_ld(32,b1);
62 		v4 = vec_ld(96,b1);
63 		v5 = vec_perm(v2,v1,vperm1); /* b1[4,5,6,7] */
64 		v6 = vec_perm(v4,v3,vperm2); /* b1[27,26,25,24] */
65 
66 		vbs2 = vec_add(v5,v6);
67 		vbs7 = vec_sub(v5,v6);
68 
69 		v2 = vec_ld(48,b1);
70 		v3 = vec_ld(80,b1);
71 		v5 = vec_perm(v1,v2,vperm1); /* b1[8,9,10,11] */
72 		v6 = vec_perm(v3,v4,vperm2); /* b1[23,22,21,20] */
73 
74 		vbs3 = vec_add(v5,v6);
75 		vbs6 = vec_sub(v5,v6);
76 
77 		v1 = vec_ld(64,b1);
78 		v5 = vec_perm(v2,v1,vperm1); /* b1[12,13,14,15] */
79 		v6 = vec_perm(v1,v3,vperm2); /* b1[19,18,17,16] */
80 
81 		vbs4 = vec_add(v5,v6);
82 		vbs5 = vec_sub(v5,v6);
83 
84 		v1 = vec_ld(0,costab);
85 		vbs8 = vec_madd(vbs8,v1,vzero);
86 		v2 = vec_ld(16,costab);
87 		vbs7 = vec_madd(vbs7,v2,vzero);
88 		v3 = vec_ld(32,costab);
89 		vbs6 = vec_madd(vbs6,v3,vzero);
90 		v4 = vec_ld(48,costab);
91 		vbs5 = vec_madd(vbs5,v4,vzero);
92 		vbs6 = vec_perm(vbs6,vbs6,vinvert);
93 		vbs5 = vec_perm(vbs5,vbs5,vinvert);
94 
95 
96 		costab = pnts[1];
97 
98 		v1 = vec_perm(vbs4,vbs4,vinvert);
99 		vbs9 = vec_add(vbs1,v1);
100 		v3 = vec_sub(vbs1,v1);
101 		v5 = vec_ld(0,costab);
102 		v2 = vec_perm(vbs3,vbs3,vinvert);
103 		vbs10 = vec_add(vbs2,v2);
104 		v4 = vec_sub(vbs2,v2);
105 		v6 = vec_ld(16,costab);
106 		vbs12 = vec_madd(v3,v5,vzero);
107 		vbs11 = vec_madd(v4,v6,vzero);
108 
109 		v7 = vec_sub(vbs7,vbs6);
110 		v8 = vec_sub(vbs8,vbs5);
111 		vbs13 = vec_add(vbs5,vbs8);
112 		vbs14 = vec_add(vbs6,vbs7);
113 		vbs15 = vec_madd(v7,v6,vzero);
114 		vbs16 = vec_madd(v8,v5,vzero);
115 
116 
117 		costab = pnts[2];
118 
119 		v1 = vec_perm(vbs10,vbs10,vinvert);
120 		v5 = vec_perm(vbs14,vbs14,vinvert);
121 		vbs1 = vec_add(v1,vbs9);
122 		vbs5 = vec_add(v5,vbs13);
123 		v2 = vec_sub(vbs9,v1);
124 		v6 = vec_sub(vbs13,v5);
125 		v3 = vec_ld(0,costab);
126 		vbs11 = vec_perm(vbs11,vbs11,vinvert);
127 		vbs15 = vec_perm(vbs15,vbs15,vinvert);
128 		vbs3 = vec_add(vbs11,vbs12);
129 		vbs7 = vec_add(vbs15,vbs16);
130 		v4 = vec_sub(vbs12,vbs11);
131 		v7 = vec_sub(vbs16,vbs15);
132 		vbs2 = vec_madd(v2,v3,vzero);
133 		vbs4 = vec_madd(v4,v3,vzero);
134 		vbs6 = vec_madd(v6,v3,vzero);
135 		vbs8 = vec_madd(v7,v3,vzero);
136 
137 		vbs2 = vec_perm(vbs2,vbs2,vinvert);
138 		vbs4 = vec_perm(vbs4,vbs4,vinvert);
139 		vbs6 = vec_perm(vbs6,vbs6,vinvert);
140 		vbs8 = vec_perm(vbs8,vbs8,vinvert);
141 
142 
143 		costab = pnts[3];
144 
145 #ifdef __APPLE__
146 		vperm1 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
147 		vperm2 = (vector unsigned char)(12,13,14,15,8,9,10,11,28,29,30,31,24,25,26,27);
148 		vperm3 = (vector unsigned char)(0,1,2,3,4,5,6,7,20,21,22,23,16,17,18,19);
149 #else
150 		vperm1 = (vector unsigned char){0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23};
151 		vperm2 = (vector unsigned char){12,13,14,15,8,9,10,11,28,29,30,31,24,25,26,27};
152 		vperm3 = (vector unsigned char){0,1,2,3,4,5,6,7,20,21,22,23,16,17,18,19};
153 #endif
154 		vperm4 = vec_add(vperm3,vec_splat_u8(8));
155 
156 		v1 = vec_ld(0,costab);
157 		v2 = vec_splat(v1,0);
158 		v3 = vec_splat(v1,1);
159 		v1 = vec_mergeh(v2,v3);
160 
161 		v2 = vec_perm(vbs1,vbs3,vperm1);
162 		v3 = vec_perm(vbs2,vbs4,vperm1);
163 		v4 = vec_perm(vbs1,vbs3,vperm2);
164 		v5 = vec_perm(vbs2,vbs4,vperm2);
165 		v6 = vec_sub(v2,v4);
166 		v7 = vec_sub(v3,v5);
167 		v2 = vec_add(v2,v4);
168 		v3 = vec_add(v3,v5);
169 		v4 = vec_madd(v6,v1,vzero);
170 		v5 = vec_nmsub(v7,v1,vzero);
171 		vbs9 = vec_perm(v2,v4,vperm3);
172 		vbs11 = vec_perm(v2,v4,vperm4);
173 		vbs10 = vec_perm(v3,v5,vperm3);
174 		vbs12 = vec_perm(v3,v5,vperm4);
175 
176 		v2 = vec_perm(vbs5,vbs7,vperm1);
177 		v3 = vec_perm(vbs6,vbs8,vperm1);
178 		v4 = vec_perm(vbs5,vbs7,vperm2);
179 		v5 = vec_perm(vbs6,vbs8,vperm2);
180 		v6 = vec_sub(v2,v4);
181 		v7 = vec_sub(v3,v5);
182 		v2 = vec_add(v2,v4);
183 		v3 = vec_add(v3,v5);
184 		v4 = vec_madd(v6,v1,vzero);
185 		v5 = vec_nmsub(v7,v1,vzero);
186 		vbs13 = vec_perm(v2,v4,vperm3);
187 		vbs15 = vec_perm(v2,v4,vperm4);
188 		vbs14 = vec_perm(v3,v5,vperm3);
189 		vbs16 = vec_perm(v3,v5,vperm4);
190 
191 
192 		costab = pnts[4];
193 
194 		v1 = vec_lde(0,costab);
195 #ifdef __APPLE__
196 		v2 = (vector float)(1.0f,-1.0f,1.0f,-1.0f);
197 #else
198 		v2 = (vector float){1.0f,-1.0f,1.0f,-1.0f};
199 #endif
200 		v3 = vec_splat(v1,0);
201 		v1 = vec_madd(v2,v3,vzero);
202 
203 		v2 = vec_mergeh(vbs9,vbs10);
204 		v3 = vec_mergel(vbs9,vbs10);
205 		v4 = vec_mergeh(vbs11,vbs12);
206 		v5 = vec_mergel(vbs11,vbs12);
207 		v6 = vec_mergeh(v2,v3);
208 		v7 = vec_mergel(v2,v3);
209 		v2 = vec_mergeh(v4,v5);
210 		v3 = vec_mergel(v4,v5);
211 		v4 = vec_sub(v6,v7);
212 		v5 = vec_sub(v2,v3);
213 		v6 = vec_add(v6,v7);
214 		v7 = vec_add(v2,v3);
215 		v2 = vec_madd(v4,v1,vzero);
216 		v3 = vec_madd(v5,v1,vzero);
217 		vbs1 = vec_mergeh(v6,v2);
218 		vbs2 = vec_mergel(v6,v2);
219 		vbs3 = vec_mergeh(v7,v3);
220 		vbs4 = vec_mergel(v7,v3);
221 
222 		v2 = vec_mergeh(vbs13,vbs14);
223 		v3 = vec_mergel(vbs13,vbs14);
224 		v4 = vec_mergeh(vbs15,vbs16);
225 		v5 = vec_mergel(vbs15,vbs16);
226 		v6 = vec_mergeh(v2,v3);
227 		v7 = vec_mergel(v2,v3);
228 		v2 = vec_mergeh(v4,v5);
229 		v3 = vec_mergel(v4,v5);
230 		v4 = vec_sub(v6,v7);
231 		v5 = vec_sub(v2,v3);
232 		v6 = vec_add(v6,v7);
233 		v7 = vec_add(v2,v3);
234 		v2 = vec_madd(v4,v1,vzero);
235 		v3 = vec_madd(v5,v1,vzero);
236 		vbs5 = vec_mergeh(v6,v2);
237 		vbs6 = vec_mergel(v6,v2);
238 		vbs7 = vec_mergeh(v7,v3);
239 		vbs8 = vec_mergel(v7,v3);
240 
241 		vec_st(vbs1,0,bufs);
242 		vec_st(vbs2,16,bufs);
243 		vec_st(vbs3,32,bufs);
244 		vec_st(vbs4,48,bufs);
245 		vec_st(vbs5,64,bufs);
246 		vec_st(vbs6,80,bufs);
247 		vec_st(vbs7,96,bufs);
248 		vec_st(vbs8,112,bufs);
249 	}
250 
251  {
252   register real *b1;
253   register int i;
254 
255   for(b1=bufs,i=8;i;i--,b1+=4)
256     b1[2] += b1[3];
257 
258   for(b1=bufs,i=4;i;i--,b1+=8)
259   {
260     b1[4] += b1[6];
261     b1[6] += b1[5];
262     b1[5] += b1[7];
263   }
264 
265   for(b1=bufs,i=2;i;i--,b1+=16)
266   {
267     b1[8]  += b1[12];
268     b1[12] += b1[10];
269     b1[10] += b1[14];
270     b1[14] += b1[9];
271     b1[9]  += b1[13];
272     b1[13] += b1[11];
273     b1[11] += b1[15];
274   }
275  }
276 
277 
278   out0[0x10*16] = bufs[0];
279   out0[0x10*15] = bufs[16+0]  + bufs[16+8];
280   out0[0x10*14] = bufs[8];
281   out0[0x10*13] = bufs[16+8]  + bufs[16+4];
282   out0[0x10*12] = bufs[4];
283   out0[0x10*11] = bufs[16+4]  + bufs[16+12];
284   out0[0x10*10] = bufs[12];
285   out0[0x10* 9] = bufs[16+12] + bufs[16+2];
286   out0[0x10* 8] = bufs[2];
287   out0[0x10* 7] = bufs[16+2]  + bufs[16+10];
288   out0[0x10* 6] = bufs[10];
289   out0[0x10* 5] = bufs[16+10] + bufs[16+6];
290   out0[0x10* 4] = bufs[6];
291   out0[0x10* 3] = bufs[16+6]  + bufs[16+14];
292   out0[0x10* 2] = bufs[14];
293   out0[0x10* 1] = bufs[16+14] + bufs[16+1];
294   out0[0x10* 0] = bufs[1];
295 
296   out1[0x10* 0] = bufs[1];
297   out1[0x10* 1] = bufs[16+1]  + bufs[16+9];
298   out1[0x10* 2] = bufs[9];
299   out1[0x10* 3] = bufs[16+9]  + bufs[16+5];
300   out1[0x10* 4] = bufs[5];
301   out1[0x10* 5] = bufs[16+5]  + bufs[16+13];
302   out1[0x10* 6] = bufs[13];
303   out1[0x10* 7] = bufs[16+13] + bufs[16+3];
304   out1[0x10* 8] = bufs[3];
305   out1[0x10* 9] = bufs[16+3]  + bufs[16+11];
306   out1[0x10*10] = bufs[11];
307   out1[0x10*11] = bufs[16+11] + bufs[16+7];
308   out1[0x10*12] = bufs[7];
309   out1[0x10*13] = bufs[16+7]  + bufs[16+15];
310   out1[0x10*14] = bufs[15];
311   out1[0x10*15] = bufs[16+15];
312 
313 }
314 
315 
316