1*c2c66affSColin Finck /*
2*c2c66affSColin Finck 	decode.c: decoding samples...
3*c2c66affSColin Finck 
4*c2c66affSColin Finck 	copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1
5*c2c66affSColin Finck 	see COPYING and AUTHORS files in distribution or http://mpg123.org
6*c2c66affSColin Finck 	initially written by Michael Hipp
7*c2c66affSColin Finck 	altivec optimization by tmkk
8*c2c66affSColin Finck */
9*c2c66affSColin Finck 
10*c2c66affSColin Finck #include "mpg123lib_intern.h"
11*c2c66affSColin Finck 
12*c2c66affSColin Finck #ifndef __APPLE__
13*c2c66affSColin Finck #include <altivec.h>
14*c2c66affSColin Finck #endif
15*c2c66affSColin Finck 
16*c2c66affSColin Finck /* A macro for normal synth functions */
17*c2c66affSColin Finck #define SYNTH_ALTIVEC(B0STEP) \
18*c2c66affSColin Finck 	v1 = vec_ld(0,window); \
19*c2c66affSColin Finck 	v2 = vec_ld(16,window); \
20*c2c66affSColin Finck 	v3 = vec_ld(32,window); \
21*c2c66affSColin Finck 	v4 = vec_ld(48,window); \
22*c2c66affSColin Finck 	v5 = vec_ld(64,window); \
23*c2c66affSColin Finck 	v1 = vec_perm(v1,v2,vperm1); \
24*c2c66affSColin Finck 	v6 = vec_ld(0,b0); \
25*c2c66affSColin Finck 	v2 = vec_perm(v2,v3,vperm1); \
26*c2c66affSColin Finck 	v7 = vec_ld(16,b0); \
27*c2c66affSColin Finck 	v3 = vec_perm(v3,v4,vperm1); \
28*c2c66affSColin Finck 	v8 = vec_ld(32,b0); \
29*c2c66affSColin Finck 	v4 = vec_perm(v4,v5,vperm1); \
30*c2c66affSColin Finck 	v9 = vec_ld(48,b0); \
31*c2c66affSColin Finck 	 \
32*c2c66affSColin Finck 	vsum = vec_madd(v1,v6,vzero); \
33*c2c66affSColin Finck 	vsum = vec_madd(v2,v7,vsum); \
34*c2c66affSColin Finck 	vsum = vec_madd(v3,v8,vsum); \
35*c2c66affSColin Finck 	vsum = vec_madd(v4,v9,vsum); \
36*c2c66affSColin Finck 	 \
37*c2c66affSColin Finck 	window += 32; \
38*c2c66affSColin Finck 	b0 += B0STEP; \
39*c2c66affSColin Finck 	 \
40*c2c66affSColin Finck 	v1 = vec_ld(0,window); \
41*c2c66affSColin Finck 	v2 = vec_ld(16,window); \
42*c2c66affSColin Finck 	v3 = vec_ld(32,window); \
43*c2c66affSColin Finck 	v4 = vec_ld(48,window); \
44*c2c66affSColin Finck 	v5 = vec_ld(64,window); \
45*c2c66affSColin Finck 	v1 = vec_perm(v1,v2,vperm1); \
46*c2c66affSColin Finck 	v6 = vec_ld(0,b0); \
47*c2c66affSColin Finck 	v2 = vec_perm(v2,v3,vperm1); \
48*c2c66affSColin Finck 	v7 = vec_ld(16,b0); \
49*c2c66affSColin Finck 	v3 = vec_perm(v3,v4,vperm1); \
50*c2c66affSColin Finck 	v8 = vec_ld(32,b0); \
51*c2c66affSColin Finck 	v4 = vec_perm(v4,v5,vperm1); \
52*c2c66affSColin Finck 	v9 = vec_ld(48,b0); \
53*c2c66affSColin Finck 	 \
54*c2c66affSColin Finck 	vsum2 = vec_madd(v1,v6,vzero); \
55*c2c66affSColin Finck 	vsum2 = vec_madd(v2,v7,vsum2); \
56*c2c66affSColin Finck 	vsum2 = vec_madd(v3,v8,vsum2); \
57*c2c66affSColin Finck 	vsum2 = vec_madd(v4,v9,vsum2); \
58*c2c66affSColin Finck 	 \
59*c2c66affSColin Finck 	window += 32; \
60*c2c66affSColin Finck 	b0 += B0STEP; \
61*c2c66affSColin Finck 	 \
62*c2c66affSColin Finck 	v1 = vec_ld(0,window); \
63*c2c66affSColin Finck 	v2 = vec_ld(16,window); \
64*c2c66affSColin Finck 	v3 = vec_ld(32,window); \
65*c2c66affSColin Finck 	v4 = vec_ld(48,window); \
66*c2c66affSColin Finck 	v5 = vec_ld(64,window); \
67*c2c66affSColin Finck 	v1 = vec_perm(v1,v2,vperm1); \
68*c2c66affSColin Finck 	v6 = vec_ld(0,b0); \
69*c2c66affSColin Finck 	v2 = vec_perm(v2,v3,vperm1); \
70*c2c66affSColin Finck 	v7 = vec_ld(16,b0); \
71*c2c66affSColin Finck 	v3 = vec_perm(v3,v4,vperm1); \
72*c2c66affSColin Finck 	v8 = vec_ld(32,b0); \
73*c2c66affSColin Finck 	v4 = vec_perm(v4,v5,vperm1); \
74*c2c66affSColin Finck 	v9 = vec_ld(48,b0); \
75*c2c66affSColin Finck 	 \
76*c2c66affSColin Finck 	vsum3 = vec_madd(v1,v6,vzero); \
77*c2c66affSColin Finck 	vsum3 = vec_madd(v2,v7,vsum3); \
78*c2c66affSColin Finck 	vsum3 = vec_madd(v3,v8,vsum3); \
79*c2c66affSColin Finck 	vsum3 = vec_madd(v4,v9,vsum3); \
80*c2c66affSColin Finck 	 \
81*c2c66affSColin Finck 	window += 32; \
82*c2c66affSColin Finck 	b0 += B0STEP; \
83*c2c66affSColin Finck 	 \
84*c2c66affSColin Finck 	v1 = vec_ld(0,window); \
85*c2c66affSColin Finck 	v2 = vec_ld(16,window); \
86*c2c66affSColin Finck 	v3 = vec_ld(32,window); \
87*c2c66affSColin Finck 	v4 = vec_ld(48,window); \
88*c2c66affSColin Finck 	v5 = vec_ld(64,window); \
89*c2c66affSColin Finck 	v1 = vec_perm(v1,v2,vperm1); \
90*c2c66affSColin Finck 	v6 = vec_ld(0,b0); \
91*c2c66affSColin Finck 	v2 = vec_perm(v2,v3,vperm1); \
92*c2c66affSColin Finck 	v7 = vec_ld(16,b0); \
93*c2c66affSColin Finck 	v3 = vec_perm(v3,v4,vperm1); \
94*c2c66affSColin Finck 	v8 = vec_ld(32,b0); \
95*c2c66affSColin Finck 	v4 = vec_perm(v4,v5,vperm1); \
96*c2c66affSColin Finck 	v9 = vec_ld(48,b0); \
97*c2c66affSColin Finck 	 \
98*c2c66affSColin Finck 	vsum4 = vec_madd(v1,v6,vzero); \
99*c2c66affSColin Finck 	vsum4 = vec_madd(v2,v7,vsum4); \
100*c2c66affSColin Finck 	vsum4 = vec_madd(v3,v8,vsum4); \
101*c2c66affSColin Finck 	vsum4 = vec_madd(v4,v9,vsum4); \
102*c2c66affSColin Finck 	 \
103*c2c66affSColin Finck 	window += 32; \
104*c2c66affSColin Finck 	b0 += B0STEP; \
105*c2c66affSColin Finck 	 \
106*c2c66affSColin Finck 	v1 = vec_mergeh(vsum,vsum3); \
107*c2c66affSColin Finck 	v2 = vec_mergeh(vsum2,vsum4); \
108*c2c66affSColin Finck 	v3 = vec_mergel(vsum,vsum3); \
109*c2c66affSColin Finck 	v4 = vec_mergel(vsum2,vsum4); \
110*c2c66affSColin Finck 	v5 = vec_mergeh(v1,v2); \
111*c2c66affSColin Finck 	v6 = vec_mergel(v1,v2); \
112*c2c66affSColin Finck 	v7 = vec_mergeh(v3,v4); \
113*c2c66affSColin Finck 	v8 = vec_mergel(v3,v4);
114*c2c66affSColin Finck 
115*c2c66affSColin Finck /* A macro for stereo synth functions */
116*c2c66affSColin Finck #define SYNTH_STEREO_ALTIVEC(B0STEP) \
117*c2c66affSColin Finck 	v1 = vec_ld(0,window); \
118*c2c66affSColin Finck 	v2 = vec_ld(16,window); \
119*c2c66affSColin Finck 	v3 = vec_ld(32,window); \
120*c2c66affSColin Finck 	v4 = vec_ld(48,window); \
121*c2c66affSColin Finck 	v5 = vec_ld(64,window); \
122*c2c66affSColin Finck 	v1 = vec_perm(v1,v2,vperm1); \
123*c2c66affSColin Finck 	v6 = vec_ld(0,b0l); \
124*c2c66affSColin Finck 	v10 = vec_ld(0,b0r); \
125*c2c66affSColin Finck 	v2 = vec_perm(v2,v3,vperm1); \
126*c2c66affSColin Finck 	v7 = vec_ld(16,b0l); \
127*c2c66affSColin Finck 	v11 = vec_ld(16,b0r); \
128*c2c66affSColin Finck 	v3 = vec_perm(v3,v4,vperm1); \
129*c2c66affSColin Finck 	v8 = vec_ld(32,b0l); \
130*c2c66affSColin Finck 	v12 = vec_ld(32,b0r); \
131*c2c66affSColin Finck 	v4 = vec_perm(v4,v5,vperm1); \
132*c2c66affSColin Finck 	v9 = vec_ld(48,b0l); \
133*c2c66affSColin Finck 	v13 = vec_ld(48,b0r); \
134*c2c66affSColin Finck 	 \
135*c2c66affSColin Finck 	vsum = vec_madd(v1,v6,vzero); \
136*c2c66affSColin Finck 	vsum5 = vec_madd(v1,v10,vzero); \
137*c2c66affSColin Finck 	vsum = vec_madd(v2,v7,vsum); \
138*c2c66affSColin Finck 	vsum5 = vec_madd(v2,v11,vsum5); \
139*c2c66affSColin Finck 	vsum = vec_madd(v3,v8,vsum); \
140*c2c66affSColin Finck 	vsum5 = vec_madd(v3,v12,vsum5); \
141*c2c66affSColin Finck 	vsum = vec_madd(v4,v9,vsum); \
142*c2c66affSColin Finck 	vsum5 = vec_madd(v4,v13,vsum5); \
143*c2c66affSColin Finck 	 \
144*c2c66affSColin Finck 	window += 32; \
145*c2c66affSColin Finck 	b0l += B0STEP; \
146*c2c66affSColin Finck 	b0r += B0STEP; \
147*c2c66affSColin Finck 	 \
148*c2c66affSColin Finck 	v1 = vec_ld(0,window); \
149*c2c66affSColin Finck 	v2 = vec_ld(16,window); \
150*c2c66affSColin Finck 	v3 = vec_ld(32,window); \
151*c2c66affSColin Finck 	v4 = vec_ld(48,window); \
152*c2c66affSColin Finck 	v5 = vec_ld(64,window); \
153*c2c66affSColin Finck 	v1 = vec_perm(v1,v2,vperm1); \
154*c2c66affSColin Finck 	v6 = vec_ld(0,b0l); \
155*c2c66affSColin Finck 	v10 = vec_ld(0,b0r); \
156*c2c66affSColin Finck 	v2 = vec_perm(v2,v3,vperm1); \
157*c2c66affSColin Finck 	v7 = vec_ld(16,b0l); \
158*c2c66affSColin Finck 	v11 = vec_ld(16,b0r); \
159*c2c66affSColin Finck 	v3 = vec_perm(v3,v4,vperm1); \
160*c2c66affSColin Finck 	v8 = vec_ld(32,b0l); \
161*c2c66affSColin Finck 	v12 = vec_ld(32,b0r); \
162*c2c66affSColin Finck 	v4 = vec_perm(v4,v5,vperm1); \
163*c2c66affSColin Finck 	v9 = vec_ld(48,b0l); \
164*c2c66affSColin Finck 	v13 = vec_ld(48,b0r); \
165*c2c66affSColin Finck 	 \
166*c2c66affSColin Finck 	vsum2 = vec_madd(v1,v6,vzero); \
167*c2c66affSColin Finck 	vsum6 = vec_madd(v1,v10,vzero); \
168*c2c66affSColin Finck 	vsum2 = vec_madd(v2,v7,vsum2); \
169*c2c66affSColin Finck 	vsum6 = vec_madd(v2,v11,vsum6); \
170*c2c66affSColin Finck 	vsum2 = vec_madd(v3,v8,vsum2); \
171*c2c66affSColin Finck 	vsum6 = vec_madd(v3,v12,vsum6); \
172*c2c66affSColin Finck 	vsum2 = vec_madd(v4,v9,vsum2); \
173*c2c66affSColin Finck 	vsum6 = vec_madd(v4,v13,vsum6); \
174*c2c66affSColin Finck 	 \
175*c2c66affSColin Finck 	window += 32; \
176*c2c66affSColin Finck 	b0l += B0STEP; \
177*c2c66affSColin Finck 	b0r += B0STEP; \
178*c2c66affSColin Finck 	 \
179*c2c66affSColin Finck 	v1 = vec_ld(0,window); \
180*c2c66affSColin Finck 	v2 = vec_ld(16,window); \
181*c2c66affSColin Finck 	v3 = vec_ld(32,window); \
182*c2c66affSColin Finck 	v4 = vec_ld(48,window); \
183*c2c66affSColin Finck 	v5 = vec_ld(64,window); \
184*c2c66affSColin Finck 	v1 = vec_perm(v1,v2,vperm1); \
185*c2c66affSColin Finck 	v6 = vec_ld(0,b0l); \
186*c2c66affSColin Finck 	v10 = vec_ld(0,b0r); \
187*c2c66affSColin Finck 	v2 = vec_perm(v2,v3,vperm1); \
188*c2c66affSColin Finck 	v7 = vec_ld(16,b0l); \
189*c2c66affSColin Finck 	v11 = vec_ld(16,b0r); \
190*c2c66affSColin Finck 	v3 = vec_perm(v3,v4,vperm1); \
191*c2c66affSColin Finck 	v8 = vec_ld(32,b0l); \
192*c2c66affSColin Finck 	v12 = vec_ld(32,b0r); \
193*c2c66affSColin Finck 	v4 = vec_perm(v4,v5,vperm1); \
194*c2c66affSColin Finck 	v9 = vec_ld(48,b0l); \
195*c2c66affSColin Finck 	v13 = vec_ld(48,b0r); \
196*c2c66affSColin Finck 	 \
197*c2c66affSColin Finck 	vsum3 = vec_madd(v1,v6,vzero); \
198*c2c66affSColin Finck 	vsum7 = vec_madd(v1,v10,vzero); \
199*c2c66affSColin Finck 	vsum3 = vec_madd(v2,v7,vsum3); \
200*c2c66affSColin Finck 	vsum7 = vec_madd(v2,v11,vsum7); \
201*c2c66affSColin Finck 	vsum3 = vec_madd(v3,v8,vsum3); \
202*c2c66affSColin Finck 	vsum7 = vec_madd(v3,v12,vsum7); \
203*c2c66affSColin Finck 	vsum3 = vec_madd(v4,v9,vsum3); \
204*c2c66affSColin Finck 	vsum7 = vec_madd(v4,v13,vsum7); \
205*c2c66affSColin Finck 	 \
206*c2c66affSColin Finck 	window += 32; \
207*c2c66affSColin Finck 	b0l += B0STEP; \
208*c2c66affSColin Finck 	b0r += B0STEP; \
209*c2c66affSColin Finck 	 \
210*c2c66affSColin Finck 	v1 = vec_ld(0,window); \
211*c2c66affSColin Finck 	v2 = vec_ld(16,window); \
212*c2c66affSColin Finck 	v3 = vec_ld(32,window); \
213*c2c66affSColin Finck 	v4 = vec_ld(48,window); \
214*c2c66affSColin Finck 	v5 = vec_ld(64,window); \
215*c2c66affSColin Finck 	v1 = vec_perm(v1,v2,vperm1); \
216*c2c66affSColin Finck 	v6 = vec_ld(0,b0l); \
217*c2c66affSColin Finck 	v10 = vec_ld(0,b0r); \
218*c2c66affSColin Finck 	v2 = vec_perm(v2,v3,vperm1); \
219*c2c66affSColin Finck 	v7 = vec_ld(16,b0l); \
220*c2c66affSColin Finck 	v11 = vec_ld(16,b0r); \
221*c2c66affSColin Finck 	v3 = vec_perm(v3,v4,vperm1); \
222*c2c66affSColin Finck 	v8 = vec_ld(32,b0l); \
223*c2c66affSColin Finck 	v12 = vec_ld(32,b0r); \
224*c2c66affSColin Finck 	v4 = vec_perm(v4,v5,vperm1); \
225*c2c66affSColin Finck 	v9 = vec_ld(48,b0l); \
226*c2c66affSColin Finck 	v13 = vec_ld(48,b0r); \
227*c2c66affSColin Finck 	 \
228*c2c66affSColin Finck 	vsum4 = vec_madd(v1,v6,vzero); \
229*c2c66affSColin Finck 	vsum8 = vec_madd(v1,v10,vzero); \
230*c2c66affSColin Finck 	vsum4 = vec_madd(v2,v7,vsum4); \
231*c2c66affSColin Finck 	vsum8 = vec_madd(v2,v11,vsum8); \
232*c2c66affSColin Finck 	vsum4 = vec_madd(v3,v8,vsum4); \
233*c2c66affSColin Finck 	vsum8 = vec_madd(v3,v12,vsum8); \
234*c2c66affSColin Finck 	vsum4 = vec_madd(v4,v9,vsum4); \
235*c2c66affSColin Finck 	vsum8 = vec_madd(v4,v13,vsum8); \
236*c2c66affSColin Finck 	 \
237*c2c66affSColin Finck 	window += 32; \
238*c2c66affSColin Finck 	b0l += B0STEP; \
239*c2c66affSColin Finck 	b0r += B0STEP; \
240*c2c66affSColin Finck 	 \
241*c2c66affSColin Finck 	v1 = vec_mergeh(vsum,vsum3); \
242*c2c66affSColin Finck 	v5 = vec_mergeh(vsum5,vsum7); \
243*c2c66affSColin Finck 	v2 = vec_mergeh(vsum2,vsum4); \
244*c2c66affSColin Finck 	v6 = vec_mergeh(vsum6,vsum8); \
245*c2c66affSColin Finck 	v3 = vec_mergel(vsum,vsum3); \
246*c2c66affSColin Finck 	v7 = vec_mergel(vsum5,vsum7); \
247*c2c66affSColin Finck 	v4 = vec_mergel(vsum2,vsum4); \
248*c2c66affSColin Finck 	v8 = vec_mergel(vsum6,vsum8); \
249*c2c66affSColin Finck 	vsum = vec_mergeh(v1,v2); \
250*c2c66affSColin Finck 	vsum5 = vec_mergeh(v5,v6); \
251*c2c66affSColin Finck 	vsum2 = vec_mergel(v1,v2); \
252*c2c66affSColin Finck 	vsum6 = vec_mergel(v5,v6); \
253*c2c66affSColin Finck 	vsum3 = vec_mergeh(v3,v4); \
254*c2c66affSColin Finck 	vsum7 = vec_mergeh(v7,v8); \
255*c2c66affSColin Finck 	vsum4 = vec_mergel(v3,v4); \
256*c2c66affSColin Finck 	vsum8 = vec_mergel(v7,v8);
257*c2c66affSColin Finck 
synth_1to1_altivec(real * bandPtr,int channel,mpg123_handle * fr,int final)258*c2c66affSColin Finck int synth_1to1_altivec(real *bandPtr,int channel,mpg123_handle *fr, int final)
259*c2c66affSColin Finck {
260*c2c66affSColin Finck 	short *samples = (short *) (fr->buffer.data+fr->buffer.fill);
261*c2c66affSColin Finck 
262*c2c66affSColin Finck 	real *b0, **buf;
263*c2c66affSColin Finck 	int clip;
264*c2c66affSColin Finck 	int bo1;
265*c2c66affSColin Finck #ifndef NO_EQUALIZER
266*c2c66affSColin Finck 	if(fr->have_eq_settings) do_equalizer(bandPtr,channel,fr->equalizer);
267*c2c66affSColin Finck #endif
268*c2c66affSColin Finck 	if(!channel)
269*c2c66affSColin Finck 	{
270*c2c66affSColin Finck 		fr->bo--;
271*c2c66affSColin Finck 		fr->bo &= 0xf;
272*c2c66affSColin Finck 		buf = fr->real_buffs[0];
273*c2c66affSColin Finck 	}
274*c2c66affSColin Finck 	else
275*c2c66affSColin Finck 	{
276*c2c66affSColin Finck 		samples++;
277*c2c66affSColin Finck 		buf = fr->real_buffs[1];
278*c2c66affSColin Finck 	}
279*c2c66affSColin Finck 
280*c2c66affSColin Finck 	if(fr->bo & 0x1)
281*c2c66affSColin Finck 	{
282*c2c66affSColin Finck 		b0 = buf[0];
283*c2c66affSColin Finck 		bo1 = fr->bo;
284*c2c66affSColin Finck 		dct64_altivec(buf[1]+((fr->bo+1)&0xf),buf[0]+fr->bo,bandPtr);
285*c2c66affSColin Finck 	}
286*c2c66affSColin Finck 	else
287*c2c66affSColin Finck 	{
288*c2c66affSColin Finck 		b0 = buf[1];
289*c2c66affSColin Finck 		bo1 = fr->bo+1;
290*c2c66affSColin Finck 		dct64_altivec(buf[0]+fr->bo,buf[1]+fr->bo+1,bandPtr);
291*c2c66affSColin Finck 	}
292*c2c66affSColin Finck 
293*c2c66affSColin Finck 
294*c2c66affSColin Finck 	{
295*c2c66affSColin Finck 		register int j;
296*c2c66affSColin Finck 		real *window = fr->decwin + 16 - bo1;
297*c2c66affSColin Finck 
298*c2c66affSColin Finck 		ALIGNED(16) int clip_tmp[4];
299*c2c66affSColin Finck 		vector float v1,v2,v3,v4,v5,v6,v7,v8,v9;
300*c2c66affSColin Finck 		vector unsigned char vperm1,vperm2,vperm3,vperm4;
301*c2c66affSColin Finck 		vector float vsum,vsum2,vsum3,vsum4,vmin,vmax,vzero;
302*c2c66affSColin Finck 		vector signed int vclip;
303*c2c66affSColin Finck 		vector signed short vsample1,vsample2;
304*c2c66affSColin Finck 		vector unsigned int vshift;
305*c2c66affSColin Finck 		vclip = vec_xor(vclip,vclip);
306*c2c66affSColin Finck 		vzero = vec_xor(vzero,vzero);
307*c2c66affSColin Finck 		vshift = vec_splat_u32(-1); /* 31 */
308*c2c66affSColin Finck #ifdef __APPLE__
309*c2c66affSColin Finck 		vmax = (vector float)(32767.0f);
310*c2c66affSColin Finck 		vmin = (vector float)(-32768.0f);
311*c2c66affSColin Finck 		vperm4 = (vector unsigned char)(0,1,18,19,2,3,22,23,4,5,26,27,6,7,30,31);
312*c2c66affSColin Finck #else
313*c2c66affSColin Finck 		vmax = (vector float){32767.0f,32767.0f,32767.0f,32767.0f};
314*c2c66affSColin Finck 		vmin = (vector float){-32768.0f,-32768.0f,-32768.0f,-32768.0f};
315*c2c66affSColin Finck 		vperm4 = (vector unsigned char){0,1,18,19,2,3,22,23,4,5,26,27,6,7,30,31};
316*c2c66affSColin Finck #endif
317*c2c66affSColin Finck 
318*c2c66affSColin Finck 		vperm1 = vec_lvsl(0,window);
319*c2c66affSColin Finck 		vperm2 = vec_lvsl(0,samples);
320*c2c66affSColin Finck 		vperm3 = vec_lvsr(0,samples);
321*c2c66affSColin Finck 		for (j=4;j;j--)
322*c2c66affSColin Finck 		{
323*c2c66affSColin Finck 			SYNTH_ALTIVEC(16);
324*c2c66affSColin Finck 
325*c2c66affSColin Finck 			vsum = vec_sub(v5,v6);
326*c2c66affSColin Finck 			v9 = vec_sub(v7,v8);
327*c2c66affSColin Finck 			vsum = vec_add(vsum,v9);
328*c2c66affSColin Finck 
329*c2c66affSColin Finck 			v3 = vec_round(vsum);
330*c2c66affSColin Finck 			v3 = (vector float)vec_cts(v3,0);
331*c2c66affSColin Finck 			v1 = (vector float)vec_cmpgt(vsum,vmax);
332*c2c66affSColin Finck 			v2 = (vector float)vec_cmplt(vsum,vmin);
333*c2c66affSColin Finck 			vsample1 = vec_ld(0,samples);
334*c2c66affSColin Finck 			vsample2 = vec_ld(15,samples);
335*c2c66affSColin Finck 			v3 = (vector float)vec_packs((vector signed int)v3,(vector signed int)v3);
336*c2c66affSColin Finck 			v4 = (vector float)vec_perm(vsample1,vsample2,vperm2);
337*c2c66affSColin Finck 			v5 = (vector float)vec_perm(v3,v4,vperm4);
338*c2c66affSColin Finck 			v6 = (vector float)vec_perm(vsample2,vsample1,vperm2);
339*c2c66affSColin Finck 			v7 = (vector float)vec_perm(v5,v6,vperm3);
340*c2c66affSColin Finck 			v8 = (vector float)vec_perm(v6,v5,vperm3);
341*c2c66affSColin Finck 			vec_st((vector signed short)v7,15,samples);
342*c2c66affSColin Finck 			vec_st((vector signed short)v8,0,samples);
343*c2c66affSColin Finck 			samples += 8;
344*c2c66affSColin Finck 
345*c2c66affSColin Finck 			v1 = (vector float)vec_sr((vector unsigned int)v1, vshift);
346*c2c66affSColin Finck 			v2 = (vector float)vec_sr((vector unsigned int)v2, vshift);
347*c2c66affSColin Finck 			v1 = (vector float)vec_add((vector unsigned int)v1,(vector unsigned int)v2);
348*c2c66affSColin Finck 			vclip = vec_sums((vector signed int)v1,vclip);
349*c2c66affSColin Finck 		}
350*c2c66affSColin Finck 
351*c2c66affSColin Finck 		for (j=4;j;j--)
352*c2c66affSColin Finck 		{
353*c2c66affSColin Finck 			SYNTH_ALTIVEC(-16);
354*c2c66affSColin Finck 
355*c2c66affSColin Finck 			vsum = vec_add(v5,v6);
356*c2c66affSColin Finck 			v9 = vec_add(v7,v8);
357*c2c66affSColin Finck 			vsum = vec_add(vsum,v9);
358*c2c66affSColin Finck 
359*c2c66affSColin Finck 			v3 = vec_round(vsum);
360*c2c66affSColin Finck 			v3 = (vector float)vec_cts(v3,0);
361*c2c66affSColin Finck 			v1 = (vector float)vec_cmpgt(vsum,vmax);
362*c2c66affSColin Finck 			v2 = (vector float)vec_cmplt(vsum,vmin);
363*c2c66affSColin Finck 			vsample1 = vec_ld(0,samples);
364*c2c66affSColin Finck 			vsample2 = vec_ld(15,samples);
365*c2c66affSColin Finck 			v3 = (vector float)vec_packs((vector signed int)v3,(vector signed int)v3);
366*c2c66affSColin Finck 			v4 = (vector float)vec_perm(vsample1,vsample2,vperm2);
367*c2c66affSColin Finck 			v5 = (vector float)vec_perm(v3,v4,vperm4);
368*c2c66affSColin Finck 			v6 = (vector float)vec_perm(vsample2,vsample1,vperm2);
369*c2c66affSColin Finck 			v7 = (vector float)vec_perm(v5,v6,vperm3);
370*c2c66affSColin Finck 			v8 = (vector float)vec_perm(v6,v5,vperm3);
371*c2c66affSColin Finck 			vec_st((vector signed short)v7,15,samples);
372*c2c66affSColin Finck 			vec_st((vector signed short)v8,0,samples);
373*c2c66affSColin Finck 			samples += 8;
374*c2c66affSColin Finck 
375*c2c66affSColin Finck 			v1 = (vector float)vec_sr((vector unsigned int)v1, vshift);
376*c2c66affSColin Finck 			v2 = (vector float)vec_sr((vector unsigned int)v2, vshift);
377*c2c66affSColin Finck 			v1 = (vector float)vec_add((vector unsigned int)v1,(vector unsigned int)v2);
378*c2c66affSColin Finck 			vclip = vec_sums((vector signed int)v1,vclip);
379*c2c66affSColin Finck 		}
380*c2c66affSColin Finck 
381*c2c66affSColin Finck 		vec_st(vclip,0,clip_tmp);
382*c2c66affSColin Finck 		clip = clip_tmp[3];
383*c2c66affSColin Finck 	}
384*c2c66affSColin Finck 	if(final) fr->buffer.fill += 128;
385*c2c66affSColin Finck 
386*c2c66affSColin Finck 	return clip;
387*c2c66affSColin Finck }
388*c2c66affSColin Finck 
synth_1to1_stereo_altivec(real * bandPtr_l,real * bandPtr_r,mpg123_handle * fr)389*c2c66affSColin Finck int synth_1to1_stereo_altivec(real *bandPtr_l, real *bandPtr_r, mpg123_handle *fr)
390*c2c66affSColin Finck {
391*c2c66affSColin Finck 	short *samples = (short *) (fr->buffer.data+fr->buffer.fill);
392*c2c66affSColin Finck 
393*c2c66affSColin Finck 	real *b0l, *b0r, **bufl, **bufr;
394*c2c66affSColin Finck 	int clip;
395*c2c66affSColin Finck 	int bo1;
396*c2c66affSColin Finck #ifndef NO_EQUALIZER
397*c2c66affSColin Finck 	if(fr->have_eq_settings)
398*c2c66affSColin Finck 	{
399*c2c66affSColin Finck 		do_equalizer(bandPtr_l,0,fr->equalizer);
400*c2c66affSColin Finck 		do_equalizer(bandPtr_r,1,fr->equalizer);
401*c2c66affSColin Finck 	}
402*c2c66affSColin Finck #endif
403*c2c66affSColin Finck 	fr->bo--;
404*c2c66affSColin Finck 	fr->bo &= 0xf;
405*c2c66affSColin Finck 	bufl = fr->real_buffs[0];
406*c2c66affSColin Finck 	bufr = fr->real_buffs[1];
407*c2c66affSColin Finck 
408*c2c66affSColin Finck 	if(fr->bo & 0x1)
409*c2c66affSColin Finck 	{
410*c2c66affSColin Finck 		b0l = bufl[0];
411*c2c66affSColin Finck 		b0r = bufr[0];
412*c2c66affSColin Finck 		bo1 = fr->bo;
413*c2c66affSColin Finck 		dct64_altivec(bufl[1]+((fr->bo+1)&0xf),bufl[0]+fr->bo,bandPtr_l);
414*c2c66affSColin Finck 		dct64_altivec(bufr[1]+((fr->bo+1)&0xf),bufr[0]+fr->bo,bandPtr_r);
415*c2c66affSColin Finck 	}
416*c2c66affSColin Finck 	else
417*c2c66affSColin Finck 	{
418*c2c66affSColin Finck 		b0l = bufl[1];
419*c2c66affSColin Finck 		b0r = bufr[1];
420*c2c66affSColin Finck 		bo1 = fr->bo+1;
421*c2c66affSColin Finck 		dct64_altivec(bufl[0]+fr->bo,bufl[1]+fr->bo+1,bandPtr_l);
422*c2c66affSColin Finck 		dct64_altivec(bufr[0]+fr->bo,bufr[1]+fr->bo+1,bandPtr_r);
423*c2c66affSColin Finck 	}
424*c2c66affSColin Finck 
425*c2c66affSColin Finck 
426*c2c66affSColin Finck 	{
427*c2c66affSColin Finck 		register int j;
428*c2c66affSColin Finck 		real *window = fr->decwin + 16 - bo1;
429*c2c66affSColin Finck 
430*c2c66affSColin Finck 		ALIGNED(16) int clip_tmp[4];
431*c2c66affSColin Finck 		vector float v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13;
432*c2c66affSColin Finck 		vector unsigned char vperm1,vperm2;
433*c2c66affSColin Finck 		vector float vsum,vsum2,vsum3,vsum4,vsum5,vsum6,vsum7,vsum8,vmin,vmax,vzero;
434*c2c66affSColin Finck 		vector signed int vclip;
435*c2c66affSColin Finck 		vector unsigned int vshift;
436*c2c66affSColin Finck 		vector signed short vprev;
437*c2c66affSColin Finck 		vclip = vec_xor(vclip,vclip);
438*c2c66affSColin Finck 		vzero = vec_xor(vzero,vzero);
439*c2c66affSColin Finck 		vshift = vec_splat_u32(-1); /* 31 */
440*c2c66affSColin Finck #ifdef __APPLE__
441*c2c66affSColin Finck 		vmax = (vector float)(32767.0f);
442*c2c66affSColin Finck 		vmin = (vector float)(-32768.0f);
443*c2c66affSColin Finck #else
444*c2c66affSColin Finck 		vmax = (vector float){32767.0f,32767.0f,32767.0f,32767.0f};
445*c2c66affSColin Finck 		vmin = (vector float){-32768.0f,-32768.0f,-32768.0f,-32768.0f};
446*c2c66affSColin Finck #endif
447*c2c66affSColin Finck 
448*c2c66affSColin Finck 		vperm1 = vec_lvsl(0,window);
449*c2c66affSColin Finck 		vperm2 = vec_lvsr(0,samples);
450*c2c66affSColin Finck 		vprev = vec_perm(vec_ld(0,samples),vec_ld(0,samples),vec_lvsl(0,samples));
451*c2c66affSColin Finck 		for (j=4;j;j--)
452*c2c66affSColin Finck 		{
453*c2c66affSColin Finck 			SYNTH_STEREO_ALTIVEC(16);
454*c2c66affSColin Finck 
455*c2c66affSColin Finck 			vsum = vec_sub(vsum,vsum2);
456*c2c66affSColin Finck 			vsum2 = vec_sub(vsum5,vsum6);
457*c2c66affSColin Finck 			vsum3 = vec_sub(vsum3,vsum4);
458*c2c66affSColin Finck 			vsum4 = vec_sub(vsum7,vsum8);
459*c2c66affSColin Finck 			vsum = vec_add(vsum,vsum3);
460*c2c66affSColin Finck 			vsum2 = vec_add(vsum2,vsum4);
461*c2c66affSColin Finck 
462*c2c66affSColin Finck 			v1 = vec_round(vsum);
463*c2c66affSColin Finck 			v2 = vec_round(vsum2);
464*c2c66affSColin Finck 			v1 = (vector float)vec_cts(v1,0);
465*c2c66affSColin Finck 			v2 = (vector float)vec_cts(v2,0);
466*c2c66affSColin Finck 			v3 = vec_mergeh(v1, v2);
467*c2c66affSColin Finck 			v4 = vec_mergel(v1, v2);
468*c2c66affSColin Finck 			v5 = (vector float)vec_packs((vector signed int)v3,(vector signed int)v4);
469*c2c66affSColin Finck 			v6 = (vector float)vec_perm(vprev,(vector signed short)v5,vperm2);
470*c2c66affSColin Finck 			vprev = (vector signed short)v5;
471*c2c66affSColin Finck 			v1 = (vector float)vec_cmpgt(vsum,vmax);
472*c2c66affSColin Finck 			v2 = (vector float)vec_cmplt(vsum,vmin);
473*c2c66affSColin Finck 			v3 = (vector float)vec_cmpgt(vsum2,vmax);
474*c2c66affSColin Finck 			v4 = (vector float)vec_cmplt(vsum2,vmin);
475*c2c66affSColin Finck 			vec_st((vector signed short)v6,0,samples);
476*c2c66affSColin Finck 			samples += 8;
477*c2c66affSColin Finck 
478*c2c66affSColin Finck 			v1 = (vector float)vec_sr((vector unsigned int)v1, vshift);
479*c2c66affSColin Finck 			v2 = (vector float)vec_sr((vector unsigned int)v2, vshift);
480*c2c66affSColin Finck 			v3 = (vector float)vec_sr((vector unsigned int)v3, vshift);
481*c2c66affSColin Finck 			v4 = (vector float)vec_sr((vector unsigned int)v4, vshift);
482*c2c66affSColin Finck 			v1 = (vector float)vec_add((vector unsigned int)v1,(vector unsigned int)v2);
483*c2c66affSColin Finck 			v2 = (vector float)vec_add((vector unsigned int)v3,(vector unsigned int)v4);
484*c2c66affSColin Finck 			vclip = vec_sums((vector signed int)v1,vclip);
485*c2c66affSColin Finck 			vclip = vec_sums((vector signed int)v2,vclip);
486*c2c66affSColin Finck 		}
487*c2c66affSColin Finck 
488*c2c66affSColin Finck 		for (j=4;j;j--)
489*c2c66affSColin Finck 		{
490*c2c66affSColin Finck 			SYNTH_STEREO_ALTIVEC(-16);
491*c2c66affSColin Finck 
492*c2c66affSColin Finck 			vsum = vec_add(vsum,vsum2);
493*c2c66affSColin Finck 			vsum2 = vec_add(vsum5,vsum6);
494*c2c66affSColin Finck 			vsum3 = vec_add(vsum3,vsum4);
495*c2c66affSColin Finck 			vsum4 = vec_add(vsum7,vsum8);
496*c2c66affSColin Finck 			vsum = vec_add(vsum,vsum3);
497*c2c66affSColin Finck 			vsum2 = vec_add(vsum2,vsum4);
498*c2c66affSColin Finck 
499*c2c66affSColin Finck 			v1 = vec_round(vsum);
500*c2c66affSColin Finck 			v2 = vec_round(vsum2);
501*c2c66affSColin Finck 			v1 = (vector float)vec_cts(v1,0);
502*c2c66affSColin Finck 			v2 = (vector float)vec_cts(v2,0);
503*c2c66affSColin Finck 			v3 = vec_mergeh(v1, v2);
504*c2c66affSColin Finck 			v4 = vec_mergel(v1, v2);
505*c2c66affSColin Finck 			v5 = (vector float)vec_packs((vector signed int)v3,(vector signed int)v4);
506*c2c66affSColin Finck 			v6 = (vector float)vec_perm(vprev,(vector signed short)v5,vperm2);
507*c2c66affSColin Finck 			vprev = (vector signed short)v5;
508*c2c66affSColin Finck 			v1 = (vector float)vec_cmpgt(vsum,vmax);
509*c2c66affSColin Finck 			v2 = (vector float)vec_cmplt(vsum,vmin);
510*c2c66affSColin Finck 			v3 = (vector float)vec_cmpgt(vsum2,vmax);
511*c2c66affSColin Finck 			v4 = (vector float)vec_cmplt(vsum2,vmin);
512*c2c66affSColin Finck 			vec_st((vector signed short)v6,0,samples);
513*c2c66affSColin Finck 			samples += 8;
514*c2c66affSColin Finck 
515*c2c66affSColin Finck 			v1 = (vector float)vec_sr((vector unsigned int)v1, vshift);
516*c2c66affSColin Finck 			v2 = (vector float)vec_sr((vector unsigned int)v2, vshift);
517*c2c66affSColin Finck 			v3 = (vector float)vec_sr((vector unsigned int)v3, vshift);
518*c2c66affSColin Finck 			v4 = (vector float)vec_sr((vector unsigned int)v4, vshift);
519*c2c66affSColin Finck 			v1 = (vector float)vec_add((vector unsigned int)v1,(vector unsigned int)v2);
520*c2c66affSColin Finck 			v2 = (vector float)vec_add((vector unsigned int)v3,(vector unsigned int)v4);
521*c2c66affSColin Finck 			vclip = vec_sums((vector signed int)v1,vclip);
522*c2c66affSColin Finck 			vclip = vec_sums((vector signed int)v2,vclip);
523*c2c66affSColin Finck 		}
524*c2c66affSColin Finck 
525*c2c66affSColin Finck 		if((size_t)samples & 0xf)
526*c2c66affSColin Finck 		{
527*c2c66affSColin Finck 			v1 = (vector float)vec_perm(vec_ld(0,samples),vec_ld(0,samples),vec_lvsl(0,samples));
528*c2c66affSColin Finck 			v2 = (vector float)vec_perm(vprev,(vector signed short)v1,vperm2);
529*c2c66affSColin Finck 			vec_st((vector signed short)v2,0,samples);
530*c2c66affSColin Finck 		}
531*c2c66affSColin Finck 
532*c2c66affSColin Finck 		vec_st(vclip,0,clip_tmp);
533*c2c66affSColin Finck 		clip = clip_tmp[3];
534*c2c66affSColin Finck 	}
535*c2c66affSColin Finck 	fr->buffer.fill += 128;
536*c2c66affSColin Finck 
537*c2c66affSColin Finck 	return clip;
538*c2c66affSColin Finck }
539*c2c66affSColin Finck 
synth_1to1_real_altivec(real * bandPtr,int channel,mpg123_handle * fr,int final)540*c2c66affSColin Finck int synth_1to1_real_altivec(real *bandPtr,int channel,mpg123_handle *fr, int final)
541*c2c66affSColin Finck {
542*c2c66affSColin Finck 	real *samples = (real *) (fr->buffer.data+fr->buffer.fill);
543*c2c66affSColin Finck 
544*c2c66affSColin Finck 	real *b0, **buf;
545*c2c66affSColin Finck 	int bo1;
546*c2c66affSColin Finck #ifndef NO_EQUALIZER
547*c2c66affSColin Finck 	if(fr->have_eq_settings) do_equalizer(bandPtr,channel,fr->equalizer);
548*c2c66affSColin Finck #endif
549*c2c66affSColin Finck 	if(!channel)
550*c2c66affSColin Finck 	{
551*c2c66affSColin Finck 		fr->bo--;
552*c2c66affSColin Finck 		fr->bo &= 0xf;
553*c2c66affSColin Finck 		buf = fr->real_buffs[0];
554*c2c66affSColin Finck 	}
555*c2c66affSColin Finck 	else
556*c2c66affSColin Finck 	{
557*c2c66affSColin Finck 		samples++;
558*c2c66affSColin Finck 		buf = fr->real_buffs[1];
559*c2c66affSColin Finck 	}
560*c2c66affSColin Finck 
561*c2c66affSColin Finck 	if(fr->bo & 0x1)
562*c2c66affSColin Finck 	{
563*c2c66affSColin Finck 		b0 = buf[0];
564*c2c66affSColin Finck 		bo1 = fr->bo;
565*c2c66affSColin Finck 		dct64_altivec(buf[1]+((fr->bo+1)&0xf),buf[0]+fr->bo,bandPtr);
566*c2c66affSColin Finck 	}
567*c2c66affSColin Finck 	else
568*c2c66affSColin Finck 	{
569*c2c66affSColin Finck 		b0 = buf[1];
570*c2c66affSColin Finck 		bo1 = fr->bo+1;
571*c2c66affSColin Finck 		dct64_altivec(buf[0]+fr->bo,buf[1]+fr->bo+1,bandPtr);
572*c2c66affSColin Finck 	}
573*c2c66affSColin Finck 
574*c2c66affSColin Finck 
575*c2c66affSColin Finck 	{
576*c2c66affSColin Finck 		register int j;
577*c2c66affSColin Finck 		real *window = fr->decwin + 16 - bo1;
578*c2c66affSColin Finck 
579*c2c66affSColin Finck 		vector float v1,v2,v3,v4,v5,v6,v7,v8,v9;
580*c2c66affSColin Finck 		vector unsigned char vperm1,vperm2,vperm3,vperm4, vperm5;
581*c2c66affSColin Finck 		vector float vsum,vsum2,vsum3,vsum4,vscale,vzero;
582*c2c66affSColin Finck 		vector float vsample1,vsample2,vsample3;
583*c2c66affSColin Finck 		vzero = vec_xor(vzero, vzero);
584*c2c66affSColin Finck #ifdef __APPLE__
585*c2c66affSColin Finck 		vscale = (vector float)(1.0f/32768.0f);
586*c2c66affSColin Finck 		vperm4 = (vector unsigned char)(0,1,2,3,20,21,22,23,4,5,6,7,28,29,30,31);
587*c2c66affSColin Finck 		vperm5 = (vector unsigned char)(8,9,10,11,20,21,22,23,12,13,14,15,28,29,30,31);
588*c2c66affSColin Finck #else
589*c2c66affSColin Finck 		vscale = (vector float){1.0f/32768.0f,1.0f/32768.0f,1.0f/32768.0f,1.0f/32768.0f};
590*c2c66affSColin Finck 		vperm4 = (vector unsigned char){0,1,2,3,20,21,22,23,4,5,6,7,28,29,30,31};
591*c2c66affSColin Finck 		vperm5 = (vector unsigned char){8,9,10,11,20,21,22,23,12,13,14,15,28,29,30,31};
592*c2c66affSColin Finck #endif
593*c2c66affSColin Finck 
594*c2c66affSColin Finck 		vperm1 = vec_lvsl(0,window);
595*c2c66affSColin Finck 		vperm2 = vec_lvsl(0,samples);
596*c2c66affSColin Finck 		vperm3 = vec_lvsr(0,samples);
597*c2c66affSColin Finck 		for (j=4;j;j--)
598*c2c66affSColin Finck 		{
599*c2c66affSColin Finck 			SYNTH_ALTIVEC(16);
600*c2c66affSColin Finck 
601*c2c66affSColin Finck 			vsum = vec_sub(v5,v6);
602*c2c66affSColin Finck 			v9 = vec_sub(v7,v8);
603*c2c66affSColin Finck 			vsum = vec_add(vsum,v9);
604*c2c66affSColin Finck 			vsum = vec_madd(vsum, vscale, vzero);
605*c2c66affSColin Finck 
606*c2c66affSColin Finck 			vsample1 = vec_ld(0,samples);
607*c2c66affSColin Finck 			vsample2 = vec_ld(16,samples);
608*c2c66affSColin Finck 			vsample3 = vec_ld(31,samples);
609*c2c66affSColin Finck 			v1 = vec_perm(vsample1, vsample2, vperm2);
610*c2c66affSColin Finck 			v2 = vec_perm(vsample2, vsample3, vperm2);
611*c2c66affSColin Finck 			v1 = vec_perm(vsum, v1, vperm4);
612*c2c66affSColin Finck 			v2 = vec_perm(vsum, v2, vperm5);
613*c2c66affSColin Finck 			v3 = vec_perm(vsample3, vsample2, vperm2);
614*c2c66affSColin Finck 			v4 = vec_perm(vsample2, vsample1, vperm2);
615*c2c66affSColin Finck 			v5 = vec_perm(v2, v3, vperm3);
616*c2c66affSColin Finck 			v6 = vec_perm(v1, v2, vperm3);
617*c2c66affSColin Finck 			v7 = vec_perm(v4, v1, vperm3);
618*c2c66affSColin Finck 			vec_st(v5,31,samples);
619*c2c66affSColin Finck 			vec_st(v6,16,samples);
620*c2c66affSColin Finck 			vec_st(v7,0,samples);
621*c2c66affSColin Finck 			samples += 8;
622*c2c66affSColin Finck 		}
623*c2c66affSColin Finck 
624*c2c66affSColin Finck 		for (j=4;j;j--)
625*c2c66affSColin Finck 		{
626*c2c66affSColin Finck 			SYNTH_ALTIVEC(-16);
627*c2c66affSColin Finck 
628*c2c66affSColin Finck 			vsum = vec_add(v5,v6);
629*c2c66affSColin Finck 			v9 = vec_add(v7,v8);
630*c2c66affSColin Finck 			vsum = vec_add(vsum,v9);
631*c2c66affSColin Finck 			vsum = vec_madd(vsum, vscale, vzero);
632*c2c66affSColin Finck 
633*c2c66affSColin Finck 			vsample1 = vec_ld(0,samples);
634*c2c66affSColin Finck 			vsample2 = vec_ld(16,samples);
635*c2c66affSColin Finck 			vsample3 = vec_ld(31,samples);
636*c2c66affSColin Finck 			v1 = vec_perm(vsample1, vsample2, vperm2);
637*c2c66affSColin Finck 			v2 = vec_perm(vsample2, vsample3, vperm2);
638*c2c66affSColin Finck 			v1 = vec_perm(vsum, v1, vperm4);
639*c2c66affSColin Finck 			v2 = vec_perm(vsum, v2, vperm5);
640*c2c66affSColin Finck 			v3 = vec_perm(vsample3, vsample2, vperm2);
641*c2c66affSColin Finck 			v4 = vec_perm(vsample2, vsample1, vperm2);
642*c2c66affSColin Finck 			v5 = vec_perm(v2, v3, vperm3);
643*c2c66affSColin Finck 			v6 = vec_perm(v1, v2, vperm3);
644*c2c66affSColin Finck 			v7 = vec_perm(v4, v1, vperm3);
645*c2c66affSColin Finck 			vec_st(v5,31,samples);
646*c2c66affSColin Finck 			vec_st(v6,16,samples);
647*c2c66affSColin Finck 			vec_st(v7,0,samples);
648*c2c66affSColin Finck 			samples += 8;
649*c2c66affSColin Finck 		}
650*c2c66affSColin Finck 	}
651*c2c66affSColin Finck 	if(final) fr->buffer.fill += 256;
652*c2c66affSColin Finck 
653*c2c66affSColin Finck 	return 0;
654*c2c66affSColin Finck }
655*c2c66affSColin Finck 
synth_1to1_fltst_altivec(real * bandPtr_l,real * bandPtr_r,mpg123_handle * fr)656*c2c66affSColin Finck int synth_1to1_fltst_altivec(real *bandPtr_l, real *bandPtr_r, mpg123_handle *fr)
657*c2c66affSColin Finck {
658*c2c66affSColin Finck 	real *samples = (real *) (fr->buffer.data+fr->buffer.fill);
659*c2c66affSColin Finck 
660*c2c66affSColin Finck 	real *b0l, *b0r, **bufl, **bufr;
661*c2c66affSColin Finck 	int bo1;
662*c2c66affSColin Finck #ifndef NO_EQUALIZER
663*c2c66affSColin Finck 	if(fr->have_eq_settings)
664*c2c66affSColin Finck 	{
665*c2c66affSColin Finck 		do_equalizer(bandPtr_l,0,fr->equalizer);
666*c2c66affSColin Finck 		do_equalizer(bandPtr_r,1,fr->equalizer);
667*c2c66affSColin Finck 	}
668*c2c66affSColin Finck #endif
669*c2c66affSColin Finck 	fr->bo--;
670*c2c66affSColin Finck 	fr->bo &= 0xf;
671*c2c66affSColin Finck 	bufl = fr->real_buffs[0];
672*c2c66affSColin Finck 	bufr = fr->real_buffs[1];
673*c2c66affSColin Finck 
674*c2c66affSColin Finck 	if(fr->bo & 0x1)
675*c2c66affSColin Finck 	{
676*c2c66affSColin Finck 		b0l = bufl[0];
677*c2c66affSColin Finck 		b0r = bufr[0];
678*c2c66affSColin Finck 		bo1 = fr->bo;
679*c2c66affSColin Finck 		dct64_altivec(bufl[1]+((fr->bo+1)&0xf),bufl[0]+fr->bo,bandPtr_l);
680*c2c66affSColin Finck 		dct64_altivec(bufr[1]+((fr->bo+1)&0xf),bufr[0]+fr->bo,bandPtr_r);
681*c2c66affSColin Finck 	}
682*c2c66affSColin Finck 	else
683*c2c66affSColin Finck 	{
684*c2c66affSColin Finck 		b0l = bufl[1];
685*c2c66affSColin Finck 		b0r = bufr[1];
686*c2c66affSColin Finck 		bo1 = fr->bo+1;
687*c2c66affSColin Finck 		dct64_altivec(bufl[0]+fr->bo,bufl[1]+fr->bo+1,bandPtr_l);
688*c2c66affSColin Finck 		dct64_altivec(bufr[0]+fr->bo,bufr[1]+fr->bo+1,bandPtr_r);
689*c2c66affSColin Finck 	}
690*c2c66affSColin Finck 
691*c2c66affSColin Finck 
692*c2c66affSColin Finck 	{
693*c2c66affSColin Finck 		register int j;
694*c2c66affSColin Finck 		real *window = fr->decwin + 16 - bo1;
695*c2c66affSColin Finck 
696*c2c66affSColin Finck 		vector float v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13;
697*c2c66affSColin Finck 		vector unsigned char vperm1,vperm2;
698*c2c66affSColin Finck 		vector float vsum,vsum2,vsum3,vsum4,vsum5,vsum6,vsum7,vsum8,vscale,vzero;
699*c2c66affSColin Finck 		vector float vprev;
700*c2c66affSColin Finck 		vzero = vec_xor(vzero,vzero);
701*c2c66affSColin Finck #ifdef __APPLE__
702*c2c66affSColin Finck 		vscale = (vector float)(1.0f/32768.0f);
703*c2c66affSColin Finck #else
704*c2c66affSColin Finck 		vscale = (vector float){1.0f/32768.0f,1.0f/32768.0f,1.0f/32768.0f,1.0f/32768.0f};
705*c2c66affSColin Finck #endif
706*c2c66affSColin Finck 
707*c2c66affSColin Finck 		vperm1 = vec_lvsl(0,window);
708*c2c66affSColin Finck 		vperm2 = vec_lvsr(0,samples);
709*c2c66affSColin Finck 		vprev = vec_perm(vec_ld(0,samples),vec_ld(0,samples),vec_lvsl(0,samples));
710*c2c66affSColin Finck 		for (j=4;j;j--)
711*c2c66affSColin Finck 		{
712*c2c66affSColin Finck 			SYNTH_STEREO_ALTIVEC(16);
713*c2c66affSColin Finck 
714*c2c66affSColin Finck 			vsum = vec_sub(vsum,vsum2);
715*c2c66affSColin Finck 			vsum2 = vec_sub(vsum5,vsum6);
716*c2c66affSColin Finck 			vsum3 = vec_sub(vsum3,vsum4);
717*c2c66affSColin Finck 			vsum4 = vec_sub(vsum7,vsum8);
718*c2c66affSColin Finck 			vsum = vec_add(vsum,vsum3);
719*c2c66affSColin Finck 			vsum2 = vec_add(vsum2,vsum4);
720*c2c66affSColin Finck 			vsum = vec_madd(vsum, vscale, vzero);
721*c2c66affSColin Finck 			vsum2 = vec_madd(vsum2, vscale, vzero);
722*c2c66affSColin Finck 
723*c2c66affSColin Finck 			v1 = vec_mergeh(vsum, vsum2);
724*c2c66affSColin Finck 			v2 = vec_mergel(vsum, vsum2);
725*c2c66affSColin Finck 			v3 = vec_perm(vprev,v1,vperm2);
726*c2c66affSColin Finck 			v4 = vec_perm(v1,v2,vperm2);
727*c2c66affSColin Finck 			vprev = v2;
728*c2c66affSColin Finck 			vec_st(v3,0,samples);
729*c2c66affSColin Finck 			vec_st(v4,16,samples);
730*c2c66affSColin Finck 			samples += 8;
731*c2c66affSColin Finck 		}
732*c2c66affSColin Finck 
733*c2c66affSColin Finck 		for (j=4;j;j--)
734*c2c66affSColin Finck 		{
735*c2c66affSColin Finck 			SYNTH_STEREO_ALTIVEC(-16);
736*c2c66affSColin Finck 
737*c2c66affSColin Finck 			vsum = vec_add(vsum,vsum2);
738*c2c66affSColin Finck 			vsum2 = vec_add(vsum5,vsum6);
739*c2c66affSColin Finck 			vsum3 = vec_add(vsum3,vsum4);
740*c2c66affSColin Finck 			vsum4 = vec_add(vsum7,vsum8);
741*c2c66affSColin Finck 			vsum = vec_add(vsum,vsum3);
742*c2c66affSColin Finck 			vsum2 = vec_add(vsum2,vsum4);
743*c2c66affSColin Finck 			vsum = vec_madd(vsum, vscale, vzero);
744*c2c66affSColin Finck 			vsum2 = vec_madd(vsum2, vscale, vzero);
745*c2c66affSColin Finck 
746*c2c66affSColin Finck 			v1 = vec_mergeh(vsum, vsum2);
747*c2c66affSColin Finck 			v2 = vec_mergel(vsum, vsum2);
748*c2c66affSColin Finck 			v3 = vec_perm(vprev,v1,vperm2);
749*c2c66affSColin Finck 			v4 = vec_perm(v1,v2,vperm2);
750*c2c66affSColin Finck 			vprev = v2;
751*c2c66affSColin Finck 			vec_st(v3,0,samples);
752*c2c66affSColin Finck 			vec_st(v4,16,samples);
753*c2c66affSColin Finck 			samples += 8;
754*c2c66affSColin Finck 		}
755*c2c66affSColin Finck 
756*c2c66affSColin Finck 		if((size_t)samples & 0xf)
757*c2c66affSColin Finck 		{
758*c2c66affSColin Finck 			v1 = (vector float)vec_perm(vec_ld(0,samples),vec_ld(0,samples),vec_lvsl(0,samples));
759*c2c66affSColin Finck 			v2 = (vector float)vec_perm(vprev,v1,vperm2);
760*c2c66affSColin Finck 			vec_st(v2,0,samples);
761*c2c66affSColin Finck 		}
762*c2c66affSColin Finck 	}
763*c2c66affSColin Finck 	fr->buffer.fill += 256;
764*c2c66affSColin Finck 
765*c2c66affSColin Finck 	return 0;
766*c2c66affSColin Finck }
767*c2c66affSColin Finck 
synth_1to1_s32_altivec(real * bandPtr,int channel,mpg123_handle * fr,int final)768*c2c66affSColin Finck int synth_1to1_s32_altivec(real *bandPtr,int channel,mpg123_handle *fr, int final)
769*c2c66affSColin Finck {
770*c2c66affSColin Finck 	int32_t *samples = (int32_t *) (fr->buffer.data+fr->buffer.fill);
771*c2c66affSColin Finck 
772*c2c66affSColin Finck 	real *b0, **buf;
773*c2c66affSColin Finck 	int clip;
774*c2c66affSColin Finck 	int bo1;
775*c2c66affSColin Finck #ifndef NO_EQUALIZER
776*c2c66affSColin Finck 	if(fr->have_eq_settings) do_equalizer(bandPtr,channel,fr->equalizer);
777*c2c66affSColin Finck #endif
778*c2c66affSColin Finck 	if(!channel)
779*c2c66affSColin Finck 	{
780*c2c66affSColin Finck 		fr->bo--;
781*c2c66affSColin Finck 		fr->bo &= 0xf;
782*c2c66affSColin Finck 		buf = fr->real_buffs[0];
783*c2c66affSColin Finck 	}
784*c2c66affSColin Finck 	else
785*c2c66affSColin Finck 	{
786*c2c66affSColin Finck 		samples++;
787*c2c66affSColin Finck 		buf = fr->real_buffs[1];
788*c2c66affSColin Finck 	}
789*c2c66affSColin Finck 
790*c2c66affSColin Finck 	if(fr->bo & 0x1)
791*c2c66affSColin Finck 	{
792*c2c66affSColin Finck 		b0 = buf[0];
793*c2c66affSColin Finck 		bo1 = fr->bo;
794*c2c66affSColin Finck 		dct64_altivec(buf[1]+((fr->bo+1)&0xf),buf[0]+fr->bo,bandPtr);
795*c2c66affSColin Finck 	}
796*c2c66affSColin Finck 	else
797*c2c66affSColin Finck 	{
798*c2c66affSColin Finck 		b0 = buf[1];
799*c2c66affSColin Finck 		bo1 = fr->bo+1;
800*c2c66affSColin Finck 		dct64_altivec(buf[0]+fr->bo,buf[1]+fr->bo+1,bandPtr);
801*c2c66affSColin Finck 	}
802*c2c66affSColin Finck 
803*c2c66affSColin Finck 
804*c2c66affSColin Finck 	{
805*c2c66affSColin Finck 		register int j;
806*c2c66affSColin Finck 		real *window = fr->decwin + 16 - bo1;
807*c2c66affSColin Finck 
808*c2c66affSColin Finck 		ALIGNED(16) int clip_tmp[4];
809*c2c66affSColin Finck 		vector float v1,v2,v3,v4,v5,v6,v7,v8,v9;
810*c2c66affSColin Finck 		vector unsigned char vperm1,vperm2,vperm3,vperm4,vperm5;
811*c2c66affSColin Finck 		vector float vsum,vsum2,vsum3,vsum4,vmax,vmin,vzero;
812*c2c66affSColin Finck 		vector signed int vsample1,vsample2,vsample3;
813*c2c66affSColin Finck 		vector unsigned int vshift;
814*c2c66affSColin Finck 		vector signed int vclip;
815*c2c66affSColin Finck 		vzero = vec_xor(vzero, vzero);
816*c2c66affSColin Finck 		vclip = vec_xor(vclip, vclip);
817*c2c66affSColin Finck 		vshift = vec_splat_u32(-1); /* 31 */
818*c2c66affSColin Finck #ifdef __APPLE__
819*c2c66affSColin Finck 		vmax = (vector float)(32767.999f);
820*c2c66affSColin Finck 		vmin = (vector float)(-32768.0f);
821*c2c66affSColin Finck 		vperm4 = (vector unsigned char)(0,1,2,3,20,21,22,23,4,5,6,7,28,29,30,31);
822*c2c66affSColin Finck 		vperm5 = (vector unsigned char)(8,9,10,11,20,21,22,23,12,13,14,15,28,29,30,31);
823*c2c66affSColin Finck #else
824*c2c66affSColin Finck 		vmax = (vector float){32767.999f,32767.999f,32767.999f,32767.999f};
825*c2c66affSColin Finck 		vmin = (vector float){-32768.0f,-32768.0f,-32768.0f,-32768.0f};
826*c2c66affSColin Finck 		vperm4 = (vector unsigned char){0,1,2,3,20,21,22,23,4,5,6,7,28,29,30,31};
827*c2c66affSColin Finck 		vperm5 = (vector unsigned char){8,9,10,11,20,21,22,23,12,13,14,15,28,29,30,31};
828*c2c66affSColin Finck #endif
829*c2c66affSColin Finck 
830*c2c66affSColin Finck 		vperm1 = vec_lvsl(0,window);
831*c2c66affSColin Finck 		vperm2 = vec_lvsl(0,samples);
832*c2c66affSColin Finck 		vperm3 = vec_lvsr(0,samples);
833*c2c66affSColin Finck 		for (j=4;j;j--)
834*c2c66affSColin Finck 		{
835*c2c66affSColin Finck 			SYNTH_ALTIVEC(16);
836*c2c66affSColin Finck 
837*c2c66affSColin Finck 			vsum = vec_sub(v5,v6);
838*c2c66affSColin Finck 			v9 = vec_sub(v7,v8);
839*c2c66affSColin Finck 			v1 = vec_add(vsum,v9);
840*c2c66affSColin Finck 			vsum = (vector float)vec_cts(v1,16);
841*c2c66affSColin Finck 			v8 = (vector float)vec_cmpgt(v1,vmax);
842*c2c66affSColin Finck 			v9 = (vector float)vec_cmplt(v1,vmin);
843*c2c66affSColin Finck 
844*c2c66affSColin Finck 			vsample1 = vec_ld(0,samples);
845*c2c66affSColin Finck 			vsample2 = vec_ld(16,samples);
846*c2c66affSColin Finck 			vsample3 = vec_ld(31,samples);
847*c2c66affSColin Finck 			v1 = (vector float)vec_perm(vsample1, vsample2, vperm2);
848*c2c66affSColin Finck 			v2 = (vector float)vec_perm(vsample2, vsample3, vperm2);
849*c2c66affSColin Finck 			v1 = vec_perm(vsum, v1, vperm4);
850*c2c66affSColin Finck 			v2 = vec_perm(vsum, v2, vperm5);
851*c2c66affSColin Finck 			v3 = (vector float)vec_perm(vsample3, vsample2, vperm2);
852*c2c66affSColin Finck 			v4 = (vector float)vec_perm(vsample2, vsample1, vperm2);
853*c2c66affSColin Finck 			v5 = vec_perm(v2, v3, vperm3);
854*c2c66affSColin Finck 			v6 = vec_perm(v1, v2, vperm3);
855*c2c66affSColin Finck 			v7 = vec_perm(v4, v1, vperm3);
856*c2c66affSColin Finck 			vec_st((vector signed int)v5,31,samples);
857*c2c66affSColin Finck 			vec_st((vector signed int)v6,16,samples);
858*c2c66affSColin Finck 			vec_st((vector signed int)v7,0,samples);
859*c2c66affSColin Finck 			samples += 8;
860*c2c66affSColin Finck 
861*c2c66affSColin Finck 			v1 = (vector float)vec_sr((vector unsigned int)v8, vshift);
862*c2c66affSColin Finck 			v2 = (vector float)vec_sr((vector unsigned int)v9, vshift);
863*c2c66affSColin Finck 			v1 = (vector float)vec_add((vector unsigned int)v1,(vector unsigned int)v2);
864*c2c66affSColin Finck 			vclip = vec_sums((vector signed int)v1,vclip);
865*c2c66affSColin Finck 		}
866*c2c66affSColin Finck 
867*c2c66affSColin Finck 		for (j=4;j;j--)
868*c2c66affSColin Finck 		{
869*c2c66affSColin Finck 			SYNTH_ALTIVEC(-16);
870*c2c66affSColin Finck 
871*c2c66affSColin Finck 			vsum = vec_add(v5,v6);
872*c2c66affSColin Finck 			v9 = vec_add(v7,v8);
873*c2c66affSColin Finck 			v1 = vec_add(vsum,v9);
874*c2c66affSColin Finck 			vsum = (vector float)vec_cts(v1,16);
875*c2c66affSColin Finck 			v8 = (vector float)vec_cmpgt(v1,vmax);
876*c2c66affSColin Finck 			v9 = (vector float)vec_cmplt(v1,vmin);
877*c2c66affSColin Finck 
878*c2c66affSColin Finck 			vsample1 = vec_ld(0,samples);
879*c2c66affSColin Finck 			vsample2 = vec_ld(16,samples);
880*c2c66affSColin Finck 			vsample3 = vec_ld(31,samples);
881*c2c66affSColin Finck 			v1 = (vector float)vec_perm(vsample1, vsample2, vperm2);
882*c2c66affSColin Finck 			v2 = (vector float)vec_perm(vsample2, vsample3, vperm2);
883*c2c66affSColin Finck 			v1 = vec_perm(vsum, v1, vperm4);
884*c2c66affSColin Finck 			v2 = vec_perm(vsum, v2, vperm5);
885*c2c66affSColin Finck 			v3 = (vector float)vec_perm(vsample3, vsample2, vperm2);
886*c2c66affSColin Finck 			v4 = (vector float)vec_perm(vsample2, vsample1, vperm2);
887*c2c66affSColin Finck 			v5 = vec_perm(v2, v3, vperm3);
888*c2c66affSColin Finck 			v6 = vec_perm(v1, v2, vperm3);
889*c2c66affSColin Finck 			v7 = vec_perm(v4, v1, vperm3);
890*c2c66affSColin Finck 			vec_st((vector signed int)v5,31,samples);
891*c2c66affSColin Finck 			vec_st((vector signed int)v6,16,samples);
892*c2c66affSColin Finck 			vec_st((vector signed int)v7,0,samples);
893*c2c66affSColin Finck 			samples += 8;
894*c2c66affSColin Finck 
895*c2c66affSColin Finck 			v1 = (vector float)vec_sr((vector unsigned int)v8, vshift);
896*c2c66affSColin Finck 			v2 = (vector float)vec_sr((vector unsigned int)v9, vshift);
897*c2c66affSColin Finck 			v1 = (vector float)vec_add((vector unsigned int)v1,(vector unsigned int)v2);
898*c2c66affSColin Finck 			vclip = vec_sums((vector signed int)v1,vclip);
899*c2c66affSColin Finck 		}
900*c2c66affSColin Finck 
901*c2c66affSColin Finck 		vec_st(vclip,0,clip_tmp);
902*c2c66affSColin Finck 		clip = clip_tmp[3];
903*c2c66affSColin Finck 	}
904*c2c66affSColin Finck 	if(final) fr->buffer.fill += 256;
905*c2c66affSColin Finck 
906*c2c66affSColin Finck 	return clip;
907*c2c66affSColin Finck }
908*c2c66affSColin Finck 
909*c2c66affSColin Finck 
synth_1to1_s32_stereo_altivec(real * bandPtr_l,real * bandPtr_r,mpg123_handle * fr)910*c2c66affSColin Finck int synth_1to1_s32_stereo_altivec(real *bandPtr_l, real *bandPtr_r, mpg123_handle *fr)
911*c2c66affSColin Finck {
912*c2c66affSColin Finck 	int32_t *samples = (int32_t *) (fr->buffer.data+fr->buffer.fill);
913*c2c66affSColin Finck 
914*c2c66affSColin Finck 	real *b0l, *b0r, **bufl, **bufr;
915*c2c66affSColin Finck 	int clip;
916*c2c66affSColin Finck 	int bo1;
917*c2c66affSColin Finck #ifndef NO_EQUALIZER
918*c2c66affSColin Finck 	if(fr->have_eq_settings)
919*c2c66affSColin Finck 	{
920*c2c66affSColin Finck 		do_equalizer(bandPtr_l,0,fr->equalizer);
921*c2c66affSColin Finck 		do_equalizer(bandPtr_r,1,fr->equalizer);
922*c2c66affSColin Finck 	}
923*c2c66affSColin Finck #endif
924*c2c66affSColin Finck 	fr->bo--;
925*c2c66affSColin Finck 	fr->bo &= 0xf;
926*c2c66affSColin Finck 	bufl = fr->real_buffs[0];
927*c2c66affSColin Finck 	bufr = fr->real_buffs[1];
928*c2c66affSColin Finck 
929*c2c66affSColin Finck 	if(fr->bo & 0x1)
930*c2c66affSColin Finck 	{
931*c2c66affSColin Finck 		b0l = bufl[0];
932*c2c66affSColin Finck 		b0r = bufr[0];
933*c2c66affSColin Finck 		bo1 = fr->bo;
934*c2c66affSColin Finck 		dct64_altivec(bufl[1]+((fr->bo+1)&0xf),bufl[0]+fr->bo,bandPtr_l);
935*c2c66affSColin Finck 		dct64_altivec(bufr[1]+((fr->bo+1)&0xf),bufr[0]+fr->bo,bandPtr_r);
936*c2c66affSColin Finck 	}
937*c2c66affSColin Finck 	else
938*c2c66affSColin Finck 	{
939*c2c66affSColin Finck 		b0l = bufl[1];
940*c2c66affSColin Finck 		b0r = bufr[1];
941*c2c66affSColin Finck 		bo1 = fr->bo+1;
942*c2c66affSColin Finck 		dct64_altivec(bufl[0]+fr->bo,bufl[1]+fr->bo+1,bandPtr_l);
943*c2c66affSColin Finck 		dct64_altivec(bufr[0]+fr->bo,bufr[1]+fr->bo+1,bandPtr_r);
944*c2c66affSColin Finck 	}
945*c2c66affSColin Finck 
946*c2c66affSColin Finck 
947*c2c66affSColin Finck 	{
948*c2c66affSColin Finck 		register int j;
949*c2c66affSColin Finck 		real *window = fr->decwin + 16 - bo1;
950*c2c66affSColin Finck 
951*c2c66affSColin Finck 		ALIGNED(16) int clip_tmp[4];
952*c2c66affSColin Finck 		vector float v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13;
953*c2c66affSColin Finck 		vector unsigned char vperm1,vperm2;
954*c2c66affSColin Finck 		vector float vsum,vsum2,vsum3,vsum4,vsum5,vsum6,vsum7,vsum8,vmax,vmin,vzero;
955*c2c66affSColin Finck 		vector float vprev;
956*c2c66affSColin Finck 		vector unsigned int vshift;
957*c2c66affSColin Finck 		vector signed int vclip;
958*c2c66affSColin Finck 		vzero = vec_xor(vzero, vzero);
959*c2c66affSColin Finck 		vclip = vec_xor(vclip, vclip);
960*c2c66affSColin Finck 		vshift = vec_splat_u32(-1); /* 31 */
961*c2c66affSColin Finck #ifdef __APPLE__
962*c2c66affSColin Finck 		vmax = (vector float)(32767.999f);
963*c2c66affSColin Finck 		vmin = (vector float)(-32768.0f);
964*c2c66affSColin Finck #else
965*c2c66affSColin Finck 		vmax = (vector float){32767.999f,32767.999f,32767.999f,32767.999f};
966*c2c66affSColin Finck 		vmin = (vector float){-32768.0f,-32768.0f,-32768.0f,-32768.0f};
967*c2c66affSColin Finck #endif
968*c2c66affSColin Finck 
969*c2c66affSColin Finck 		vperm1 = vec_lvsl(0,window);
970*c2c66affSColin Finck 		vperm2 = vec_lvsr(0,samples);
971*c2c66affSColin Finck 		vprev = (vector float)vec_perm(vec_ld(0,samples),vec_ld(0,samples),vec_lvsl(0,samples));
972*c2c66affSColin Finck 		for (j=4;j;j--)
973*c2c66affSColin Finck 		{
974*c2c66affSColin Finck 			SYNTH_STEREO_ALTIVEC(16);
975*c2c66affSColin Finck 
976*c2c66affSColin Finck 			vsum = vec_sub(vsum,vsum2);
977*c2c66affSColin Finck 			vsum2 = vec_sub(vsum5,vsum6);
978*c2c66affSColin Finck 			vsum3 = vec_sub(vsum3,vsum4);
979*c2c66affSColin Finck 			vsum4 = vec_sub(vsum7,vsum8);
980*c2c66affSColin Finck 			v1 = vec_add(vsum,vsum3);
981*c2c66affSColin Finck 			v2 = vec_add(vsum2,vsum4);
982*c2c66affSColin Finck 			vsum = (vector float)vec_cts(v1,16);
983*c2c66affSColin Finck 			vsum2 = (vector float)vec_cts(v2,16);
984*c2c66affSColin Finck 			v5 = (vector float)vec_cmpgt(v1,vmax);
985*c2c66affSColin Finck 			v6 = (vector float)vec_cmplt(v1,vmin);
986*c2c66affSColin Finck 			v7 = (vector float)vec_cmpgt(v2,vmax);
987*c2c66affSColin Finck 			v8 = (vector float)vec_cmplt(v2,vmin);
988*c2c66affSColin Finck 
989*c2c66affSColin Finck 			v1 = vec_mergeh(vsum, vsum2);
990*c2c66affSColin Finck 			v2 = vec_mergel(vsum, vsum2);
991*c2c66affSColin Finck 			v3 = vec_perm(vprev,v1,vperm2);
992*c2c66affSColin Finck 			v4 = vec_perm(v1,v2,vperm2);
993*c2c66affSColin Finck 			vprev = v2;
994*c2c66affSColin Finck 			vec_st((vector signed int)v3,0,samples);
995*c2c66affSColin Finck 			vec_st((vector signed int)v4,16,samples);
996*c2c66affSColin Finck 			samples += 8;
997*c2c66affSColin Finck 
998*c2c66affSColin Finck 			v1 = (vector float)vec_sr((vector unsigned int)v5, vshift);
999*c2c66affSColin Finck 			v2 = (vector float)vec_sr((vector unsigned int)v6, vshift);
1000*c2c66affSColin Finck 			v3 = (vector float)vec_sr((vector unsigned int)v7, vshift);
1001*c2c66affSColin Finck 			v4 = (vector float)vec_sr((vector unsigned int)v8, vshift);
1002*c2c66affSColin Finck 			v1 = (vector float)vec_add((vector unsigned int)v1,(vector unsigned int)v2);
1003*c2c66affSColin Finck 			v2 = (vector float)vec_add((vector unsigned int)v3,(vector unsigned int)v4);
1004*c2c66affSColin Finck 			vclip = vec_sums((vector signed int)v1,vclip);
1005*c2c66affSColin Finck 			vclip = vec_sums((vector signed int)v2,vclip);
1006*c2c66affSColin Finck 		}
1007*c2c66affSColin Finck 
1008*c2c66affSColin Finck 		for (j=4;j;j--)
1009*c2c66affSColin Finck 		{
1010*c2c66affSColin Finck 			SYNTH_STEREO_ALTIVEC(-16);
1011*c2c66affSColin Finck 
1012*c2c66affSColin Finck 			vsum = vec_add(vsum,vsum2);
1013*c2c66affSColin Finck 			vsum2 = vec_add(vsum5,vsum6);
1014*c2c66affSColin Finck 			vsum3 = vec_add(vsum3,vsum4);
1015*c2c66affSColin Finck 			vsum4 = vec_add(vsum7,vsum8);
1016*c2c66affSColin Finck 			v1 = vec_add(vsum,vsum3);
1017*c2c66affSColin Finck 			v2 = vec_add(vsum2,vsum4);
1018*c2c66affSColin Finck 			vsum = (vector float)vec_cts(v1,16);
1019*c2c66affSColin Finck 			vsum2 = (vector float)vec_cts(v2,16);
1020*c2c66affSColin Finck 			v5 = (vector float)vec_cmpgt(v1,vmax);
1021*c2c66affSColin Finck 			v6 = (vector float)vec_cmplt(v1,vmin);
1022*c2c66affSColin Finck 			v7 = (vector float)vec_cmpgt(v2,vmax);
1023*c2c66affSColin Finck 			v8 = (vector float)vec_cmplt(v2,vmin);
1024*c2c66affSColin Finck 
1025*c2c66affSColin Finck 			v1 = vec_mergeh(vsum, vsum2);
1026*c2c66affSColin Finck 			v2 = vec_mergel(vsum, vsum2);
1027*c2c66affSColin Finck 			v3 = vec_perm(vprev,v1,vperm2);
1028*c2c66affSColin Finck 			v4 = vec_perm(v1,v2,vperm2);
1029*c2c66affSColin Finck 			vprev = v2;
1030*c2c66affSColin Finck 			vec_st((vector signed int)v3,0,samples);
1031*c2c66affSColin Finck 			vec_st((vector signed int)v4,16,samples);
1032*c2c66affSColin Finck 			samples += 8;
1033*c2c66affSColin Finck 
1034*c2c66affSColin Finck 			v1 = (vector float)vec_sr((vector unsigned int)v5, vshift);
1035*c2c66affSColin Finck 			v2 = (vector float)vec_sr((vector unsigned int)v6, vshift);
1036*c2c66affSColin Finck 			v3 = (vector float)vec_sr((vector unsigned int)v7, vshift);
1037*c2c66affSColin Finck 			v4 = (vector float)vec_sr((vector unsigned int)v8, vshift);
1038*c2c66affSColin Finck 			v1 = (vector float)vec_add((vector unsigned int)v1,(vector unsigned int)v2);
1039*c2c66affSColin Finck 			v2 = (vector float)vec_add((vector unsigned int)v3,(vector unsigned int)v4);
1040*c2c66affSColin Finck 			vclip = vec_sums((vector signed int)v1,vclip);
1041*c2c66affSColin Finck 			vclip = vec_sums((vector signed int)v2,vclip);
1042*c2c66affSColin Finck 		}
1043*c2c66affSColin Finck 
1044*c2c66affSColin Finck 		if((size_t)samples & 0xf)
1045*c2c66affSColin Finck 		{
1046*c2c66affSColin Finck 			v1 = (vector float)vec_perm(vec_ld(0,samples),vec_ld(0,samples),vec_lvsl(0,samples));
1047*c2c66affSColin Finck 			v2 = (vector float)vec_perm(vprev,v1,vperm2);
1048*c2c66affSColin Finck 			vec_st((vector signed int)v2,0,samples);
1049*c2c66affSColin Finck 		}
1050*c2c66affSColin Finck 
1051*c2c66affSColin Finck 		vec_st(vclip,0,clip_tmp);
1052*c2c66affSColin Finck 		clip = clip_tmp[3];
1053*c2c66affSColin Finck 	}
1054*c2c66affSColin Finck 	fr->buffer.fill += 256;
1055*c2c66affSColin Finck 
1056*c2c66affSColin Finck 	return clip;
1057*c2c66affSColin Finck }
1058