1 /* this file is a part of amp software, (C) tomislav uzelac 1996,1997
2 */
3 
4 /* transform.c  imdct and polyphase(DCT) transforms
5  *
6  * Created by: tomislav uzelac  May 1996
7  * Karl Anders Oygard optimized this for speed, Mar 13 97
8  * Some optimisations based on ideas from Michael Hipp's mpg123 package
9  */
10 
11 /*
12  * Comments for this file:
13  *
14  * The polyphase algorithm is clearly the most cpu consuming part of mpeg 1
15  * layer 3 decoding.  Thus, there has been some effort to optimise this
16  * particular algorithm.  Currently, everything has been kept in straight C
17  * with no assembler optimisations, but in order to provide efficient paths
18  * for different architectures, alternative implementations of some
19  * critical sections has been done.  You may want to experiment with these,
20  * to see which suits your architecture better.
21  *
22  * Selection of the different implementations is done with the following
23  * defines:
24  *
25  *     HAS_AUTOINCREMENT
26  *
27  *         Define this if your architecture supports preincrementation of
28  *         pointers when referencing (applies to e.g. 68k)
29  *
30  * For those who are optimising amp, check out the Pentium rdtsc code
31  * (define PENTIUM_RDTSC).  This code uses the rdtsc counter for showing
32  * how many cycles are spent in different parts of the code.
33  */
34 
35 #include <math.h>
36 #include <sys/types.h>
37 #include <unistd.h>
38 #include <string.h>
39 
40 #include "audio.h"
41 #include "getdata.h"
42 #include "misc2.h"
43 
44 #define TRANSFORM
45 #include "transform.h"
46 
47 #define PI12      0.261799387f
48 #define PI36      0.087266462f
49 
50 #undef ARCH_i586
51 
imdct_init()52 void imdct_init()
53 {
54   int i;
55 
56   for(i=0;i<36;i++) /* 0 */
57     win[0][i] = (float) sin(PI36 *(i+0.5));
58   for(i=0;i<18;i++) /* 1 */
59     win[1][i] = (float) sin(PI36 *(i+0.5));
60   for(i=18;i<24;i++)
61     win[1][i] = 1.0f;
62   for(i=24;i<30;i++)
63     win[1][i] = (float) sin(PI12 *(i+0.5-18));
64   for(i=30;i<36;i++)
65     win[1][i] = 0.0f;
66   for(i=0;i<6;i++) /* 3 */
67     win[3][i] = 0.0f;
68   for(i=6;i<12;i++)
69     win[3][i] = (float) sin(PI12 * (i+ 0.5 - 6.0));
70   for(i=12;i<18;i++)
71     win[3][i] = 1.0f;
72   for(i=18;i<36;i++)
73     win[3][i] = (float) sin(PI36 * (i + 0.5));
74 }
75 
76 /* This uses Byeong Gi Lee's Fast Cosine Transform algorithm to decompose
77    the 36 point and 12 point IDCT's into 9 point and 3 point IDCT's,
78    respectively. Then the 9 point IDCT is computed by a modified version of
79    Mikko Tommila's IDCT algorithm, based on the WFTA. See his comments
80    before the first 9 point IDCT. The 3 point IDCT is already efficient to
81    implement. -- Jeff Tsay. */
82 /* I got the unrolled IDCT from Jeff Tsay; the code is presumably by
83    Francois-Raymond Boyer - I unrolled it a little further. tu */
84 
imdct(int win_type,int sb,int ch)85 void imdct(int win_type,int sb,int ch)
86 {
87 /*------------------------------------------------------------------*/
88 /*                                                                  */
89 /*    Function: Calculation of the inverse MDCT                     */
90 /*    In the case of short blocks the 3 output vectors are already  */
91 /*    overlapped and added in this modul.                           */
92 /*                                                                  */
93 /*    New layer3                                                    */
94 /*                                                                  */
95 /*------------------------------------------------------------------*/
96 
97        float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11;
98 
99        register float  save;
100        float  pp1, pp2;
101        float   *win_bt;
102        int     i, p, ss;
103 		 float *in = xr[ch][sb];
104 		 float *s_p = s[ch][sb];
105 		 float *res_p = res[sb];
106 		 float out[36];
107 
108    if(win_type == 2){
109 		for(p=0;p<36;p+=9) {
110 			out[p]   = out[p+1] = out[p+2] = out[p+3] =
111 			out[p+4] = out[p+5] = out[p+6] = out[p+7] =
112 			out[p+8] = 0.0f;
113 		}
114 
115 	for(ss=0;ss<18;ss+=6) {
116 
117 	/*
118 	 *  12 point IMDCT
119 	 */
120 
121 	        /* Begin 12 point IDCT */
122 
123 	        /* Input aliasing for 12 pt IDCT */
124 		in[5+ss]+=in[4+ss];in[4+ss]+=in[3+ss];in[3+ss]+=in[2+ss];
125 		in[2+ss]+=in[1+ss];in[1+ss]+=in[0+ss];
126 
127 		/* Input aliasing on odd indices (for 6 point IDCT) */
128 		in[5+ss] += in[3+ss];  in[3+ss]  += in[1+ss];
129 
130 		/* 3 point IDCT on even indices */
131 
132 		pp2 = in[4+ss] * 0.500000000f;
133 		pp1 = in[2+ss] * 0.866025403f;
134 		save = in[0+ss] + pp2;
135 		tmp1 = in[0+ss] - in[4+ss];
136 		tmp0 = save + pp1;
137 		tmp2 = save - pp1;
138 
139 		/* End 3 point IDCT on even indices */
140 
141 		/* 3 point IDCT on odd indices (for 6 point IDCT) */
142 
143 		pp2 = in[5+ss] * 0.500000000f;
144 		pp1 = in[3+ss] * 0.866025403f;
145 		save = in[1+ss] + pp2;
146 		tmp4 = in[1+ss] - in[5+ss];
147 		tmp5 = save + pp1;
148 		tmp3 = save - pp1;
149 
150 		/* End 3 point IDCT on odd indices */
151 
152 		/* Twiddle factors on odd indices (for 6 point IDCT) */
153 
154 		tmp3 *= 1.931851653f;
155 		tmp4 *= 0.707106781f;
156 		tmp5 *= 0.517638090f;
157 
158 		/* Output butterflies on 2 3 point IDCT's (for 6 point IDCT) */
159 
160 		save = tmp0;
161 		tmp0 += tmp5;
162 		tmp5 = save - tmp5;
163 
164 		save = tmp1;
165 		tmp1 += tmp4;
166 		tmp4 = save - tmp4;
167 
168 		save = tmp2;
169 		tmp2 += tmp3;
170 		tmp3 = save - tmp3;
171 
172 		/* End 6 point IDCT */
173 
174 		/* Twiddle factors on indices (for 12 point IDCT) */
175 
176 		tmp0 *= 0.504314480f;
177 		tmp1 *= 0.541196100f;
178 		tmp2 *= 0.630236207f;
179 		tmp3 *= 0.821339815f;
180 		tmp4 *= 1.306562965f;
181 		tmp5 *= 3.830648788f;
182 
183 		/* End 12 point IDCT */
184 
185 		/* Shift to 12 point modified IDCT, multiply by window type 2 */
186 		tmp8  = tmp0 * -0.793353340f;
187 		tmp9  = tmp0 * -0.608761429f;
188 		tmp7  = tmp1 * -0.923879532f;
189 		tmp10 = tmp1 * -0.382683432f;
190 		tmp6  = tmp2 * -0.991444861f;
191 		tmp11 = tmp2 * -0.130526192f;
192 
193 		tmp0  = tmp3;
194 		tmp1  = tmp4 *  0.382683432f;
195 		tmp2  = tmp5 *  0.608761429f;
196 
197 		tmp3  = tmp5 * -0.793353340f;
198 		tmp4  = tmp4 * -0.923879532f;
199 		tmp5  = tmp0 * -0.991444861f;
200 
201 		tmp0 *= 0.130526192f;
202 
203 		out[ss + 6]  += tmp0;
204 		out[ss + 7]  += tmp1;
205 		out[ss + 8]  += tmp2;
206 		out[ss + 9]  += tmp3;
207 		out[ss + 10] += tmp4;
208 		out[ss + 11] += tmp5;
209 		out[ss + 12] += tmp6;
210 		out[ss + 13] += tmp7;
211 		out[ss + 14] += tmp8;
212 		out[ss + 15] += tmp9;
213 		out[ss + 16] += tmp10;
214 		out[ss + 17] += tmp11;
215 
216 	}
217 	if (sb&1) {
218 		for (i=0;i<18;i+=2) res_p[i]=out[i] + s_p[i];
219 		for (i=1;i<18;i+=2) res_p[i]=-out[i] - s_p[i];
220 	} else
221 		for (i=0;i<18;i++) res_p[i]=out[i] + s_p[i];
222 	for (i=18;i<36;i++) s_p[i-18]=out[i];
223 
224     } else {
225 /*
226  * 36 point IDCT ****************************************************************
227  */
228 	float tmp[18];
229 
230       /* input aliasing for 36 point IDCT */
231       in[17]+=in[16]; in[16]+=in[15]; in[15]+=in[14]; in[14]+=in[13];
232       in[13]+=in[12]; in[12]+=in[11]; in[11]+=in[10]; in[10]+=in[9];
233       in[9] +=in[8];  in[8] +=in[7];  in[7] +=in[6];  in[6] +=in[5];
234       in[5] +=in[4];  in[4] +=in[3];  in[3] +=in[2];  in[2] +=in[1];
235       in[1] +=in[0];
236 
237       /* 18 point IDCT for odd indices */
238 
239       /* input aliasing for 18 point IDCT */
240       in[17]+=in[15]; in[15]+=in[13]; in[13]+=in[11]; in[11]+=in[9];
241       in[9] +=in[7];  in[7] +=in[5];  in[5] +=in[3];  in[3] +=in[1];
242 
243 
244 {
245    float tmp0,tmp1,tmp2,tmp3,tmp4,tmp0_,tmp1_,tmp2_,tmp3_;
246    float tmp0o,tmp1o,tmp2o,tmp3o,tmp4o,tmp0_o,tmp1_o,tmp2_o,tmp3_o;
247 
248 /* Fast 9 Point Inverse Discrete Cosine Transform
249 //
250 // By  Francois-Raymond Boyer
251 //         mailto:boyerf@iro.umontreal.ca
252 //         http://www.iro.umontreal.ca/~boyerf
253 //
254 // The code has been optimized for Intel processors
255 //  (takes a lot of time to convert float to and from iternal FPU representation)
256 //
257 // It is a simple "factorization" of the IDCT matrix.
258 */
259    /* 9 point IDCT on even indices */
260    {
261    /* 5 points on odd indices (not realy an IDCT) */
262    float i0 = in[0]+in[0];
263    float i0p12 = i0 + in[12];
264 
265    tmp0 = i0p12 + in[4]*1.8793852415718f  + in[8]*1.532088886238f   + in[16]*0.34729635533386f;
266    tmp1 = i0    + in[4]                   - in[8] - in[12] - in[12] - in[16];
267    tmp2 = i0p12 - in[4]*0.34729635533386f - in[8]*1.8793852415718f  + in[16]*1.532088886238f;
268    tmp3 = i0p12 - in[4]*1.532088886238f   + in[8]*0.34729635533386f - in[16]*1.8793852415718f;
269    tmp4 = in[0] - in[4]                   + in[8] - in[12]          + in[16];
270    }
271    {
272    float i6_ = in[6]*1.732050808f;
273 
274    tmp0_ = in[2]*1.9696155060244f  + i6_ + in[10]*1.2855752193731f  + in[14]*0.68404028665134f;
275    tmp1_ = (in[2]                        - in[10]                   - in[14])*1.732050808f;
276    tmp2_ = in[2]*1.2855752193731f  - i6_ - in[10]*0.68404028665134f + in[14]*1.9696155060244f;
277    tmp3_ = in[2]*0.68404028665134f - i6_ + in[10]*1.9696155060244f  - in[14]*1.2855752193731f;
278    }
279 
280    /* 9 point IDCT on odd indices */
281    {
282    /* 5 points on odd indices (not realy an IDCT) */
283    float i0 = in[0+1]+in[0+1];
284    float i0p12 = i0 + in[12+1];
285 
286    tmp0o = i0p12   + in[4+1]*1.8793852415718f  + in[8+1]*1.532088886238f       + in[16+1]*0.34729635533386f;
287    tmp1o = i0      + in[4+1]                   - in[8+1] - in[12+1] - in[12+1] - in[16+1];
288    tmp2o = i0p12   - in[4+1]*0.34729635533386f - in[8+1]*1.8793852415718f      + in[16+1]*1.532088886238f;
289    tmp3o = i0p12   - in[4+1]*1.532088886238f   + in[8+1]*0.34729635533386f     - in[16+1]*1.8793852415718f;
290    tmp4o = (in[0+1] - in[4+1]                   + in[8+1] - in[12+1]            + in[16+1])*0.707106781f; /* Twiddled */
291    }
292    {
293    /* 4 points on even indices */
294    float i6_ = in[6+1]*1.732050808f;		/* Sqrt[3] */
295 
296    tmp0_o = in[2+1]*1.9696155060244f  + i6_ + in[10+1]*1.2855752193731f  + in[14+1]*0.68404028665134f;
297    tmp1_o = (in[2+1]                        - in[10+1]                   - in[14+1])*1.732050808f;
298    tmp2_o = in[2+1]*1.2855752193731f  - i6_ - in[10+1]*0.68404028665134f + in[14+1]*1.9696155060244f;
299    tmp3_o = in[2+1]*0.68404028665134f - i6_ + in[10+1]*1.9696155060244f  - in[14+1]*1.2855752193731f;
300    }
301 
302    /* Twiddle factors on odd indices
303    // and
304    // Butterflies on 9 point IDCT's
305    // and
306    // twiddle factors for 36 point IDCT
307    */
308    {
309    float e, o;
310    e = tmp0 + tmp0_; o = (tmp0o + tmp0_o)*0.501909918f; tmp[0] = (e + o)*(-0.500476342f*.5f);    tmp[17] = (e - o)*(-11.46279281f*.5f);
311    e = tmp1 + tmp1_; o = (tmp1o + tmp1_o)*0.517638090f; tmp[1] = (e + o)*(-0.504314480f*.5f);    tmp[16] = (e - o)*(-3.830648788f*.5f);
312    e = tmp2 + tmp2_; o = (tmp2o + tmp2_o)*0.551688959f; tmp[2] = (e + o)*(-0.512139757f*.5f);    tmp[15] = (e - o)*(-2.310113158f*.5f);
313    e = tmp3 + tmp3_; o = (tmp3o + tmp3_o)*0.610387294f; tmp[3] = (e + o)*(-0.524264562f*.5f);    tmp[14] = (e - o)*(-1.662754762f*.5f);
314                                                         tmp[4] = (tmp4 + tmp4o)*(-0.541196100f); tmp[13] = (tmp4 - tmp4o)*(-1.306562965f);
315    e = tmp3 - tmp3_; o = (tmp3o - tmp3_o)*0.871723397f; tmp[5] = (e + o)*(-0.563690973f*.5f);    tmp[12] = (e - o)*(-1.082840285f*.5f);
316    e = tmp2 - tmp2_; o = (tmp2o - tmp2_o)*1.183100792f; tmp[6] = (e + o)*(-0.592844523f*.5f);    tmp[11] = (e - o)*(-0.930579498f*.5f);
317    e = tmp1 - tmp1_; o = (tmp1o - tmp1_o)*1.931851653f; tmp[7] = (e + o)*(-0.630236207f*.5f);    tmp[10] = (e - o)*(-0.821339815f*.5f);
318    e = tmp0 - tmp0_; o = (tmp0o - tmp0_o)*5.736856623f; tmp[8] = (e + o)*(-0.678170852f*.5f);    tmp[9] =  (e - o)*(-0.740093616f*.5f);
319    }
320    }
321 	/* shift to modified IDCT */
322 	win_bt = win[win_type];
323 
324 	if (sb&1) {
325 		res_p[0] =  -tmp[9]  * win_bt[0] + s_p[0];
326 		res_p[1] =-(-tmp[10] * win_bt[1] + s_p[1]);
327 		res_p[2] =  -tmp[11] * win_bt[2] + s_p[2];
328 		res_p[3] =-(-tmp[12] * win_bt[3] + s_p[3]);
329 		res_p[4] =  -tmp[13] * win_bt[4] + s_p[4];
330 		res_p[5] =-(-tmp[14] * win_bt[5] + s_p[5]);
331 		res_p[6] =  -tmp[15] * win_bt[6] + s_p[6];
332 		res_p[7] =-(-tmp[16] * win_bt[7] + s_p[7]);
333 		res_p[8] =  -tmp[17] * win_bt[8] + s_p[8];
334 
335 		res_p[9] = -(tmp[17] * win_bt[9] + s_p[9]);
336 		res_p[10]=  tmp[16] * win_bt[10] + s_p[10];
337 		res_p[11]=-(tmp[15] * win_bt[11] + s_p[11]);
338 		res_p[12]=  tmp[14] * win_bt[12] + s_p[12];
339 		res_p[13]=-(tmp[13] * win_bt[13] + s_p[13]);
340 		res_p[14]=  tmp[12] * win_bt[14] + s_p[14];
341 		res_p[15]=-(tmp[11] * win_bt[15] + s_p[15]);
342 		res_p[16]=  tmp[10] * win_bt[16] + s_p[16];
343 		res_p[17]=-(tmp[9]  * win_bt[17] + s_p[17]);
344 	} else {
345 		res_p[0] = -tmp[9]  * win_bt[0] + s_p[0];
346 		res_p[1] = -tmp[10] * win_bt[1] + s_p[1];
347 		res_p[2] = -tmp[11] * win_bt[2] + s_p[2];
348 		res_p[3] = -tmp[12] * win_bt[3] + s_p[3];
349 		res_p[4] = -tmp[13] * win_bt[4] + s_p[4];
350 		res_p[5] = -tmp[14] * win_bt[5] + s_p[5];
351 		res_p[6] = -tmp[15] * win_bt[6] + s_p[6];
352 		res_p[7] = -tmp[16] * win_bt[7] + s_p[7];
353 		res_p[8] = -tmp[17] * win_bt[8] + s_p[8];
354 
355 		res_p[9] = tmp[17] * win_bt[9] + s_p[9];
356 		res_p[10]= tmp[16] * win_bt[10] + s_p[10];
357 		res_p[11]= tmp[15] * win_bt[11] + s_p[11];
358 		res_p[12]= tmp[14] * win_bt[12] + s_p[12];
359 		res_p[13]= tmp[13] * win_bt[13] + s_p[13];
360 		res_p[14]= tmp[12] * win_bt[14] + s_p[14];
361 		res_p[15]= tmp[11] * win_bt[15] + s_p[15];
362 		res_p[16]= tmp[10] * win_bt[16] + s_p[16];
363 		res_p[17]= tmp[9]  * win_bt[17] + s_p[17];
364 	}
365 
366 	s_p[0]= tmp[8]  * win_bt[18];
367 	s_p[1]= tmp[7]  * win_bt[19];
368 	s_p[2]= tmp[6]  * win_bt[20];
369 	s_p[3]= tmp[5]  * win_bt[21];
370 	s_p[4]= tmp[4]  * win_bt[22];
371 	s_p[5]= tmp[3]  * win_bt[23];
372 	s_p[6]= tmp[2]  * win_bt[24];
373 	s_p[7]= tmp[1]  * win_bt[25];
374 	s_p[8]= tmp[0]  * win_bt[26];
375 
376 	s_p[9]= tmp[0]  * win_bt[27];
377 	s_p[10]= tmp[1]  * win_bt[28];
378 	s_p[11]= tmp[2]  * win_bt[29];
379 	s_p[12]= tmp[3]  * win_bt[30];
380 	s_p[13]= tmp[4]  * win_bt[31];
381 	s_p[14]= tmp[5]  * win_bt[32];
382 	s_p[15]= tmp[6]  * win_bt[33];
383 	s_p[16]= tmp[7]  * win_bt[34];
384 	s_p[17]= tmp[8]  * win_bt[35];
385     }
386 }
387 
388 /* fast DCT according to Lee[84]
389  * reordering according to Konstantinides[94]
390  */
poly(const int ch,int f)391 void poly(const int ch,int f)
392 {
393 static float u[2][2][17][16]; /* no v[][], it's redundant */
394 static int u_start[2]={0,0}; /* first element of u[][] */
395 static int u_div[2]={0,0}; /* which part of u[][] is currently used */
396 int start = u_start[ch];
397 int div = u_div[ch];
398 float (*u_p)[16];
399 
400 #if defined(PENTIUM_RDTSC)
401 unsigned int cnt4, cnt3, cnt2, cnt1;
402 static int min_cycles = 99999999;
403 
404 	__asm__(".byte 0x0f,0x31" : "=a" (cnt1), "=d" (cnt4));
405 #endif
406 
407 	{
408 	float d16,d17,d18,d19,d20,d21,d22,d23,d24,d25,d26,d27,d28,d29,d30,d31;
409 	float d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12,d13,d14,d15;
410 
411 	/* step 1: initial reordering and 1st (16 wide) butterflies
412 	*/
413 
414 	d0 = res[ 0][f]; d16=(d0  - res[31][f]) *  b1; d0 += res[31][f];
415 	d1 = res[ 1][f]; d17=(d1  - res[30][f]) *  b3; d1 += res[30][f];
416 	d3 = res[ 2][f]; d19=(d3  - res[29][f]) *  b5; d3 += res[29][f];
417 	d2 = res[ 3][f]; d18=(d2  - res[28][f]) *  b7; d2 += res[28][f];
418 	d6 = res[ 4][f]; d22=(d6  - res[27][f]) *  b9; d6 += res[27][f];
419 	d7 = res[ 5][f]; d23=(d7  - res[26][f]) * b11; d7 += res[26][f];
420 	d5 = res[ 6][f]; d21=(d5  - res[25][f]) * b13; d5 += res[25][f];
421 	d4 = res[ 7][f]; d20=(d4  - res[24][f]) * b15; d4 += res[24][f];
422 	d12= res[ 8][f]; d28=(d12 - res[23][f]) * b17; d12+= res[23][f];
423 	d13= res[ 9][f]; d29=(d13 - res[22][f]) * b19; d13+= res[22][f];
424 	d15= res[10][f]; d31=(d15 - res[21][f]) * b21; d15+= res[21][f];
425 	d14= res[11][f]; d30=(d14 - res[20][f]) * b23; d14+= res[20][f];
426 	d10= res[12][f]; d26=(d10 - res[19][f]) * b25; d10+= res[19][f];
427 	d11= res[13][f]; d27=(d11 - res[18][f]) * b27; d11+= res[18][f];
428 	d9 = res[14][f]; d25=(d9  - res[17][f]) * b29; d9 += res[17][f];
429 	d8 = res[15][f]; d24=(d8  - res[16][f]) * b31; d8 += res[16][f];
430 
431 	{
432 	float c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15;
433 
434 /* a test to see what can be done with memory separation
435  * first we process indexes 0-15
436 */
437 	c0 = d0 + d8 ; c8 = ( d0 - d8 ) *  b2;
438 	c1 = d1 + d9 ; c9 = ( d1 - d9 ) *  b6;
439 	c2 = d2 + d10; c10= ( d2 - d10) * b14;
440 	c3 = d3 + d11; c11= ( d3 - d11) * b10;
441 	c4 = d4 + d12; c12= ( d4 - d12) * b30;
442 	c5 = d5 + d13; c13= ( d5 - d13) * b26;
443 	c6 = d6 + d14; c14= ( d6 - d14) * b18;
444 	c7 = d7 + d15; c15= ( d7 - d15) * b22;
445 
446 	/* step 3: 4-wide butterflies
447 	*/
448 	d0 = c0 + c4 ; d4 = ( c0 - c4 ) *  b4;
449 	d1 = c1 + c5 ; d5 = ( c1 - c5 ) * b12;
450 	d2 = c2 + c6 ; d6 = ( c2 - c6 ) * b28;
451 	d3 = c3 + c7 ; d7 = ( c3 - c7 ) * b20;
452 
453 	d8 = c8 + c12; d12= ( c8 - c12) *  b4;
454 	d9 = c9 + c13; d13= ( c9 - c13) * b12;
455 	d10= c10+ c14; d14= (c10 - c14) * b28;
456 	d11= c11+ c15; d15= (c11 - c15) * b20;
457 
458 
459 	/* step 4: 2-wide butterflies
460 	*/
461 	{
462 	float rb8 = b8;
463 	float rb24 = b24;
464 
465 /**/	c0 = d0 + d2 ; c2 = ( d0 - d2 ) *  rb8;
466 	c1 = d1 + d3 ; c3 = ( d1 - d3 ) * rb24;
467 /**/	c4 = d4 + d6 ; c6 = ( d4 - d6 ) *  rb8;
468 	c5 = d5 + d7 ; c7 = ( d5 - d7 ) * rb24;
469 /**/	c8 = d8 + d10; c10= ( d8 - d10) *  rb8;
470 	c9 = d9 + d11; c11= ( d9 - d11) * rb24;
471 /**/	c12= d12+ d14; c14= (d12 - d14) *  rb8;
472 	c13= d13+ d15; c15= (d13 - d15) * rb24;
473 	}
474 
475 	/* step 5: 1-wide butterflies
476 	*/
477 	{
478 	float rb16 = b16;
479 
480 	/* this is a little 'hacked up'
481 	*/
482 	d0 = (-c0 -c1) * 2; d1 = ( c0 - c1 ) * rb16;
483 	d2 = c2 + c3; d3 = ( c2 - c3 ) * rb16;
484 	d3 -= d2;
485 
486 	d4 = c4 +c5; d5 = ( c4 - c5 ) * rb16;
487 	d5 += d4;
488 	d7 = -d5;
489 	d7 += ( c6 - c7 ) * rb16; d6 = +c6 +c7;
490 
491 	d8 = c8 + c9 ; d9 = ( c8 - c9 ) * rb16;
492 	d11= +d8 +d9;
493 	d11 +=(c10 - c11) * rb16; d10= c10+ c11;
494 
495 	d12 = c12+ c13; d13 = (c12 - c13) * rb16;
496 	d13 += -d8-d9+d12;
497 	d14 = c14+ c15; d15 = (c14 - c15) * rb16;
498 	d15-=d11;
499 	d14 += -d8 -d10;
500 	}
501 
502         /* step 6: final resolving & reordering
503          * the other 32 are stored for use with the next granule
504          */
505 
506         u_p = (float (*)[16]) &u[ch][div][0][start];
507 
508 /*16*/  u_p[ 0][0] =+d1 ;
509         u_p[ 2][0] = +d9 -d14;
510 /*20*/  u_p[ 4][0] = +d5 -d6;
511         u_p[ 6][0] = -d10 +d13;
512 /*24*/  u_p[ 8][0] =d3;
513         u_p[10][0] = -d8 -d9 +d11 -d13;
514 /*28*/  u_p[12][0] = +d7;
515         u_p[14][0] = +d15;
516 
517         /* the other 32 are stored for use with the next granule
518          */
519 
520         u_p = (float (*)[16]) &u[ch][!div][0][start];
521 
522 /*0*/   u_p[16][0] = d0;
523         u_p[14][0] = -(+d8 );
524 /*4*/   u_p[12][0] = -(+d4 );
525         u_p[10][0] = -(-d8 +d12 );
526 /*8*/   u_p[ 8][0] = -(+d2 );
527         u_p[ 6][0] = -(+d8 +d10 -d12 );
528 /*12*/  u_p[ 4][0] = -(-d4 +d6 );
529         u_p[ 2][0] = -d14;
530         u_p[ 0][0] = -d1;
531 	}
532 
533 	{
534 	float c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15;
535 
536 /* memory separation, second part
537 */
538 /* 2
539 */
540         c0=d16 + d24; c8= (d16 - d24) *  b2;
541         c1=d17 + d25; c9= (d17 - d25) *  b6;
542         c2=d18 + d26; c10= (d18 - d26) * b14;
543         c3=d19 + d27; c11= (d19 - d27) * b10;
544         c4=d20 + d28; c12= (d20 - d28) * b30;
545         c5=d21 + d29; c13= (d21 - d29) * b26;
546         c6=d22 + d30; c14= (d22 - d30) * b18;
547         c7=d23 + d31; c15= (d23 - d31) * b22;
548 
549 /* 3
550 */
551         d16= c0+ c4; d20= (c0 - c4) *  b4;
552         d17= c1+ c5; d21= (c1 - c5) * b12;
553         d18= c2+ c6; d22= (c2 - c6) * b28;
554         d19= c3+ c7; d23= (c3 - c7) * b20;
555 
556         d24= c8+ c12; d28= (c8 - c12) *  b4;
557         d25= c9+ c13; d29= (c9 - c13) * b12;
558         d26= c10+ c14; d30= (c10 - c14) * b28;
559         d27= c11+ c15; d31= (c11 - c15) * b20;
560 
561 /* 4
562 */
563         {
564         float rb8 = b8;
565         float rb24 = b24;
566 
567 /**/    c0= d16+ d18; c2= (d16 - d18) *  rb8;
568         c1= d17+ d19; c3= (d17 - d19) * rb24;
569 /**/    c4= d20+ d22; c6= (d20 - d22) *  rb8;
570         c5= d21+ d23; c7= (d21 - d23) * rb24;
571 /**/    c8= d24+ d26; c10= (d24 - d26) *  rb8;
572         c9= d25+ d27; c11= (d25 - d27) * rb24;
573 /**/    c12= d28+ d30; c14= (d28 - d30) *  rb8;
574         c13= d29+ d31; c15= (d29 - d31) * rb24;
575         }
576 
577 /* 5
578 */
579         {
580         float rb16 = b16;
581         d16= c0+ c1; d17= (c0 - c1) * rb16;
582         d18= c2+ c3; d19= (c2 - c3) * rb16;
583 
584         d20= c4+ c5; d21= (c4 - c5) * rb16;
585         d20+=d16; d21+=d17;
586         d22= c6+ c7; d23= (c6 - c7) * rb16;
587         d22+=d16; d22+=d18;
588         d23+=d16; d23+=d17; d23+=d19;
589 
590 
591         d24= c8+ c9; d25= (c8 - c9) * rb16;
592         d26= c10+ c11; d27= (c10 - c11) * rb16;
593         d26+=d24;
594         d27+=d24; d27+=d25;
595 
596         d28= c12+ c13; d29= (c12 - c13) * rb16;
597         d28-=d20; d29+=d28; d29-=d21;
598         d30= c14+ c15; d31= (c14 - c15) * rb16;
599         d30-=d22;
600         d31-=d23;
601 	}
602 
603 	/* step 6: final resolving & reordering
604 	 * the other 32 are stored for use with the next granule
605 	 */
606 
607 	u_p = (float (*)[16]) &u[ch][!div][0][start];
608 
609 	u_p[ 1][0] = -(+d30 );
610 	u_p[ 3][0] = -(+d22 -d26 );
611 	u_p[ 5][0] = -(-d18 -d20 +d26 );
612 	u_p[ 7][0] = -(+d18 -d28 );
613 	u_p[ 9][0] = -(+d28 );
614 	u_p[11][0] = -(+d20 -d24 );
615 	u_p[13][0] = -(-d16 +d24 );
616 	u_p[15][0] = -(+d16 );
617 
618 	/* the other 32 are stored for use with the next granule
619 	 */
620 
621 	u_p = (float (*)[16]) &u[ch][div][0][start];
622 
623 	u_p[15][0] = +d31;
624 	u_p[13][0] = +d23 -d27;
625 	u_p[11][0] = -d19 -d20 -d21 +d27;
626 	u_p[ 9][0] = +d19 -d29;
627 	u_p[ 7][0] = -d18 +d29;
628 	u_p[ 5][0] = +d18 +d20 +d21 -d25 -d26;
629 	u_p[ 3][0] = -d17 -d22 +d25 +d26;
630 	u_p[ 1][0] = +d17 -d30;
631 	}
632 	}
633 
634 #if defined(PENTIUM_RDTSC1)
635 	__asm__(".byte 0x0f,0x31" : "=a" (cnt3), "=d" (cnt4));
636 #endif
637 
638 	/* we're doing dewindowing and calculating final samples now
639 	 */
640 
641 #if defined(ARCH_i586)
642 	/* x86 assembler optimisations.  These optimisations are tuned
643            specifically for Intel Pentiums. */
644 
645     asm("movl $15,%%eax\n\t"\
646         "1:\n\t"\
647 	"flds (%0)\n\t"\
648 	"fmuls (%1)\n\t"\
649 	"flds 4(%0)\n\t"\
650 	"fmuls 4(%1)\n\t"\
651 	"flds 8(%0)\n\t"\
652 	"fmuls 8(%1)\n\t"\
653 	"fxch %%st(2)\n\t"\
654 	"faddp\n\t"\
655 	"flds 12(%0)\n\t"\
656 	"fmuls 12(%1)\n\t"\
657 	"fxch %%st(2)\n\t"\
658 	"faddp\n\t"\
659 	"flds 16(%0)\n\t"\
660 	"fmuls 16(%1)\n\t"\
661 	"fxch %%st(2)\n\t"\
662 	"faddp\n\t"\
663 	"flds 20(%0)\n\t"\
664 	"fmuls 20(%1)\n\t"\
665 	"fxch %%st(2)\n\t"\
666 	"faddp\n\t"\
667 	"flds 24(%0)\n\t"\
668 	"fmuls 24(%1)\n\t"\
669 	"fxch %%st(2)\n\t"\
670 	"faddp\n\t"\
671 	"flds 28(%0)\n\t"\
672 	"fmuls 28(%1)\n\t"\
673 	"fxch %%st(2)\n\t"\
674 	"faddp\n\t"\
675 	"flds 32(%0)\n\t"\
676 	"fmuls 32(%1)\n\t"\
677 	"fxch %%st(2)\n\t"\
678 	"faddp\n\t"\
679 	"flds 36(%0)\n\t"\
680 	"fmuls 36(%1)\n\t"\
681 	"fxch %%st(2)\n\t"\
682 	"faddp\n\t"\
683 	"flds 40(%0)\n\t"\
684 	"fmuls 40(%1)\n\t"\
685 	"fxch %%st(2)\n\t"\
686 	"faddp\n\t"\
687 	"flds 44(%0)\n\t"\
688 	"fmuls 44(%1)\n\t"\
689 	"fxch %%st(2)\n\t"\
690 	"faddp\n\t"\
691 	"flds 48(%0)\n\t"\
692 	"fmuls 48(%1)\n\t"\
693 	"fxch %%st(2)\n\t"\
694 	"faddp\n\t"\
695 	"flds 52(%0)\n\t"\
696 	"fmuls 52(%1)\n\t"\
697 	"fxch %%st(2)\n\t"\
698 	"faddp\n\t"\
699 	"flds 56(%0)\n\t"\
700 	"fmuls 56(%1)\n\t"\
701 	"fxch %%st(2)\n\t"\
702 	"faddp\n\t"\
703 	"flds 60(%0)\n\t"\
704 	"fmuls 60(%1)\n\t"\
705 	"fxch %%st(2)\n\t"\
706 	"faddp\n\t"\
707 	"addl $64,%0\n\t"\
708 	"addl $128,%1\n\t"\
709 	"subl $4,%%esp\n\t"\
710 	"faddp\n\t"\
711 	"fistpl (%%esp)\n\t"\
712 	"popl %%ecx\n\t"\
713 	"cmpl $32767,%%ecx\n\t"\
714 	"jle 2f\n\t"\
715 	"movw $32767,%%cx\n\t"\
716 	"jmp 3f\n\t"\
717 	"2: cmpl $-32768,%%ecx\n\t"\
718 	"jge 3f\n\t"\
719 	"movw $-32768,%%cx\n\t"\
720 	"3: movw %%cx,(%2)\n\t"\
721 	"addl %3,%2\n\t"\
722 	"decl %%eax\n\t"\
723 	"jns 1b\n\t"\
724 
725 	"testb $1,%4\n\t"\
726         "je 4f\n\t"
727 
728 	"flds (%0)\n\t"\
729 	"fmuls (%1)\n\t"\
730 	"flds 8(%0)\n\t"\
731 	"fmuls 8(%1)\n\t"\
732 	"flds 16(%0)\n\t"\
733 	"fmuls 16(%1)\n\t"\
734 	"fxch %%st(2)\n\t"\
735 	"faddp\n\t"\
736 	"flds 24(%0)\n\t"\
737 	"fmuls 24(%1)\n\t"\
738 	"fxch %%st(2)\n\t"\
739 	"faddp\n\t"\
740 	"flds 32(%0)\n\t"\
741 	"fmuls 32(%1)\n\t"\
742 	"fxch %%st(2)\n\t"\
743 	"faddp\n\t"\
744 	"flds 40(%0)\n\t"\
745 	"fmuls 40(%1)\n\t"\
746 	"fxch %%st(2)\n\t"\
747 	"faddp\n\t"\
748 	"flds 48(%0)\n\t"\
749 	"fmuls 48(%1)\n\t"\
750 	"fxch %%st(2)\n\t"\
751 	"faddp\n\t"\
752 	"flds 56(%0)\n\t"\
753 	"fmuls 56(%1)\n\t"\
754 	"fxch %%st(2)\n\t"\
755 	"faddp\n\t"\
756 	"subl $4,%%esp\n\t"\
757 	"subl $64,%0\n\t"\
758 	"subl $192,%1\n\t"\
759 	"faddp\n\t"\
760 	"fistpl (%%esp)\n\t"\
761 	"popl %%ecx\n\t"\
762 	"cmpl $32767,%%ecx\n\t"\
763 	"jle 2f\n\t"\
764 	"movw $32767,%%cx\n\t"\
765 	"jmp 3f\n\t"\
766 	"2: cmpl $-32768,%%ecx\n\t"\
767 	"jge 3f\n\t"\
768 	"movw $-32768,%%cx\n\t"\
769 	"3: movw %%cx,(%2)\n\t"\
770 
771 	"movl %5,%%ecx\n\t"\
772 	"sall $3,%%ecx\n\t"\
773 	"addl %%ecx,%1\n\t"\
774 	"addl %3,%2\n\t"\
775 	"movl $14,%%eax\n\t"\
776 
777 	"1:flds 4(%0)\n\t"\
778 	"fmuls 56(%1)\n\t"\
779 	"flds (%0)\n\t"\
780 	"fmuls 60(%1)\n\t"\
781 	"flds 12(%0)\n\t"\
782 	"fmuls 48(%1)\n\t"\
783 	"fxch %%st(2)\n\t"\
784 	"fsubp\n\t"\
785 	"flds 8(%0)\n\t"\
786 	"fmuls 52(%1)\n\t"\
787 	"fxch %%st(2)\n\t"\
788 	"faddp\n\t"\
789 	"flds 20(%0)\n\t"\
790 	"fmuls 40(%1)\n\t"\
791 	"fxch %%st(2)\n\t"\
792 	"fsubrp\n\t"\
793 	"flds 16(%0)\n\t"\
794 	"fmuls 44(%1)\n\t"\
795 	"fxch %%st(2)\n\t"\
796 	"faddp\n\t"\
797 	"flds 28(%0)\n\t"\
798 	"fmuls 32(%1)\n\t"\
799 	"fxch %%st(2)\n\t"\
800 	"fsubrp\n\t"\
801 	"flds 24(%0)\n\t"\
802 	"fmuls 36(%1)\n\t"\
803 	"fxch %%st(2)\n\t"\
804 	"faddp\n\t"\
805 	"flds 36(%0)\n\t"\
806 	"fmuls 24(%1)\n\t"\
807 	"fxch %%st(2)\n\t"\
808 	"fsubrp\n\t"\
809 	"flds 32(%0)\n\t"\
810 	"fmuls 28(%1)\n\t"\
811 	"fxch %%st(2)\n\t"\
812 	"faddp\n\t"\
813 	"flds 44(%0)\n\t"\
814 	"fmuls 16(%1)\n\t"\
815 	"fxch %%st(2)\n\t"\
816 	"fsubrp\n\t"\
817 	"flds 40(%0)\n\t"\
818 	"fmuls 20(%1)\n\t"\
819 	"fxch %%st(2)\n\t"\
820 	"faddp\n\t"\
821 	"flds 52(%0)\n\t"\
822 	"fmuls 8(%1)\n\t"\
823 	"fxch %%st(2)\n\t"\
824 	"fsubrp\n\t"\
825 	"flds 48(%0)\n\t"\
826 	"fmuls 12(%1)\n\t"\
827 	"fxch %%st(2)\n\t"\
828 	"faddp\n\t"\
829 	"flds 60(%0)\n\t"\
830 	"fmuls (%1)\n\t"\
831 	"fxch %%st(2)\n\t"\
832 	"fsubrp\n\t"\
833 	"flds 56(%0)\n\t"\
834 	"fmuls 4(%1)\n\t"\
835 	"fxch %%st(2)\n\t"\
836 	"faddp\n\t"\
837 	"subl $64,%0\n\t"\
838 	"subl $128,%1\n\t"\
839 	"subl $4,%%esp\n\t"\
840 	"fsubp\n\t"\
841 	"fistpl (%%esp)\n\t"\
842 	"popl %%ecx\n\t"\
843 	"cmpl $32767,%%ecx\n\t"\
844 	"jle 2f\n\t"\
845 	"movw $32767,%%cx\n\t"\
846 	"jmp 3f\n\t"\
847 	"2: cmpl $-32768,%%ecx\n\t"\
848 	"jge 3f\n\t"\
849 	"movw $-32768,%%cx\n\t"\
850 	"3: movw %%cx,(%2)\n\t"\
851 	"addl %3,%2\n\t"\
852 	"decl %%eax\n\t"\
853 	"jns 1b\n\t"\
854 	"jmp 5f\n\t"\
855 
856 	"4:flds 4(%0)\n\t"\
857 	"fmuls 4(%1)\n\t"\
858 	"flds 12(%0)\n\t"\
859 	"fmuls 12(%1)\n\t"\
860 	"flds 20(%0)\n\t"\
861 	"fmuls 20(%1)\n\t"\
862 	"fxch %%st(2)\n\t"\
863 	"faddp\n\t"\
864 	"flds 28(%0)\n\t"\
865 	"fmuls 28(%1)\n\t"\
866 	"fxch %%st(2)\n\t"\
867 	"faddp\n\t"\
868 	"flds 36(%0)\n\t"\
869 	"fmuls 36(%1)\n\t"\
870 	"fxch %%st(2)\n\t"\
871 	"faddp\n\t"\
872 	"flds 44(%0)\n\t"\
873 	"fmuls 44(%1)\n\t"\
874 	"fxch %%st(2)\n\t"\
875 	"faddp\n\t"\
876 	"flds 52(%0)\n\t"\
877 	"fmuls 52(%1)\n\t"\
878 	"fxch %%st(2)\n\t"\
879 	"faddp\n\t"\
880 	"flds 60(%0)\n\t"\
881 	"fmuls 60(%1)\n\t"\
882 	"fxch %%st(2)\n\t"\
883 	"faddp\n\t"\
884 	"subl $4,%%esp\n\t"\
885 	"subl $64,%0\n\t"\
886 	"subl $192,%1\n\t"\
887 	"faddp\n\t"\
888 	"fistpl (%%esp)\n\t"\
889 	"popl %%ecx\n\t"\
890 	"cmpl $32767,%%ecx\n\t"\
891 	"jle 2f\n\t"\
892 	"movw $32767,%%cx\n\t"\
893 	"jmp 3f\n\t"\
894 	"2: cmpl $-32768,%%ecx\n\t"\
895 	"jge 3f\n\t"\
896 	"movw $-32768,%%cx\n\t"\
897 	"3: movw %%cx,(%2)\n\t"\
898 
899 	"movl %5,%%ecx\n\t"\
900 	"sall $3,%%ecx\n\t"\
901 	"addl %%ecx,%1\n\t"\
902 	"addl %3,%2\n\t"\
903 
904 	"movl $14,%%eax\n\t"\
905 	"1:flds (%0)\n\t"\
906 	"fmuls 60(%1)\n\t"\
907 	"flds 4(%0)\n\t"\
908 	"fmuls 56(%1)\n\t"\
909 	"flds 8(%0)\n\t"\
910 	"fmuls 52(%1)\n\t"\
911 	"fxch %%st(2)\n\t"\
912 	"fsubp\n\t"\
913 	"flds 12(%0)\n\t"\
914 	"fmuls 48(%1)\n\t"\
915 	"fxch %%st(2)\n\t"\
916 	"faddp\n\t"\
917 	"flds 16(%0)\n\t"\
918 	"fmuls 44(%1)\n\t"\
919 	"fxch %%st(2)\n\t"\
920 	"fsubrp\n\t"\
921 	"flds 20(%0)\n\t"\
922 	"fmuls 40(%1)\n\t"\
923 	"fxch %%st(2)\n\t"\
924 	"faddp\n\t"\
925 	"flds 24(%0)\n\t"\
926 	"fmuls 36(%1)\n\t"\
927 	"fxch %%st(2)\n\t"\
928 	"fsubrp\n\t"\
929 	"flds 28(%0)\n\t"\
930 	"fmuls 32(%1)\n\t"\
931 	"fxch %%st(2)\n\t"\
932 	"faddp\n\t"\
933 	"flds 32(%0)\n\t"\
934 	"fmuls 28(%1)\n\t"\
935 	"fxch %%st(2)\n\t"\
936 	"fsubrp\n\t"\
937 	"flds 36(%0)\n\t"\
938 	"fmuls 24(%1)\n\t"\
939 	"fxch %%st(2)\n\t"\
940 	"faddp\n\t"\
941 	"flds 40(%0)\n\t"\
942 	"fmuls 20(%1)\n\t"\
943 	"fxch %%st(2)\n\t"\
944 	"fsubrp\n\t"\
945 	"flds 44(%0)\n\t"\
946 	"fmuls 16(%1)\n\t"\
947 	"fxch %%st(2)\n\t"\
948 	"faddp\n\t"\
949 	"flds 48(%0)\n\t"\
950 	"fmuls 12(%1)\n\t"\
951 	"fxch %%st(2)\n\t"\
952 	"fsubrp\n\t"\
953 	"flds 52(%0)\n\t"\
954 	"fmuls 8(%1)\n\t"\
955 	"fxch %%st(2)\n\t"\
956 	"faddp\n\t"\
957 	"flds 56(%0)\n\t"\
958 	"fmuls 4(%1)\n\t"\
959 	"fxch %%st(2)\n\t"\
960 	"fsubrp\n\t"\
961 	"flds 60(%0)\n\t"\
962 	"fmuls (%1)\n\t"\
963 	"fxch %%st(2)\n\t"\
964 	"faddp\n\t"\
965 	"subl $64,%0\n\t"\
966 	"subl $128,%1\n\t"\
967 	"subl $4,%%esp\n\t"\
968 	"fsubp\n\t"\
969 	"fistpl (%%esp)\n\t"\
970 	"popl %%ecx\n\t"\
971 	"cmpl $32767,%%ecx\n\t"\
972 	"jle 2f\n\t"\
973 	"movw $32767,%%cx\n\t"\
974 	"jmp 3f\n\t"\
975 	"2: cmpl $-32768,%%ecx\n\t"\
976 	"jge 3f\n\t"\
977 	"movw $-32768,%%cx\n\t"\
978 	"3: movw %%cx,(%2)\n\t"\
979 	"addl %3,%2\n\t"\
980 	"decl %%eax\n\t"\
981 	"jns 1b\n\t"\
982 
983 	"5:"\
984 	    : : "b" (u[ch][div]), "d" (t_dewindow[0] + 16 - start), "S" (&sample_buffer[f>>(2-nch)][nch==2?0:(f&1?16:0)][ch]), "m" (sizeof(short) * nch), "m" (div), "m" (start)\
985 	    : "eax", "ecx", "memory");
986 #else
987 	{
988 	  short *samples = (&sample_buffer[f>>(2-nch)][nch==2?0:(f&1?16:0)][ch]);
989 	  int out, j;
990 
991 #define PUT_SAMPLE(out)			\
992 		if (out > 32767)	\
993 		  *samples = 32767;	\
994 		else			\
995 		  if (out < -32768)	\
996 		    *samples = -32768;	\
997 		  else			\
998 		    *samples = out;	\
999 					\
1000 		samples += nch;
1001 
1002 #if defined(SUPERHACK)
1003 	  /* These is a simple implementation which should be nicer to the
1004              cache; computation of samples are done in one pass rather than
1005              two.  However, for various reasons which I do not have time to
1006              investigate, it runs quite a lot slower than two pass
1007              computations.  If you have time, you are welcome to look into
1008              it. */
1009 
1010           {
1011             float (*u_ptr)[16] = u[ch][div];
1012 	    const float *dewindow2 = t_dewindow[0] + start;
1013 
1014 	    {
1015 	      float outf1, outf2, outf3, outf4;
1016 
1017 	      outf1  = u_ptr[0][ 0] * dewindow[0x0];
1018 	      outf2  = u_ptr[0][ 1] * dewindow[0x1];
1019 	      outf3  = u_ptr[0][ 2] * dewindow[0x2];
1020 	      outf4  = u_ptr[0][ 3] * dewindow[0x3];
1021 	      outf1 += u_ptr[0][ 4] * dewindow[0x4];
1022 	      outf2 += u_ptr[0][ 5] * dewindow[0x5];
1023 	      outf3 += u_ptr[0][ 6] * dewindow[0x6];
1024 	      outf4 += u_ptr[0][ 7] * dewindow[0x7];
1025 	      outf1 += u_ptr[0][ 8] * dewindow[0x8];
1026 	      outf2 += u_ptr[0][ 9] * dewindow[0x9];
1027 	      outf3 += u_ptr[0][10] * dewindow[0xa];
1028 	      outf4 += u_ptr[0][11] * dewindow[0xb];
1029 	      outf1 += u_ptr[0][12] * dewindow[0xc];
1030 	      outf2 += u_ptr[0][13] * dewindow[0xd];
1031 	      outf3 += u_ptr[0][14] * dewindow[0xe];
1032 	      outf4 += u_ptr[0][15] * dewindow[0xf];
1033 
1034 	      out = outf1 + outf2 + outf3 + outf4;
1035 
1036 	      dewindow += 32;
1037 	      dewindow2 += 32;
1038 	      u_ptr++;
1039 
1040 	      if (out > 32767)
1041 		samples[0] = 32767;
1042 	      else
1043 		if (out < -32768)
1044 		  samples[0] = -32768;
1045 		else
1046 		  samples[0] = out;
1047 	    }
1048 
1049             if (div & 0x1) {
1050               for (j = 1; j < 16; ++j) {
1051                 float outf1, outf2, outf3, outf4;
1052 
1053                 outf1  = u_ptr[0][ 0] * dewindow[0x0];
1054                 outf3  = u_ptr[0][ 0] * dewindow2[0xf];
1055                 outf2  = u_ptr[0][ 1] * dewindow[0x1];
1056                 outf4  = u_ptr[0][ 1] * dewindow2[0xe];
1057                 outf1 += u_ptr[0][ 2] * dewindow[0x2];
1058                 outf3 += u_ptr[0][ 2] * dewindow2[0xd];
1059                 outf2 += u_ptr[0][ 3] * dewindow[0x3];
1060                 outf4 += u_ptr[0][ 3] * dewindow2[0xc];
1061                 outf1 += u_ptr[0][ 4] * dewindow[0x4];
1062                 outf3 += u_ptr[0][ 4] * dewindow2[0xb];
1063                 outf2 += u_ptr[0][ 5] * dewindow[0x5];
1064                 outf4 += u_ptr[0][ 5] * dewindow2[0xa];
1065                 outf1 += u_ptr[0][ 6] * dewindow[0x6];
1066                 outf3 += u_ptr[0][ 6] * dewindow2[0x9];
1067                 outf2 += u_ptr[0][ 7] * dewindow[0x7];
1068                 outf4 += u_ptr[0][ 7] * dewindow2[0x8];
1069                 outf1 += u_ptr[0][ 8] * dewindow[0x8];
1070                 outf3 += u_ptr[0][ 8] * dewindow2[0x7];
1071                 outf2 += u_ptr[0][ 9] * dewindow[0x9];
1072                 outf4 += u_ptr[0][ 9] * dewindow2[0x6];
1073                 outf1 += u_ptr[0][10] * dewindow[0xa];
1074                 outf3 += u_ptr[0][10] * dewindow2[0x5];
1075                 outf2 += u_ptr[0][11] * dewindow[0xb];
1076                 outf4 += u_ptr[0][11] * dewindow2[0x4];
1077                 outf1 += u_ptr[0][12] * dewindow[0xc];
1078                 outf3 += u_ptr[0][12] * dewindow2[0x3];
1079                 outf2 += u_ptr[0][13] * dewindow[0xd];
1080                 outf4 += u_ptr[0][13] * dewindow2[0x2];
1081                 outf1 += u_ptr[0][14] * dewindow[0xe];
1082                 outf3 += u_ptr[0][14] * dewindow2[0x1];
1083                 outf2 += u_ptr[0][15] * dewindow[0xf];
1084                 outf4 += u_ptr[0][15] * dewindow2[0x0];
1085 
1086                 dewindow += 32;
1087                 dewindow2 += 32;
1088                 u_ptr++;
1089 
1090                 out = outf1 + outf2;
1091 
1092                 if (out > 32767)
1093                   samples[j * 2] = 32767;
1094                 else
1095                   if (out < -32768)
1096                     samples[j * 2] = -32768;
1097                   else
1098                     samples[j * 2] = out;
1099 
1100                 out = outf4 - outf3;
1101 
1102                 if (out > 32767)
1103                   samples[64 - (j * 2)] = 32767;
1104                 else
1105                   if (out < -32768)
1106                     samples[64 - (j * 2)] = -32768;
1107                   else
1108                     samples[64 - (j * 2)] = out;
1109               }
1110 
1111               {
1112                 float outf2, outf4;
1113 
1114                 outf2  = u_ptr[0][ 0] * dewindow[0x0];
1115                 outf4  = u_ptr[0][ 2] * dewindow[0x2];
1116                 outf2 += u_ptr[0][ 4] * dewindow[0x4];
1117                 outf4 += u_ptr[0][ 6] * dewindow[0x6];
1118                 outf2 += u_ptr[0][ 8] * dewindow[0x8];
1119                 outf4 += u_ptr[0][10] * dewindow[0xa];
1120                 outf2 += u_ptr[0][12] * dewindow[0xc];
1121                 outf4 += u_ptr[0][14] * dewindow[0xe];
1122 
1123                 out = outf2 + outf4;
1124 
1125                 if (out > 32767)
1126                   samples[16 * 2] = 32767;
1127                 else
1128                   if (out < -32768)
1129                     samples[16 * 2] = -32768;
1130                   else
1131                     samples[16 * 2] = out;
1132               }
1133             } else {
1134               for (j = 1; j < 16; ++j) {
1135                 float outf1, outf2, outf3, outf4;
1136 
1137                 outf1  = u_ptr[0][ 0] * dewindow[0x0];
1138                 outf3  = u_ptr[0][ 0] * dewindow2[0xf];
1139                 outf2  = u_ptr[0][ 1] * dewindow[0x1];
1140                 outf4  = u_ptr[0][ 1] * dewindow2[0xe];
1141                 outf1 += u_ptr[0][ 2] * dewindow[0x2];
1142                 outf3 += u_ptr[0][ 2] * dewindow2[0xd];
1143                 outf2 += u_ptr[0][ 3] * dewindow[0x3];
1144                 outf4 += u_ptr[0][ 3] * dewindow2[0xc];
1145                 outf1 += u_ptr[0][ 4] * dewindow[0x4];
1146                 outf3 += u_ptr[0][ 4] * dewindow2[0xb];
1147                 outf2 += u_ptr[0][ 5] * dewindow[0x5];
1148                 outf4 += u_ptr[0][ 5] * dewindow2[0xa];
1149                 outf1 += u_ptr[0][ 6] * dewindow[0x6];
1150                 outf3 += u_ptr[0][ 6] * dewindow2[0x9];
1151                 outf2 += u_ptr[0][ 7] * dewindow[0x7];
1152                 outf4 += u_ptr[0][ 7] * dewindow2[0x8];
1153                 outf1 += u_ptr[0][ 8] * dewindow[0x8];
1154                 outf3 += u_ptr[0][ 8] * dewindow2[0x7];
1155                 outf2 += u_ptr[0][ 9] * dewindow[0x9];
1156                 outf4 += u_ptr[0][ 9] * dewindow2[0x6];
1157                 outf1 += u_ptr[0][10] * dewindow[0xa];
1158                 outf3 += u_ptr[0][10] * dewindow2[0x5];
1159                 outf2 += u_ptr[0][11] * dewindow[0xb];
1160                 outf4 += u_ptr[0][11] * dewindow2[0x4];
1161                 outf1 += u_ptr[0][12] * dewindow[0xc];
1162                 outf3 += u_ptr[0][12] * dewindow2[0x3];
1163                 outf2 += u_ptr[0][13] * dewindow[0xd];
1164                 outf4 += u_ptr[0][13] * dewindow2[0x2];
1165                 outf1 += u_ptr[0][14] * dewindow[0xe];
1166                 outf3 += u_ptr[0][14] * dewindow2[0x1];
1167                 outf2 += u_ptr[0][15] * dewindow[0xf];
1168                 outf4 += u_ptr[0][15] * dewindow2[0x0];
1169 
1170                 dewindow += 32;
1171                 dewindow2 += 32;
1172                 u_ptr++;
1173 
1174                 out = outf1 + outf2;
1175 
1176                 if (out > 32767)
1177                   samples[j * 2] = 32767;
1178                 else
1179                   if (out < -32768)
1180                     samples[j * 2] = -32768;
1181                   else
1182                     samples[j * 2] = out;
1183 
1184                 out = outf3 - outf4;
1185 
1186                 if (out > 32767)
1187                   samples[64 - (j * 2)] = 32767;
1188                 else
1189                   if (out < -32768)
1190                     samples[64 - (j * 2)] = -32768;
1191                   else
1192                     samples[64 - (j * 2)] = out;
1193               }
1194 
1195               {
1196                 float outf2, outf4;
1197 
1198                 outf2  = u_ptr[0][ 1] * dewindow[0x1];
1199                 outf4  = u_ptr[0][ 3] * dewindow[0x3];
1200                 outf2 += u_ptr[0][ 5] * dewindow[0x5];
1201                 outf4 += u_ptr[0][ 7] * dewindow[0x7];
1202                 outf2 += u_ptr[0][ 9] * dewindow[0x9];
1203                 outf4 += u_ptr[0][11] * dewindow[0xb];
1204                 outf2 += u_ptr[0][13] * dewindow[0xd];
1205                 outf4 += u_ptr[0][15] * dewindow[0xf];
1206 
1207                 out = outf2 + outf4;
1208 
1209                 if (out > 32767)
1210                   samples[16 * 2] = 32767;
1211                 else
1212                   if (out < -32768)
1213                     samples[16 * 2] = -32768;
1214                   else
1215                     samples[16 * 2] = out;
1216               }
1217             }
1218           }
1219 #elif defined(HAS_AUTOINCREMENT)
1220 	  const float *dewindow = t_dewindow[0] + 15 - start;
1221 
1222 	  /* This is tuned specifically for architectures with
1223              autoincrement and -decrement. */
1224 
1225 	  {
1226 	    float *u_ptr = (float*) u[ch][div];
1227 
1228 	    u_ptr--;
1229 
1230 	    for (j = 0; j < 16; ++j) {
1231 	      float outf1, outf2, outf3, outf4;
1232 
1233 	      outf1  = *++u_ptr * *++dewindow;
1234 	      outf2  = *++u_ptr * *++dewindow;
1235 	      outf3  = *++u_ptr * *++dewindow;
1236 	      outf4  = *++u_ptr * *++dewindow;
1237 	      outf1 += *++u_ptr * *++dewindow;
1238 	      outf2 += *++u_ptr * *++dewindow;
1239 	      outf3 += *++u_ptr * *++dewindow;
1240 	      outf4 += *++u_ptr * *++dewindow;
1241 	      outf1 += *++u_ptr * *++dewindow;
1242 	      outf2 += *++u_ptr * *++dewindow;
1243 	      outf3 += *++u_ptr * *++dewindow;
1244 	      outf4 += *++u_ptr * *++dewindow;
1245 	      outf1 += *++u_ptr * *++dewindow;
1246 	      outf2 += *++u_ptr * *++dewindow;
1247 	      outf3 += *++u_ptr * *++dewindow;
1248 	      outf4 += *++u_ptr * *++dewindow;
1249 
1250 	      out = outf1 + outf2 + outf3 + outf4;
1251 
1252 	      dewindow += 16;
1253 
1254 	      PUT_SAMPLE(out)
1255 	    }
1256 
1257 	    if (div & 0x1) {
1258 	      {
1259 		float outf2, outf4;
1260 
1261 		outf2  = u_ptr[ 1] * dewindow[0x1];
1262 		outf4  = u_ptr[ 3] * dewindow[0x3];
1263 		outf2 += u_ptr[ 5] * dewindow[0x5];
1264 		outf4 += u_ptr[ 7] * dewindow[0x7];
1265 		outf2 += u_ptr[ 9] * dewindow[0x9];
1266 		outf4 += u_ptr[11] * dewindow[0xb];
1267 		outf2 += u_ptr[13] * dewindow[0xd];
1268 		outf4 += u_ptr[15] * dewindow[0xf];
1269 
1270 		out = outf2 + outf4;
1271 
1272 		PUT_SAMPLE(out)
1273 	      }
1274 
1275 	      dewindow -= 31;
1276 	      dewindow += start;
1277 	      dewindow += start;
1278 	      u_ptr -= 16;
1279 
1280 	      for (; j < 31; ++j) {
1281 		float outf1, outf2, outf3, outf4;
1282 
1283 		outf1  = *++u_ptr * *--dewindow;
1284 		outf2  = *++u_ptr * *--dewindow;
1285 		outf3  = *++u_ptr * *--dewindow;
1286 		outf4  = *++u_ptr * *--dewindow;
1287 		outf1 += *++u_ptr * *--dewindow;
1288 		outf2 += *++u_ptr * *--dewindow;
1289 		outf3 += *++u_ptr * *--dewindow;
1290 		outf4 += *++u_ptr * *--dewindow;
1291 		outf1 += *++u_ptr * *--dewindow;
1292 		outf2 += *++u_ptr * *--dewindow;
1293 		outf3 += *++u_ptr * *--dewindow;
1294 		outf4 += *++u_ptr * *--dewindow;
1295 		outf1 += *++u_ptr * *--dewindow;
1296 		outf2 += *++u_ptr * *--dewindow;
1297 		outf3 += *++u_ptr * *--dewindow;
1298 		outf4 += *++u_ptr * *--dewindow;
1299 
1300 		out = outf2 - outf1 + outf4 - outf3;
1301 
1302 		dewindow -= 16;
1303 		u_ptr -= 32;
1304 
1305 		PUT_SAMPLE(out)
1306 	      }
1307 	    } else {
1308 	      {
1309 		float outf2, outf4;
1310 
1311 		outf2  = u_ptr[ 2] * dewindow[ 0x2];
1312 		outf4  = u_ptr[ 4] * dewindow[ 0x4];
1313 		outf2 += u_ptr[ 6] * dewindow[ 0x6];
1314 		outf4 += u_ptr[ 8] * dewindow[ 0x8];
1315 		outf2 += u_ptr[10] * dewindow[ 0xa];
1316 		outf4 += u_ptr[12] * dewindow[ 0xc];
1317 		outf2 += u_ptr[14] * dewindow[ 0xe];
1318 		outf4 += u_ptr[16] * dewindow[0x10];
1319 
1320 		out = outf2 + outf4;
1321 
1322 		PUT_SAMPLE(out)
1323 	      }
1324 
1325 	      dewindow -= 31;
1326 	      dewindow += start;
1327 	      dewindow += start;
1328 	      u_ptr -= 16;
1329 
1330 	      for (; j < 31; ++j) {
1331 		float outf1, outf2, outf3, outf4;
1332 
1333 		outf1  = *++u_ptr * *--dewindow;
1334 		outf2  = *++u_ptr * *--dewindow;
1335 		outf3  = *++u_ptr * *--dewindow;
1336 		outf4  = *++u_ptr * *--dewindow;
1337 		outf1 += *++u_ptr * *--dewindow;
1338 		outf2 += *++u_ptr * *--dewindow;
1339 		outf3 += *++u_ptr * *--dewindow;
1340 		outf4 += *++u_ptr * *--dewindow;
1341 		outf1 += *++u_ptr * *--dewindow;
1342 		outf2 += *++u_ptr * *--dewindow;
1343 		outf3 += *++u_ptr * *--dewindow;
1344 		outf4 += *++u_ptr * *--dewindow;
1345 		outf1 += *++u_ptr * *--dewindow;
1346 		outf2 += *++u_ptr * *--dewindow;
1347 		outf3 += *++u_ptr * *--dewindow;
1348 		outf4 += *++u_ptr * *--dewindow;
1349 
1350 		out = outf1 - outf2 + outf3 - outf4;
1351 
1352 		dewindow -= 16;
1353 		u_ptr -= 32;
1354 
1355 		PUT_SAMPLE(out)
1356 	      }
1357 	    }
1358 	  }
1359 #else
1360 	  const float *dewindow = t_dewindow[0] + 16 - start;
1361 
1362 	  /* These optimisations are tuned specifically for architectures
1363              without autoincrement and -decrement. */
1364 
1365 	  {
1366 	    float (*u_ptr)[16] = u[ch][div];
1367 
1368 	    for (j = 0; j < 16; ++j) {
1369 	      float outf1, outf2, outf3, outf4;
1370 
1371 	      outf1  = u_ptr[0][ 0] * dewindow[0x0];
1372 	      outf2  = u_ptr[0][ 1] * dewindow[0x1];
1373 	      outf3  = u_ptr[0][ 2] * dewindow[0x2];
1374 	      outf4  = u_ptr[0][ 3] * dewindow[0x3];
1375 	      outf1 += u_ptr[0][ 4] * dewindow[0x4];
1376 	      outf2 += u_ptr[0][ 5] * dewindow[0x5];
1377 	      outf3 += u_ptr[0][ 6] * dewindow[0x6];
1378 	      outf4 += u_ptr[0][ 7] * dewindow[0x7];
1379 	      outf1 += u_ptr[0][ 8] * dewindow[0x8];
1380 	      outf2 += u_ptr[0][ 9] * dewindow[0x9];
1381 	      outf3 += u_ptr[0][10] * dewindow[0xa];
1382 	      outf4 += u_ptr[0][11] * dewindow[0xb];
1383 	      outf1 += u_ptr[0][12] * dewindow[0xc];
1384 	      outf2 += u_ptr[0][13] * dewindow[0xd];
1385 	      outf3 += u_ptr[0][14] * dewindow[0xe];
1386 	      outf4 += u_ptr[0][15] * dewindow[0xf];
1387 
1388 	      out = outf1 + outf2 + outf3 + outf4;
1389 
1390 	      dewindow += 32;
1391 	      u_ptr++;
1392 
1393 	      PUT_SAMPLE(out)
1394 	    }
1395 
1396 	    if (div & 0x1) {
1397 	      {
1398 		float outf2, outf4;
1399 
1400 		outf2  = u_ptr[0][ 0] * dewindow[0x0];
1401 		outf4  = u_ptr[0][ 2] * dewindow[0x2];
1402 		outf2 += u_ptr[0][ 4] * dewindow[0x4];
1403 		outf4 += u_ptr[0][ 6] * dewindow[0x6];
1404 		outf2 += u_ptr[0][ 8] * dewindow[0x8];
1405 		outf4 += u_ptr[0][10] * dewindow[0xa];
1406 		outf2 += u_ptr[0][12] * dewindow[0xc];
1407 		outf4 += u_ptr[0][14] * dewindow[0xe];
1408 
1409 		out = outf2 + outf4;
1410 
1411 		PUT_SAMPLE(out)
1412 	      }
1413 
1414 	      dewindow -= 48;
1415 	      dewindow += start;
1416 	      dewindow += start;
1417 
1418 	      for (; j < 31; ++j) {
1419 		float outf1, outf2, outf3, outf4;
1420 
1421 		--u_ptr;
1422 
1423 		outf1  = u_ptr[0][ 0] * dewindow[0xf];
1424 		outf2  = u_ptr[0][ 1] * dewindow[0xe];
1425 		outf3  = u_ptr[0][ 2] * dewindow[0xd];
1426 		outf4  = u_ptr[0][ 3] * dewindow[0xc];
1427 		outf1 += u_ptr[0][ 4] * dewindow[0xb];
1428 		outf2 += u_ptr[0][ 5] * dewindow[0xa];
1429 		outf3 += u_ptr[0][ 6] * dewindow[0x9];
1430 		outf4 += u_ptr[0][ 7] * dewindow[0x8];
1431 		outf1 += u_ptr[0][ 8] * dewindow[0x7];
1432 		outf2 += u_ptr[0][ 9] * dewindow[0x6];
1433 		outf3 += u_ptr[0][10] * dewindow[0x5];
1434 		outf4 += u_ptr[0][11] * dewindow[0x4];
1435 		outf1 += u_ptr[0][12] * dewindow[0x3];
1436 		outf2 += u_ptr[0][13] * dewindow[0x2];
1437 		outf3 += u_ptr[0][14] * dewindow[0x1];
1438 		outf4 += u_ptr[0][15] * dewindow[0x0];
1439 
1440 		out = -outf1 + outf2 - outf3 + outf4;
1441 
1442 		dewindow -= 32;
1443 
1444 		PUT_SAMPLE(out)
1445 	      }
1446 	    } else {
1447 	      {
1448 		float outf2, outf4;
1449 
1450 		outf2  = u_ptr[0][ 1] * dewindow[0x1];
1451 		outf4  = u_ptr[0][ 3] * dewindow[0x3];
1452 		outf2 += u_ptr[0][ 5] * dewindow[0x5];
1453 		outf4 += u_ptr[0][ 7] * dewindow[0x7];
1454 		outf2 += u_ptr[0][ 9] * dewindow[0x9];
1455 		outf4 += u_ptr[0][11] * dewindow[0xb];
1456 		outf2 += u_ptr[0][13] * dewindow[0xd];
1457 		outf4 += u_ptr[0][15] * dewindow[0xf];
1458 
1459 		out = outf2 + outf4;
1460 
1461 		PUT_SAMPLE(out)
1462 	      }
1463 
1464 	      dewindow -= 48;
1465 	      dewindow += start;
1466 	      dewindow += start;
1467 
1468 	      for (; j < 31; ++j) {
1469 		float outf1, outf2, outf3, outf4;
1470 
1471 		--u_ptr;
1472 
1473 		outf1  = u_ptr[0][ 0] * dewindow[0xf];
1474 		outf2  = u_ptr[0][ 1] * dewindow[0xe];
1475 		outf3  = u_ptr[0][ 2] * dewindow[0xd];
1476 		outf4  = u_ptr[0][ 3] * dewindow[0xc];
1477 		outf1 += u_ptr[0][ 4] * dewindow[0xb];
1478 		outf2 += u_ptr[0][ 5] * dewindow[0xa];
1479 		outf3 += u_ptr[0][ 6] * dewindow[0x9];
1480 		outf4 += u_ptr[0][ 7] * dewindow[0x8];
1481 		outf1 += u_ptr[0][ 8] * dewindow[0x7];
1482 		outf2 += u_ptr[0][ 9] * dewindow[0x6];
1483 		outf3 += u_ptr[0][10] * dewindow[0x5];
1484 		outf4 += u_ptr[0][11] * dewindow[0x4];
1485 		outf1 += u_ptr[0][12] * dewindow[0x3];
1486 		outf2 += u_ptr[0][13] * dewindow[0x2];
1487 		outf3 += u_ptr[0][14] * dewindow[0x1];
1488 		outf4 += u_ptr[0][15] * dewindow[0x0];
1489 
1490 		out = outf1 - outf2 + outf3 - outf4;
1491 
1492 		dewindow -= 32;
1493 
1494 		PUT_SAMPLE(out)
1495 	      }
1496 	    }
1497 	  }
1498 #endif
1499 	}
1500 #endif
1501 
1502 	--u_start[ch];
1503 	u_start[ch] &= 0xf;
1504 	u_div[ch]=u_div[ch] ? 0 : 1;
1505 
1506 #if defined(PENTIUM_RDTSC)
1507 	__asm__(".byte 0x0f,0x31" : "=a" (cnt2), "=d" (cnt4));
1508 
1509 	if (cnt2-cnt1 < min_cycles) {
1510 	  min_cycles = cnt2-cnt1;
1511 	  /*printf("%d, %d cycles, %d\n", cnt3-cnt1, min_cycles, start);*/
1512 	}
1513 #endif
1514 }
1515 
premultiply()1516 void premultiply()
1517 {
1518   int i,t;
1519 
1520   for (i = 0; i < 17; ++i)
1521     for (t = 0; t < 32; ++t)
1522       t_dewindow[i][t] *= 16383.5f;
1523 }
1524