1 /* this file is a part of amp software, (C) tomislav uzelac 1996,1997
2 */
3
4 /* transform.c imdct and polyphase(DCT) transforms
5 *
6 * Created by: tomislav uzelac May 1996
7 * Karl Anders Oygard optimized this for speed, Mar 13 97
8 * Some optimisations based on ideas from Michael Hipp's mpg123 package
9 */
10
11 /*
12 * Comments for this file:
13 *
14 * The polyphase algorithm is clearly the most cpu consuming part of mpeg 1
15 * layer 3 decoding. Thus, there has been some effort to optimise this
16 * particular algorithm. Currently, everything has been kept in straight C
17 * with no assembler optimisations, but in order to provide efficient paths
18 * for different architectures, alternative implementations of some
19 * critical sections has been done. You may want to experiment with these,
20 * to see which suits your architecture better.
21 *
22 * Selection of the different implementations is done with the following
23 * defines:
24 *
25 * HAS_AUTOINCREMENT
26 *
27 * Define this if your architecture supports preincrementation of
28 * pointers when referencing (applies to e.g. 68k)
29 *
30 * For those who are optimising amp, check out the Pentium rdtsc code
31 * (define PENTIUM_RDTSC). This code uses the rdtsc counter for showing
32 * how many cycles are spent in different parts of the code.
33 */
34
35 #include <math.h>
36 #include <sys/types.h>
37 #include <unistd.h>
38 #include <string.h>
39
40 #include "audio.h"
41 #include "getdata.h"
42 #include "misc2.h"
43
44 #define TRANSFORM
45 #include "transform.h"
46
47 #define PI12 0.261799387f
48 #define PI36 0.087266462f
49
50 #undef ARCH_i586
51
imdct_init()52 void imdct_init()
53 {
54 int i;
55
56 for(i=0;i<36;i++) /* 0 */
57 win[0][i] = (float) sin(PI36 *(i+0.5));
58 for(i=0;i<18;i++) /* 1 */
59 win[1][i] = (float) sin(PI36 *(i+0.5));
60 for(i=18;i<24;i++)
61 win[1][i] = 1.0f;
62 for(i=24;i<30;i++)
63 win[1][i] = (float) sin(PI12 *(i+0.5-18));
64 for(i=30;i<36;i++)
65 win[1][i] = 0.0f;
66 for(i=0;i<6;i++) /* 3 */
67 win[3][i] = 0.0f;
68 for(i=6;i<12;i++)
69 win[3][i] = (float) sin(PI12 * (i+ 0.5 - 6.0));
70 for(i=12;i<18;i++)
71 win[3][i] = 1.0f;
72 for(i=18;i<36;i++)
73 win[3][i] = (float) sin(PI36 * (i + 0.5));
74 }
75
76 /* This uses Byeong Gi Lee's Fast Cosine Transform algorithm to decompose
77 the 36 point and 12 point IDCT's into 9 point and 3 point IDCT's,
78 respectively. Then the 9 point IDCT is computed by a modified version of
79 Mikko Tommila's IDCT algorithm, based on the WFTA. See his comments
80 before the first 9 point IDCT. The 3 point IDCT is already efficient to
81 implement. -- Jeff Tsay. */
82 /* I got the unrolled IDCT from Jeff Tsay; the code is presumably by
83 Francois-Raymond Boyer - I unrolled it a little further. tu */
84
imdct(int win_type,int sb,int ch)85 void imdct(int win_type,int sb,int ch)
86 {
87 /*------------------------------------------------------------------*/
88 /* */
89 /* Function: Calculation of the inverse MDCT */
90 /* In the case of short blocks the 3 output vectors are already */
91 /* overlapped and added in this modul. */
92 /* */
93 /* New layer3 */
94 /* */
95 /*------------------------------------------------------------------*/
96
97 float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11;
98
99 register float save;
100 float pp1, pp2;
101 float *win_bt;
102 int i, p, ss;
103 float *in = xr[ch][sb];
104 float *s_p = s[ch][sb];
105 float *res_p = res[sb];
106 float out[36];
107
108 if(win_type == 2){
109 for(p=0;p<36;p+=9) {
110 out[p] = out[p+1] = out[p+2] = out[p+3] =
111 out[p+4] = out[p+5] = out[p+6] = out[p+7] =
112 out[p+8] = 0.0f;
113 }
114
115 for(ss=0;ss<18;ss+=6) {
116
117 /*
118 * 12 point IMDCT
119 */
120
121 /* Begin 12 point IDCT */
122
123 /* Input aliasing for 12 pt IDCT */
124 in[5+ss]+=in[4+ss];in[4+ss]+=in[3+ss];in[3+ss]+=in[2+ss];
125 in[2+ss]+=in[1+ss];in[1+ss]+=in[0+ss];
126
127 /* Input aliasing on odd indices (for 6 point IDCT) */
128 in[5+ss] += in[3+ss]; in[3+ss] += in[1+ss];
129
130 /* 3 point IDCT on even indices */
131
132 pp2 = in[4+ss] * 0.500000000f;
133 pp1 = in[2+ss] * 0.866025403f;
134 save = in[0+ss] + pp2;
135 tmp1 = in[0+ss] - in[4+ss];
136 tmp0 = save + pp1;
137 tmp2 = save - pp1;
138
139 /* End 3 point IDCT on even indices */
140
141 /* 3 point IDCT on odd indices (for 6 point IDCT) */
142
143 pp2 = in[5+ss] * 0.500000000f;
144 pp1 = in[3+ss] * 0.866025403f;
145 save = in[1+ss] + pp2;
146 tmp4 = in[1+ss] - in[5+ss];
147 tmp5 = save + pp1;
148 tmp3 = save - pp1;
149
150 /* End 3 point IDCT on odd indices */
151
152 /* Twiddle factors on odd indices (for 6 point IDCT) */
153
154 tmp3 *= 1.931851653f;
155 tmp4 *= 0.707106781f;
156 tmp5 *= 0.517638090f;
157
158 /* Output butterflies on 2 3 point IDCT's (for 6 point IDCT) */
159
160 save = tmp0;
161 tmp0 += tmp5;
162 tmp5 = save - tmp5;
163
164 save = tmp1;
165 tmp1 += tmp4;
166 tmp4 = save - tmp4;
167
168 save = tmp2;
169 tmp2 += tmp3;
170 tmp3 = save - tmp3;
171
172 /* End 6 point IDCT */
173
174 /* Twiddle factors on indices (for 12 point IDCT) */
175
176 tmp0 *= 0.504314480f;
177 tmp1 *= 0.541196100f;
178 tmp2 *= 0.630236207f;
179 tmp3 *= 0.821339815f;
180 tmp4 *= 1.306562965f;
181 tmp5 *= 3.830648788f;
182
183 /* End 12 point IDCT */
184
185 /* Shift to 12 point modified IDCT, multiply by window type 2 */
186 tmp8 = tmp0 * -0.793353340f;
187 tmp9 = tmp0 * -0.608761429f;
188 tmp7 = tmp1 * -0.923879532f;
189 tmp10 = tmp1 * -0.382683432f;
190 tmp6 = tmp2 * -0.991444861f;
191 tmp11 = tmp2 * -0.130526192f;
192
193 tmp0 = tmp3;
194 tmp1 = tmp4 * 0.382683432f;
195 tmp2 = tmp5 * 0.608761429f;
196
197 tmp3 = tmp5 * -0.793353340f;
198 tmp4 = tmp4 * -0.923879532f;
199 tmp5 = tmp0 * -0.991444861f;
200
201 tmp0 *= 0.130526192f;
202
203 out[ss + 6] += tmp0;
204 out[ss + 7] += tmp1;
205 out[ss + 8] += tmp2;
206 out[ss + 9] += tmp3;
207 out[ss + 10] += tmp4;
208 out[ss + 11] += tmp5;
209 out[ss + 12] += tmp6;
210 out[ss + 13] += tmp7;
211 out[ss + 14] += tmp8;
212 out[ss + 15] += tmp9;
213 out[ss + 16] += tmp10;
214 out[ss + 17] += tmp11;
215
216 }
217 if (sb&1) {
218 for (i=0;i<18;i+=2) res_p[i]=out[i] + s_p[i];
219 for (i=1;i<18;i+=2) res_p[i]=-out[i] - s_p[i];
220 } else
221 for (i=0;i<18;i++) res_p[i]=out[i] + s_p[i];
222 for (i=18;i<36;i++) s_p[i-18]=out[i];
223
224 } else {
225 /*
226 * 36 point IDCT ****************************************************************
227 */
228 float tmp[18];
229
230 /* input aliasing for 36 point IDCT */
231 in[17]+=in[16]; in[16]+=in[15]; in[15]+=in[14]; in[14]+=in[13];
232 in[13]+=in[12]; in[12]+=in[11]; in[11]+=in[10]; in[10]+=in[9];
233 in[9] +=in[8]; in[8] +=in[7]; in[7] +=in[6]; in[6] +=in[5];
234 in[5] +=in[4]; in[4] +=in[3]; in[3] +=in[2]; in[2] +=in[1];
235 in[1] +=in[0];
236
237 /* 18 point IDCT for odd indices */
238
239 /* input aliasing for 18 point IDCT */
240 in[17]+=in[15]; in[15]+=in[13]; in[13]+=in[11]; in[11]+=in[9];
241 in[9] +=in[7]; in[7] +=in[5]; in[5] +=in[3]; in[3] +=in[1];
242
243
244 {
245 float tmp0,tmp1,tmp2,tmp3,tmp4,tmp0_,tmp1_,tmp2_,tmp3_;
246 float tmp0o,tmp1o,tmp2o,tmp3o,tmp4o,tmp0_o,tmp1_o,tmp2_o,tmp3_o;
247
248 /* Fast 9 Point Inverse Discrete Cosine Transform
249 //
250 // By Francois-Raymond Boyer
251 // mailto:boyerf@iro.umontreal.ca
252 // http://www.iro.umontreal.ca/~boyerf
253 //
254 // The code has been optimized for Intel processors
255 // (takes a lot of time to convert float to and from iternal FPU representation)
256 //
257 // It is a simple "factorization" of the IDCT matrix.
258 */
259 /* 9 point IDCT on even indices */
260 {
261 /* 5 points on odd indices (not realy an IDCT) */
262 float i0 = in[0]+in[0];
263 float i0p12 = i0 + in[12];
264
265 tmp0 = i0p12 + in[4]*1.8793852415718f + in[8]*1.532088886238f + in[16]*0.34729635533386f;
266 tmp1 = i0 + in[4] - in[8] - in[12] - in[12] - in[16];
267 tmp2 = i0p12 - in[4]*0.34729635533386f - in[8]*1.8793852415718f + in[16]*1.532088886238f;
268 tmp3 = i0p12 - in[4]*1.532088886238f + in[8]*0.34729635533386f - in[16]*1.8793852415718f;
269 tmp4 = in[0] - in[4] + in[8] - in[12] + in[16];
270 }
271 {
272 float i6_ = in[6]*1.732050808f;
273
274 tmp0_ = in[2]*1.9696155060244f + i6_ + in[10]*1.2855752193731f + in[14]*0.68404028665134f;
275 tmp1_ = (in[2] - in[10] - in[14])*1.732050808f;
276 tmp2_ = in[2]*1.2855752193731f - i6_ - in[10]*0.68404028665134f + in[14]*1.9696155060244f;
277 tmp3_ = in[2]*0.68404028665134f - i6_ + in[10]*1.9696155060244f - in[14]*1.2855752193731f;
278 }
279
280 /* 9 point IDCT on odd indices */
281 {
282 /* 5 points on odd indices (not realy an IDCT) */
283 float i0 = in[0+1]+in[0+1];
284 float i0p12 = i0 + in[12+1];
285
286 tmp0o = i0p12 + in[4+1]*1.8793852415718f + in[8+1]*1.532088886238f + in[16+1]*0.34729635533386f;
287 tmp1o = i0 + in[4+1] - in[8+1] - in[12+1] - in[12+1] - in[16+1];
288 tmp2o = i0p12 - in[4+1]*0.34729635533386f - in[8+1]*1.8793852415718f + in[16+1]*1.532088886238f;
289 tmp3o = i0p12 - in[4+1]*1.532088886238f + in[8+1]*0.34729635533386f - in[16+1]*1.8793852415718f;
290 tmp4o = (in[0+1] - in[4+1] + in[8+1] - in[12+1] + in[16+1])*0.707106781f; /* Twiddled */
291 }
292 {
293 /* 4 points on even indices */
294 float i6_ = in[6+1]*1.732050808f; /* Sqrt[3] */
295
296 tmp0_o = in[2+1]*1.9696155060244f + i6_ + in[10+1]*1.2855752193731f + in[14+1]*0.68404028665134f;
297 tmp1_o = (in[2+1] - in[10+1] - in[14+1])*1.732050808f;
298 tmp2_o = in[2+1]*1.2855752193731f - i6_ - in[10+1]*0.68404028665134f + in[14+1]*1.9696155060244f;
299 tmp3_o = in[2+1]*0.68404028665134f - i6_ + in[10+1]*1.9696155060244f - in[14+1]*1.2855752193731f;
300 }
301
302 /* Twiddle factors on odd indices
303 // and
304 // Butterflies on 9 point IDCT's
305 // and
306 // twiddle factors for 36 point IDCT
307 */
308 {
309 float e, o;
310 e = tmp0 + tmp0_; o = (tmp0o + tmp0_o)*0.501909918f; tmp[0] = (e + o)*(-0.500476342f*.5f); tmp[17] = (e - o)*(-11.46279281f*.5f);
311 e = tmp1 + tmp1_; o = (tmp1o + tmp1_o)*0.517638090f; tmp[1] = (e + o)*(-0.504314480f*.5f); tmp[16] = (e - o)*(-3.830648788f*.5f);
312 e = tmp2 + tmp2_; o = (tmp2o + tmp2_o)*0.551688959f; tmp[2] = (e + o)*(-0.512139757f*.5f); tmp[15] = (e - o)*(-2.310113158f*.5f);
313 e = tmp3 + tmp3_; o = (tmp3o + tmp3_o)*0.610387294f; tmp[3] = (e + o)*(-0.524264562f*.5f); tmp[14] = (e - o)*(-1.662754762f*.5f);
314 tmp[4] = (tmp4 + tmp4o)*(-0.541196100f); tmp[13] = (tmp4 - tmp4o)*(-1.306562965f);
315 e = tmp3 - tmp3_; o = (tmp3o - tmp3_o)*0.871723397f; tmp[5] = (e + o)*(-0.563690973f*.5f); tmp[12] = (e - o)*(-1.082840285f*.5f);
316 e = tmp2 - tmp2_; o = (tmp2o - tmp2_o)*1.183100792f; tmp[6] = (e + o)*(-0.592844523f*.5f); tmp[11] = (e - o)*(-0.930579498f*.5f);
317 e = tmp1 - tmp1_; o = (tmp1o - tmp1_o)*1.931851653f; tmp[7] = (e + o)*(-0.630236207f*.5f); tmp[10] = (e - o)*(-0.821339815f*.5f);
318 e = tmp0 - tmp0_; o = (tmp0o - tmp0_o)*5.736856623f; tmp[8] = (e + o)*(-0.678170852f*.5f); tmp[9] = (e - o)*(-0.740093616f*.5f);
319 }
320 }
321 /* shift to modified IDCT */
322 win_bt = win[win_type];
323
324 if (sb&1) {
325 res_p[0] = -tmp[9] * win_bt[0] + s_p[0];
326 res_p[1] =-(-tmp[10] * win_bt[1] + s_p[1]);
327 res_p[2] = -tmp[11] * win_bt[2] + s_p[2];
328 res_p[3] =-(-tmp[12] * win_bt[3] + s_p[3]);
329 res_p[4] = -tmp[13] * win_bt[4] + s_p[4];
330 res_p[5] =-(-tmp[14] * win_bt[5] + s_p[5]);
331 res_p[6] = -tmp[15] * win_bt[6] + s_p[6];
332 res_p[7] =-(-tmp[16] * win_bt[7] + s_p[7]);
333 res_p[8] = -tmp[17] * win_bt[8] + s_p[8];
334
335 res_p[9] = -(tmp[17] * win_bt[9] + s_p[9]);
336 res_p[10]= tmp[16] * win_bt[10] + s_p[10];
337 res_p[11]=-(tmp[15] * win_bt[11] + s_p[11]);
338 res_p[12]= tmp[14] * win_bt[12] + s_p[12];
339 res_p[13]=-(tmp[13] * win_bt[13] + s_p[13]);
340 res_p[14]= tmp[12] * win_bt[14] + s_p[14];
341 res_p[15]=-(tmp[11] * win_bt[15] + s_p[15]);
342 res_p[16]= tmp[10] * win_bt[16] + s_p[16];
343 res_p[17]=-(tmp[9] * win_bt[17] + s_p[17]);
344 } else {
345 res_p[0] = -tmp[9] * win_bt[0] + s_p[0];
346 res_p[1] = -tmp[10] * win_bt[1] + s_p[1];
347 res_p[2] = -tmp[11] * win_bt[2] + s_p[2];
348 res_p[3] = -tmp[12] * win_bt[3] + s_p[3];
349 res_p[4] = -tmp[13] * win_bt[4] + s_p[4];
350 res_p[5] = -tmp[14] * win_bt[5] + s_p[5];
351 res_p[6] = -tmp[15] * win_bt[6] + s_p[6];
352 res_p[7] = -tmp[16] * win_bt[7] + s_p[7];
353 res_p[8] = -tmp[17] * win_bt[8] + s_p[8];
354
355 res_p[9] = tmp[17] * win_bt[9] + s_p[9];
356 res_p[10]= tmp[16] * win_bt[10] + s_p[10];
357 res_p[11]= tmp[15] * win_bt[11] + s_p[11];
358 res_p[12]= tmp[14] * win_bt[12] + s_p[12];
359 res_p[13]= tmp[13] * win_bt[13] + s_p[13];
360 res_p[14]= tmp[12] * win_bt[14] + s_p[14];
361 res_p[15]= tmp[11] * win_bt[15] + s_p[15];
362 res_p[16]= tmp[10] * win_bt[16] + s_p[16];
363 res_p[17]= tmp[9] * win_bt[17] + s_p[17];
364 }
365
366 s_p[0]= tmp[8] * win_bt[18];
367 s_p[1]= tmp[7] * win_bt[19];
368 s_p[2]= tmp[6] * win_bt[20];
369 s_p[3]= tmp[5] * win_bt[21];
370 s_p[4]= tmp[4] * win_bt[22];
371 s_p[5]= tmp[3] * win_bt[23];
372 s_p[6]= tmp[2] * win_bt[24];
373 s_p[7]= tmp[1] * win_bt[25];
374 s_p[8]= tmp[0] * win_bt[26];
375
376 s_p[9]= tmp[0] * win_bt[27];
377 s_p[10]= tmp[1] * win_bt[28];
378 s_p[11]= tmp[2] * win_bt[29];
379 s_p[12]= tmp[3] * win_bt[30];
380 s_p[13]= tmp[4] * win_bt[31];
381 s_p[14]= tmp[5] * win_bt[32];
382 s_p[15]= tmp[6] * win_bt[33];
383 s_p[16]= tmp[7] * win_bt[34];
384 s_p[17]= tmp[8] * win_bt[35];
385 }
386 }
387
388 /* fast DCT according to Lee[84]
389 * reordering according to Konstantinides[94]
390 */
poly(const int ch,int f)391 void poly(const int ch,int f)
392 {
393 static float u[2][2][17][16]; /* no v[][], it's redundant */
394 static int u_start[2]={0,0}; /* first element of u[][] */
395 static int u_div[2]={0,0}; /* which part of u[][] is currently used */
396 int start = u_start[ch];
397 int div = u_div[ch];
398 float (*u_p)[16];
399
400 #if defined(PENTIUM_RDTSC)
401 unsigned int cnt4, cnt3, cnt2, cnt1;
402 static int min_cycles = 99999999;
403
404 __asm__(".byte 0x0f,0x31" : "=a" (cnt1), "=d" (cnt4));
405 #endif
406
407 {
408 float d16,d17,d18,d19,d20,d21,d22,d23,d24,d25,d26,d27,d28,d29,d30,d31;
409 float d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12,d13,d14,d15;
410
411 /* step 1: initial reordering and 1st (16 wide) butterflies
412 */
413
414 d0 = res[ 0][f]; d16=(d0 - res[31][f]) * b1; d0 += res[31][f];
415 d1 = res[ 1][f]; d17=(d1 - res[30][f]) * b3; d1 += res[30][f];
416 d3 = res[ 2][f]; d19=(d3 - res[29][f]) * b5; d3 += res[29][f];
417 d2 = res[ 3][f]; d18=(d2 - res[28][f]) * b7; d2 += res[28][f];
418 d6 = res[ 4][f]; d22=(d6 - res[27][f]) * b9; d6 += res[27][f];
419 d7 = res[ 5][f]; d23=(d7 - res[26][f]) * b11; d7 += res[26][f];
420 d5 = res[ 6][f]; d21=(d5 - res[25][f]) * b13; d5 += res[25][f];
421 d4 = res[ 7][f]; d20=(d4 - res[24][f]) * b15; d4 += res[24][f];
422 d12= res[ 8][f]; d28=(d12 - res[23][f]) * b17; d12+= res[23][f];
423 d13= res[ 9][f]; d29=(d13 - res[22][f]) * b19; d13+= res[22][f];
424 d15= res[10][f]; d31=(d15 - res[21][f]) * b21; d15+= res[21][f];
425 d14= res[11][f]; d30=(d14 - res[20][f]) * b23; d14+= res[20][f];
426 d10= res[12][f]; d26=(d10 - res[19][f]) * b25; d10+= res[19][f];
427 d11= res[13][f]; d27=(d11 - res[18][f]) * b27; d11+= res[18][f];
428 d9 = res[14][f]; d25=(d9 - res[17][f]) * b29; d9 += res[17][f];
429 d8 = res[15][f]; d24=(d8 - res[16][f]) * b31; d8 += res[16][f];
430
431 {
432 float c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15;
433
434 /* a test to see what can be done with memory separation
435 * first we process indexes 0-15
436 */
437 c0 = d0 + d8 ; c8 = ( d0 - d8 ) * b2;
438 c1 = d1 + d9 ; c9 = ( d1 - d9 ) * b6;
439 c2 = d2 + d10; c10= ( d2 - d10) * b14;
440 c3 = d3 + d11; c11= ( d3 - d11) * b10;
441 c4 = d4 + d12; c12= ( d4 - d12) * b30;
442 c5 = d5 + d13; c13= ( d5 - d13) * b26;
443 c6 = d6 + d14; c14= ( d6 - d14) * b18;
444 c7 = d7 + d15; c15= ( d7 - d15) * b22;
445
446 /* step 3: 4-wide butterflies
447 */
448 d0 = c0 + c4 ; d4 = ( c0 - c4 ) * b4;
449 d1 = c1 + c5 ; d5 = ( c1 - c5 ) * b12;
450 d2 = c2 + c6 ; d6 = ( c2 - c6 ) * b28;
451 d3 = c3 + c7 ; d7 = ( c3 - c7 ) * b20;
452
453 d8 = c8 + c12; d12= ( c8 - c12) * b4;
454 d9 = c9 + c13; d13= ( c9 - c13) * b12;
455 d10= c10+ c14; d14= (c10 - c14) * b28;
456 d11= c11+ c15; d15= (c11 - c15) * b20;
457
458
459 /* step 4: 2-wide butterflies
460 */
461 {
462 float rb8 = b8;
463 float rb24 = b24;
464
465 /**/ c0 = d0 + d2 ; c2 = ( d0 - d2 ) * rb8;
466 c1 = d1 + d3 ; c3 = ( d1 - d3 ) * rb24;
467 /**/ c4 = d4 + d6 ; c6 = ( d4 - d6 ) * rb8;
468 c5 = d5 + d7 ; c7 = ( d5 - d7 ) * rb24;
469 /**/ c8 = d8 + d10; c10= ( d8 - d10) * rb8;
470 c9 = d9 + d11; c11= ( d9 - d11) * rb24;
471 /**/ c12= d12+ d14; c14= (d12 - d14) * rb8;
472 c13= d13+ d15; c15= (d13 - d15) * rb24;
473 }
474
475 /* step 5: 1-wide butterflies
476 */
477 {
478 float rb16 = b16;
479
480 /* this is a little 'hacked up'
481 */
482 d0 = (-c0 -c1) * 2; d1 = ( c0 - c1 ) * rb16;
483 d2 = c2 + c3; d3 = ( c2 - c3 ) * rb16;
484 d3 -= d2;
485
486 d4 = c4 +c5; d5 = ( c4 - c5 ) * rb16;
487 d5 += d4;
488 d7 = -d5;
489 d7 += ( c6 - c7 ) * rb16; d6 = +c6 +c7;
490
491 d8 = c8 + c9 ; d9 = ( c8 - c9 ) * rb16;
492 d11= +d8 +d9;
493 d11 +=(c10 - c11) * rb16; d10= c10+ c11;
494
495 d12 = c12+ c13; d13 = (c12 - c13) * rb16;
496 d13 += -d8-d9+d12;
497 d14 = c14+ c15; d15 = (c14 - c15) * rb16;
498 d15-=d11;
499 d14 += -d8 -d10;
500 }
501
502 /* step 6: final resolving & reordering
503 * the other 32 are stored for use with the next granule
504 */
505
506 u_p = (float (*)[16]) &u[ch][div][0][start];
507
508 /*16*/ u_p[ 0][0] =+d1 ;
509 u_p[ 2][0] = +d9 -d14;
510 /*20*/ u_p[ 4][0] = +d5 -d6;
511 u_p[ 6][0] = -d10 +d13;
512 /*24*/ u_p[ 8][0] =d3;
513 u_p[10][0] = -d8 -d9 +d11 -d13;
514 /*28*/ u_p[12][0] = +d7;
515 u_p[14][0] = +d15;
516
517 /* the other 32 are stored for use with the next granule
518 */
519
520 u_p = (float (*)[16]) &u[ch][!div][0][start];
521
522 /*0*/ u_p[16][0] = d0;
523 u_p[14][0] = -(+d8 );
524 /*4*/ u_p[12][0] = -(+d4 );
525 u_p[10][0] = -(-d8 +d12 );
526 /*8*/ u_p[ 8][0] = -(+d2 );
527 u_p[ 6][0] = -(+d8 +d10 -d12 );
528 /*12*/ u_p[ 4][0] = -(-d4 +d6 );
529 u_p[ 2][0] = -d14;
530 u_p[ 0][0] = -d1;
531 }
532
533 {
534 float c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15;
535
536 /* memory separation, second part
537 */
538 /* 2
539 */
540 c0=d16 + d24; c8= (d16 - d24) * b2;
541 c1=d17 + d25; c9= (d17 - d25) * b6;
542 c2=d18 + d26; c10= (d18 - d26) * b14;
543 c3=d19 + d27; c11= (d19 - d27) * b10;
544 c4=d20 + d28; c12= (d20 - d28) * b30;
545 c5=d21 + d29; c13= (d21 - d29) * b26;
546 c6=d22 + d30; c14= (d22 - d30) * b18;
547 c7=d23 + d31; c15= (d23 - d31) * b22;
548
549 /* 3
550 */
551 d16= c0+ c4; d20= (c0 - c4) * b4;
552 d17= c1+ c5; d21= (c1 - c5) * b12;
553 d18= c2+ c6; d22= (c2 - c6) * b28;
554 d19= c3+ c7; d23= (c3 - c7) * b20;
555
556 d24= c8+ c12; d28= (c8 - c12) * b4;
557 d25= c9+ c13; d29= (c9 - c13) * b12;
558 d26= c10+ c14; d30= (c10 - c14) * b28;
559 d27= c11+ c15; d31= (c11 - c15) * b20;
560
561 /* 4
562 */
563 {
564 float rb8 = b8;
565 float rb24 = b24;
566
567 /**/ c0= d16+ d18; c2= (d16 - d18) * rb8;
568 c1= d17+ d19; c3= (d17 - d19) * rb24;
569 /**/ c4= d20+ d22; c6= (d20 - d22) * rb8;
570 c5= d21+ d23; c7= (d21 - d23) * rb24;
571 /**/ c8= d24+ d26; c10= (d24 - d26) * rb8;
572 c9= d25+ d27; c11= (d25 - d27) * rb24;
573 /**/ c12= d28+ d30; c14= (d28 - d30) * rb8;
574 c13= d29+ d31; c15= (d29 - d31) * rb24;
575 }
576
577 /* 5
578 */
579 {
580 float rb16 = b16;
581 d16= c0+ c1; d17= (c0 - c1) * rb16;
582 d18= c2+ c3; d19= (c2 - c3) * rb16;
583
584 d20= c4+ c5; d21= (c4 - c5) * rb16;
585 d20+=d16; d21+=d17;
586 d22= c6+ c7; d23= (c6 - c7) * rb16;
587 d22+=d16; d22+=d18;
588 d23+=d16; d23+=d17; d23+=d19;
589
590
591 d24= c8+ c9; d25= (c8 - c9) * rb16;
592 d26= c10+ c11; d27= (c10 - c11) * rb16;
593 d26+=d24;
594 d27+=d24; d27+=d25;
595
596 d28= c12+ c13; d29= (c12 - c13) * rb16;
597 d28-=d20; d29+=d28; d29-=d21;
598 d30= c14+ c15; d31= (c14 - c15) * rb16;
599 d30-=d22;
600 d31-=d23;
601 }
602
603 /* step 6: final resolving & reordering
604 * the other 32 are stored for use with the next granule
605 */
606
607 u_p = (float (*)[16]) &u[ch][!div][0][start];
608
609 u_p[ 1][0] = -(+d30 );
610 u_p[ 3][0] = -(+d22 -d26 );
611 u_p[ 5][0] = -(-d18 -d20 +d26 );
612 u_p[ 7][0] = -(+d18 -d28 );
613 u_p[ 9][0] = -(+d28 );
614 u_p[11][0] = -(+d20 -d24 );
615 u_p[13][0] = -(-d16 +d24 );
616 u_p[15][0] = -(+d16 );
617
618 /* the other 32 are stored for use with the next granule
619 */
620
621 u_p = (float (*)[16]) &u[ch][div][0][start];
622
623 u_p[15][0] = +d31;
624 u_p[13][0] = +d23 -d27;
625 u_p[11][0] = -d19 -d20 -d21 +d27;
626 u_p[ 9][0] = +d19 -d29;
627 u_p[ 7][0] = -d18 +d29;
628 u_p[ 5][0] = +d18 +d20 +d21 -d25 -d26;
629 u_p[ 3][0] = -d17 -d22 +d25 +d26;
630 u_p[ 1][0] = +d17 -d30;
631 }
632 }
633
634 #if defined(PENTIUM_RDTSC1)
635 __asm__(".byte 0x0f,0x31" : "=a" (cnt3), "=d" (cnt4));
636 #endif
637
638 /* we're doing dewindowing and calculating final samples now
639 */
640
641 #if defined(ARCH_i586)
642 /* x86 assembler optimisations. These optimisations are tuned
643 specifically for Intel Pentiums. */
644
645 asm("movl $15,%%eax\n\t"\
646 "1:\n\t"\
647 "flds (%0)\n\t"\
648 "fmuls (%1)\n\t"\
649 "flds 4(%0)\n\t"\
650 "fmuls 4(%1)\n\t"\
651 "flds 8(%0)\n\t"\
652 "fmuls 8(%1)\n\t"\
653 "fxch %%st(2)\n\t"\
654 "faddp\n\t"\
655 "flds 12(%0)\n\t"\
656 "fmuls 12(%1)\n\t"\
657 "fxch %%st(2)\n\t"\
658 "faddp\n\t"\
659 "flds 16(%0)\n\t"\
660 "fmuls 16(%1)\n\t"\
661 "fxch %%st(2)\n\t"\
662 "faddp\n\t"\
663 "flds 20(%0)\n\t"\
664 "fmuls 20(%1)\n\t"\
665 "fxch %%st(2)\n\t"\
666 "faddp\n\t"\
667 "flds 24(%0)\n\t"\
668 "fmuls 24(%1)\n\t"\
669 "fxch %%st(2)\n\t"\
670 "faddp\n\t"\
671 "flds 28(%0)\n\t"\
672 "fmuls 28(%1)\n\t"\
673 "fxch %%st(2)\n\t"\
674 "faddp\n\t"\
675 "flds 32(%0)\n\t"\
676 "fmuls 32(%1)\n\t"\
677 "fxch %%st(2)\n\t"\
678 "faddp\n\t"\
679 "flds 36(%0)\n\t"\
680 "fmuls 36(%1)\n\t"\
681 "fxch %%st(2)\n\t"\
682 "faddp\n\t"\
683 "flds 40(%0)\n\t"\
684 "fmuls 40(%1)\n\t"\
685 "fxch %%st(2)\n\t"\
686 "faddp\n\t"\
687 "flds 44(%0)\n\t"\
688 "fmuls 44(%1)\n\t"\
689 "fxch %%st(2)\n\t"\
690 "faddp\n\t"\
691 "flds 48(%0)\n\t"\
692 "fmuls 48(%1)\n\t"\
693 "fxch %%st(2)\n\t"\
694 "faddp\n\t"\
695 "flds 52(%0)\n\t"\
696 "fmuls 52(%1)\n\t"\
697 "fxch %%st(2)\n\t"\
698 "faddp\n\t"\
699 "flds 56(%0)\n\t"\
700 "fmuls 56(%1)\n\t"\
701 "fxch %%st(2)\n\t"\
702 "faddp\n\t"\
703 "flds 60(%0)\n\t"\
704 "fmuls 60(%1)\n\t"\
705 "fxch %%st(2)\n\t"\
706 "faddp\n\t"\
707 "addl $64,%0\n\t"\
708 "addl $128,%1\n\t"\
709 "subl $4,%%esp\n\t"\
710 "faddp\n\t"\
711 "fistpl (%%esp)\n\t"\
712 "popl %%ecx\n\t"\
713 "cmpl $32767,%%ecx\n\t"\
714 "jle 2f\n\t"\
715 "movw $32767,%%cx\n\t"\
716 "jmp 3f\n\t"\
717 "2: cmpl $-32768,%%ecx\n\t"\
718 "jge 3f\n\t"\
719 "movw $-32768,%%cx\n\t"\
720 "3: movw %%cx,(%2)\n\t"\
721 "addl %3,%2\n\t"\
722 "decl %%eax\n\t"\
723 "jns 1b\n\t"\
724
725 "testb $1,%4\n\t"\
726 "je 4f\n\t"
727
728 "flds (%0)\n\t"\
729 "fmuls (%1)\n\t"\
730 "flds 8(%0)\n\t"\
731 "fmuls 8(%1)\n\t"\
732 "flds 16(%0)\n\t"\
733 "fmuls 16(%1)\n\t"\
734 "fxch %%st(2)\n\t"\
735 "faddp\n\t"\
736 "flds 24(%0)\n\t"\
737 "fmuls 24(%1)\n\t"\
738 "fxch %%st(2)\n\t"\
739 "faddp\n\t"\
740 "flds 32(%0)\n\t"\
741 "fmuls 32(%1)\n\t"\
742 "fxch %%st(2)\n\t"\
743 "faddp\n\t"\
744 "flds 40(%0)\n\t"\
745 "fmuls 40(%1)\n\t"\
746 "fxch %%st(2)\n\t"\
747 "faddp\n\t"\
748 "flds 48(%0)\n\t"\
749 "fmuls 48(%1)\n\t"\
750 "fxch %%st(2)\n\t"\
751 "faddp\n\t"\
752 "flds 56(%0)\n\t"\
753 "fmuls 56(%1)\n\t"\
754 "fxch %%st(2)\n\t"\
755 "faddp\n\t"\
756 "subl $4,%%esp\n\t"\
757 "subl $64,%0\n\t"\
758 "subl $192,%1\n\t"\
759 "faddp\n\t"\
760 "fistpl (%%esp)\n\t"\
761 "popl %%ecx\n\t"\
762 "cmpl $32767,%%ecx\n\t"\
763 "jle 2f\n\t"\
764 "movw $32767,%%cx\n\t"\
765 "jmp 3f\n\t"\
766 "2: cmpl $-32768,%%ecx\n\t"\
767 "jge 3f\n\t"\
768 "movw $-32768,%%cx\n\t"\
769 "3: movw %%cx,(%2)\n\t"\
770
771 "movl %5,%%ecx\n\t"\
772 "sall $3,%%ecx\n\t"\
773 "addl %%ecx,%1\n\t"\
774 "addl %3,%2\n\t"\
775 "movl $14,%%eax\n\t"\
776
777 "1:flds 4(%0)\n\t"\
778 "fmuls 56(%1)\n\t"\
779 "flds (%0)\n\t"\
780 "fmuls 60(%1)\n\t"\
781 "flds 12(%0)\n\t"\
782 "fmuls 48(%1)\n\t"\
783 "fxch %%st(2)\n\t"\
784 "fsubp\n\t"\
785 "flds 8(%0)\n\t"\
786 "fmuls 52(%1)\n\t"\
787 "fxch %%st(2)\n\t"\
788 "faddp\n\t"\
789 "flds 20(%0)\n\t"\
790 "fmuls 40(%1)\n\t"\
791 "fxch %%st(2)\n\t"\
792 "fsubrp\n\t"\
793 "flds 16(%0)\n\t"\
794 "fmuls 44(%1)\n\t"\
795 "fxch %%st(2)\n\t"\
796 "faddp\n\t"\
797 "flds 28(%0)\n\t"\
798 "fmuls 32(%1)\n\t"\
799 "fxch %%st(2)\n\t"\
800 "fsubrp\n\t"\
801 "flds 24(%0)\n\t"\
802 "fmuls 36(%1)\n\t"\
803 "fxch %%st(2)\n\t"\
804 "faddp\n\t"\
805 "flds 36(%0)\n\t"\
806 "fmuls 24(%1)\n\t"\
807 "fxch %%st(2)\n\t"\
808 "fsubrp\n\t"\
809 "flds 32(%0)\n\t"\
810 "fmuls 28(%1)\n\t"\
811 "fxch %%st(2)\n\t"\
812 "faddp\n\t"\
813 "flds 44(%0)\n\t"\
814 "fmuls 16(%1)\n\t"\
815 "fxch %%st(2)\n\t"\
816 "fsubrp\n\t"\
817 "flds 40(%0)\n\t"\
818 "fmuls 20(%1)\n\t"\
819 "fxch %%st(2)\n\t"\
820 "faddp\n\t"\
821 "flds 52(%0)\n\t"\
822 "fmuls 8(%1)\n\t"\
823 "fxch %%st(2)\n\t"\
824 "fsubrp\n\t"\
825 "flds 48(%0)\n\t"\
826 "fmuls 12(%1)\n\t"\
827 "fxch %%st(2)\n\t"\
828 "faddp\n\t"\
829 "flds 60(%0)\n\t"\
830 "fmuls (%1)\n\t"\
831 "fxch %%st(2)\n\t"\
832 "fsubrp\n\t"\
833 "flds 56(%0)\n\t"\
834 "fmuls 4(%1)\n\t"\
835 "fxch %%st(2)\n\t"\
836 "faddp\n\t"\
837 "subl $64,%0\n\t"\
838 "subl $128,%1\n\t"\
839 "subl $4,%%esp\n\t"\
840 "fsubp\n\t"\
841 "fistpl (%%esp)\n\t"\
842 "popl %%ecx\n\t"\
843 "cmpl $32767,%%ecx\n\t"\
844 "jle 2f\n\t"\
845 "movw $32767,%%cx\n\t"\
846 "jmp 3f\n\t"\
847 "2: cmpl $-32768,%%ecx\n\t"\
848 "jge 3f\n\t"\
849 "movw $-32768,%%cx\n\t"\
850 "3: movw %%cx,(%2)\n\t"\
851 "addl %3,%2\n\t"\
852 "decl %%eax\n\t"\
853 "jns 1b\n\t"\
854 "jmp 5f\n\t"\
855
856 "4:flds 4(%0)\n\t"\
857 "fmuls 4(%1)\n\t"\
858 "flds 12(%0)\n\t"\
859 "fmuls 12(%1)\n\t"\
860 "flds 20(%0)\n\t"\
861 "fmuls 20(%1)\n\t"\
862 "fxch %%st(2)\n\t"\
863 "faddp\n\t"\
864 "flds 28(%0)\n\t"\
865 "fmuls 28(%1)\n\t"\
866 "fxch %%st(2)\n\t"\
867 "faddp\n\t"\
868 "flds 36(%0)\n\t"\
869 "fmuls 36(%1)\n\t"\
870 "fxch %%st(2)\n\t"\
871 "faddp\n\t"\
872 "flds 44(%0)\n\t"\
873 "fmuls 44(%1)\n\t"\
874 "fxch %%st(2)\n\t"\
875 "faddp\n\t"\
876 "flds 52(%0)\n\t"\
877 "fmuls 52(%1)\n\t"\
878 "fxch %%st(2)\n\t"\
879 "faddp\n\t"\
880 "flds 60(%0)\n\t"\
881 "fmuls 60(%1)\n\t"\
882 "fxch %%st(2)\n\t"\
883 "faddp\n\t"\
884 "subl $4,%%esp\n\t"\
885 "subl $64,%0\n\t"\
886 "subl $192,%1\n\t"\
887 "faddp\n\t"\
888 "fistpl (%%esp)\n\t"\
889 "popl %%ecx\n\t"\
890 "cmpl $32767,%%ecx\n\t"\
891 "jle 2f\n\t"\
892 "movw $32767,%%cx\n\t"\
893 "jmp 3f\n\t"\
894 "2: cmpl $-32768,%%ecx\n\t"\
895 "jge 3f\n\t"\
896 "movw $-32768,%%cx\n\t"\
897 "3: movw %%cx,(%2)\n\t"\
898
899 "movl %5,%%ecx\n\t"\
900 "sall $3,%%ecx\n\t"\
901 "addl %%ecx,%1\n\t"\
902 "addl %3,%2\n\t"\
903
904 "movl $14,%%eax\n\t"\
905 "1:flds (%0)\n\t"\
906 "fmuls 60(%1)\n\t"\
907 "flds 4(%0)\n\t"\
908 "fmuls 56(%1)\n\t"\
909 "flds 8(%0)\n\t"\
910 "fmuls 52(%1)\n\t"\
911 "fxch %%st(2)\n\t"\
912 "fsubp\n\t"\
913 "flds 12(%0)\n\t"\
914 "fmuls 48(%1)\n\t"\
915 "fxch %%st(2)\n\t"\
916 "faddp\n\t"\
917 "flds 16(%0)\n\t"\
918 "fmuls 44(%1)\n\t"\
919 "fxch %%st(2)\n\t"\
920 "fsubrp\n\t"\
921 "flds 20(%0)\n\t"\
922 "fmuls 40(%1)\n\t"\
923 "fxch %%st(2)\n\t"\
924 "faddp\n\t"\
925 "flds 24(%0)\n\t"\
926 "fmuls 36(%1)\n\t"\
927 "fxch %%st(2)\n\t"\
928 "fsubrp\n\t"\
929 "flds 28(%0)\n\t"\
930 "fmuls 32(%1)\n\t"\
931 "fxch %%st(2)\n\t"\
932 "faddp\n\t"\
933 "flds 32(%0)\n\t"\
934 "fmuls 28(%1)\n\t"\
935 "fxch %%st(2)\n\t"\
936 "fsubrp\n\t"\
937 "flds 36(%0)\n\t"\
938 "fmuls 24(%1)\n\t"\
939 "fxch %%st(2)\n\t"\
940 "faddp\n\t"\
941 "flds 40(%0)\n\t"\
942 "fmuls 20(%1)\n\t"\
943 "fxch %%st(2)\n\t"\
944 "fsubrp\n\t"\
945 "flds 44(%0)\n\t"\
946 "fmuls 16(%1)\n\t"\
947 "fxch %%st(2)\n\t"\
948 "faddp\n\t"\
949 "flds 48(%0)\n\t"\
950 "fmuls 12(%1)\n\t"\
951 "fxch %%st(2)\n\t"\
952 "fsubrp\n\t"\
953 "flds 52(%0)\n\t"\
954 "fmuls 8(%1)\n\t"\
955 "fxch %%st(2)\n\t"\
956 "faddp\n\t"\
957 "flds 56(%0)\n\t"\
958 "fmuls 4(%1)\n\t"\
959 "fxch %%st(2)\n\t"\
960 "fsubrp\n\t"\
961 "flds 60(%0)\n\t"\
962 "fmuls (%1)\n\t"\
963 "fxch %%st(2)\n\t"\
964 "faddp\n\t"\
965 "subl $64,%0\n\t"\
966 "subl $128,%1\n\t"\
967 "subl $4,%%esp\n\t"\
968 "fsubp\n\t"\
969 "fistpl (%%esp)\n\t"\
970 "popl %%ecx\n\t"\
971 "cmpl $32767,%%ecx\n\t"\
972 "jle 2f\n\t"\
973 "movw $32767,%%cx\n\t"\
974 "jmp 3f\n\t"\
975 "2: cmpl $-32768,%%ecx\n\t"\
976 "jge 3f\n\t"\
977 "movw $-32768,%%cx\n\t"\
978 "3: movw %%cx,(%2)\n\t"\
979 "addl %3,%2\n\t"\
980 "decl %%eax\n\t"\
981 "jns 1b\n\t"\
982
983 "5:"\
984 : : "b" (u[ch][div]), "d" (t_dewindow[0] + 16 - start), "S" (&sample_buffer[f>>(2-nch)][nch==2?0:(f&1?16:0)][ch]), "m" (sizeof(short) * nch), "m" (div), "m" (start)\
985 : "eax", "ecx", "memory");
986 #else
987 {
988 short *samples = (&sample_buffer[f>>(2-nch)][nch==2?0:(f&1?16:0)][ch]);
989 int out, j;
990
991 #define PUT_SAMPLE(out) \
992 if (out > 32767) \
993 *samples = 32767; \
994 else \
995 if (out < -32768) \
996 *samples = -32768; \
997 else \
998 *samples = out; \
999 \
1000 samples += nch;
1001
1002 #if defined(SUPERHACK)
1003 /* These is a simple implementation which should be nicer to the
1004 cache; computation of samples are done in one pass rather than
1005 two. However, for various reasons which I do not have time to
1006 investigate, it runs quite a lot slower than two pass
1007 computations. If you have time, you are welcome to look into
1008 it. */
1009
1010 {
1011 float (*u_ptr)[16] = u[ch][div];
1012 const float *dewindow2 = t_dewindow[0] + start;
1013
1014 {
1015 float outf1, outf2, outf3, outf4;
1016
1017 outf1 = u_ptr[0][ 0] * dewindow[0x0];
1018 outf2 = u_ptr[0][ 1] * dewindow[0x1];
1019 outf3 = u_ptr[0][ 2] * dewindow[0x2];
1020 outf4 = u_ptr[0][ 3] * dewindow[0x3];
1021 outf1 += u_ptr[0][ 4] * dewindow[0x4];
1022 outf2 += u_ptr[0][ 5] * dewindow[0x5];
1023 outf3 += u_ptr[0][ 6] * dewindow[0x6];
1024 outf4 += u_ptr[0][ 7] * dewindow[0x7];
1025 outf1 += u_ptr[0][ 8] * dewindow[0x8];
1026 outf2 += u_ptr[0][ 9] * dewindow[0x9];
1027 outf3 += u_ptr[0][10] * dewindow[0xa];
1028 outf4 += u_ptr[0][11] * dewindow[0xb];
1029 outf1 += u_ptr[0][12] * dewindow[0xc];
1030 outf2 += u_ptr[0][13] * dewindow[0xd];
1031 outf3 += u_ptr[0][14] * dewindow[0xe];
1032 outf4 += u_ptr[0][15] * dewindow[0xf];
1033
1034 out = outf1 + outf2 + outf3 + outf4;
1035
1036 dewindow += 32;
1037 dewindow2 += 32;
1038 u_ptr++;
1039
1040 if (out > 32767)
1041 samples[0] = 32767;
1042 else
1043 if (out < -32768)
1044 samples[0] = -32768;
1045 else
1046 samples[0] = out;
1047 }
1048
1049 if (div & 0x1) {
1050 for (j = 1; j < 16; ++j) {
1051 float outf1, outf2, outf3, outf4;
1052
1053 outf1 = u_ptr[0][ 0] * dewindow[0x0];
1054 outf3 = u_ptr[0][ 0] * dewindow2[0xf];
1055 outf2 = u_ptr[0][ 1] * dewindow[0x1];
1056 outf4 = u_ptr[0][ 1] * dewindow2[0xe];
1057 outf1 += u_ptr[0][ 2] * dewindow[0x2];
1058 outf3 += u_ptr[0][ 2] * dewindow2[0xd];
1059 outf2 += u_ptr[0][ 3] * dewindow[0x3];
1060 outf4 += u_ptr[0][ 3] * dewindow2[0xc];
1061 outf1 += u_ptr[0][ 4] * dewindow[0x4];
1062 outf3 += u_ptr[0][ 4] * dewindow2[0xb];
1063 outf2 += u_ptr[0][ 5] * dewindow[0x5];
1064 outf4 += u_ptr[0][ 5] * dewindow2[0xa];
1065 outf1 += u_ptr[0][ 6] * dewindow[0x6];
1066 outf3 += u_ptr[0][ 6] * dewindow2[0x9];
1067 outf2 += u_ptr[0][ 7] * dewindow[0x7];
1068 outf4 += u_ptr[0][ 7] * dewindow2[0x8];
1069 outf1 += u_ptr[0][ 8] * dewindow[0x8];
1070 outf3 += u_ptr[0][ 8] * dewindow2[0x7];
1071 outf2 += u_ptr[0][ 9] * dewindow[0x9];
1072 outf4 += u_ptr[0][ 9] * dewindow2[0x6];
1073 outf1 += u_ptr[0][10] * dewindow[0xa];
1074 outf3 += u_ptr[0][10] * dewindow2[0x5];
1075 outf2 += u_ptr[0][11] * dewindow[0xb];
1076 outf4 += u_ptr[0][11] * dewindow2[0x4];
1077 outf1 += u_ptr[0][12] * dewindow[0xc];
1078 outf3 += u_ptr[0][12] * dewindow2[0x3];
1079 outf2 += u_ptr[0][13] * dewindow[0xd];
1080 outf4 += u_ptr[0][13] * dewindow2[0x2];
1081 outf1 += u_ptr[0][14] * dewindow[0xe];
1082 outf3 += u_ptr[0][14] * dewindow2[0x1];
1083 outf2 += u_ptr[0][15] * dewindow[0xf];
1084 outf4 += u_ptr[0][15] * dewindow2[0x0];
1085
1086 dewindow += 32;
1087 dewindow2 += 32;
1088 u_ptr++;
1089
1090 out = outf1 + outf2;
1091
1092 if (out > 32767)
1093 samples[j * 2] = 32767;
1094 else
1095 if (out < -32768)
1096 samples[j * 2] = -32768;
1097 else
1098 samples[j * 2] = out;
1099
1100 out = outf4 - outf3;
1101
1102 if (out > 32767)
1103 samples[64 - (j * 2)] = 32767;
1104 else
1105 if (out < -32768)
1106 samples[64 - (j * 2)] = -32768;
1107 else
1108 samples[64 - (j * 2)] = out;
1109 }
1110
1111 {
1112 float outf2, outf4;
1113
1114 outf2 = u_ptr[0][ 0] * dewindow[0x0];
1115 outf4 = u_ptr[0][ 2] * dewindow[0x2];
1116 outf2 += u_ptr[0][ 4] * dewindow[0x4];
1117 outf4 += u_ptr[0][ 6] * dewindow[0x6];
1118 outf2 += u_ptr[0][ 8] * dewindow[0x8];
1119 outf4 += u_ptr[0][10] * dewindow[0xa];
1120 outf2 += u_ptr[0][12] * dewindow[0xc];
1121 outf4 += u_ptr[0][14] * dewindow[0xe];
1122
1123 out = outf2 + outf4;
1124
1125 if (out > 32767)
1126 samples[16 * 2] = 32767;
1127 else
1128 if (out < -32768)
1129 samples[16 * 2] = -32768;
1130 else
1131 samples[16 * 2] = out;
1132 }
1133 } else {
1134 for (j = 1; j < 16; ++j) {
1135 float outf1, outf2, outf3, outf4;
1136
1137 outf1 = u_ptr[0][ 0] * dewindow[0x0];
1138 outf3 = u_ptr[0][ 0] * dewindow2[0xf];
1139 outf2 = u_ptr[0][ 1] * dewindow[0x1];
1140 outf4 = u_ptr[0][ 1] * dewindow2[0xe];
1141 outf1 += u_ptr[0][ 2] * dewindow[0x2];
1142 outf3 += u_ptr[0][ 2] * dewindow2[0xd];
1143 outf2 += u_ptr[0][ 3] * dewindow[0x3];
1144 outf4 += u_ptr[0][ 3] * dewindow2[0xc];
1145 outf1 += u_ptr[0][ 4] * dewindow[0x4];
1146 outf3 += u_ptr[0][ 4] * dewindow2[0xb];
1147 outf2 += u_ptr[0][ 5] * dewindow[0x5];
1148 outf4 += u_ptr[0][ 5] * dewindow2[0xa];
1149 outf1 += u_ptr[0][ 6] * dewindow[0x6];
1150 outf3 += u_ptr[0][ 6] * dewindow2[0x9];
1151 outf2 += u_ptr[0][ 7] * dewindow[0x7];
1152 outf4 += u_ptr[0][ 7] * dewindow2[0x8];
1153 outf1 += u_ptr[0][ 8] * dewindow[0x8];
1154 outf3 += u_ptr[0][ 8] * dewindow2[0x7];
1155 outf2 += u_ptr[0][ 9] * dewindow[0x9];
1156 outf4 += u_ptr[0][ 9] * dewindow2[0x6];
1157 outf1 += u_ptr[0][10] * dewindow[0xa];
1158 outf3 += u_ptr[0][10] * dewindow2[0x5];
1159 outf2 += u_ptr[0][11] * dewindow[0xb];
1160 outf4 += u_ptr[0][11] * dewindow2[0x4];
1161 outf1 += u_ptr[0][12] * dewindow[0xc];
1162 outf3 += u_ptr[0][12] * dewindow2[0x3];
1163 outf2 += u_ptr[0][13] * dewindow[0xd];
1164 outf4 += u_ptr[0][13] * dewindow2[0x2];
1165 outf1 += u_ptr[0][14] * dewindow[0xe];
1166 outf3 += u_ptr[0][14] * dewindow2[0x1];
1167 outf2 += u_ptr[0][15] * dewindow[0xf];
1168 outf4 += u_ptr[0][15] * dewindow2[0x0];
1169
1170 dewindow += 32;
1171 dewindow2 += 32;
1172 u_ptr++;
1173
1174 out = outf1 + outf2;
1175
1176 if (out > 32767)
1177 samples[j * 2] = 32767;
1178 else
1179 if (out < -32768)
1180 samples[j * 2] = -32768;
1181 else
1182 samples[j * 2] = out;
1183
1184 out = outf3 - outf4;
1185
1186 if (out > 32767)
1187 samples[64 - (j * 2)] = 32767;
1188 else
1189 if (out < -32768)
1190 samples[64 - (j * 2)] = -32768;
1191 else
1192 samples[64 - (j * 2)] = out;
1193 }
1194
1195 {
1196 float outf2, outf4;
1197
1198 outf2 = u_ptr[0][ 1] * dewindow[0x1];
1199 outf4 = u_ptr[0][ 3] * dewindow[0x3];
1200 outf2 += u_ptr[0][ 5] * dewindow[0x5];
1201 outf4 += u_ptr[0][ 7] * dewindow[0x7];
1202 outf2 += u_ptr[0][ 9] * dewindow[0x9];
1203 outf4 += u_ptr[0][11] * dewindow[0xb];
1204 outf2 += u_ptr[0][13] * dewindow[0xd];
1205 outf4 += u_ptr[0][15] * dewindow[0xf];
1206
1207 out = outf2 + outf4;
1208
1209 if (out > 32767)
1210 samples[16 * 2] = 32767;
1211 else
1212 if (out < -32768)
1213 samples[16 * 2] = -32768;
1214 else
1215 samples[16 * 2] = out;
1216 }
1217 }
1218 }
1219 #elif defined(HAS_AUTOINCREMENT)
1220 const float *dewindow = t_dewindow[0] + 15 - start;
1221
1222 /* This is tuned specifically for architectures with
1223 autoincrement and -decrement. */
1224
1225 {
1226 float *u_ptr = (float*) u[ch][div];
1227
1228 u_ptr--;
1229
1230 for (j = 0; j < 16; ++j) {
1231 float outf1, outf2, outf3, outf4;
1232
1233 outf1 = *++u_ptr * *++dewindow;
1234 outf2 = *++u_ptr * *++dewindow;
1235 outf3 = *++u_ptr * *++dewindow;
1236 outf4 = *++u_ptr * *++dewindow;
1237 outf1 += *++u_ptr * *++dewindow;
1238 outf2 += *++u_ptr * *++dewindow;
1239 outf3 += *++u_ptr * *++dewindow;
1240 outf4 += *++u_ptr * *++dewindow;
1241 outf1 += *++u_ptr * *++dewindow;
1242 outf2 += *++u_ptr * *++dewindow;
1243 outf3 += *++u_ptr * *++dewindow;
1244 outf4 += *++u_ptr * *++dewindow;
1245 outf1 += *++u_ptr * *++dewindow;
1246 outf2 += *++u_ptr * *++dewindow;
1247 outf3 += *++u_ptr * *++dewindow;
1248 outf4 += *++u_ptr * *++dewindow;
1249
1250 out = outf1 + outf2 + outf3 + outf4;
1251
1252 dewindow += 16;
1253
1254 PUT_SAMPLE(out)
1255 }
1256
1257 if (div & 0x1) {
1258 {
1259 float outf2, outf4;
1260
1261 outf2 = u_ptr[ 1] * dewindow[0x1];
1262 outf4 = u_ptr[ 3] * dewindow[0x3];
1263 outf2 += u_ptr[ 5] * dewindow[0x5];
1264 outf4 += u_ptr[ 7] * dewindow[0x7];
1265 outf2 += u_ptr[ 9] * dewindow[0x9];
1266 outf4 += u_ptr[11] * dewindow[0xb];
1267 outf2 += u_ptr[13] * dewindow[0xd];
1268 outf4 += u_ptr[15] * dewindow[0xf];
1269
1270 out = outf2 + outf4;
1271
1272 PUT_SAMPLE(out)
1273 }
1274
1275 dewindow -= 31;
1276 dewindow += start;
1277 dewindow += start;
1278 u_ptr -= 16;
1279
1280 for (; j < 31; ++j) {
1281 float outf1, outf2, outf3, outf4;
1282
1283 outf1 = *++u_ptr * *--dewindow;
1284 outf2 = *++u_ptr * *--dewindow;
1285 outf3 = *++u_ptr * *--dewindow;
1286 outf4 = *++u_ptr * *--dewindow;
1287 outf1 += *++u_ptr * *--dewindow;
1288 outf2 += *++u_ptr * *--dewindow;
1289 outf3 += *++u_ptr * *--dewindow;
1290 outf4 += *++u_ptr * *--dewindow;
1291 outf1 += *++u_ptr * *--dewindow;
1292 outf2 += *++u_ptr * *--dewindow;
1293 outf3 += *++u_ptr * *--dewindow;
1294 outf4 += *++u_ptr * *--dewindow;
1295 outf1 += *++u_ptr * *--dewindow;
1296 outf2 += *++u_ptr * *--dewindow;
1297 outf3 += *++u_ptr * *--dewindow;
1298 outf4 += *++u_ptr * *--dewindow;
1299
1300 out = outf2 - outf1 + outf4 - outf3;
1301
1302 dewindow -= 16;
1303 u_ptr -= 32;
1304
1305 PUT_SAMPLE(out)
1306 }
1307 } else {
1308 {
1309 float outf2, outf4;
1310
1311 outf2 = u_ptr[ 2] * dewindow[ 0x2];
1312 outf4 = u_ptr[ 4] * dewindow[ 0x4];
1313 outf2 += u_ptr[ 6] * dewindow[ 0x6];
1314 outf4 += u_ptr[ 8] * dewindow[ 0x8];
1315 outf2 += u_ptr[10] * dewindow[ 0xa];
1316 outf4 += u_ptr[12] * dewindow[ 0xc];
1317 outf2 += u_ptr[14] * dewindow[ 0xe];
1318 outf4 += u_ptr[16] * dewindow[0x10];
1319
1320 out = outf2 + outf4;
1321
1322 PUT_SAMPLE(out)
1323 }
1324
1325 dewindow -= 31;
1326 dewindow += start;
1327 dewindow += start;
1328 u_ptr -= 16;
1329
1330 for (; j < 31; ++j) {
1331 float outf1, outf2, outf3, outf4;
1332
1333 outf1 = *++u_ptr * *--dewindow;
1334 outf2 = *++u_ptr * *--dewindow;
1335 outf3 = *++u_ptr * *--dewindow;
1336 outf4 = *++u_ptr * *--dewindow;
1337 outf1 += *++u_ptr * *--dewindow;
1338 outf2 += *++u_ptr * *--dewindow;
1339 outf3 += *++u_ptr * *--dewindow;
1340 outf4 += *++u_ptr * *--dewindow;
1341 outf1 += *++u_ptr * *--dewindow;
1342 outf2 += *++u_ptr * *--dewindow;
1343 outf3 += *++u_ptr * *--dewindow;
1344 outf4 += *++u_ptr * *--dewindow;
1345 outf1 += *++u_ptr * *--dewindow;
1346 outf2 += *++u_ptr * *--dewindow;
1347 outf3 += *++u_ptr * *--dewindow;
1348 outf4 += *++u_ptr * *--dewindow;
1349
1350 out = outf1 - outf2 + outf3 - outf4;
1351
1352 dewindow -= 16;
1353 u_ptr -= 32;
1354
1355 PUT_SAMPLE(out)
1356 }
1357 }
1358 }
1359 #else
1360 const float *dewindow = t_dewindow[0] + 16 - start;
1361
1362 /* These optimisations are tuned specifically for architectures
1363 without autoincrement and -decrement. */
1364
1365 {
1366 float (*u_ptr)[16] = u[ch][div];
1367
1368 for (j = 0; j < 16; ++j) {
1369 float outf1, outf2, outf3, outf4;
1370
1371 outf1 = u_ptr[0][ 0] * dewindow[0x0];
1372 outf2 = u_ptr[0][ 1] * dewindow[0x1];
1373 outf3 = u_ptr[0][ 2] * dewindow[0x2];
1374 outf4 = u_ptr[0][ 3] * dewindow[0x3];
1375 outf1 += u_ptr[0][ 4] * dewindow[0x4];
1376 outf2 += u_ptr[0][ 5] * dewindow[0x5];
1377 outf3 += u_ptr[0][ 6] * dewindow[0x6];
1378 outf4 += u_ptr[0][ 7] * dewindow[0x7];
1379 outf1 += u_ptr[0][ 8] * dewindow[0x8];
1380 outf2 += u_ptr[0][ 9] * dewindow[0x9];
1381 outf3 += u_ptr[0][10] * dewindow[0xa];
1382 outf4 += u_ptr[0][11] * dewindow[0xb];
1383 outf1 += u_ptr[0][12] * dewindow[0xc];
1384 outf2 += u_ptr[0][13] * dewindow[0xd];
1385 outf3 += u_ptr[0][14] * dewindow[0xe];
1386 outf4 += u_ptr[0][15] * dewindow[0xf];
1387
1388 out = outf1 + outf2 + outf3 + outf4;
1389
1390 dewindow += 32;
1391 u_ptr++;
1392
1393 PUT_SAMPLE(out)
1394 }
1395
1396 if (div & 0x1) {
1397 {
1398 float outf2, outf4;
1399
1400 outf2 = u_ptr[0][ 0] * dewindow[0x0];
1401 outf4 = u_ptr[0][ 2] * dewindow[0x2];
1402 outf2 += u_ptr[0][ 4] * dewindow[0x4];
1403 outf4 += u_ptr[0][ 6] * dewindow[0x6];
1404 outf2 += u_ptr[0][ 8] * dewindow[0x8];
1405 outf4 += u_ptr[0][10] * dewindow[0xa];
1406 outf2 += u_ptr[0][12] * dewindow[0xc];
1407 outf4 += u_ptr[0][14] * dewindow[0xe];
1408
1409 out = outf2 + outf4;
1410
1411 PUT_SAMPLE(out)
1412 }
1413
1414 dewindow -= 48;
1415 dewindow += start;
1416 dewindow += start;
1417
1418 for (; j < 31; ++j) {
1419 float outf1, outf2, outf3, outf4;
1420
1421 --u_ptr;
1422
1423 outf1 = u_ptr[0][ 0] * dewindow[0xf];
1424 outf2 = u_ptr[0][ 1] * dewindow[0xe];
1425 outf3 = u_ptr[0][ 2] * dewindow[0xd];
1426 outf4 = u_ptr[0][ 3] * dewindow[0xc];
1427 outf1 += u_ptr[0][ 4] * dewindow[0xb];
1428 outf2 += u_ptr[0][ 5] * dewindow[0xa];
1429 outf3 += u_ptr[0][ 6] * dewindow[0x9];
1430 outf4 += u_ptr[0][ 7] * dewindow[0x8];
1431 outf1 += u_ptr[0][ 8] * dewindow[0x7];
1432 outf2 += u_ptr[0][ 9] * dewindow[0x6];
1433 outf3 += u_ptr[0][10] * dewindow[0x5];
1434 outf4 += u_ptr[0][11] * dewindow[0x4];
1435 outf1 += u_ptr[0][12] * dewindow[0x3];
1436 outf2 += u_ptr[0][13] * dewindow[0x2];
1437 outf3 += u_ptr[0][14] * dewindow[0x1];
1438 outf4 += u_ptr[0][15] * dewindow[0x0];
1439
1440 out = -outf1 + outf2 - outf3 + outf4;
1441
1442 dewindow -= 32;
1443
1444 PUT_SAMPLE(out)
1445 }
1446 } else {
1447 {
1448 float outf2, outf4;
1449
1450 outf2 = u_ptr[0][ 1] * dewindow[0x1];
1451 outf4 = u_ptr[0][ 3] * dewindow[0x3];
1452 outf2 += u_ptr[0][ 5] * dewindow[0x5];
1453 outf4 += u_ptr[0][ 7] * dewindow[0x7];
1454 outf2 += u_ptr[0][ 9] * dewindow[0x9];
1455 outf4 += u_ptr[0][11] * dewindow[0xb];
1456 outf2 += u_ptr[0][13] * dewindow[0xd];
1457 outf4 += u_ptr[0][15] * dewindow[0xf];
1458
1459 out = outf2 + outf4;
1460
1461 PUT_SAMPLE(out)
1462 }
1463
1464 dewindow -= 48;
1465 dewindow += start;
1466 dewindow += start;
1467
1468 for (; j < 31; ++j) {
1469 float outf1, outf2, outf3, outf4;
1470
1471 --u_ptr;
1472
1473 outf1 = u_ptr[0][ 0] * dewindow[0xf];
1474 outf2 = u_ptr[0][ 1] * dewindow[0xe];
1475 outf3 = u_ptr[0][ 2] * dewindow[0xd];
1476 outf4 = u_ptr[0][ 3] * dewindow[0xc];
1477 outf1 += u_ptr[0][ 4] * dewindow[0xb];
1478 outf2 += u_ptr[0][ 5] * dewindow[0xa];
1479 outf3 += u_ptr[0][ 6] * dewindow[0x9];
1480 outf4 += u_ptr[0][ 7] * dewindow[0x8];
1481 outf1 += u_ptr[0][ 8] * dewindow[0x7];
1482 outf2 += u_ptr[0][ 9] * dewindow[0x6];
1483 outf3 += u_ptr[0][10] * dewindow[0x5];
1484 outf4 += u_ptr[0][11] * dewindow[0x4];
1485 outf1 += u_ptr[0][12] * dewindow[0x3];
1486 outf2 += u_ptr[0][13] * dewindow[0x2];
1487 outf3 += u_ptr[0][14] * dewindow[0x1];
1488 outf4 += u_ptr[0][15] * dewindow[0x0];
1489
1490 out = outf1 - outf2 + outf3 - outf4;
1491
1492 dewindow -= 32;
1493
1494 PUT_SAMPLE(out)
1495 }
1496 }
1497 }
1498 #endif
1499 }
1500 #endif
1501
1502 --u_start[ch];
1503 u_start[ch] &= 0xf;
1504 u_div[ch]=u_div[ch] ? 0 : 1;
1505
1506 #if defined(PENTIUM_RDTSC)
1507 __asm__(".byte 0x0f,0x31" : "=a" (cnt2), "=d" (cnt4));
1508
1509 if (cnt2-cnt1 < min_cycles) {
1510 min_cycles = cnt2-cnt1;
1511 /*printf("%d, %d cycles, %d\n", cnt3-cnt1, min_cycles, start);*/
1512 }
1513 #endif
1514 }
1515
premultiply()1516 void premultiply()
1517 {
1518 int i,t;
1519
1520 for (i = 0; i < 17; ++i)
1521 for (t = 0; t < 32; ++t)
1522 t_dewindow[i][t] *= 16383.5f;
1523 }
1524