1 /*
2 fftlib.c:
3
4 Copyright 2005 John Green Istvan Varga Victor Lazzarini
5
6 FFT library
7 based on public domain code by John Green <green_jt@vsdec.npt.nuwc.navy.mil>
8 original version is available at
9 http://hyperarchive.lcs.mit.edu/
10 /HyperArchive/Archive/dev/src/ffts-for-risc-2-c.hqx
11 ported to Csound by Istvan Varga, 2005
12
13 This file is part of Csound.
14
15 The Csound Library is free software; you can redistribute it
16 and/or modify it under the terms of the GNU Lesser General Public
17 License as published by the Free Software Foundation; either
18 version 2.1 of the License, or (at your option) any later version.
19
20 Csound is distributed in the hope that it will be useful,
21 but WITHOUT ANY WARRANTY; without even the implied warranty of
22 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 GNU Lesser General Public License for more details.
24
25 You should have received a copy of the GNU Lesser General Public
26 License along with Csound; if not, write to the Free Software
27 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
28 02110-1301 USA
29 */
30
31 #include <stdlib.h>
32 #include <math.h>
33 #include "csoundCore.h"
34 #include "csound.h"
35 #include "fftlib.h"
36 #include "pffft.h"
37
38
39
40 #define POW2(m) ((uint32) (1 << (m))) /* integer power of 2 for m<32 */
41
42 /* fft's with M bigger than this bust primary cache */
43 #define MCACHE (11 - (sizeof(MYFLT) / 8))
44
45 /* some math constants to 40 decimal places */
46 //#define MYPI 3.141592653589793238462643383279502884197 /* pi */
47 //#define MYROOT2 1.414213562373095048801688724209698078569 /* sqrt(2) */
48 #define MYCOSPID8 0.9238795325112867561281831893967882868224 /* cos(pi/8) */
49 #define MYSINPID8 0.3826834323650897717284599840303988667613 /* sin(pi/8) */
50
51
52 /*****************************************************
53 * routines to initialize tables used by fft routines *
54 *****************************************************/
55
fftCosInit(int32_t M,MYFLT * Utbl)56 static void fftCosInit(int32_t M, MYFLT *Utbl)
57 {
58 /* Compute Utbl, the cosine table for ffts */
59 /* of size (pow(2,M)/4 +1) */
60 /* INPUTS */
61 /* M = log2 of fft size */
62 /* OUTPUTS */
63 /* *Utbl = cosine table */
64 uint32_t fftN = POW2(M);
65 uint32_t i1;
66
67 Utbl[0] = FL(1.0);
68 for (i1 = 1; i1 < fftN/4; i1++)
69 Utbl[i1] = COS((FL(2.0) * PI_F * (MYFLT)i1) / (MYFLT)fftN);
70 Utbl[fftN/4] = FL(0.0);
71 }
72
fftBRInit(int32_t M,int16 * BRLow)73 static void fftBRInit(int32_t M, int16 *BRLow)
74 {
75 /* Compute BRLow, the bit reversed table for ffts */
76 /* of size pow(2,M/2 -1) */
77 /* INPUTS */
78 /* M = log2 of fft size */
79 /* OUTPUTS */
80 /* *BRLow = bit reversed counter table */
81 int32_t Mroot_1 = M / 2 - 1;
82 int32_t Nroot_1 = POW2(Mroot_1);
83 int32_t i1;
84 int32_t bitsum;
85 int32_t bitmask;
86 int32_t bit;
87
88 for (i1 = 0; i1 < Nroot_1; i1++) {
89 bitsum = 0;
90 bitmask = 1;
91 for (bit = 1; bit <= Mroot_1; bitmask <<= 1, bit++)
92 if (i1 & bitmask)
93 bitsum = bitsum + (Nroot_1 >> bit);
94 BRLow[i1] = bitsum;
95 }
96 }
97
98 /*****************
99 * parts of ffts1 *
100 *****************/
101
bitrevR2(MYFLT * ioptr,int32_t M,int16 * BRLow)102 static void bitrevR2(MYFLT *ioptr, int32_t M, int16 *BRLow)
103 {
104 /*** bit reverse and first radix 2 stage of forward or inverse fft ***/
105 MYFLT f0r;
106 MYFLT f0i;
107 MYFLT f1r;
108 MYFLT f1i;
109 MYFLT f2r;
110 MYFLT f2i;
111 MYFLT f3r;
112 MYFLT f3i;
113 MYFLT f4r;
114 MYFLT f4i;
115 MYFLT f5r;
116 MYFLT f5i;
117 MYFLT f6r;
118 MYFLT f6i;
119 MYFLT f7r;
120 MYFLT f7i;
121 MYFLT t0r;
122 MYFLT t0i;
123 MYFLT t1r;
124 MYFLT t1i;
125 MYFLT *p0r;
126 MYFLT *p1r;
127 MYFLT *IOP;
128 MYFLT *iolimit;
129 int32_t Colstart;
130 int32_t iCol;
131 uint32_t posA;
132 uint32_t posAi;
133 uint32_t posB;
134 uint32_t posBi;
135
136 const uint32_t Nrems2 = POW2((M + 3) / 2);
137 const uint32_t Nroot_1_ColInc = POW2(M) - Nrems2;
138 const uint32_t Nroot_1 = POW2(M / 2 - 1) - 1;
139 const uint32_t ColstartShift = (M + 1) / 2 + 1;
140
141 posA = POW2(M); /* 1/2 of POW2(M) complexes */
142 posAi = posA + 1;
143 posB = posA + 2;
144 posBi = posB + 1;
145
146 iolimit = ioptr + Nrems2;
147 for (; ioptr < iolimit; ioptr += POW2(M / 2 + 1)) {
148 for (Colstart = Nroot_1; Colstart >= 0; Colstart--) {
149 iCol = Nroot_1;
150 p0r = ioptr + Nroot_1_ColInc + BRLow[Colstart] * 4;
151 IOP = ioptr + (Colstart << ColstartShift);
152 p1r = IOP + BRLow[iCol] * 4;
153 f0r = *(p0r);
154 f0i = *(p0r + 1);
155 f1r = *(p0r + posA);
156 f1i = *(p0r + posAi);
157 for (; iCol > Colstart;) {
158 f2r = *(p0r + 2);
159 f2i = *(p0r + (2 + 1));
160 f3r = *(p0r + posB);
161 f3i = *(p0r + posBi);
162 f4r = *(p1r);
163 f4i = *(p1r + 1);
164 f5r = *(p1r + posA);
165 f5i = *(p1r + posAi);
166 f6r = *(p1r + 2);
167 f6i = *(p1r + (2 + 1));
168 f7r = *(p1r + posB);
169 f7i = *(p1r + posBi);
170
171 t0r = f0r + f1r;
172 t0i = f0i + f1i;
173 f1r = f0r - f1r;
174 f1i = f0i - f1i;
175 t1r = f2r + f3r;
176 t1i = f2i + f3i;
177 f3r = f2r - f3r;
178 f3i = f2i - f3i;
179 f0r = f4r + f5r;
180 f0i = f4i + f5i;
181 f5r = f4r - f5r;
182 f5i = f4i - f5i;
183 f2r = f6r + f7r;
184 f2i = f6i + f7i;
185 f7r = f6r - f7r;
186 f7i = f6i - f7i;
187
188 *(p1r) = t0r;
189 *(p1r + 1) = t0i;
190 *(p1r + 2) = f1r;
191 *(p1r + (2 + 1)) = f1i;
192 *(p1r + posA) = t1r;
193 *(p1r + posAi) = t1i;
194 *(p1r + posB) = f3r;
195 *(p1r + posBi) = f3i;
196 *(p0r) = f0r;
197 *(p0r + 1) = f0i;
198 *(p0r + 2) = f5r;
199 *(p0r + (2 + 1)) = f5i;
200 *(p0r + posA) = f2r;
201 *(p0r + posAi) = f2i;
202 *(p0r + posB) = f7r;
203 *(p0r + posBi) = f7i;
204
205 p0r -= Nrems2;
206 f0r = *(p0r);
207 f0i = *(p0r + 1);
208 f1r = *(p0r + posA);
209 f1i = *(p0r + posAi);
210 iCol -= 1;
211 p1r = IOP + BRLow[iCol] * 4;
212 }
213 f2r = *(p0r + 2);
214 f2i = *(p0r + (2 + 1));
215 f3r = *(p0r + posB);
216 f3i = *(p0r + posBi);
217
218 t0r = f0r + f1r;
219 t0i = f0i + f1i;
220 f1r = f0r - f1r;
221 f1i = f0i - f1i;
222 t1r = f2r + f3r;
223 t1i = f2i + f3i;
224 f3r = f2r - f3r;
225 f3i = f2i - f3i;
226
227 *(p0r) = t0r;
228 *(p0r + 1) = t0i;
229 *(p0r + 2) = f1r;
230 *(p0r + (2 + 1)) = f1i;
231 *(p0r + posA) = t1r;
232 *(p0r + posAi) = t1i;
233 *(p0r + posB) = f3r;
234 *(p0r + posBi) = f3i;
235 }
236 }
237 }
238
fft2pt(MYFLT * ioptr)239 static void fft2pt(MYFLT *ioptr)
240 {
241 /*** RADIX 2 fft ***/
242 MYFLT f0r, f0i, f1r, f1i;
243 MYFLT t0r, t0i;
244
245 /* bit reversed load */
246 f0r = ioptr[0];
247 f0i = ioptr[1];
248 f1r = ioptr[2];
249 f1i = ioptr[3];
250
251 /* Butterflys */
252 /*
253 f0 - - t0
254 f1 - 1 - f1
255 */
256
257 t0r = f0r + f1r;
258 t0i = f0i + f1i;
259 f1r = f0r - f1r;
260 f1i = f0i - f1i;
261
262 /* store result */
263 ioptr[0] = t0r;
264 ioptr[1] = t0i;
265 ioptr[2] = f1r;
266 ioptr[3] = f1i;
267 }
268
fft4pt(MYFLT * ioptr)269 static void fft4pt(MYFLT *ioptr)
270 {
271 /*** RADIX 4 fft ***/
272 MYFLT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
273 MYFLT t0r, t0i, t1r, t1i;
274
275 /* bit reversed load */
276 f0r = ioptr[0];
277 f0i = ioptr[1];
278 f1r = ioptr[4];
279 f1i = ioptr[5];
280 f2r = ioptr[2];
281 f2i = ioptr[3];
282 f3r = ioptr[6];
283 f3i = ioptr[7];
284
285 /* Butterflys */
286 /*
287 f0 - - t0 - - f0
288 f1 - 1 - f1 - - f1
289 f2 - - f2 - 1 - f2
290 f3 - 1 - t1 - -i - f3
291 */
292
293 t0r = f0r + f1r;
294 t0i = f0i + f1i;
295 f1r = f0r - f1r;
296 f1i = f0i - f1i;
297
298 t1r = f2r - f3r;
299 t1i = f2i - f3i;
300 f2r = f2r + f3r;
301 f2i = f2i + f3i;
302
303 f0r = t0r + f2r;
304 f0i = t0i + f2i;
305 f2r = t0r - f2r;
306 f2i = t0i - f2i;
307
308 f3r = f1r - t1i;
309 f3i = f1i + t1r;
310 f1r = f1r + t1i;
311 f1i = f1i - t1r;
312
313 /* store result */
314 ioptr[0] = f0r;
315 ioptr[1] = f0i;
316 ioptr[2] = f1r;
317 ioptr[3] = f1i;
318 ioptr[4] = f2r;
319 ioptr[5] = f2i;
320 ioptr[6] = f3r;
321 ioptr[7] = f3i;
322 }
323
fft8pt(MYFLT * ioptr)324 static void fft8pt(MYFLT *ioptr)
325 {
326 /*** RADIX 8 fft ***/
327 MYFLT w0r = (MYFLT)(1.0 / ROOT2); /* cos(pi/4) */
328 MYFLT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
329 MYFLT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
330 MYFLT t0r, t0i, t1r, t1i;
331 const MYFLT Two = FL(2.0);
332
333 /* bit reversed load */
334 f0r = ioptr[0];
335 f0i = ioptr[1];
336 f1r = ioptr[8];
337 f1i = ioptr[9];
338 f2r = ioptr[4];
339 f2i = ioptr[5];
340 f3r = ioptr[12];
341 f3i = ioptr[13];
342 f4r = ioptr[2];
343 f4i = ioptr[3];
344 f5r = ioptr[10];
345 f5i = ioptr[11];
346 f6r = ioptr[6];
347 f6i = ioptr[7];
348 f7r = ioptr[14];
349 f7i = ioptr[15];
350 /* Butterflys */
351 /*
352 f0 - - t0 - - f0 - - f0
353 f1 - 1 - f1 - - f1 - - f1
354 f2 - - f2 - 1 - f2 - - f2
355 f3 - 1 - t1 - -i - f3 - - f3
356 f4 - - t0 - - f4 - 1 - t0
357 f5 - 1 - f5 - - f5 - w3 - f4
358 f6 - - f6 - 1 - f6 - -i - t1
359 f7 - 1 - t1 - -i - f7 - iw3- f6
360 */
361
362 t0r = f0r + f1r;
363 t0i = f0i + f1i;
364 f1r = f0r - f1r;
365 f1i = f0i - f1i;
366
367 t1r = f2r - f3r;
368 t1i = f2i - f3i;
369 f2r = f2r + f3r;
370 f2i = f2i + f3i;
371
372 f0r = t0r + f2r;
373 f0i = t0i + f2i;
374 f2r = t0r - f2r;
375 f2i = t0i - f2i;
376
377 f3r = f1r - t1i;
378 f3i = f1i + t1r;
379 f1r = f1r + t1i;
380 f1i = f1i - t1r;
381
382 t0r = f4r + f5r;
383 t0i = f4i + f5i;
384 f5r = f4r - f5r;
385 f5i = f4i - f5i;
386
387 t1r = f6r - f7r;
388 t1i = f6i - f7i;
389 f6r = f6r + f7r;
390 f6i = f6i + f7i;
391
392 f4r = t0r + f6r;
393 f4i = t0i + f6i;
394 f6r = t0r - f6r;
395 f6i = t0i - f6i;
396
397 f7r = f5r - t1i;
398 f7i = f5i + t1r;
399 f5r = f5r + t1i;
400 f5i = f5i - t1r;
401
402 t0r = f0r - f4r;
403 t0i = f0i - f4i;
404 f0r = f0r + f4r;
405 f0i = f0i + f4i;
406
407 t1r = f2r - f6i;
408 t1i = f2i + f6r;
409 f2r = f2r + f6i;
410 f2i = f2i - f6r;
411
412 f4r = f1r - f5r * w0r - f5i * w0r;
413 f4i = f1i + f5r * w0r - f5i * w0r;
414 f1r = f1r * Two - f4r;
415 f1i = f1i * Two - f4i;
416
417 f6r = f3r + f7r * w0r - f7i * w0r;
418 f6i = f3i + f7r * w0r + f7i * w0r;
419 f3r = f3r * Two - f6r;
420 f3i = f3i * Two - f6i;
421
422 /* store result */
423 ioptr[0] = f0r;
424 ioptr[1] = f0i;
425 ioptr[2] = f1r;
426 ioptr[3] = f1i;
427 ioptr[4] = f2r;
428 ioptr[5] = f2i;
429 ioptr[6] = f3r;
430 ioptr[7] = f3i;
431 ioptr[8] = t0r;
432 ioptr[9] = t0i;
433 ioptr[10] = f4r;
434 ioptr[11] = f4i;
435 ioptr[12] = t1r;
436 ioptr[13] = t1i;
437 ioptr[14] = f6r;
438 ioptr[15] = f6i;
439 }
440
bfR2(MYFLT * ioptr,int32_t M,int32_t NDiffU)441 static void bfR2(MYFLT *ioptr, int32_t M, int32_t NDiffU)
442 {
443 /*** 2nd radix 2 stage ***/
444 uint32_t pos;
445 uint32_t posi;
446 uint32_t pinc;
447 uint32_t pnext;
448 uint32_t NSameU;
449 uint32_t SameUCnt;
450
451 MYFLT *pstrt;
452 MYFLT *p0r, *p1r, *p2r, *p3r;
453
454 MYFLT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
455 MYFLT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
456
457 pinc = NDiffU * 2; /* 2 floats per complex */
458 pnext = pinc * 4;
459 pos = 2;
460 posi = pos + 1;
461 NSameU = POW2(M) / 4 / NDiffU; /* 4 Us at a time */
462 pstrt = ioptr;
463 p0r = pstrt;
464 p1r = pstrt + pinc;
465 p2r = p1r + pinc;
466 p3r = p2r + pinc;
467
468 /* Butterflys */
469 /*
470 f0 - - f4
471 f1 - 1 - f5
472 f2 - - f6
473 f3 - 1 - f7
474 */
475 /* Butterflys */
476 /*
477 f0 - - f4
478 f1 - 1 - f5
479 f2 - - f6
480 f3 - 1 - f7
481 */
482
483 for (SameUCnt = NSameU; SameUCnt > 0; SameUCnt--) {
484
485 f0r = *p0r;
486 f1r = *p1r;
487 f0i = *(p0r + 1);
488 f1i = *(p1r + 1);
489 f2r = *p2r;
490 f3r = *p3r;
491 f2i = *(p2r + 1);
492 f3i = *(p3r + 1);
493
494 f4r = f0r + f1r;
495 f4i = f0i + f1i;
496 f5r = f0r - f1r;
497 f5i = f0i - f1i;
498
499 f6r = f2r + f3r;
500 f6i = f2i + f3i;
501 f7r = f2r - f3r;
502 f7i = f2i - f3i;
503
504 *p0r = f4r;
505 *(p0r + 1) = f4i;
506 *p1r = f5r;
507 *(p1r + 1) = f5i;
508 *p2r = f6r;
509 *(p2r + 1) = f6i;
510 *p3r = f7r;
511 *(p3r + 1) = f7i;
512
513 f0r = *(p0r + pos);
514 f1i = *(p1r + posi);
515 f0i = *(p0r + posi);
516 f1r = *(p1r + pos);
517 f2r = *(p2r + pos);
518 f3i = *(p3r + posi);
519 f2i = *(p2r + posi);
520 f3r = *(p3r + pos);
521
522 f4r = f0r + f1i;
523 f4i = f0i - f1r;
524 f5r = f0r - f1i;
525 f5i = f0i + f1r;
526
527 f6r = f2r + f3i;
528 f6i = f2i - f3r;
529 f7r = f2r - f3i;
530 f7i = f2i + f3r;
531
532 *(p0r + pos) = f4r;
533 *(p0r + posi) = f4i;
534 *(p1r + pos) = f5r;
535 *(p1r + posi) = f5i;
536 *(p2r + pos) = f6r;
537 *(p2r + posi) = f6i;
538 *(p3r + pos) = f7r;
539 *(p3r + posi) = f7i;
540
541 p0r += pnext;
542 p1r += pnext;
543 p2r += pnext;
544 p3r += pnext;
545 }
546 }
547
bfR4(MYFLT * ioptr,int32_t M,int32_t NDiffU)548 static void bfR4(MYFLT *ioptr, int32_t M, int32_t NDiffU)
549 {
550 /*** 1 radix 4 stage ***/
551 uint32_t pos;
552 uint32_t posi;
553 uint32_t pinc;
554 uint32_t pnext;
555 uint32_t pnexti;
556 uint32_t NSameU;
557 uint32_t SameUCnt;
558
559 MYFLT *pstrt;
560 MYFLT *p0r, *p1r, *p2r, *p3r;
561
562 MYFLT w1r = FL(1.0) / FL(ROOT2); /* cos(pi/4) */
563 MYFLT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
564 MYFLT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
565 MYFLT t1r, t1i;
566 const MYFLT Two = FL(2.0);
567
568 pinc = NDiffU * 2; /* 2 floats per complex */
569 pnext = pinc * 4;
570 pnexti = pnext + 1;
571 pos = 2;
572 posi = pos + 1;
573 NSameU = POW2(M) / 4 / NDiffU; /* 4 pts per butterfly */
574 pstrt = ioptr;
575 p0r = pstrt;
576 p1r = pstrt + pinc;
577 p2r = p1r + pinc;
578 p3r = p2r + pinc;
579
580 /* Butterflys */
581 /*
582 f0 - - f0 - - f4
583 f1 - 1 - f5 - - f5
584 f2 - - f6 - 1 - f6
585 f3 - 1 - f3 - -i - f7
586 */
587 /* Butterflys */
588 /*
589 f0 - - f4 - - f4
590 f1 - -i - t1 - - f5
591 f2 - - f2 - w1 - f6
592 f3 - -i - f7 - iw1- f7
593 */
594
595 f0r = *p0r;
596 f1r = *p1r;
597 f2r = *p2r;
598 f3r = *p3r;
599 f0i = *(p0r + 1);
600 f1i = *(p1r + 1);
601 f2i = *(p2r + 1);
602 f3i = *(p3r + 1);
603
604 f5r = f0r - f1r;
605 f5i = f0i - f1i;
606 f0r = f0r + f1r;
607 f0i = f0i + f1i;
608
609 f6r = f2r + f3r;
610 f6i = f2i + f3i;
611 f3r = f2r - f3r;
612 f3i = f2i - f3i;
613
614 for (SameUCnt = NSameU - 1; SameUCnt > 0; SameUCnt--) {
615
616 f7r = f5r - f3i;
617 f7i = f5i + f3r;
618 f5r = f5r + f3i;
619 f5i = f5i - f3r;
620
621 f4r = f0r + f6r;
622 f4i = f0i + f6i;
623 f6r = f0r - f6r;
624 f6i = f0i - f6i;
625
626 f2r = *(p2r + pos);
627 f2i = *(p2r + posi);
628 f1r = *(p1r + pos);
629 f1i = *(p1r + posi);
630 f3i = *(p3r + posi);
631 f0r = *(p0r + pos);
632 f3r = *(p3r + pos);
633 f0i = *(p0r + posi);
634
635 *p3r = f7r;
636 *p0r = f4r;
637 *(p3r + 1) = f7i;
638 *(p0r + 1) = f4i;
639 *p1r = f5r;
640 *p2r = f6r;
641 *(p1r + 1) = f5i;
642 *(p2r + 1) = f6i;
643
644 f7r = f2r - f3i;
645 f7i = f2i + f3r;
646 f2r = f2r + f3i;
647 f2i = f2i - f3r;
648
649 f4r = f0r + f1i;
650 f4i = f0i - f1r;
651 t1r = f0r - f1i;
652 t1i = f0i + f1r;
653
654 f5r = t1r - f7r * w1r + f7i * w1r;
655 f5i = t1i - f7r * w1r - f7i * w1r;
656 f7r = t1r * Two - f5r;
657 f7i = t1i * Two - f5i;
658
659 f6r = f4r - f2r * w1r - f2i * w1r;
660 f6i = f4i + f2r * w1r - f2i * w1r;
661 f4r = f4r * Two - f6r;
662 f4i = f4i * Two - f6i;
663
664 f3r = *(p3r + pnext);
665 f0r = *(p0r + pnext);
666 f3i = *(p3r + pnexti);
667 f0i = *(p0r + pnexti);
668 f2r = *(p2r + pnext);
669 f2i = *(p2r + pnexti);
670 f1r = *(p1r + pnext);
671 f1i = *(p1r + pnexti);
672
673 *(p2r + pos) = f6r;
674 *(p1r + pos) = f5r;
675 *(p2r + posi) = f6i;
676 *(p1r + posi) = f5i;
677 *(p3r + pos) = f7r;
678 *(p0r + pos) = f4r;
679 *(p3r + posi) = f7i;
680 *(p0r + posi) = f4i;
681
682 f6r = f2r + f3r;
683 f6i = f2i + f3i;
684 f3r = f2r - f3r;
685 f3i = f2i - f3i;
686
687 f5r = f0r - f1r;
688 f5i = f0i - f1i;
689 f0r = f0r + f1r;
690 f0i = f0i + f1i;
691
692 p3r += pnext;
693 p0r += pnext;
694 p1r += pnext;
695 p2r += pnext;
696 }
697 f7r = f5r - f3i;
698 f7i = f5i + f3r;
699 f5r = f5r + f3i;
700 f5i = f5i - f3r;
701
702 f4r = f0r + f6r;
703 f4i = f0i + f6i;
704 f6r = f0r - f6r;
705 f6i = f0i - f6i;
706
707 f2r = *(p2r + pos);
708 f2i = *(p2r + posi);
709 f1r = *(p1r + pos);
710 f1i = *(p1r + posi);
711 f3i = *(p3r + posi);
712 f0r = *(p0r + pos);
713 f3r = *(p3r + pos);
714 f0i = *(p0r + posi);
715
716 *p3r = f7r;
717 *p0r = f4r;
718 *(p3r + 1) = f7i;
719 *(p0r + 1) = f4i;
720 *p1r = f5r;
721 *p2r = f6r;
722 *(p1r + 1) = f5i;
723 *(p2r + 1) = f6i;
724
725 f7r = f2r - f3i;
726 f7i = f2i + f3r;
727 f2r = f2r + f3i;
728 f2i = f2i - f3r;
729
730 f4r = f0r + f1i;
731 f4i = f0i - f1r;
732 t1r = f0r - f1i;
733 t1i = f0i + f1r;
734
735 f5r = t1r - f7r * w1r + f7i * w1r;
736 f5i = t1i - f7r * w1r - f7i * w1r;
737 f7r = t1r * Two - f5r;
738 f7i = t1i * Two - f5i;
739
740 f6r = f4r - f2r * w1r - f2i * w1r;
741 f6i = f4i + f2r * w1r - f2i * w1r;
742 f4r = f4r * Two - f6r;
743 f4i = f4i * Two - f6i;
744
745 *(p2r + pos) = f6r;
746 *(p1r + pos) = f5r;
747 *(p2r + posi) = f6i;
748 *(p1r + posi) = f5i;
749 *(p3r + pos) = f7r;
750 *(p0r + pos) = f4r;
751 *(p3r + posi) = f7i;
752 *(p0r + posi) = f4i;
753 }
754
bfstages(MYFLT * ioptr,int32_t M,MYFLT * Utbl,int32_t Ustride,int32_t NDiffU,int32_t StageCnt)755 static void bfstages(MYFLT *ioptr, int32_t M, MYFLT *Utbl, int32_t Ustride,
756 int32_t NDiffU, int32_t StageCnt)
757 {
758 /*** RADIX 8 Stages ***/
759 uint32_t pos;
760 uint32_t posi;
761 uint32_t pinc;
762 uint32_t pnext;
763 uint32_t NSameU;
764 int32_t Uinc;
765 int32_t Uinc2;
766 int32_t Uinc4;
767 uint32_t DiffUCnt;
768 uint32_t SameUCnt;
769 uint32_t U2toU3;
770
771 MYFLT *pstrt;
772 MYFLT *p0r, *p1r, *p2r, *p3r;
773 MYFLT *u0r, *u0i, *u1r, *u1i, *u2r, *u2i;
774
775 MYFLT w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i;
776 MYFLT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
777 MYFLT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
778 MYFLT t0r, t0i, t1r, t1i;
779 const MYFLT Two = FL(2.0);
780
781 pinc = NDiffU * 2; /* 2 floats per complex */
782 pnext = pinc * 8;
783 pos = pinc * 4;
784 posi = pos + 1;
785 NSameU = POW2(M) / 8 / NDiffU; /* 8 pts per butterfly */
786 Uinc = (int32_t) NSameU * Ustride;
787 Uinc2 = Uinc * 2;
788 Uinc4 = Uinc * 4;
789 U2toU3 = (POW2(M) / 8) * Ustride;
790 for (; StageCnt > 0; StageCnt--) {
791
792 u0r = &Utbl[0];
793 u0i = &Utbl[POW2(M - 2) * Ustride];
794 u1r = u0r;
795 u1i = u0i;
796 u2r = u0r;
797 u2i = u0i;
798
799 w0r = *u0r;
800 w0i = *u0i;
801 w1r = *u1r;
802 w1i = *u1i;
803 w2r = *u2r;
804 w2i = *u2i;
805 w3r = *(u2r + U2toU3);
806 w3i = *(u2i - U2toU3);
807
808 pstrt = ioptr;
809
810 p0r = pstrt;
811 p1r = pstrt + pinc;
812 p2r = p1r + pinc;
813 p3r = p2r + pinc;
814
815 /* Butterflys */
816 /*
817 f0 - - t0 - - f0 - - f0
818 f1 - w0- f1 - - f1 - - f1
819 f2 - - f2 - w1- f2 - - f4
820 f3 - w0- t1 - iw1- f3 - - f5
821
822 f4 - - t0 - - f4 - w2- t0
823 f5 - w0- f5 - - f5 - w3- t1
824 f6 - - f6 - w1- f6 - iw2- f6
825 f7 - w0- t1 - iw1- f7 - iw3- f7
826 */
827
828 for (DiffUCnt = NDiffU; DiffUCnt > 0; DiffUCnt--) {
829 f0r = *p0r;
830 f0i = *(p0r + 1);
831 f1r = *p1r;
832 f1i = *(p1r + 1);
833 for (SameUCnt = NSameU - 1; SameUCnt > 0; SameUCnt--) {
834 f2r = *p2r;
835 f2i = *(p2r + 1);
836 f3r = *p3r;
837 f3i = *(p3r + 1);
838
839 t0r = f0r + f1r * w0r + f1i * w0i;
840 t0i = f0i - f1r * w0i + f1i * w0r;
841 f1r = f0r * Two - t0r;
842 f1i = f0i * Two - t0i;
843
844 f4r = *(p0r + pos);
845 f4i = *(p0r + posi);
846 f5r = *(p1r + pos);
847 f5i = *(p1r + posi);
848
849 f6r = *(p2r + pos);
850 f6i = *(p2r + posi);
851 f7r = *(p3r + pos);
852 f7i = *(p3r + posi);
853
854 t1r = f2r - f3r * w0r - f3i * w0i;
855 t1i = f2i + f3r * w0i - f3i * w0r;
856 f2r = f2r * Two - t1r;
857 f2i = f2i * Two - t1i;
858
859 f0r = t0r + f2r * w1r + f2i * w1i;
860 f0i = t0i - f2r * w1i + f2i * w1r;
861 f2r = t0r * Two - f0r;
862 f2i = t0i * Two - f0i;
863
864 f3r = f1r + t1r * w1i - t1i * w1r;
865 f3i = f1i + t1r * w1r + t1i * w1i;
866 f1r = f1r * Two - f3r;
867 f1i = f1i * Two - f3i;
868
869 t0r = f4r + f5r * w0r + f5i * w0i;
870 t0i = f4i - f5r * w0i + f5i * w0r;
871 f5r = f4r * Two - t0r;
872 f5i = f4i * Two - t0i;
873
874 t1r = f6r - f7r * w0r - f7i * w0i;
875 t1i = f6i + f7r * w0i - f7i * w0r;
876 f6r = f6r * Two - t1r;
877 f6i = f6i * Two - t1i;
878
879 f4r = t0r + f6r * w1r + f6i * w1i;
880 f4i = t0i - f6r * w1i + f6i * w1r;
881 f6r = t0r * Two - f4r;
882 f6i = t0i * Two - f4i;
883
884 f7r = f5r + t1r * w1i - t1i * w1r;
885 f7i = f5i + t1r * w1r + t1i * w1i;
886 f5r = f5r * Two - f7r;
887 f5i = f5i * Two - f7i;
888
889 t0r = f0r - f4r * w2r - f4i * w2i;
890 t0i = f0i + f4r * w2i - f4i * w2r;
891 f0r = f0r * Two - t0r;
892 f0i = f0i * Two - t0i;
893
894 t1r = f1r - f5r * w3r - f5i * w3i;
895 t1i = f1i + f5r * w3i - f5i * w3r;
896 f1r = f1r * Two - t1r;
897 f1i = f1i * Two - t1i;
898
899 *(p0r + pos) = t0r;
900 *(p1r + pos) = t1r;
901 *(p0r + posi) = t0i;
902 *(p1r + posi) = t1i;
903 *p0r = f0r;
904 *p1r = f1r;
905 *(p0r + 1) = f0i;
906 *(p1r + 1) = f1i;
907
908 p0r += pnext;
909 f0r = *p0r;
910 f0i = *(p0r + 1);
911
912 p1r += pnext;
913
914 f1r = *p1r;
915 f1i = *(p1r + 1);
916
917 f4r = f2r - f6r * w2i + f6i * w2r;
918 f4i = f2i - f6r * w2r - f6i * w2i;
919 f6r = f2r * Two - f4r;
920 f6i = f2i * Two - f4i;
921
922 f5r = f3r - f7r * w3i + f7i * w3r;
923 f5i = f3i - f7r * w3r - f7i * w3i;
924 f7r = f3r * Two - f5r;
925 f7i = f3i * Two - f5i;
926
927 *p2r = f4r;
928 *p3r = f5r;
929 *(p2r + 1) = f4i;
930 *(p3r + 1) = f5i;
931 *(p2r + pos) = f6r;
932 *(p3r + pos) = f7r;
933 *(p2r + posi) = f6i;
934 *(p3r + posi) = f7i;
935
936 p2r += pnext;
937 p3r += pnext;
938 }
939
940 f2r = *p2r;
941 f2i = *(p2r + 1);
942 f3r = *p3r;
943 f3i = *(p3r + 1);
944
945 t0r = f0r + f1r * w0r + f1i * w0i;
946 t0i = f0i - f1r * w0i + f1i * w0r;
947 f1r = f0r * Two - t0r;
948 f1i = f0i * Two - t0i;
949
950 f4r = *(p0r + pos);
951 f4i = *(p0r + posi);
952 f5r = *(p1r + pos);
953 f5i = *(p1r + posi);
954
955 f6r = *(p2r + pos);
956 f6i = *(p2r + posi);
957 f7r = *(p3r + pos);
958 f7i = *(p3r + posi);
959
960 t1r = f2r - f3r * w0r - f3i * w0i;
961 t1i = f2i + f3r * w0i - f3i * w0r;
962 f2r = f2r * Two - t1r;
963 f2i = f2i * Two - t1i;
964
965 f0r = t0r + f2r * w1r + f2i * w1i;
966 f0i = t0i - f2r * w1i + f2i * w1r;
967 f2r = t0r * Two - f0r;
968 f2i = t0i * Two - f0i;
969
970 f3r = f1r + t1r * w1i - t1i * w1r;
971 f3i = f1i + t1r * w1r + t1i * w1i;
972 f1r = f1r * Two - f3r;
973 f1i = f1i * Two - f3i;
974
975 if ((int32_t) DiffUCnt == NDiffU / 2)
976 Uinc4 = -Uinc4;
977
978 u0r += Uinc4;
979 u0i -= Uinc4;
980 u1r += Uinc2;
981 u1i -= Uinc2;
982 u2r += Uinc;
983 u2i -= Uinc;
984
985 pstrt += 2;
986
987 t0r = f4r + f5r * w0r + f5i * w0i;
988 t0i = f4i - f5r * w0i + f5i * w0r;
989 f5r = f4r * Two - t0r;
990 f5i = f4i * Two - t0i;
991
992 t1r = f6r - f7r * w0r - f7i * w0i;
993 t1i = f6i + f7r * w0i - f7i * w0r;
994 f6r = f6r * Two - t1r;
995 f6i = f6i * Two - t1i;
996
997 f4r = t0r + f6r * w1r + f6i * w1i;
998 f4i = t0i - f6r * w1i + f6i * w1r;
999 f6r = t0r * Two - f4r;
1000 f6i = t0i * Two - f4i;
1001
1002 f7r = f5r + t1r * w1i - t1i * w1r;
1003 f7i = f5i + t1r * w1r + t1i * w1i;
1004 f5r = f5r * Two - f7r;
1005 f5i = f5i * Two - f7i;
1006
1007 w0r = *u0r;
1008 w0i = *u0i;
1009 w1r = *u1r;
1010 w1i = *u1i;
1011
1012 if ((int32_t) DiffUCnt <= NDiffU / 2)
1013 w0r = -w0r;
1014
1015 t0r = f0r - f4r * w2r - f4i * w2i;
1016 t0i = f0i + f4r * w2i - f4i * w2r;
1017 f0r = f0r * Two - t0r;
1018 f0i = f0i * Two - t0i;
1019
1020 f4r = f2r - f6r * w2i + f6i * w2r;
1021 f4i = f2i - f6r * w2r - f6i * w2i;
1022 f6r = f2r * Two - f4r;
1023 f6i = f2i * Two - f4i;
1024
1025 *(p0r + pos) = t0r;
1026 *p2r = f4r;
1027 *(p0r + posi) = t0i;
1028 *(p2r + 1) = f4i;
1029 w2r = *u2r;
1030 w2i = *u2i;
1031 *p0r = f0r;
1032 *(p2r + pos) = f6r;
1033 *(p0r + 1) = f0i;
1034 *(p2r + posi) = f6i;
1035
1036 p0r = pstrt;
1037 p2r = pstrt + pinc + pinc;
1038
1039 t1r = f1r - f5r * w3r - f5i * w3i;
1040 t1i = f1i + f5r * w3i - f5i * w3r;
1041 f1r = f1r * Two - t1r;
1042 f1i = f1i * Two - t1i;
1043
1044 f5r = f3r - f7r * w3i + f7i * w3r;
1045 f5i = f3i - f7r * w3r - f7i * w3i;
1046 f7r = f3r * Two - f5r;
1047 f7i = f3i * Two - f5i;
1048
1049 *(p1r + pos) = t1r;
1050 *p3r = f5r;
1051 *(p1r + posi) = t1i;
1052 *(p3r + 1) = f5i;
1053 w3r = *(u2r + U2toU3);
1054 w3i = *(u2i - U2toU3);
1055 *p1r = f1r;
1056 *(p3r + pos) = f7r;
1057 *(p1r + 1) = f1i;
1058 *(p3r + posi) = f7i;
1059
1060 p1r = pstrt + pinc;
1061 p3r = p2r + pinc;
1062 }
1063 NSameU /= 8;
1064 Uinc /= 8;
1065 Uinc2 /= 8;
1066 Uinc4 = Uinc * 4;
1067 NDiffU *= 8;
1068 pinc *= 8;
1069 pnext *= 8;
1070 pos *= 8;
1071 posi = pos + 1;
1072 }
1073 }
1074
fftrecurs(MYFLT * ioptr,int32_t M,MYFLT * Utbl,int32_t Ustride,int32_t NDiffU,int32_t StageCnt)1075 static void fftrecurs(MYFLT *ioptr, int32_t M, MYFLT *Utbl, int32_t Ustride,
1076 int32_t NDiffU, int32_t StageCnt)
1077 {
1078 /* recursive bfstages calls to maximize on chip cache efficiency */
1079 int32_t i1;
1080
1081 if (M <= (int32_t) MCACHE) /* fits on chip ? */
1082 bfstages(ioptr, M, Utbl, Ustride, NDiffU, StageCnt); /* RADIX 8 Stages */
1083 else {
1084 for (i1 = 0; i1 < 8; i1++) {
1085 fftrecurs(&ioptr[i1 * POW2(M - 3) * 2], M - 3, Utbl, 8 * Ustride,
1086 NDiffU, StageCnt - 1); /* RADIX 8 Stages */
1087 }
1088 bfstages(ioptr, M, Utbl, Ustride, POW2(M - 3), 1); /* RADIX 8 Stage */
1089 }
1090 }
1091
ffts1(MYFLT * ioptr,int32_t M,MYFLT * Utbl,int16 * BRLow)1092 static void ffts1(MYFLT *ioptr, int32_t M, MYFLT *Utbl, int16 *BRLow)
1093 {
1094 /* Compute in-place complex fft on the rows of the input array */
1095 /* INPUTS */
1096 /* *ioptr = input data array */
1097 /* M = log2 of fft size (ex M=10 for 1024 point fft) */
1098 /* *Utbl = cosine table */
1099 /* *BRLow = bit reversed counter table */
1100 /* OUTPUTS */
1101 /* *ioptr = output data array */
1102
1103 int32_t StageCnt;
1104 int32_t NDiffU;
1105
1106 switch (M) {
1107 case 0:
1108 break;
1109 case 1:
1110 fft2pt(ioptr); /* a 2 pt fft */
1111 break;
1112 case 2:
1113 fft4pt(ioptr); /* a 4 pt fft */
1114 break;
1115 case 3:
1116 fft8pt(ioptr); /* an 8 pt fft */
1117 break;
1118 default:
1119 bitrevR2(ioptr, M, BRLow); /* bit reverse and first radix 2 stage */
1120 StageCnt = (M - 1) / 3; /* number of radix 8 stages */
1121 NDiffU = 2; /* one radix 2 stage already complete */
1122 if ((M - 1 - (StageCnt * 3)) == 1) {
1123 bfR2(ioptr, M, NDiffU); /* 1 radix 2 stage */
1124 NDiffU *= 2;
1125 }
1126 if ((M - 1 - (StageCnt * 3)) == 2) {
1127 bfR4(ioptr, M, NDiffU); /* 1 radix 4 stage */
1128 NDiffU *= 4;
1129 }
1130 if (M <= (int32_t) MCACHE)
1131 bfstages(ioptr, M, Utbl, 1, NDiffU, StageCnt); /* RADIX 8 Stages */
1132 else
1133 fftrecurs(ioptr, M, Utbl, 1, NDiffU, StageCnt); /* RADIX 8 Stages */
1134 }
1135 }
1136
1137 /******************
1138 * parts of iffts1 *
1139 ******************/
1140
scbitrevR2(MYFLT * ioptr,int32_t M,int16 * BRLow,MYFLT scale)1141 static void scbitrevR2(MYFLT *ioptr, int32_t M, int16 *BRLow, MYFLT scale)
1142 {
1143 /*** scaled bit reverse and first radix 2 stage forward or inverse fft ***/
1144 MYFLT f0r;
1145 MYFLT f0i;
1146 MYFLT f1r;
1147 MYFLT f1i;
1148 MYFLT f2r;
1149 MYFLT f2i;
1150 MYFLT f3r;
1151 MYFLT f3i;
1152 MYFLT f4r;
1153 MYFLT f4i;
1154 MYFLT f5r;
1155 MYFLT f5i;
1156 MYFLT f6r;
1157 MYFLT f6i;
1158 MYFLT f7r;
1159 MYFLT f7i;
1160 MYFLT t0r;
1161 MYFLT t0i;
1162 MYFLT t1r;
1163 MYFLT t1i;
1164 MYFLT *p0r;
1165 MYFLT *p1r;
1166 MYFLT *IOP;
1167 MYFLT *iolimit;
1168 int32_t Colstart;
1169 int32_t iCol;
1170 uint32_t posA;
1171 uint32_t posAi;
1172 uint32_t posB;
1173 uint32_t posBi;
1174
1175 const uint32_t Nrems2 = POW2((M + 3) / 2);
1176 const uint32_t Nroot_1_ColInc = POW2(M) - Nrems2;
1177 const uint32_t Nroot_1 = POW2(M / 2 - 1) - 1;
1178 const uint32_t ColstartShift = (M + 1) / 2 + 1;
1179
1180 posA = POW2(M); /* 1/2 of POW2(M) complexes */
1181 posAi = posA + 1;
1182 posB = posA + 2;
1183 posBi = posB + 1;
1184
1185 iolimit = ioptr + Nrems2;
1186 for (; ioptr < iolimit; ioptr += POW2(M / 2 + 1)) {
1187 for (Colstart = Nroot_1; Colstart >= 0; Colstart--) {
1188 iCol = Nroot_1;
1189 p0r = ioptr + Nroot_1_ColInc + BRLow[Colstart] * 4;
1190 IOP = ioptr + (Colstart << ColstartShift);
1191 p1r = IOP + BRLow[iCol] * 4;
1192 f0r = *(p0r);
1193 f0i = *(p0r + 1);
1194 f1r = *(p0r + posA);
1195 f1i = *(p0r + posAi);
1196 for (; iCol > Colstart;) {
1197 f2r = *(p0r + 2);
1198 f2i = *(p0r + (2 + 1));
1199 f3r = *(p0r + posB);
1200 f3i = *(p0r + posBi);
1201 f4r = *(p1r);
1202 f4i = *(p1r + 1);
1203 f5r = *(p1r + posA);
1204 f5i = *(p1r + posAi);
1205 f6r = *(p1r + 2);
1206 f6i = *(p1r + (2 + 1));
1207 f7r = *(p1r + posB);
1208 f7i = *(p1r + posBi);
1209
1210 t0r = f0r + f1r;
1211 t0i = f0i + f1i;
1212 f1r = f0r - f1r;
1213 f1i = f0i - f1i;
1214 t1r = f2r + f3r;
1215 t1i = f2i + f3i;
1216 f3r = f2r - f3r;
1217 f3i = f2i - f3i;
1218 f0r = f4r + f5r;
1219 f0i = f4i + f5i;
1220 f5r = f4r - f5r;
1221 f5i = f4i - f5i;
1222 f2r = f6r + f7r;
1223 f2i = f6i + f7i;
1224 f7r = f6r - f7r;
1225 f7i = f6i - f7i;
1226
1227 *(p1r) = scale * t0r;
1228 *(p1r + 1) = scale * t0i;
1229 *(p1r + 2) = scale * f1r;
1230 *(p1r + (2 + 1)) = scale * f1i;
1231 *(p1r + posA) = scale * t1r;
1232 *(p1r + posAi) = scale * t1i;
1233 *(p1r + posB) = scale * f3r;
1234 *(p1r + posBi) = scale * f3i;
1235 *(p0r) = scale * f0r;
1236 *(p0r + 1) = scale * f0i;
1237 *(p0r + 2) = scale * f5r;
1238 *(p0r + (2 + 1)) = scale * f5i;
1239 *(p0r + posA) = scale * f2r;
1240 *(p0r + posAi) = scale * f2i;
1241 *(p0r + posB) = scale * f7r;
1242 *(p0r + posBi) = scale * f7i;
1243
1244 p0r -= Nrems2;
1245 f0r = *(p0r);
1246 f0i = *(p0r + 1);
1247 f1r = *(p0r + posA);
1248 f1i = *(p0r + posAi);
1249 iCol -= 1;
1250 p1r = IOP + BRLow[iCol] * 4;
1251 }
1252 f2r = *(p0r + 2);
1253 f2i = *(p0r + (2 + 1));
1254 f3r = *(p0r + posB);
1255 f3i = *(p0r + posBi);
1256
1257 t0r = f0r + f1r;
1258 t0i = f0i + f1i;
1259 f1r = f0r - f1r;
1260 f1i = f0i - f1i;
1261 t1r = f2r + f3r;
1262 t1i = f2i + f3i;
1263 f3r = f2r - f3r;
1264 f3i = f2i - f3i;
1265
1266 *(p0r) = scale * t0r;
1267 *(p0r + 1) = scale * t0i;
1268 *(p0r + 2) = scale * f1r;
1269 *(p0r + (2 + 1)) = scale * f1i;
1270 *(p0r + posA) = scale * t1r;
1271 *(p0r + posAi) = scale * t1i;
1272 *(p0r + posB) = scale * f3r;
1273 *(p0r + posBi) = scale * f3i;
1274 }
1275 }
1276 }
1277
ifft2pt(MYFLT * ioptr,MYFLT scale)1278 static void ifft2pt(MYFLT *ioptr, MYFLT scale)
1279 {
1280 /*** RADIX 2 ifft ***/
1281 MYFLT f0r, f0i, f1r, f1i;
1282 MYFLT t0r, t0i;
1283
1284 /* bit reversed load */
1285 f0r = ioptr[0];
1286 f0i = ioptr[1];
1287 f1r = ioptr[2];
1288 f1i = ioptr[3];
1289
1290 /* Butterflys */
1291 /*
1292 f0 - - t0
1293 f1 - 1 - f1
1294 */
1295
1296 t0r = f0r + f1r;
1297 t0i = f0i + f1i;
1298 f1r = f0r - f1r;
1299 f1i = f0i - f1i;
1300
1301 /* store result */
1302 ioptr[0] = scale * t0r;
1303 ioptr[1] = scale * t0i;
1304 ioptr[2] = scale * f1r;
1305 ioptr[3] = scale * f1i;
1306 }
1307
ifft4pt(MYFLT * ioptr,MYFLT scale)1308 static void ifft4pt(MYFLT *ioptr, MYFLT scale)
1309 {
1310 /*** RADIX 4 ifft ***/
1311 MYFLT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
1312 MYFLT t0r, t0i, t1r, t1i;
1313
1314 /* bit reversed load */
1315 f0r = ioptr[0];
1316 f0i = ioptr[1];
1317 f1r = ioptr[4];
1318 f1i = ioptr[5];
1319 f2r = ioptr[2];
1320 f2i = ioptr[3];
1321 f3r = ioptr[6];
1322 f3i = ioptr[7];
1323
1324 /* Butterflys */
1325 /*
1326 f0 - - t0 - - f0
1327 f1 - 1 - f1 - - f1
1328 f2 - - f2 - 1 - f2
1329 f3 - 1 - t1 - i - f3
1330 */
1331
1332 t0r = f0r + f1r;
1333 t0i = f0i + f1i;
1334 f1r = f0r - f1r;
1335 f1i = f0i - f1i;
1336
1337 t1r = f2r - f3r;
1338 t1i = f2i - f3i;
1339 f2r = f2r + f3r;
1340 f2i = f2i + f3i;
1341
1342 f0r = t0r + f2r;
1343 f0i = t0i + f2i;
1344 f2r = t0r - f2r;
1345 f2i = t0i - f2i;
1346
1347 f3r = f1r + t1i;
1348 f3i = f1i - t1r;
1349 f1r = f1r - t1i;
1350 f1i = f1i + t1r;
1351
1352 /* store result */
1353 ioptr[0] = scale * f0r;
1354 ioptr[1] = scale * f0i;
1355 ioptr[2] = scale * f1r;
1356 ioptr[3] = scale * f1i;
1357 ioptr[4] = scale * f2r;
1358 ioptr[5] = scale * f2i;
1359 ioptr[6] = scale * f3r;
1360 ioptr[7] = scale * f3i;
1361 }
1362
ifft8pt(MYFLT * ioptr,MYFLT scale)1363 static void ifft8pt(MYFLT *ioptr, MYFLT scale)
1364 {
1365 /*** RADIX 8 ifft ***/
1366 MYFLT w0r = FL(1.0) / FL(ROOT2); /* cos(pi/4) */
1367 MYFLT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
1368 MYFLT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
1369 MYFLT t0r, t0i, t1r, t1i;
1370 const MYFLT Two = FL(2.0);
1371
1372 /* bit reversed load */
1373 f0r = ioptr[0];
1374 f0i = ioptr[1];
1375 f1r = ioptr[8];
1376 f1i = ioptr[9];
1377 f2r = ioptr[4];
1378 f2i = ioptr[5];
1379 f3r = ioptr[12];
1380 f3i = ioptr[13];
1381 f4r = ioptr[2];
1382 f4i = ioptr[3];
1383 f5r = ioptr[10];
1384 f5i = ioptr[11];
1385 f6r = ioptr[6];
1386 f6i = ioptr[7];
1387 f7r = ioptr[14];
1388 f7i = ioptr[15];
1389
1390 /* Butterflys */
1391 /*
1392 f0 - - t0 - - f0 - - f0
1393 f1 - 1 - f1 - - f1 - - f1
1394 f2 - - f2 - 1 - f2 - - f2
1395 f3 - 1 - t1 - i - f3 - - f3
1396 f4 - - t0 - - f4 - 1 - t0
1397 f5 - 1 - f5 - - f5 - w3 - f4
1398 f6 - - f6 - 1 - f6 - i - t1
1399 f7 - 1 - t1 - i - f7 - iw3- f6
1400 */
1401
1402 t0r = f0r + f1r;
1403 t0i = f0i + f1i;
1404 f1r = f0r - f1r;
1405 f1i = f0i - f1i;
1406
1407 t1r = f2r - f3r;
1408 t1i = f2i - f3i;
1409 f2r = f2r + f3r;
1410 f2i = f2i + f3i;
1411
1412 f0r = t0r + f2r;
1413 f0i = t0i + f2i;
1414 f2r = t0r - f2r;
1415 f2i = t0i - f2i;
1416
1417 f3r = f1r + t1i;
1418 f3i = f1i - t1r;
1419 f1r = f1r - t1i;
1420 f1i = f1i + t1r;
1421
1422 t0r = f4r + f5r;
1423 t0i = f4i + f5i;
1424 f5r = f4r - f5r;
1425 f5i = f4i - f5i;
1426
1427 t1r = f6r - f7r;
1428 t1i = f6i - f7i;
1429 f6r = f6r + f7r;
1430 f6i = f6i + f7i;
1431
1432 f4r = t0r + f6r;
1433 f4i = t0i + f6i;
1434 f6r = t0r - f6r;
1435 f6i = t0i - f6i;
1436
1437 f7r = f5r + t1i;
1438 f7i = f5i - t1r;
1439 f5r = f5r - t1i;
1440 f5i = f5i + t1r;
1441
1442 t0r = f0r - f4r;
1443 t0i = f0i - f4i;
1444 f0r = f0r + f4r;
1445 f0i = f0i + f4i;
1446
1447 t1r = f2r + f6i;
1448 t1i = f2i - f6r;
1449 f2r = f2r - f6i;
1450 f2i = f2i + f6r;
1451
1452 f4r = f1r - f5r * w0r + f5i * w0r;
1453 f4i = f1i - f5r * w0r - f5i * w0r;
1454 f1r = f1r * Two - f4r;
1455 f1i = f1i * Two - f4i;
1456
1457 f6r = f3r + f7r * w0r + f7i * w0r;
1458 f6i = f3i - f7r * w0r + f7i * w0r;
1459 f3r = f3r * Two - f6r;
1460 f3i = f3i * Two - f6i;
1461
1462 /* store result */
1463 ioptr[0] = scale * f0r;
1464 ioptr[1] = scale * f0i;
1465 ioptr[2] = scale * f1r;
1466 ioptr[3] = scale * f1i;
1467 ioptr[4] = scale * f2r;
1468 ioptr[5] = scale * f2i;
1469 ioptr[6] = scale * f3r;
1470 ioptr[7] = scale * f3i;
1471 ioptr[8] = scale * t0r;
1472 ioptr[9] = scale * t0i;
1473 ioptr[10] = scale * f4r;
1474 ioptr[11] = scale * f4i;
1475 ioptr[12] = scale * t1r;
1476 ioptr[13] = scale * t1i;
1477 ioptr[14] = scale * f6r;
1478 ioptr[15] = scale * f6i;
1479 }
1480
ibfR2(MYFLT * ioptr,int32_t M,int32_t NDiffU)1481 static void ibfR2(MYFLT *ioptr, int32_t M, int32_t NDiffU)
1482 {
1483 /*** 2nd radix 2 stage ***/
1484 uint32_t pos;
1485 uint32_t posi;
1486 uint32_t pinc;
1487 uint32_t pnext;
1488 uint32_t NSameU;
1489 uint32_t SameUCnt;
1490
1491 MYFLT *pstrt;
1492 MYFLT *p0r, *p1r, *p2r, *p3r;
1493
1494 MYFLT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
1495 MYFLT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
1496
1497 pinc = NDiffU * 2; /* 2 floats per complex */
1498 pnext = pinc * 4;
1499 pos = 2;
1500 posi = pos + 1;
1501 NSameU = POW2(M) / 4 / NDiffU; /* 4 Us at a time */
1502 pstrt = ioptr;
1503 p0r = pstrt;
1504 p1r = pstrt + pinc;
1505 p2r = p1r + pinc;
1506 p3r = p2r + pinc;
1507
1508 /* Butterflys */
1509 /*
1510 f0 - - f4
1511 f1 - 1 - f5
1512 f2 - - f6
1513 f3 - 1 - f7
1514 */
1515 /* Butterflys */
1516 /*
1517 f0 - - f4
1518 f1 - 1 - f5
1519 f2 - - f6
1520 f3 - 1 - f7
1521 */
1522
1523 for (SameUCnt = NSameU; SameUCnt > 0; SameUCnt--) {
1524
1525 f0r = *p0r;
1526 f1r = *p1r;
1527 f0i = *(p0r + 1);
1528 f1i = *(p1r + 1);
1529 f2r = *p2r;
1530 f3r = *p3r;
1531 f2i = *(p2r + 1);
1532 f3i = *(p3r + 1);
1533
1534 f4r = f0r + f1r;
1535 f4i = f0i + f1i;
1536 f5r = f0r - f1r;
1537 f5i = f0i - f1i;
1538
1539 f6r = f2r + f3r;
1540 f6i = f2i + f3i;
1541 f7r = f2r - f3r;
1542 f7i = f2i - f3i;
1543
1544 *p0r = f4r;
1545 *(p0r + 1) = f4i;
1546 *p1r = f5r;
1547 *(p1r + 1) = f5i;
1548 *p2r = f6r;
1549 *(p2r + 1) = f6i;
1550 *p3r = f7r;
1551 *(p3r + 1) = f7i;
1552
1553 f0r = *(p0r + pos);
1554 f1i = *(p1r + posi);
1555 f0i = *(p0r + posi);
1556 f1r = *(p1r + pos);
1557 f2r = *(p2r + pos);
1558 f3i = *(p3r + posi);
1559 f2i = *(p2r + posi);
1560 f3r = *(p3r + pos);
1561
1562 f4r = f0r - f1i;
1563 f4i = f0i + f1r;
1564 f5r = f0r + f1i;
1565 f5i = f0i - f1r;
1566
1567 f6r = f2r - f3i;
1568 f6i = f2i + f3r;
1569 f7r = f2r + f3i;
1570 f7i = f2i - f3r;
1571
1572 *(p0r + pos) = f4r;
1573 *(p0r + posi) = f4i;
1574 *(p1r + pos) = f5r;
1575 *(p1r + posi) = f5i;
1576 *(p2r + pos) = f6r;
1577 *(p2r + posi) = f6i;
1578 *(p3r + pos) = f7r;
1579 *(p3r + posi) = f7i;
1580
1581 p0r += pnext;
1582 p1r += pnext;
1583 p2r += pnext;
1584 p3r += pnext;
1585 }
1586 }
1587
ibfR4(MYFLT * ioptr,int32_t M,int32_t NDiffU)1588 static void ibfR4(MYFLT *ioptr, int32_t M, int32_t NDiffU)
1589 {
1590 /*** 1 radix 4 stage ***/
1591 uint32_t pos;
1592 uint32_t posi;
1593 uint32_t pinc;
1594 uint32_t pnext;
1595 uint32_t pnexti;
1596 uint32_t NSameU;
1597 uint32_t SameUCnt;
1598
1599 MYFLT *pstrt;
1600 MYFLT *p0r, *p1r, *p2r, *p3r;
1601
1602 MYFLT w1r = FL(1.0) / FL(ROOT2); /* cos(pi/4) */
1603 MYFLT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
1604 MYFLT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
1605 MYFLT t1r, t1i;
1606 const MYFLT Two = FL(2.0);
1607
1608 pinc = NDiffU * 2; /* 2 floats per complex */
1609 pnext = pinc * 4;
1610 pnexti = pnext + 1;
1611 pos = 2;
1612 posi = pos + 1;
1613 NSameU = POW2(M) / 4 / NDiffU; /* 4 pts per butterfly */
1614 pstrt = ioptr;
1615 p0r = pstrt;
1616 p1r = pstrt + pinc;
1617 p2r = p1r + pinc;
1618 p3r = p2r + pinc;
1619
1620 /* Butterflys */
1621 /*
1622 f0 - - f0 - - f4
1623 f1 - 1 - f5 - - f5
1624 f2 - - f6 - 1 - f6
1625 f3 - 1 - f3 - -i - f7
1626 */
1627 /* Butterflys */
1628 /*
1629 f0 - - f4 - - f4
1630 f1 - -i - t1 - - f5
1631 f2 - - f2 - w1 - f6
1632 f3 - -i - f7 - iw1- f7
1633 */
1634
1635 f0r = *p0r;
1636 f1r = *p1r;
1637 f2r = *p2r;
1638 f3r = *p3r;
1639 f0i = *(p0r + 1);
1640 f1i = *(p1r + 1);
1641 f2i = *(p2r + 1);
1642 f3i = *(p3r + 1);
1643
1644 f5r = f0r - f1r;
1645 f5i = f0i - f1i;
1646 f0r = f0r + f1r;
1647 f0i = f0i + f1i;
1648
1649 f6r = f2r + f3r;
1650 f6i = f2i + f3i;
1651 f3r = f2r - f3r;
1652 f3i = f2i - f3i;
1653
1654 for (SameUCnt = NSameU - 1; SameUCnt > 0; SameUCnt--) {
1655
1656 f7r = f5r + f3i;
1657 f7i = f5i - f3r;
1658 f5r = f5r - f3i;
1659 f5i = f5i + f3r;
1660
1661 f4r = f0r + f6r;
1662 f4i = f0i + f6i;
1663 f6r = f0r - f6r;
1664 f6i = f0i - f6i;
1665
1666 f2r = *(p2r + pos);
1667 f2i = *(p2r + posi);
1668 f1r = *(p1r + pos);
1669 f1i = *(p1r + posi);
1670 f3i = *(p3r + posi);
1671 f0r = *(p0r + pos);
1672 f3r = *(p3r + pos);
1673 f0i = *(p0r + posi);
1674
1675 *p3r = f7r;
1676 *p0r = f4r;
1677 *(p3r + 1) = f7i;
1678 *(p0r + 1) = f4i;
1679 *p1r = f5r;
1680 *p2r = f6r;
1681 *(p1r + 1) = f5i;
1682 *(p2r + 1) = f6i;
1683
1684 f7r = f2r + f3i;
1685 f7i = f2i - f3r;
1686 f2r = f2r - f3i;
1687 f2i = f2i + f3r;
1688
1689 f4r = f0r - f1i;
1690 f4i = f0i + f1r;
1691 t1r = f0r + f1i;
1692 t1i = f0i - f1r;
1693
1694 f5r = t1r - f7r * w1r - f7i * w1r;
1695 f5i = t1i + f7r * w1r - f7i * w1r;
1696 f7r = t1r * Two - f5r;
1697 f7i = t1i * Two - f5i;
1698
1699 f6r = f4r - f2r * w1r + f2i * w1r;
1700 f6i = f4i - f2r * w1r - f2i * w1r;
1701 f4r = f4r * Two - f6r;
1702 f4i = f4i * Two - f6i;
1703
1704 f3r = *(p3r + pnext);
1705 f0r = *(p0r + pnext);
1706 f3i = *(p3r + pnexti);
1707 f0i = *(p0r + pnexti);
1708 f2r = *(p2r + pnext);
1709 f2i = *(p2r + pnexti);
1710 f1r = *(p1r + pnext);
1711 f1i = *(p1r + pnexti);
1712
1713 *(p2r + pos) = f6r;
1714 *(p1r + pos) = f5r;
1715 *(p2r + posi) = f6i;
1716 *(p1r + posi) = f5i;
1717 *(p3r + pos) = f7r;
1718 *(p0r + pos) = f4r;
1719 *(p3r + posi) = f7i;
1720 *(p0r + posi) = f4i;
1721
1722 f6r = f2r + f3r;
1723 f6i = f2i + f3i;
1724 f3r = f2r - f3r;
1725 f3i = f2i - f3i;
1726
1727 f5r = f0r - f1r;
1728 f5i = f0i - f1i;
1729 f0r = f0r + f1r;
1730 f0i = f0i + f1i;
1731
1732 p3r += pnext;
1733 p0r += pnext;
1734 p1r += pnext;
1735 p2r += pnext;
1736 }
1737 f7r = f5r + f3i;
1738 f7i = f5i - f3r;
1739 f5r = f5r - f3i;
1740 f5i = f5i + f3r;
1741
1742 f4r = f0r + f6r;
1743 f4i = f0i + f6i;
1744 f6r = f0r - f6r;
1745 f6i = f0i - f6i;
1746
1747 f2r = *(p2r + pos);
1748 f2i = *(p2r + posi);
1749 f1r = *(p1r + pos);
1750 f1i = *(p1r + posi);
1751 f3i = *(p3r + posi);
1752 f0r = *(p0r + pos);
1753 f3r = *(p3r + pos);
1754 f0i = *(p0r + posi);
1755
1756 *p3r = f7r;
1757 *p0r = f4r;
1758 *(p3r + 1) = f7i;
1759 *(p0r + 1) = f4i;
1760 *p1r = f5r;
1761 *p2r = f6r;
1762 *(p1r + 1) = f5i;
1763 *(p2r + 1) = f6i;
1764
1765 f7r = f2r + f3i;
1766 f7i = f2i - f3r;
1767 f2r = f2r - f3i;
1768 f2i = f2i + f3r;
1769
1770 f4r = f0r - f1i;
1771 f4i = f0i + f1r;
1772 t1r = f0r + f1i;
1773 t1i = f0i - f1r;
1774
1775 f5r = t1r - f7r * w1r - f7i * w1r;
1776 f5i = t1i + f7r * w1r - f7i * w1r;
1777 f7r = t1r * Two - f5r;
1778 f7i = t1i * Two - f5i;
1779
1780 f6r = f4r - f2r * w1r + f2i * w1r;
1781 f6i = f4i - f2r * w1r - f2i * w1r;
1782 f4r = f4r * Two - f6r;
1783 f4i = f4i * Two - f6i;
1784
1785 *(p2r + pos) = f6r;
1786 *(p1r + pos) = f5r;
1787 *(p2r + posi) = f6i;
1788 *(p1r + posi) = f5i;
1789 *(p3r + pos) = f7r;
1790 *(p0r + pos) = f4r;
1791 *(p3r + posi) = f7i;
1792 *(p0r + posi) = f4i;
1793 }
1794
ibfstages(MYFLT * ioptr,int32_t M,MYFLT * Utbl,int32_t Ustride,int32_t NDiffU,int32_t StageCnt)1795 static void ibfstages(MYFLT *ioptr, int32_t M, MYFLT *Utbl, int32_t Ustride,
1796 int32_t NDiffU, int32_t StageCnt)
1797 {
1798 /*** RADIX 8 Stages ***/
1799 uint32_t pos;
1800 uint32_t posi;
1801 uint32_t pinc;
1802 uint32_t pnext;
1803 uint32_t NSameU;
1804 int32_t Uinc;
1805 int32_t Uinc2;
1806 int32_t Uinc4;
1807 uint32_t DiffUCnt;
1808 uint32_t SameUCnt;
1809 uint32_t U2toU3;
1810
1811 MYFLT *pstrt;
1812 MYFLT *p0r, *p1r, *p2r, *p3r;
1813 MYFLT *u0r, *u0i, *u1r, *u1i, *u2r, *u2i;
1814
1815 MYFLT w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i;
1816 MYFLT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
1817 MYFLT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
1818 MYFLT t0r, t0i, t1r, t1i;
1819 const MYFLT Two = FL(2.0);
1820
1821 pinc = NDiffU * 2; /* 2 floats per complex */
1822 pnext = pinc * 8;
1823 pos = pinc * 4;
1824 posi = pos + 1;
1825 NSameU = POW2(M) / 8 / NDiffU; /* 8 pts per butterfly */
1826 Uinc = (int32_t) NSameU * Ustride;
1827 Uinc2 = Uinc * 2;
1828 Uinc4 = Uinc * 4;
1829 U2toU3 = (POW2(M) / 8) * Ustride;
1830 for (; StageCnt > 0; StageCnt--) {
1831
1832 u0r = &Utbl[0];
1833 u0i = &Utbl[POW2(M - 2) * Ustride];
1834 u1r = u0r;
1835 u1i = u0i;
1836 u2r = u0r;
1837 u2i = u0i;
1838
1839 w0r = *u0r;
1840 w0i = *u0i;
1841 w1r = *u1r;
1842 w1i = *u1i;
1843 w2r = *u2r;
1844 w2i = *u2i;
1845 w3r = *(u2r + U2toU3);
1846 w3i = *(u2i - U2toU3);
1847
1848 pstrt = ioptr;
1849
1850 p0r = pstrt;
1851 p1r = pstrt + pinc;
1852 p2r = p1r + pinc;
1853 p3r = p2r + pinc;
1854
1855 /* Butterflys */
1856 /*
1857 f0 - - t0 - - f0 - - f0
1858 f1 - w0- f1 - - f1 - - f1
1859 f2 - - f2 - w1- f2 - - f4
1860 f3 - w0- t1 - iw1- f3 - - f5
1861
1862 f4 - - t0 - - f4 - w2- t0
1863 f5 - w0- f5 - - f5 - w3- t1
1864 f6 - - f6 - w1- f6 - iw2- f6
1865 f7 - w0- t1 - iw1- f7 - iw3- f7
1866 */
1867
1868 for (DiffUCnt = NDiffU; DiffUCnt > 0; DiffUCnt--) {
1869 f0r = *p0r;
1870 f0i = *(p0r + 1);
1871 f1r = *p1r;
1872 f1i = *(p1r + 1);
1873 for (SameUCnt = NSameU - 1; SameUCnt > 0; SameUCnt--) {
1874 f2r = *p2r;
1875 f2i = *(p2r + 1);
1876 f3r = *p3r;
1877 f3i = *(p3r + 1);
1878
1879 t0r = f0r + f1r * w0r - f1i * w0i;
1880 t0i = f0i + f1r * w0i + f1i * w0r;
1881 f1r = f0r * Two - t0r;
1882 f1i = f0i * Two - t0i;
1883
1884 f4r = *(p0r + pos);
1885 f4i = *(p0r + posi);
1886 f5r = *(p1r + pos);
1887 f5i = *(p1r + posi);
1888
1889 f6r = *(p2r + pos);
1890 f6i = *(p2r + posi);
1891 f7r = *(p3r + pos);
1892 f7i = *(p3r + posi);
1893
1894 t1r = f2r - f3r * w0r + f3i * w0i;
1895 t1i = f2i - f3r * w0i - f3i * w0r;
1896 f2r = f2r * Two - t1r;
1897 f2i = f2i * Two - t1i;
1898
1899 f0r = t0r + f2r * w1r - f2i * w1i;
1900 f0i = t0i + f2r * w1i + f2i * w1r;
1901 f2r = t0r * Two - f0r;
1902 f2i = t0i * Two - f0i;
1903
1904 f3r = f1r + t1r * w1i + t1i * w1r;
1905 f3i = f1i - t1r * w1r + t1i * w1i;
1906 f1r = f1r * Two - f3r;
1907 f1i = f1i * Two - f3i;
1908
1909 t0r = f4r + f5r * w0r - f5i * w0i;
1910 t0i = f4i + f5r * w0i + f5i * w0r;
1911 f5r = f4r * Two - t0r;
1912 f5i = f4i * Two - t0i;
1913
1914 t1r = f6r - f7r * w0r + f7i * w0i;
1915 t1i = f6i - f7r * w0i - f7i * w0r;
1916 f6r = f6r * Two - t1r;
1917 f6i = f6i * Two - t1i;
1918
1919 f4r = t0r + f6r * w1r - f6i * w1i;
1920 f4i = t0i + f6r * w1i + f6i * w1r;
1921 f6r = t0r * Two - f4r;
1922 f6i = t0i * Two - f4i;
1923
1924 f7r = f5r + t1r * w1i + t1i * w1r;
1925 f7i = f5i - t1r * w1r + t1i * w1i;
1926 f5r = f5r * Two - f7r;
1927 f5i = f5i * Two - f7i;
1928
1929 t0r = f0r - f4r * w2r + f4i * w2i;
1930 t0i = f0i - f4r * w2i - f4i * w2r;
1931 f0r = f0r * Two - t0r;
1932 f0i = f0i * Two - t0i;
1933
1934 t1r = f1r - f5r * w3r + f5i * w3i;
1935 t1i = f1i - f5r * w3i - f5i * w3r;
1936 f1r = f1r * Two - t1r;
1937 f1i = f1i * Two - t1i;
1938
1939 *(p0r + pos) = t0r;
1940 *(p0r + posi) = t0i;
1941 *p0r = f0r;
1942 *(p0r + 1) = f0i;
1943
1944 p0r += pnext;
1945 f0r = *p0r;
1946 f0i = *(p0r + 1);
1947
1948 *(p1r + pos) = t1r;
1949 *(p1r + posi) = t1i;
1950 *p1r = f1r;
1951 *(p1r + 1) = f1i;
1952
1953 p1r += pnext;
1954
1955 f1r = *p1r;
1956 f1i = *(p1r + 1);
1957
1958 f4r = f2r - f6r * w2i - f6i * w2r;
1959 f4i = f2i + f6r * w2r - f6i * w2i;
1960 f6r = f2r * Two - f4r;
1961 f6i = f2i * Two - f4i;
1962
1963 f5r = f3r - f7r * w3i - f7i * w3r;
1964 f5i = f3i + f7r * w3r - f7i * w3i;
1965 f7r = f3r * Two - f5r;
1966 f7i = f3i * Two - f5i;
1967
1968 *p2r = f4r;
1969 *(p2r + 1) = f4i;
1970 *(p2r + pos) = f6r;
1971 *(p2r + posi) = f6i;
1972
1973 p2r += pnext;
1974
1975 *p3r = f5r;
1976 *(p3r + 1) = f5i;
1977 *(p3r + pos) = f7r;
1978 *(p3r + posi) = f7i;
1979
1980 p3r += pnext;
1981 }
1982
1983 f2r = *p2r;
1984 f2i = *(p2r + 1);
1985 f3r = *p3r;
1986 f3i = *(p3r + 1);
1987
1988 t0r = f0r + f1r * w0r - f1i * w0i;
1989 t0i = f0i + f1r * w0i + f1i * w0r;
1990 f1r = f0r * Two - t0r;
1991 f1i = f0i * Two - t0i;
1992
1993 f4r = *(p0r + pos);
1994 f4i = *(p0r + posi);
1995 f5r = *(p1r + pos);
1996 f5i = *(p1r + posi);
1997
1998 f6r = *(p2r + pos);
1999 f6i = *(p2r + posi);
2000 f7r = *(p3r + pos);
2001 f7i = *(p3r + posi);
2002
2003 t1r = f2r - f3r * w0r + f3i * w0i;
2004 t1i = f2i - f3r * w0i - f3i * w0r;
2005 f2r = f2r * Two - t1r;
2006 f2i = f2i * Two - t1i;
2007
2008 f0r = t0r + f2r * w1r - f2i * w1i;
2009 f0i = t0i + f2r * w1i + f2i * w1r;
2010 f2r = t0r * Two - f0r;
2011 f2i = t0i * Two - f0i;
2012
2013 f3r = f1r + t1r * w1i + t1i * w1r;
2014 f3i = f1i - t1r * w1r + t1i * w1i;
2015 f1r = f1r * Two - f3r;
2016 f1i = f1i * Two - f3i;
2017
2018 if ((int32_t) DiffUCnt == NDiffU / 2)
2019 Uinc4 = -Uinc4;
2020
2021 u0r += Uinc4;
2022 u0i -= Uinc4;
2023 u1r += Uinc2;
2024 u1i -= Uinc2;
2025 u2r += Uinc;
2026 u2i -= Uinc;
2027
2028 pstrt += 2;
2029
2030 t0r = f4r + f5r * w0r - f5i * w0i;
2031 t0i = f4i + f5r * w0i + f5i * w0r;
2032 f5r = f4r * Two - t0r;
2033 f5i = f4i * Two - t0i;
2034
2035 t1r = f6r - f7r * w0r + f7i * w0i;
2036 t1i = f6i - f7r * w0i - f7i * w0r;
2037 f6r = f6r * Two - t1r;
2038 f6i = f6i * Two - t1i;
2039
2040 f4r = t0r + f6r * w1r - f6i * w1i;
2041 f4i = t0i + f6r * w1i + f6i * w1r;
2042 f6r = t0r * Two - f4r;
2043 f6i = t0i * Two - f4i;
2044
2045 f7r = f5r + t1r * w1i + t1i * w1r;
2046 f7i = f5i - t1r * w1r + t1i * w1i;
2047 f5r = f5r * Two - f7r;
2048 f5i = f5i * Two - f7i;
2049
2050 w0r = *u0r;
2051 w0i = *u0i;
2052 w1r = *u1r;
2053 w1i = *u1i;
2054
2055 if ((int32_t) DiffUCnt <= NDiffU / 2)
2056 w0r = -w0r;
2057
2058 t0r = f0r - f4r * w2r + f4i * w2i;
2059 t0i = f0i - f4r * w2i - f4i * w2r;
2060 f0r = f0r * Two - t0r;
2061 f0i = f0i * Two - t0i;
2062
2063 f4r = f2r - f6r * w2i - f6i * w2r;
2064 f4i = f2i + f6r * w2r - f6i * w2i;
2065 f6r = f2r * Two - f4r;
2066 f6i = f2i * Two - f4i;
2067
2068 *(p0r + pos) = t0r;
2069 *p2r = f4r;
2070 *(p0r + posi) = t0i;
2071 *(p2r + 1) = f4i;
2072 w2r = *u2r;
2073 w2i = *u2i;
2074 *p0r = f0r;
2075 *(p2r + pos) = f6r;
2076 *(p0r + 1) = f0i;
2077 *(p2r + posi) = f6i;
2078
2079 p0r = pstrt;
2080 p2r = pstrt + pinc + pinc;
2081
2082 t1r = f1r - f5r * w3r + f5i * w3i;
2083 t1i = f1i - f5r * w3i - f5i * w3r;
2084 f1r = f1r * Two - t1r;
2085 f1i = f1i * Two - t1i;
2086
2087 f5r = f3r - f7r * w3i - f7i * w3r;
2088 f5i = f3i + f7r * w3r - f7i * w3i;
2089 f7r = f3r * Two - f5r;
2090 f7i = f3i * Two - f5i;
2091
2092 *(p1r + pos) = t1r;
2093 *p3r = f5r;
2094 *(p1r + posi) = t1i;
2095 *(p3r + 1) = f5i;
2096 w3r = *(u2r + U2toU3);
2097 w3i = *(u2i - U2toU3);
2098 *p1r = f1r;
2099 *(p3r + pos) = f7r;
2100 *(p1r + 1) = f1i;
2101 *(p3r + posi) = f7i;
2102
2103 p1r = pstrt + pinc;
2104 p3r = p2r + pinc;
2105 }
2106 NSameU /= 8;
2107 Uinc /= 8;
2108 Uinc2 /= 8;
2109 Uinc4 = Uinc * 4;
2110 NDiffU *= 8;
2111 pinc *= 8;
2112 pnext *= 8;
2113 pos *= 8;
2114 posi = pos + 1;
2115 }
2116 }
2117
ifftrecurs(MYFLT * ioptr,int32_t M,MYFLT * Utbl,int32_t Ustride,int32_t NDiffU,int32_t StageCnt)2118 static void ifftrecurs(MYFLT *ioptr, int32_t M, MYFLT *Utbl, int32_t Ustride,
2119 int32_t NDiffU, int32_t StageCnt)
2120 {
2121 /* recursive bfstages calls to maximize on chip cache efficiency */
2122 int32_t i1;
2123
2124 if (M <= (int32_t) MCACHE)
2125 ibfstages(ioptr, M, Utbl, Ustride, NDiffU, StageCnt); /* RADIX 8 Stages */
2126 else {
2127 for (i1 = 0; i1 < 8; i1++) {
2128 ifftrecurs(&ioptr[i1 * POW2(M - 3) * 2], M - 3, Utbl, 8 * Ustride,
2129 NDiffU, StageCnt - 1); /* RADIX 8 Stages */
2130 }
2131 ibfstages(ioptr, M, Utbl, Ustride, POW2(M - 3), 1); /* RADIX 8 Stage */
2132 }
2133 }
2134
iffts1(MYFLT * ioptr,int32_t M,MYFLT * Utbl,int16 * BRLow)2135 static void iffts1(MYFLT *ioptr, int32_t M, MYFLT *Utbl, int16 *BRLow)
2136 {
2137 /* Compute in-place inverse complex fft on the rows of the input array */
2138 /* INPUTS */
2139 /* *ioptr = input data array */
2140 /* M = log2 of fft size */
2141 /* *Utbl = cosine table */
2142 /* *BRLow = bit reversed counter table */
2143 /* OUTPUTS */
2144 /* *ioptr = output data array */
2145
2146 int32_t StageCnt;
2147 int32_t NDiffU;
2148 const MYFLT scale = FL(1.0) / POW2(M);
2149
2150 switch (M) {
2151 case 0:
2152 break;
2153 case 1:
2154 ifft2pt(ioptr, scale); /* a 2 pt fft */
2155 break;
2156 case 2:
2157 ifft4pt(ioptr, scale); /* a 4 pt fft */
2158 break;
2159 case 3:
2160 ifft8pt(ioptr, scale); /* an 8 pt fft */
2161 break;
2162 default:
2163 /* bit reverse and first radix 2 stage */
2164 scbitrevR2(ioptr, M, BRLow, scale);
2165 StageCnt = (M - 1) / 3; /* number of radix 8 stages */
2166 NDiffU = 2; /* one radix 2 stage already complete */
2167 if ((M - 1 - (StageCnt * 3)) == 1) {
2168 ibfR2(ioptr, M, NDiffU); /* 1 radix 2 stage */
2169 NDiffU *= 2;
2170 }
2171 if ((M - 1 - (StageCnt * 3)) == 2) {
2172 ibfR4(ioptr, M, NDiffU); /* 1 radix 4 stage */
2173 NDiffU *= 4;
2174 }
2175 if (M <= (int32_t) MCACHE)
2176 ibfstages(ioptr, M, Utbl, 1, NDiffU, StageCnt); /* RADIX 8 Stages */
2177 else
2178 ifftrecurs(ioptr, M, Utbl, 1, NDiffU, StageCnt); /* RADIX 8 Stages */
2179 }
2180 }
2181
2182 /******************
2183 * parts of rffts1 *
2184 ******************/
2185
rfft1pt(MYFLT * ioptr)2186 static void rfft1pt(MYFLT *ioptr)
2187 {
2188 /*** RADIX 2 rfft ***/
2189 MYFLT f0r, f0i;
2190 MYFLT t0r, t0i;
2191
2192 /* bit reversed load */
2193 f0r = ioptr[0];
2194 f0i = ioptr[1];
2195
2196 /* finish rfft */
2197 t0r = f0r + f0i;
2198 t0i = f0r - f0i;
2199
2200 /* store result */
2201 ioptr[0] = t0r;
2202 ioptr[1] = t0i;
2203 }
2204
rfft2pt(MYFLT * ioptr)2205 static void rfft2pt(MYFLT *ioptr)
2206 {
2207 /*** RADIX 4 rfft ***/
2208 MYFLT f0r, f0i, f1r, f1i;
2209 MYFLT t0r, t0i;
2210
2211 /* bit reversed load */
2212 f0r = ioptr[0];
2213 f0i = ioptr[1];
2214 f1r = ioptr[2];
2215 f1i = ioptr[3];
2216
2217 /* Butterflys */
2218 /*
2219 f0 - - t0
2220 f1 - 1 - f1
2221 */
2222
2223 t0r = f0r + f1r;
2224 t0i = f0i + f1i;
2225 f1r = f0r - f1r;
2226 f1i = f1i - f0i;
2227 /* finish rfft */
2228 f0r = t0r + t0i;
2229 f0i = t0r - t0i;
2230
2231 /* store result */
2232 ioptr[0] = f0r;
2233 ioptr[1] = f0i;
2234 ioptr[2] = f1r;
2235 ioptr[3] = f1i;
2236 }
2237
rfft4pt(MYFLT * ioptr)2238 static void rfft4pt(MYFLT *ioptr)
2239 {
2240 /*** RADIX 8 rfft ***/
2241 MYFLT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
2242 MYFLT t0r, t0i, t1r, t1i;
2243 MYFLT w0r = 1.0 / ROOT2; /* cos(pi/4) */
2244 const MYFLT Two = FL(2.0);
2245 const MYFLT scale = FL(0.5);
2246
2247 /* bit reversed load */
2248 f0r = ioptr[0];
2249 f0i = ioptr[1];
2250 f1r = ioptr[4];
2251 f1i = ioptr[5];
2252 f2r = ioptr[2];
2253 f2i = ioptr[3];
2254 f3r = ioptr[6];
2255 f3i = ioptr[7];
2256
2257 /* Butterflys */
2258 /*
2259 f0 - - t0 - - f0
2260 f1 - 1 - f1 - - f1
2261 f2 - - f2 - 1 - f2
2262 f3 - 1 - t1 - -i - f3
2263 */
2264
2265 t0r = f0r + f1r;
2266 t0i = f0i + f1i;
2267 f1r = f0r - f1r;
2268 f1i = f0i - f1i;
2269
2270 t1r = f2r - f3r;
2271 t1i = f2i - f3i;
2272 f2r = f2r + f3r;
2273 f2i = f2i + f3i;
2274
2275 f0r = t0r + f2r;
2276 f0i = t0i + f2i;
2277 f2r = t0r - f2r;
2278 f2i = f2i - t0i; /* neg for rfft */
2279
2280 f3r = f1r - t1i;
2281 f3i = f1i + t1r;
2282 f1r = f1r + t1i;
2283 f1i = f1i - t1r;
2284
2285 /* finish rfft */
2286 t0r = f0r + f0i; /* compute Re(x[0]) */
2287 t0i = f0r - f0i; /* compute Re(x[N/2]) */
2288
2289 t1r = f1r + f3r;
2290 t1i = f1i - f3i;
2291 f0r = f1i + f3i;
2292 f0i = f3r - f1r;
2293
2294 f1r = t1r + w0r * f0r + w0r * f0i;
2295 f1i = t1i - w0r * f0r + w0r * f0i;
2296 f3r = Two * t1r - f1r;
2297 f3i = f1i - Two * t1i;
2298
2299 /* store result */
2300 ioptr[4] = f2r;
2301 ioptr[5] = f2i;
2302 ioptr[0] = t0r;
2303 ioptr[1] = t0i;
2304
2305 ioptr[2] = scale * f1r;
2306 ioptr[3] = scale * f1i;
2307 ioptr[6] = scale * f3r;
2308 ioptr[7] = scale * f3i;
2309 }
2310
rfft8pt(MYFLT * ioptr)2311 static void rfft8pt(MYFLT *ioptr)
2312 {
2313 /*** RADIX 16 rfft ***/
2314 MYFLT w0r = 1.0 / ROOT2; /* cos(pi/4) */
2315 MYFLT w1r = MYCOSPID8; /* cos(pi/8) */
2316 MYFLT w1i = MYSINPID8; /* sin(pi/8) */
2317 MYFLT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
2318 MYFLT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
2319 MYFLT t0r, t0i, t1r, t1i;
2320 const MYFLT Two = FL(2.0);
2321 const MYFLT scale = FL(0.5);
2322
2323 /* bit reversed load */
2324 f0r = ioptr[0];
2325 f0i = ioptr[1];
2326 f1r = ioptr[8];
2327 f1i = ioptr[9];
2328 f2r = ioptr[4];
2329 f2i = ioptr[5];
2330 f3r = ioptr[12];
2331 f3i = ioptr[13];
2332 f4r = ioptr[2];
2333 f4i = ioptr[3];
2334 f5r = ioptr[10];
2335 f5i = ioptr[11];
2336 f6r = ioptr[6];
2337 f6i = ioptr[7];
2338 f7r = ioptr[14];
2339 f7i = ioptr[15];
2340 /* Butterflys */
2341 /*
2342 f0 - - t0 - - f0 - - f0
2343 f1 - 1 - f1 - - f1 - - f1
2344 f2 - - f2 - 1 - f2 - - f2
2345 f3 - 1 - t1 - -i - f3 - - f3
2346 f4 - - t0 - - f4 - 1 - t0
2347 f5 - 1 - f5 - - f5 - w3 - f4
2348 f6 - - f6 - 1 - f6 - -i - t1
2349 f7 - 1 - t1 - -i - f7 - iw3- f6
2350 */
2351
2352 t0r = f0r + f1r;
2353 t0i = f0i + f1i;
2354 f1r = f0r - f1r;
2355 f1i = f0i - f1i;
2356
2357 t1r = f2r - f3r;
2358 t1i = f2i - f3i;
2359 f2r = f2r + f3r;
2360 f2i = f2i + f3i;
2361
2362 f0r = t0r + f2r;
2363 f0i = t0i + f2i;
2364 f2r = t0r - f2r;
2365 f2i = t0i - f2i;
2366
2367 f3r = f1r - t1i;
2368 f3i = f1i + t1r;
2369 f1r = f1r + t1i;
2370 f1i = f1i - t1r;
2371
2372 t0r = f4r + f5r;
2373 t0i = f4i + f5i;
2374 f5r = f4r - f5r;
2375 f5i = f4i - f5i;
2376
2377 t1r = f6r - f7r;
2378 t1i = f6i - f7i;
2379 f6r = f6r + f7r;
2380 f6i = f6i + f7i;
2381
2382 f4r = t0r + f6r;
2383 f4i = t0i + f6i;
2384 f6r = t0r - f6r;
2385 f6i = t0i - f6i;
2386
2387 f7r = f5r - t1i;
2388 f7i = f5i + t1r;
2389 f5r = f5r + t1i;
2390 f5i = f5i - t1r;
2391
2392 t0r = f0r - f4r;
2393 t0i = f4i - f0i; /* neg for rfft */
2394 f0r = f0r + f4r;
2395 f0i = f0i + f4i;
2396
2397 t1r = f2r - f6i;
2398 t1i = f2i + f6r;
2399 f2r = f2r + f6i;
2400 f2i = f2i - f6r;
2401
2402 f4r = f1r - f5r * w0r - f5i * w0r;
2403 f4i = f1i + f5r * w0r - f5i * w0r;
2404 f1r = f1r * Two - f4r;
2405 f1i = f1i * Two - f4i;
2406
2407 f6r = f3r + f7r * w0r - f7i * w0r;
2408 f6i = f3i + f7r * w0r + f7i * w0r;
2409 f3r = f3r * Two - f6r;
2410 f3i = f3i * Two - f6i;
2411
2412 /* finish rfft */
2413 f5r = f0r + f0i; /* compute Re(x[0]) */
2414 f5i = f0r - f0i; /* compute Re(x[N/2]) */
2415
2416 f0r = f2r + t1r;
2417 f0i = f2i - t1i;
2418 f7r = f2i + t1i;
2419 f7i = t1r - f2r;
2420
2421 f2r = f0r + w0r * f7r + w0r * f7i;
2422 f2i = f0i - w0r * f7r + w0r * f7i;
2423 t1r = Two * f0r - f2r;
2424 t1i = f2i - Two * f0i;
2425
2426 f0r = f1r + f6r;
2427 f0i = f1i - f6i;
2428 f7r = f1i + f6i;
2429 f7i = f6r - f1r;
2430
2431 f1r = f0r + w1r * f7r + w1i * f7i;
2432 f1i = f0i - w1i * f7r + w1r * f7i;
2433 f6r = Two * f0r - f1r;
2434 f6i = f1i - Two * f0i;
2435
2436 f0r = f3r + f4r;
2437 f0i = f3i - f4i;
2438 f7r = f3i + f4i;
2439 f7i = f4r - f3r;
2440
2441 f3r = f0r + w1i * f7r + w1r * f7i;
2442 f3i = f0i - w1r * f7r + w1i * f7i;
2443 f4r = Two * f0r - f3r;
2444 f4i = f3i - Two * f0i;
2445
2446 /* store result */
2447 ioptr[8] = t0r;
2448 ioptr[9] = t0i;
2449 ioptr[0] = f5r;
2450 ioptr[1] = f5i;
2451
2452 ioptr[4] = scale * f2r;
2453 ioptr[5] = scale * f2i;
2454 ioptr[12] = scale * t1r;
2455 ioptr[13] = scale * t1i;
2456
2457 ioptr[2] = scale * f1r;
2458 ioptr[3] = scale * f1i;
2459 ioptr[6] = scale * f3r;
2460 ioptr[7] = scale * f3i;
2461 ioptr[10] = scale * f4r;
2462 ioptr[11] = scale * f4i;
2463 ioptr[14] = scale * f6r;
2464 ioptr[15] = scale * f6i;
2465 }
2466
frstage(MYFLT * ioptr,int32_t M,MYFLT * Utbl)2467 static void frstage(MYFLT *ioptr, int32_t M, MYFLT *Utbl)
2468 {
2469 /* Finish RFFT */
2470
2471 uint32_t pos;
2472 uint32_t posi;
2473 uint32_t diffUcnt;
2474
2475 MYFLT *p0r, *p1r;
2476 MYFLT *u0r, *u0i;
2477
2478 MYFLT w0r, w0i;
2479 MYFLT f0r, f0i, f1r, f1i, f4r, f4i, f5r, f5i;
2480 MYFLT t0r, t0i, t1r, t1i;
2481 const MYFLT Two = FL(2.0);
2482
2483 pos = POW2(M - 1);
2484 posi = pos + 1;
2485
2486 p0r = ioptr;
2487 p1r = ioptr + pos / 2;
2488
2489 u0r = Utbl + POW2(M - 3);
2490
2491 w0r = *u0r, f0r = *(p0r);
2492 f0i = *(p0r + 1);
2493 f4r = *(p0r + pos);
2494 f4i = *(p0r + posi);
2495 f1r = *(p1r);
2496 f1i = *(p1r + 1);
2497 f5r = *(p1r + pos);
2498 f5i = *(p1r + posi);
2499
2500 t0r = Two * f0r + Two * f0i; /* compute Re(x[0]) */
2501 t0i = Two * f0r - Two * f0i; /* compute Re(x[N/2]) */
2502 t1r = f4r + f4r;
2503 t1i = -f4i - f4i;
2504
2505 f0r = f1r + f5r;
2506 f0i = f1i - f5i;
2507 f4r = f1i + f5i;
2508 f4i = f5r - f1r;
2509
2510 f1r = f0r + w0r * f4r + w0r * f4i;
2511 f1i = f0i - w0r * f4r + w0r * f4i;
2512 f5r = Two * f0r - f1r;
2513 f5i = f1i - Two * f0i;
2514
2515 *(p0r) = t0r;
2516 *(p0r + 1) = t0i;
2517 *(p0r + pos) = t1r;
2518 *(p0r + posi) = t1i;
2519 *(p1r) = f1r;
2520 *(p1r + 1) = f1i;
2521 *(p1r + pos) = f5r;
2522 *(p1r + posi) = f5i;
2523
2524 u0r = Utbl + 1;
2525 u0i = Utbl + (POW2(M - 2) - 1);
2526
2527 w0r = *u0r, w0i = *u0i;
2528
2529 p0r = (ioptr + 2);
2530 p1r = (ioptr + (POW2(M - 2) - 1) * 2);
2531
2532 /* Butterflys */
2533 /*
2534 f0 - t0 - - f0
2535 f5 - t1 - w0 - f5
2536
2537 f1 - t0 - - f1
2538 f4 - t1 -iw0 - f4
2539 */
2540
2541 for (diffUcnt = POW2(M - 3) - 1; diffUcnt > 0; diffUcnt--) {
2542
2543 f0r = *(p0r);
2544 f0i = *(p0r + 1);
2545 f5r = *(p1r + pos);
2546 f5i = *(p1r + posi);
2547 f1r = *(p1r);
2548 f1i = *(p1r + 1);
2549 f4r = *(p0r + pos);
2550 f4i = *(p0r + posi);
2551
2552 t0r = f0r + f5r;
2553 t0i = f0i - f5i;
2554 t1r = f0i + f5i;
2555 t1i = f5r - f0r;
2556
2557 f0r = t0r + w0r * t1r + w0i * t1i;
2558 f0i = t0i - w0i * t1r + w0r * t1i;
2559 f5r = Two * t0r - f0r;
2560 f5i = f0i - Two * t0i;
2561
2562 t0r = f1r + f4r;
2563 t0i = f1i - f4i;
2564 t1r = f1i + f4i;
2565 t1i = f4r - f1r;
2566
2567 f1r = t0r + w0i * t1r + w0r * t1i;
2568 f1i = t0i - w0r * t1r + w0i * t1i;
2569 f4r = Two * t0r - f1r;
2570 f4i = f1i - Two * t0i;
2571
2572 *(p0r) = f0r;
2573 *(p0r + 1) = f0i;
2574 *(p1r + pos) = f5r;
2575 *(p1r + posi) = f5i;
2576
2577 w0r = *++u0r;
2578 w0i = *--u0i;
2579
2580 *(p1r) = f1r;
2581 *(p1r + 1) = f1i;
2582 *(p0r + pos) = f4r;
2583 *(p0r + posi) = f4i;
2584
2585 p0r += 2;
2586 p1r -= 2;
2587 }
2588 }
2589
rffts1(MYFLT * ioptr,int32_t M,MYFLT * Utbl,int16 * BRLow)2590 static void rffts1(MYFLT *ioptr, int32_t M, MYFLT *Utbl, int16 *BRLow)
2591 {
2592 /* Compute in-place real fft on the rows of the input array */
2593 /* The result is the complex spectra of the positive frequencies */
2594 /* except the location for the first complex number contains the real */
2595 /* values for DC and Nyquest */
2596 /* INPUTS */
2597 /* *ioptr = real input data array */
2598 /* M = log2 of fft size */
2599 /* *Utbl = cosine table */
2600 /* *BRLow = bit reversed counter table */
2601 /* OUTPUTS */
2602 /* *ioptr = output data array in the following order */
2603 /* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), */
2604 /* ... Re(x[N/2-1]), Im(x[N/2-1]). */
2605
2606 MYFLT scale;
2607 int32_t StageCnt;
2608 int32_t NDiffU;
2609
2610 M = M - 1;
2611 switch (M) {
2612 case -1:
2613 break;
2614 case 0:
2615 rfft1pt(ioptr); /* a 2 pt fft */
2616 break;
2617 case 1:
2618 rfft2pt(ioptr); /* a 4 pt fft */
2619 break;
2620 case 2:
2621 rfft4pt(ioptr); /* an 8 pt fft */
2622 break;
2623 case 3:
2624 rfft8pt(ioptr); /* a 16 pt fft */
2625 break;
2626 default:
2627 scale = 0.5;
2628 /* bit reverse and first radix 2 stage */
2629 scbitrevR2(ioptr, M, BRLow, scale);
2630 StageCnt = (M - 1) / 3; /* number of radix 8 stages */
2631 NDiffU = 2; /* one radix 2 stage already complete */
2632 if ((M - 1 - (StageCnt * 3)) == 1) {
2633 bfR2(ioptr, M, NDiffU); /* 1 radix 2 stage */
2634 NDiffU *= 2;
2635 }
2636 if ((M - 1 - (StageCnt * 3)) == 2) {
2637 bfR4(ioptr, M, NDiffU); /* 1 radix 4 stage */
2638 NDiffU *= 4;
2639 }
2640 if (M <= (int32_t) MCACHE)
2641 bfstages(ioptr, M, Utbl, 2, NDiffU, StageCnt); /* RADIX 8 Stages */
2642 else
2643 fftrecurs(ioptr, M, Utbl, 2, NDiffU, StageCnt); /* RADIX 8 Stages */
2644 frstage(ioptr, M + 1, Utbl);
2645 }
2646 }
2647
2648 /*******************
2649 * parts of riffts1 *
2650 *******************/
2651
rifft1pt(MYFLT * ioptr,MYFLT scale)2652 static void rifft1pt(MYFLT *ioptr, MYFLT scale)
2653 {
2654 /*** RADIX 2 rifft ***/
2655 MYFLT f0r, f0i;
2656 MYFLT t0r, t0i;
2657
2658 /* bit reversed load */
2659 f0r = ioptr[0];
2660 f0i = ioptr[1];
2661
2662 /* finish rfft */
2663 t0r = f0r + f0i;
2664 t0i = f0r - f0i;
2665
2666 /* store result */
2667 ioptr[0] = scale * t0r;
2668 ioptr[1] = scale * t0i;
2669 }
2670
rifft2pt(MYFLT * ioptr,MYFLT scale)2671 static void rifft2pt(MYFLT *ioptr, MYFLT scale)
2672 {
2673 /*** RADIX 4 rifft ***/
2674 MYFLT f0r, f0i, f1r, f1i;
2675 MYFLT t0r, t0i;
2676 const MYFLT Two = FL(2.0);
2677
2678 /* bit reversed load */
2679 t0r = ioptr[0];
2680 t0i = ioptr[1];
2681 f1r = Two * ioptr[2];
2682 f1i = Two * ioptr[3];
2683
2684 /* start rifft */
2685 f0r = t0r + t0i;
2686 f0i = t0r - t0i;
2687 /* Butterflys */
2688 /*
2689 f0 - - t0
2690 f1 - 1 - f1
2691 */
2692
2693 t0r = f0r + f1r;
2694 t0i = f0i - f1i;
2695 f1r = f0r - f1r;
2696 f1i = f0i + f1i;
2697
2698 /* store result */
2699 ioptr[0] = scale * t0r;
2700 ioptr[1] = scale * t0i;
2701 ioptr[2] = scale * f1r;
2702 ioptr[3] = scale * f1i;
2703 }
2704
rifft4pt(MYFLT * ioptr,MYFLT scale)2705 static void rifft4pt(MYFLT *ioptr, MYFLT scale)
2706 {
2707 /*** RADIX 8 rifft ***/
2708 MYFLT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
2709 MYFLT t0r, t0i, t1r, t1i;
2710 MYFLT w0r = 1.0 / ROOT2; /* cos(pi/4) */
2711 const MYFLT Two = FL(2.0);
2712
2713 /* bit reversed load */
2714 t0r = ioptr[0];
2715 t0i = ioptr[1];
2716 f2r = ioptr[2];
2717 f2i = ioptr[3];
2718 f1r = Two * ioptr[4];
2719 f1i = Two * ioptr[5];
2720 f3r = ioptr[6];
2721 f3i = ioptr[7];
2722
2723 /* start rfft */
2724 f0r = t0r + t0i; /* compute Re(x[0]) */
2725 f0i = t0r - t0i; /* compute Re(x[N/2]) */
2726
2727 t1r = f2r + f3r;
2728 t1i = f2i - f3i;
2729 t0r = f2r - f3r;
2730 t0i = f2i + f3i;
2731
2732 f2r = t1r - w0r * t0r - w0r * t0i;
2733 f2i = t1i + w0r * t0r - w0r * t0i;
2734 f3r = Two * t1r - f2r;
2735 f3i = f2i - Two * t1i;
2736
2737 /* Butterflys */
2738 /*
2739 f0 - - t0 - - f0
2740 f1 - 1 - f1 - - f1
2741 f2 - - f2 - 1 - f2
2742 f3 - 1 - t1 - i - f3
2743 */
2744
2745 t0r = f0r + f1r;
2746 t0i = f0i - f1i;
2747 f1r = f0r - f1r;
2748 f1i = f0i + f1i;
2749
2750 t1r = f2r - f3r;
2751 t1i = f2i - f3i;
2752 f2r = f2r + f3r;
2753 f2i = f2i + f3i;
2754
2755 f0r = t0r + f2r;
2756 f0i = t0i + f2i;
2757 f2r = t0r - f2r;
2758 f2i = t0i - f2i;
2759
2760 f3r = f1r + t1i;
2761 f3i = f1i - t1r;
2762 f1r = f1r - t1i;
2763 f1i = f1i + t1r;
2764
2765 /* store result */
2766 ioptr[0] = scale * f0r;
2767 ioptr[1] = scale * f0i;
2768 ioptr[2] = scale * f1r;
2769 ioptr[3] = scale * f1i;
2770 ioptr[4] = scale * f2r;
2771 ioptr[5] = scale * f2i;
2772 ioptr[6] = scale * f3r;
2773 ioptr[7] = scale * f3i;
2774 }
2775
rifft8pt(MYFLT * ioptr,MYFLT scale)2776 static void rifft8pt(MYFLT *ioptr, MYFLT scale)
2777 {
2778 /*** RADIX 16 rifft ***/
2779 MYFLT w0r = (MYFLT) (1.0 / ROOT2); /* cos(pi/4) */
2780 MYFLT w1r = MYCOSPID8; /* cos(pi/8) */
2781 MYFLT w1i = MYSINPID8; /* sin(pi/8) */
2782 MYFLT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
2783 MYFLT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
2784 MYFLT t0r, t0i, t1r, t1i;
2785 const MYFLT Two = FL(2.0);
2786
2787 /* bit reversed load */
2788 t0r = ioptr[0];
2789 t0i = ioptr[1];
2790 f4r = ioptr[2];
2791 f4i = ioptr[3];
2792 f2r = ioptr[4];
2793 f2i = ioptr[5];
2794 f6r = ioptr[6];
2795 f6i = ioptr[7];
2796 f1r = Two * ioptr[8];
2797 f1i = Two * ioptr[9];
2798 f5r = ioptr[10];
2799 f5i = ioptr[11];
2800 f3r = ioptr[12];
2801 f3i = ioptr[13];
2802 f7r = ioptr[14];
2803 f7i = ioptr[15];
2804
2805 /* start rfft */
2806 f0r = t0r + t0i; /* compute Re(x[0]) */
2807 f0i = t0r - t0i; /* compute Re(x[N/2]) */
2808
2809 t0r = f2r + f3r;
2810 t0i = f2i - f3i;
2811 t1r = f2r - f3r;
2812 t1i = f2i + f3i;
2813
2814 f2r = t0r - w0r * t1r - w0r * t1i;
2815 f2i = t0i + w0r * t1r - w0r * t1i;
2816 f3r = Two * t0r - f2r;
2817 f3i = f2i - Two * t0i;
2818
2819 t0r = f4r + f7r;
2820 t0i = f4i - f7i;
2821 t1r = f4r - f7r;
2822 t1i = f4i + f7i;
2823
2824 f4r = t0r - w1i * t1r - w1r * t1i;
2825 f4i = t0i + w1r * t1r - w1i * t1i;
2826 f7r = Two * t0r - f4r;
2827 f7i = f4i - Two * t0i;
2828
2829 t0r = f6r + f5r;
2830 t0i = f6i - f5i;
2831 t1r = f6r - f5r;
2832 t1i = f6i + f5i;
2833
2834 f6r = t0r - w1r * t1r - w1i * t1i;
2835 f6i = t0i + w1i * t1r - w1r * t1i;
2836 f5r = Two * t0r - f6r;
2837 f5i = f6i - Two * t0i;
2838
2839 /* Butterflys */
2840 /*
2841 f0 - - t0 - - f0 - - f0
2842 f1* - 1 - f1 - - f1 - - f1
2843 f2 - - f2 - 1 - f2 - - f2
2844 f3 - 1 - t1 - i - f3 - - f3
2845 f4 - - t0 - - f4 - 1 - t0
2846 f5 - 1 - f5 - - f5 - w3 - f4
2847 f6 - - f6 - 1 - f6 - i - t1
2848 f7 - 1 - t1 - i - f7 - iw3- f6
2849 */
2850
2851 t0r = f0r + f1r;
2852 t0i = f0i - f1i;
2853 f1r = f0r - f1r;
2854 f1i = f0i + f1i;
2855
2856 t1r = f2r - f3r;
2857 t1i = f2i - f3i;
2858 f2r = f2r + f3r;
2859 f2i = f2i + f3i;
2860
2861 f0r = t0r + f2r;
2862 f0i = t0i + f2i;
2863 f2r = t0r - f2r;
2864 f2i = t0i - f2i;
2865
2866 f3r = f1r + t1i;
2867 f3i = f1i - t1r;
2868 f1r = f1r - t1i;
2869 f1i = f1i + t1r;
2870
2871 t0r = f4r + f5r;
2872 t0i = f4i + f5i;
2873 f5r = f4r - f5r;
2874 f5i = f4i - f5i;
2875
2876 t1r = f6r - f7r;
2877 t1i = f6i - f7i;
2878 f6r = f6r + f7r;
2879 f6i = f6i + f7i;
2880
2881 f4r = t0r + f6r;
2882 f4i = t0i + f6i;
2883 f6r = t0r - f6r;
2884 f6i = t0i - f6i;
2885
2886 f7r = f5r + t1i;
2887 f7i = f5i - t1r;
2888 f5r = f5r - t1i;
2889 f5i = f5i + t1r;
2890
2891 t0r = f0r - f4r;
2892 t0i = f0i - f4i;
2893 f0r = f0r + f4r;
2894 f0i = f0i + f4i;
2895
2896 t1r = f2r + f6i;
2897 t1i = f2i - f6r;
2898 f2r = f2r - f6i;
2899 f2i = f2i + f6r;
2900
2901 f4r = f1r - f5r * w0r + f5i * w0r;
2902 f4i = f1i - f5r * w0r - f5i * w0r;
2903 f1r = f1r * Two - f4r;
2904 f1i = f1i * Two - f4i;
2905
2906 f6r = f3r + f7r * w0r + f7i * w0r;
2907 f6i = f3i - f7r * w0r + f7i * w0r;
2908 f3r = f3r * Two - f6r;
2909 f3i = f3i * Two - f6i;
2910
2911 /* store result */
2912 ioptr[0] = scale * f0r;
2913 ioptr[1] = scale * f0i;
2914 ioptr[2] = scale * f1r;
2915 ioptr[3] = scale * f1i;
2916 ioptr[4] = scale * f2r;
2917 ioptr[5] = scale * f2i;
2918 ioptr[6] = scale * f3r;
2919 ioptr[7] = scale * f3i;
2920 ioptr[8] = scale * t0r;
2921 ioptr[9] = scale * t0i;
2922 ioptr[10] = scale * f4r;
2923 ioptr[11] = scale * f4i;
2924 ioptr[12] = scale * t1r;
2925 ioptr[13] = scale * t1i;
2926 ioptr[14] = scale * f6r;
2927 ioptr[15] = scale * f6i;
2928 }
2929
ifrstage(MYFLT * ioptr,int32_t M,MYFLT * Utbl)2930 static void ifrstage(MYFLT *ioptr, int32_t M, MYFLT *Utbl)
2931 {
2932 /* Start RIFFT */
2933
2934 uint32_t pos;
2935 uint32_t posi;
2936 uint32_t diffUcnt;
2937
2938 MYFLT *p0r, *p1r;
2939 MYFLT *u0r, *u0i;
2940
2941 MYFLT w0r, w0i;
2942 MYFLT f0r, f0i, f1r, f1i, f4r, f4i, f5r, f5i;
2943 MYFLT t0r, t0i, t1r, t1i;
2944 const MYFLT Two = FL(2.0);
2945
2946 pos = POW2(M - 1);
2947 posi = pos + 1;
2948
2949 p0r = ioptr;
2950 p1r = ioptr + pos / 2;
2951
2952 u0r = Utbl + POW2(M - 3);
2953
2954 w0r = *u0r, f0r = *(p0r);
2955 f0i = *(p0r + 1);
2956 f4r = *(p0r + pos);
2957 f4i = *(p0r + posi);
2958 f1r = *(p1r);
2959 f1i = *(p1r + 1);
2960 f5r = *(p1r + pos);
2961 f5i = *(p1r + posi);
2962
2963 t0r = f0r + f0i;
2964 t0i = f0r - f0i;
2965 t1r = f4r + f4r;
2966 t1i = -f4i - f4i;
2967
2968 f0r = f1r + f5r;
2969 f0i = f1i - f5i;
2970 f4r = f1r - f5r;
2971 f4i = f1i + f5i;
2972
2973 f1r = f0r - w0r * f4r - w0r * f4i;
2974 f1i = f0i + w0r * f4r - w0r * f4i;
2975 f5r = Two * f0r - f1r;
2976 f5i = f1i - Two * f0i;
2977
2978 *(p0r) = t0r;
2979 *(p0r + 1) = t0i;
2980 *(p0r + pos) = t1r;
2981 *(p0r + posi) = t1i;
2982 *(p1r) = f1r;
2983 *(p1r + 1) = f1i;
2984 *(p1r + pos) = f5r;
2985 *(p1r + posi) = f5i;
2986
2987 u0r = Utbl + 1;
2988 u0i = Utbl + (POW2(M - 2) - 1);
2989
2990 w0r = *u0r, w0i = *u0i;
2991
2992 p0r = (ioptr + 2);
2993 p1r = (ioptr + (POW2(M - 2) - 1) * 2);
2994
2995 /* Butterflys */
2996 /*
2997 f0 - t0 - f0
2998 f1 - t1 -w0- f1
2999
3000 f2 - t0 - f2
3001 f3 - t1 -iw0- f3
3002 */
3003
3004 for (diffUcnt = POW2(M - 3) - 1; diffUcnt > 0; diffUcnt--) {
3005
3006 f0r = *(p0r);
3007 f0i = *(p0r + 1);
3008 f5r = *(p1r + pos);
3009 f5i = *(p1r + posi);
3010 f1r = *(p1r);
3011 f1i = *(p1r + 1);
3012 f4r = *(p0r + pos);
3013 f4i = *(p0r + posi);
3014
3015 t0r = f0r + f5r;
3016 t0i = f0i - f5i;
3017 t1r = f0r - f5r;
3018 t1i = f0i + f5i;
3019
3020 f0r = t0r - w0i * t1r - w0r * t1i;
3021 f0i = t0i + w0r * t1r - w0i * t1i;
3022 f5r = Two * t0r - f0r;
3023 f5i = f0i - Two * t0i;
3024
3025 t0r = f1r + f4r;
3026 t0i = f1i - f4i;
3027 t1r = f1r - f4r;
3028 t1i = f1i + f4i;
3029
3030 f1r = t0r - w0r * t1r - w0i * t1i;
3031 f1i = t0i + w0i * t1r - w0r * t1i;
3032 f4r = Two * t0r - f1r;
3033 f4i = f1i - Two * t0i;
3034
3035 *(p0r) = f0r;
3036 *(p0r + 1) = f0i;
3037 *(p1r + pos) = f5r;
3038 *(p1r + posi) = f5i;
3039
3040 w0r = *++u0r;
3041 w0i = *--u0i;
3042
3043 *(p1r) = f1r;
3044 *(p1r + 1) = f1i;
3045 *(p0r + pos) = f4r;
3046 *(p0r + posi) = f4i;
3047
3048 p0r += 2;
3049 p1r -= 2;
3050 }
3051 }
3052
riffts1(MYFLT * ioptr,int32_t M,MYFLT * Utbl,int16 * BRLow)3053 static void riffts1(MYFLT *ioptr, int32_t M, MYFLT *Utbl, int16 *BRLow)
3054 {
3055 /* Compute in-place real ifft on the rows of the input array */
3056 /* data order as from rffts1 */
3057 /* INPUTS */
3058 /* *ioptr = input data array in the following order */
3059 /* M = log2 of fft size */
3060 /* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), */
3061 /* Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */
3062 /* *Utbl = cosine table */
3063 /* *BRLow = bit reversed counter table */
3064 /* OUTPUTS */
3065 /* *ioptr = real output data array */
3066
3067 MYFLT scale;
3068 int32_t StageCnt;
3069 int32_t NDiffU;
3070
3071 scale = (MYFLT)(1.0 / (double)((int32_t)POW2(M)));
3072 M = M - 1;
3073 switch (M) {
3074 case -1:
3075 break;
3076 case 0:
3077 rifft1pt(ioptr, scale); /* a 2 pt fft */
3078 break;
3079 case 1:
3080 rifft2pt(ioptr, scale); /* a 4 pt fft */
3081 break;
3082 case 2:
3083 rifft4pt(ioptr, scale); /* an 8 pt fft */
3084 break;
3085 case 3:
3086 rifft8pt(ioptr, scale); /* a 16 pt fft */
3087 break;
3088 default:
3089 ifrstage(ioptr, M + 1, Utbl);
3090 /* bit reverse and first radix 2 stage */
3091 scbitrevR2(ioptr, M, BRLow, scale);
3092 StageCnt = (M - 1) / 3; /* number of radix 8 stages */
3093 NDiffU = 2; /* one radix 2 stage already complete */
3094 if ((M - 1 - (StageCnt * 3)) == 1) {
3095 ibfR2(ioptr, M, NDiffU); /* 1 radix 2 stage */
3096 NDiffU *= 2;
3097 }
3098 if ((M - 1 - (StageCnt * 3)) == 2) {
3099 ibfR4(ioptr, M, NDiffU); /* 1 radix 4 stage */
3100 NDiffU *= 4;
3101 }
3102 if (M <= (int32_t) MCACHE)
3103 ibfstages(ioptr, M, Utbl, 2, NDiffU, StageCnt); /* RADIX 8 Stages */
3104 else
3105 ifftrecurs(ioptr, M, Utbl, 2, NDiffU, StageCnt); /* RADIX 8 Stages */
3106 }
3107 }
3108
3109
fftInit(CSOUND * csound,int32_t M)3110 static void fftInit(CSOUND *csound, int32_t M)
3111 {
3112 /* malloc and init cosine and bit reversed tables for a given size */
3113 /* fft, ifft, rfft, rifft */
3114 /* INPUTS */
3115 /* M = log2 of fft size (ex M=10 for 1024 point fft) */
3116 /* OUTPUTS */
3117 /* private cosine and bit reversed tables */
3118
3119 MYFLT **UtblArray;
3120 int16 **BRLowArray;
3121 int32_t i;
3122
3123 if (!csound->FFT_max_size) {
3124 if (csound->FFT_table_1 == NULL)
3125 csound->FFT_table_1 = csound->Malloc(csound, sizeof(MYFLT*) * 32);
3126 if (csound->FFT_table_2 == NULL)
3127 csound->FFT_table_2 = csound->Malloc(csound, sizeof(int16*) * 32);
3128 for (i = 0; i < 32; i++) {
3129 ((MYFLT**) csound->FFT_table_1)[i] = (MYFLT*) NULL;
3130 ((int16**) csound->FFT_table_2)[i] = (int16*) NULL;
3131 }
3132 }
3133 UtblArray = (MYFLT**) csound->FFT_table_1;
3134 BRLowArray = (int16**) csound->FFT_table_2;
3135
3136 /*** I did NOT test cases with M>27 ***/
3137 /* init cos table */
3138 UtblArray[M] = (MYFLT*) csound->Malloc(csound,
3139 (POW2(M) / 4 + 1) * sizeof(MYFLT));
3140 fftCosInit(M, UtblArray[M]);
3141 if (M > 1) {
3142 /* init bit reversed table for cmplx FFT */
3143 if (BRLowArray[M / 2] == NULL) {
3144 BRLowArray[M / 2] =
3145 (int16*) csound->Malloc(csound, POW2(M / 2 - 1) * sizeof(int16));
3146 fftBRInit(M, BRLowArray[M / 2]);
3147 }
3148 }
3149 if (M > 2) {
3150 /* init bit reversed table for real FFT */
3151 if (BRLowArray[(M - 1) / 2] == 0) {
3152 BRLowArray[(M - 1) / 2] =
3153 (int16*) csound->Malloc(csound,
3154 POW2((M - 1) / 2 - 1) * sizeof(int16));
3155 fftBRInit(M - 1, BRLowArray[(M - 1) / 2]);
3156 }
3157 }
3158
3159 csound->FFT_max_size |= (1 << M);
3160 }
3161
3162
ConvertFFTSize(CSOUND * csound,int32_t N)3163 static inline int32_t ConvertFFTSize(CSOUND *csound, int32_t N)
3164 {
3165 if (N <= 0)
3166 return (-N);
3167 switch (N) {
3168 case 0x00000001: return 0;
3169 case 0x00000002: return 1;
3170 case 0x00000004: return 2;
3171 case 0x00000008: return 3;
3172 case 0x00000010: return 4;
3173 case 0x00000020: return 5;
3174 case 0x00000040: return 6;
3175 case 0x00000080: return 7;
3176 case 0x00000100: return 8;
3177 case 0x00000200: return 9;
3178 case 0x00000400: return 10;
3179 case 0x00000800: return 11;
3180 case 0x00001000: return 12;
3181 case 0x00002000: return 13;
3182 case 0x00004000: return 14;
3183 case 0x00008000: return 15;
3184 case 0x00010000: return 16;
3185 case 0x00020000: return 17;
3186 case 0x00040000: return 18;
3187 case 0x00080000: return 19;
3188 case 0x00100000: return 20;
3189 case 0x00200000: return 21;
3190 case 0x00400000: return 22;
3191 case 0x00800000: return 23;
3192 case 0x01000000: return 24;
3193 case 0x02000000: return 25;
3194 case 0x04000000: return 26;
3195 case 0x08000000: return 27;
3196 case 0x10000000: return 28;
3197 }
3198 csound->Warning(csound, Str(" *** fftlib.c: internal error: "
3199 "invalid FFT size: %d"), N);
3200 return 0;
3201 }
3202
getTablePointers(CSOUND * p,MYFLT ** ct,int16 ** bt,int32_t cn,int32_t bn)3203 static inline void getTablePointers(CSOUND *p, MYFLT **ct, int16 **bt,
3204 int32_t cn, int32_t bn)
3205 {
3206 if (!(p->FFT_max_size & (1 << cn)))
3207 fftInit(p, cn);
3208 *ct = ((MYFLT**) p->FFT_table_1)[cn];
3209 *bt = ((int16**) p->FFT_table_2)[bn];
3210 }
3211
3212
3213 /**
3214 * Returns the amplitude scale that should be applied to the result of
3215 * an inverse complex FFT with a length of 'FFTsize' samples.
3216 */
csoundGetInverseComplexFFTScale(CSOUND * csound,int32_t FFTsize)3217 MYFLT csoundGetInverseComplexFFTScale(CSOUND *csound, int32_t FFTsize)
3218 {
3219 IGN(FFTsize);
3220 IGN(csound);
3221 return FL(1.0);
3222 }
3223
3224 /**
3225 * Returns the amplitude scale that should be applied to the result of
3226 * an inverse real FFT with a length of 'FFTsize' samples.
3227 */
csoundGetInverseRealFFTScale(CSOUND * csound,int32_t FFTsize)3228 MYFLT csoundGetInverseRealFFTScale(CSOUND *csound, int32_t FFTsize)
3229 {
3230 IGN(FFTsize);
3231 IGN(csound);
3232 return FL(1.0);
3233 }
3234
3235 /**
3236 * Compute in-place complex FFT
3237 * FFTsize: FFT length in samples
3238 * buf: array of FFTsize*2 MYFLT values,
3239 * in interleaved real/imaginary format
3240 */
csoundComplexFFT(CSOUND * csound,MYFLT * buf,int32_t FFTsize)3241 void csoundComplexFFT(CSOUND *csound, MYFLT *buf, int32_t FFTsize)
3242 {
3243 MYFLT *Utbl;
3244 int16 *BRLow;
3245 int32_t M;
3246
3247 M = ConvertFFTSize(csound, FFTsize);
3248 getTablePointers(csound, &Utbl, &BRLow, M, M / 2);
3249 ffts1(buf, M, Utbl, BRLow);
3250 }
3251
3252 /**
3253 * Compute in-place inverse complex FFT
3254 * FFTsize: FFT length in samples
3255 * buf: array of FFTsize*2 MYFLT values,
3256 * in interleaved real/imaginary format
3257 * Output should be scaled by the return value of
3258 * csoundGetInverseComplexFFTScale(csound, FFTsize).
3259 */
3260
csoundInverseComplexFFT(CSOUND * csound,MYFLT * buf,int32_t FFTsize)3261 void csoundInverseComplexFFT(CSOUND *csound, MYFLT *buf, int32_t FFTsize)
3262 {
3263 MYFLT *Utbl;
3264 int16 *BRLow;
3265 int32_t M;
3266
3267 M = ConvertFFTSize(csound, FFTsize);
3268 getTablePointers(csound, &Utbl, &BRLow, M, M / 2);
3269 iffts1(buf, M, Utbl, BRLow);
3270 }
3271
3272 /**
3273 * Compute in-place real FFT
3274 * FFTsize: FFT length in samples
3275 * buf: array of FFTsize MYFLT values; output is in interleaved
3276 * real/imaginary format, except for buf[1] which is the real
3277 * part for the Nyquist frequency
3278 */
3279
csoundRealFFT(CSOUND * csound,MYFLT * buf,int32_t FFTsize)3280 void csoundRealFFT(CSOUND *csound, MYFLT *buf, int32_t FFTsize)
3281 {
3282
3283 MYFLT *Utbl;
3284 int16 *BRLow;
3285 int32_t M;
3286 M = ConvertFFTSize(csound, FFTsize);
3287 getTablePointers(csound, &Utbl, &BRLow, M, (M - 1) / 2);
3288 rffts1(buf, M, Utbl, BRLow);
3289 }
3290
3291 /**
3292 * Compute in-place inverse real FFT
3293 * FFTsize: FFT length in samples
3294 * buf: array of FFTsize MYFLT values; input is expected to be in
3295 * interleaved real/imaginary format, except for buf[1] which
3296 * is the real part for the Nyquist frequency
3297 * Output should be scaled by the return value of
3298 * csoundGetInverseRealFFTScale(csound, FFTsize).
3299 */
3300
csoundInverseRealFFT(CSOUND * csound,MYFLT * buf,int32_t FFTsize)3301 void csoundInverseRealFFT(CSOUND *csound, MYFLT *buf, int32_t FFTsize)
3302 {
3303 MYFLT *Utbl;
3304 int16 *BRLow;
3305 int32_t M;
3306 M = ConvertFFTSize(csound, FFTsize);
3307 getTablePointers(csound, &Utbl, &BRLow, M, (M - 1) / 2);
3308 riffts1(buf, M, Utbl, BRLow);
3309 }
3310
3311 /**
3312 * Multiply two arrays (buf1 and buf2) of complex data in the format
3313 * returned by csoundRealFFT(), and leave the result in outbuf, which
3314 * may be the same as either buf1 or buf2.
3315 * An amplitude scale of 'scaleFac' is also applied.
3316 * The arrays should contain 'FFTsize' MYFLT values.
3317 */
3318
csoundRealFFTMult(CSOUND * csound,MYFLT * outbuf,MYFLT * buf1,MYFLT * buf2,int32_t FFTsize,MYFLT scaleFac)3319 void csoundRealFFTMult(CSOUND *csound, MYFLT *outbuf,
3320 MYFLT *buf1, MYFLT *buf2, int32_t FFTsize, MYFLT scaleFac)
3321 {
3322 MYFLT re, im;
3323 int32_t i;
3324 IGN(csound);
3325
3326 if (scaleFac != FL(1.0)) {
3327 outbuf[0] = buf1[0] * buf2[0] * scaleFac;
3328 if (FFTsize < 2)
3329 return;
3330 outbuf[1] = buf1[1] * buf2[1] * scaleFac;
3331 for (i = 2; i < FFTsize; ) {
3332 re = ((buf1[i] * buf2[i]) - (buf1[i + 1] * buf2[i + 1])) * scaleFac;
3333 im = ((buf1[i] * buf2[i + 1]) + (buf2[i] * buf1[i + 1])) * scaleFac;
3334 outbuf[i++] = re;
3335 outbuf[i++] = im;
3336 }
3337 }
3338 else {
3339 outbuf[0] = buf1[0] * buf2[0];
3340 if (FFTsize < 2)
3341 return;
3342 outbuf[1] = buf1[1] * buf2[1];
3343 for (i = 2; i < FFTsize; ) {
3344 re = (buf1[i] * buf2[i]) - (buf1[i + 1] * buf2[i + 1]);
3345 im = (buf1[i] * buf2[i + 1]) + (buf2[i] * buf1[i + 1]);
3346 outbuf[i++] = re;
3347 outbuf[i++] = im;
3348 }
3349 }
3350 }
3351
3352
3353
3354 /*
3355 New FFT interface
3356 VL, 2016
3357 */
3358 static
pffft_execute(CSOUND_FFT_SETUP * setup,MYFLT * sig)3359 void pffft_execute(CSOUND_FFT_SETUP *setup,
3360 MYFLT *sig) {
3361 int32_t i, N = setup->N;
3362 float s, *buf;
3363 buf = (float *) setup->buffer;
3364 for(i=0;i<N;i++)
3365 buf[i] = sig[i];
3366 pffft_transform_ordered((PFFFT_Setup *)
3367 setup->setup,
3368 buf,buf,NULL,setup->d);
3369 s = (setup->d == PFFFT_BACKWARD ?
3370 (MYFLT) setup->N : FL(1.0));
3371 for(i=0;i<N;i++)
3372 sig[i] = buf[i]/s;
3373 }
3374
3375 #if defined(__MACH__)
3376 /* vDSP FFT implementation */
3377 #include <Accelerate/Accelerate.h>
3378 static
vDSP_execute(CSOUND_FFT_SETUP * setup,MYFLT * sig)3379 void vDSP_execute(CSOUND_FFT_SETUP *setup,
3380 MYFLT *sig){
3381 #ifdef USE_DOUBLE
3382 DSPDoubleSplitComplex tmp;
3383 #else
3384 DSPSplitComplex tmp;
3385 #endif
3386 int32_t i,j;
3387 MYFLT s;
3388 int32_t N = setup->N;
3389 tmp.realp = &setup->buffer[0];
3390 tmp.imagp = &setup->buffer[N>>1];
3391 for(i=j=0;i<N;i+=2,j++){
3392 tmp.realp[j] = sig[i];
3393 tmp.imagp[j] = sig[i+1];
3394 }
3395 #ifdef USE_DOUBLE
3396 vDSP_fft_zripD((FFTSetupD) setup->setup,
3397 &tmp, 1,
3398 setup->M,setup->d);
3399 #else
3400 vDSP_fft_zrip((FFTSetup) setup->setup,
3401 &tmp, 1,
3402 setup->M,setup->d);
3403 #endif
3404 s = (setup->d == -1 ? (MYFLT)(N) : FL(2.0));
3405 for(i=j=0;i<N;i+=2,j++){
3406 sig[i] = tmp.realp[j]/s;
3407 sig[i+1] = tmp.imagp[j]/s;
3408 }
3409 }
3410 #endif
3411
3412 #define ALIGN_BYTES 64
3413 static
align_alloc(CSOUND * csound,size_t nb_bytes)3414 void *align_alloc(CSOUND *csound, size_t nb_bytes){
3415 void *p, *p0 = csound->Malloc(csound, nb_bytes + ALIGN_BYTES);
3416 if(!p0) return (void *) 0;
3417 p = (void *) (((size_t) p0 + ALIGN_BYTES)
3418 & (~((size_t) (ALIGN_BYTES-1))));
3419 *((void **) p - 1) = p0;
3420 return p;
3421 }
3422
setupDispose(CSOUND * csound,void * pp)3423 int32_t setupDispose(CSOUND *csound, void *pp){
3424 IGN(csound);
3425 CSOUND_FFT_SETUP *setup =(CSOUND_FFT_SETUP *) pp;
3426 switch(setup->lib){
3427 #if defined(__MACH__)
3428 case VDSP_LIB:
3429 #ifdef USE_DOUBLE
3430 vDSP_destroy_fftsetupD((FFTSetupD)
3431 #else
3432 vDSP_destroy_fftsetup((FFTSetup)
3433 #endif
3434 setup->setup);
3435 break;
3436 #endif
3437 case PFFT_LIB:
3438 pffft_destroy_setup((PFFFT_Setup *)setup->setup);
3439 break;
3440 }
3441 return OK;
3442 }
3443
isPowTwo(int32_t N)3444 int32_t isPowTwo(int32_t N) {
3445 return (N != 0) ? !(N & (N - 1)) : 0;
3446 }
3447
csoundRealFFT2Setup(CSOUND * csound,int32_t FFTsize,int32_t d)3448 void *csoundRealFFT2Setup(CSOUND *csound,
3449 int32_t FFTsize,
3450 int32_t d){
3451 CSOUND_FFT_SETUP *setup;
3452 int32_t lib = csound->oparms->fft_lib;
3453 if(lib == PFFT_LIB && FFTsize <= 16){
3454 csound->Warning(csound,
3455 "FFTsize %d \n"
3456 "Cannot use PFFT with sizes <= 16\n"
3457 "--defaulting to FFTLIB",
3458 FFTsize);
3459 lib = 0;
3460 }
3461 setup = (CSOUND_FFT_SETUP *)
3462 csound->Calloc(csound, sizeof(CSOUND_FFT_SETUP));
3463 setup->N = FFTsize;
3464 setup->p2 = isPowTwo(FFTsize);
3465 switch(lib){
3466 #if defined(__MACH__)
3467 case VDSP_LIB:
3468 setup->M = ConvertFFTSize(csound, FFTsize);
3469 setup->setup = (void *)
3470 #ifdef USE_DOUBLE
3471 vDSP_create_fftsetupD(setup->M,kFFTRadix2);
3472 #else
3473 vDSP_create_fftsetup(setup->M,kFFTRadix2);
3474 #endif
3475 setup->d = (d == FFT_FWD ?
3476 kFFTDirection_Forward :
3477 kFFTDirection_Inverse);
3478 setup->lib = lib;
3479 break;
3480 #endif
3481 case PFFT_LIB:
3482 setup->setup = (void *)
3483 pffft_new_setup(FFTsize,PFFFT_REAL);
3484 setup->d = (d == FFT_FWD ?
3485 PFFFT_FORWARD :
3486 PFFFT_BACKWARD);
3487 setup->lib = lib;
3488 break;
3489 default:
3490 setup->lib = 0;
3491 setup->d = d;
3492 return (void *) setup;
3493 }
3494 setup->buffer = (MYFLT *) align_alloc(csound, sizeof(MYFLT)*FFTsize);
3495 csound->RegisterResetCallback(csound, (void*) setup,
3496 (int32_t (*)(CSOUND *, void *))
3497 setupDispose);
3498 return (void *) setup;
3499 }
3500
csoundRealFFT2(CSOUND * csound,void * p,MYFLT * sig)3501 void csoundRealFFT2(CSOUND *csound,
3502 void *p, MYFLT *sig){
3503 CSOUND_FFT_SETUP *setup =
3504 (CSOUND_FFT_SETUP *) p;
3505 switch(setup->lib) {
3506 #if defined(__MACH__)
3507 case VDSP_LIB:
3508 vDSP_execute(setup,sig);
3509 break;
3510 #endif
3511 case PFFT_LIB:
3512 pffft_execute(setup,sig);
3513 break;
3514 default:
3515 (setup->d == FFT_FWD ?
3516 csoundRealFFT(csound,
3517 sig,setup->N) :
3518 csoundInverseRealFFT(csound,
3519 sig,setup->N));
3520 setup->lib = 0;
3521 }
3522 }
3523
3524
csoundDCTSetup(CSOUND * csound,int32_t FFTsize,int32_t d)3525 void *csoundDCTSetup(CSOUND *csound,
3526 int32_t FFTsize, int32_t d){
3527 CSOUND_FFT_SETUP *setup;
3528 setup = (CSOUND_FFT_SETUP *)
3529 csoundRealFFT2Setup(csound,
3530 FFTsize*4,d);
3531 if(setup->lib == 0){
3532 setup->buffer = (MYFLT *)
3533 csound->Calloc(csound, sizeof(MYFLT)*setup->N);
3534 }
3535 return setup;
3536 }
3537
3538
pffft_DCT_execute(CSOUND * csound,void * p,MYFLT * sig)3539 void pffft_DCT_execute(CSOUND *csound,
3540 void *p, MYFLT *sig){
3541 IGN(csound);
3542 CSOUND_FFT_SETUP *setup =
3543 (CSOUND_FFT_SETUP *) p;
3544 int32_t i,j, N= setup->N;
3545 float *buffer = (float *)setup->buffer;
3546 if(setup->d == FFT_FWD){
3547 for(i=j = 0; i < N/2; i+=2, j++){
3548 buffer[i] = FL(0.0);
3549 buffer[i+1] = sig[j];
3550 }
3551 for(i=N/2,j=N/4-1; i < N; i+=2, j--){
3552 buffer[i] = FL(0.0);
3553 buffer[i+1] = sig[j];
3554 }
3555 pffft_transform_ordered((PFFFT_Setup *)
3556 setup->setup,
3557 buffer,buffer,
3558 NULL,PFFFT_FORWARD);
3559 for(i=j=0; i < N/2; i+=2, j++){
3560 sig[j] = buffer[i];
3561 }
3562 } else {
3563 buffer[0] = sig[0];
3564 buffer[1] = -sig[0];
3565 for(i=2,j=1; i < N/2; i+=2, j++){
3566 buffer[i] = sig[j];
3567 buffer[i+1] = FL(0.0);
3568 }
3569 buffer[N/2] = buffer[N/2+1] = FL(0.0);
3570 for(i=N/2+2,j=N/4-1; i < N; i+=2, j--){
3571 buffer[i] = -sig[j];
3572 buffer[i+1] = FL(0.0);
3573 }
3574 pffft_transform_ordered((PFFFT_Setup *)
3575 setup->setup,
3576 buffer,buffer,
3577 NULL,PFFFT_BACKWARD);
3578 for(i=j=0; i < N/2; i+=2, j++){
3579 sig[j] = buffer[i+1]/N;
3580 }
3581 }
3582 }
3583
3584 #if defined(__MACH__)
vDSP_DCT_execute(CSOUND * csound,void * p,MYFLT * sig)3585 void vDSP_DCT_execute(CSOUND *csound,
3586 void *p, MYFLT *sig){
3587 IGN(csound);
3588 CSOUND_FFT_SETUP *setup =
3589 (CSOUND_FFT_SETUP *) p;
3590 int32_t i,j, N= setup->N;
3591 #ifdef USE_DOUBLE
3592 DSPDoubleSplitComplex tmp;
3593 #else
3594 DSPSplitComplex tmp;
3595 #endif
3596 tmp.realp = &setup->buffer[0];
3597 tmp.imagp = &setup->buffer[N>>1];
3598 if(setup->d == kFFTDirection_Forward){
3599 for(j=0;j<N/4;j++){
3600 tmp.realp[j] = FL(0.0);
3601 tmp.imagp[j] = sig[j];
3602 }
3603 for(i=N/4,j=N/4-1;i<N/2;i++,j--){
3604 tmp.realp[i] = FL(0.0);
3605 tmp.imagp[i] = sig[j];
3606 }
3607 #ifdef USE_DOUBLE
3608 vDSP_fft_zripD((FFTSetupD) setup->setup,
3609 #else
3610 vDSP_fft_zrip((FFTSetup) setup->setup,
3611 #endif
3612 &tmp, 1,
3613 setup->M,setup->d);
3614
3615 for(j=0; j < N/4; j++){
3616 sig[j] = tmp.realp[j]/FL(2.0);
3617 }
3618 } else {
3619 tmp.realp[0] = sig[0];
3620 tmp.imagp[0] = -sig[0];
3621 for(j=1; j < N/4; j++){
3622 tmp.realp[j] = sig[j];
3623 tmp.imagp[j] = FL(0.0);
3624 }
3625 tmp.realp[N/4] = tmp.imagp[N/4] = FL(0.0);
3626 for(i=N/4+1,j=N/4-1; i < N/2; i++, j--){
3627 tmp.realp[i] = -sig[j];
3628 tmp.imagp[i] = FL(0.0);
3629 }
3630 #ifdef USE_DOUBLE
3631 vDSP_fft_zripD((FFTSetupD) setup->setup,
3632 #else
3633 vDSP_fft_zrip((FFTSetup) setup->setup,
3634 #endif
3635 &tmp, 1,
3636 setup->M,setup->d);
3637 for(j=0; j < N/4; j++){
3638 sig[j] = tmp.imagp[j]/N;
3639 }
3640 }
3641 }
3642 #endif
3643
DCT_execute(CSOUND * csound,void * p,MYFLT * sig)3644 void DCT_execute(CSOUND *csound,
3645 void *p, MYFLT *sig){
3646 CSOUND_FFT_SETUP *setup =
3647 (CSOUND_FFT_SETUP *) p;
3648 int32_t i,j, N= setup->N;
3649 MYFLT *buffer = setup->buffer;
3650 if(setup->d == FFT_FWD){
3651 for(i=j = 0; i < N/2; i+=2, j++){
3652 buffer[i] = FL(0.0);
3653 buffer[i+1] = sig[j];
3654 }
3655 for(i=N/2,j=N/4-1; i < N; i+=2, j--){
3656 buffer[i] = FL(0.0);
3657 buffer[i+1] = sig[j];
3658 }
3659 csoundRealFFT(csound,buffer,N);
3660 for(i=j=0; i < N/2; i+=2, j++){
3661 sig[j] = buffer[i];
3662 }
3663 } else {
3664 buffer[0] = sig[0];
3665 buffer[1] = -sig[0];
3666 for(i=2,j=1; i < N/2; i+=2, j++){
3667 buffer[i] = sig[j];
3668 buffer[i+1] = FL(0.0);
3669 }
3670 buffer[N/2] = buffer[N/2+1] = FL(0.0);
3671 for(i=N/2+2,j=N/4-1; i < N; i+=2, j--){
3672 buffer[i] = -sig[j];
3673 buffer[i+1] = FL(0.0);
3674 }
3675 csoundInverseRealFFT(csound,buffer,N);
3676 for(i=j=0; i < N/2; i+=2, j++){
3677 sig[j] = buffer[i+1];
3678 }
3679 }
3680 }
3681
csoundDCT(CSOUND * csound,void * p,MYFLT * sig)3682 void csoundDCT(CSOUND *csound,
3683 void *p, MYFLT *sig){
3684 CSOUND_FFT_SETUP *setup =
3685 (CSOUND_FFT_SETUP *) p;
3686 switch(setup->lib) {
3687 #if defined(__MACH__)
3688 case VDSP_LIB:
3689 vDSP_DCT_execute(csound,setup,sig);
3690 break;
3691 #endif
3692 case PFFT_LIB:
3693 pffft_DCT_execute(csound,setup,sig);
3694 break;
3695 default:
3696 DCT_execute(csound,setup,sig);
3697 setup->lib = 0;
3698 }
3699 }
3700
3701 /* =======--====================*/
3702 #if 0
3703 #ifdef HAVE_VECLIB
3704 /* vDSP FFT interface */
3705
3706 #define ALIGN64 64
3707 static
3708 void *vDSP_alloc(CSOUND *csound, size_t nb_bytes){
3709 void *p, *p0 = csound->Malloc(csound, nb_bytes + ALIGN64);
3710 if(!p0) return (void *) 0;
3711 p = (void *) (((size_t) p0 + ALIGN64)
3712 & (~((size_t) (ALIGN64-1))));
3713 *((void **) p - 1) = p0;
3714 return p;
3715 }
3716
3717 static
3718 void vDSP_free(CSOUND *csound, void *p) {
3719 if(p)csound->Free(csound, *((void **) p - 1));
3720 }
3721
3722 /* reset vdsp */
3723 int32_t vDSP_reset(CSOUND *csound, void *pp){
3724 IGN(pp);
3725 #ifdef USE_DOUBLE
3726 vDSP_destroy_fftsetupD(csound->vdsp_setup);
3727 #else
3728 vDSP_destroy_fftsetup(csound->vdsp_setup);
3729 #endif
3730 csound->vdsp_setup = NULL;
3731 vDSP_free(csound, csound->vdsp_buffer);
3732 return OK;
3733 }
3734
3735 /* create setup if setup does not exist */
3736 static
3737 void vDSP_setup(CSOUND *csound, int32_t FFTsize){
3738 int32_t M = ConvertFFTSize(csound, FFTsize);
3739 if(csound->FFT_max_size < FFTsize){
3740 #ifdef USE_DOUBLE
3741 if(csound->vdsp_setup != NULL)
3742 vDSP_destroy_fftsetupD(csound->vdsp_setup);
3743 csound->vdsp_setup =
3744 vDSP_create_fftsetupD(M,kFFTRadix2);
3745 #else
3746 if(csound->vdsp_setup != NULL)
3747 vDSP_destroy_fftsetup(csound->vdsp_setup);
3748 csound->vdsp_setup =
3749 vDSP_create_fftsetup(M,kFFTRadix2);
3750 #endif
3751 if(csound->vdsp_buffer != NULL)
3752 vDSP_free(csound, csound->vdsp_buffer);
3753 csound->vdsp_buffer = vDSP_alloc(csound,
3754 FFTsize*(sizeof(MYFLT)));
3755 if(csound->FFT_max_size == 0)
3756 csound->RegisterResetCallback(csound, (void*) NULL,
3757 (int32_t (*)(CSOUND *, void *))
3758 vDSP_reset);
3759 csound->FFT_max_size = FFTsize;
3760 }
3761 }
3762
3763 static
3764 void vDSP_RealFFT(CSOUND *csound,int32_t FFTsize,MYFLT *sig,FFTDirection d){
3765 #ifdef USE_DOUBLE
3766 DSPDoubleSplitComplex tmp;
3767 #else
3768 DSPSplitComplex tmp;
3769 #endif
3770 int32_t i,j;
3771 MYFLT s;
3772 vDSP_setup(csound, FFTsize);
3773 tmp.realp = &csound->vdsp_buffer[0];
3774 tmp.imagp = &csound->vdsp_buffer[FFTsize>>1];
3775 for(i=j=0;i<FFTsize;i+=2,j++){
3776 tmp.realp[j] = sig[i];
3777 tmp.imagp[j] = sig[i+1];
3778 }
3779 #ifdef USE_DOUBLE
3780 vDSP_fft_zripD(csound->vdsp_setup, &tmp, 1,
3781 ConvertFFTSize(csound, FFTsize),
3782 d);
3783 #else
3784 vDSP_fft_zrip(csound->vdsp_setup, &tmp, 1,
3785 ConvertFFTSize(csound, FFTsize),
3786 d);
3787 #endif
3788 s = (d == -1 ? (MYFLT)(FFTsize) : FL(2.0));
3789 for(i=j=0;i<FFTsize;i+=2,j++){
3790 sig[i] = tmp.realp[j]/s;
3791 sig[i+1] = tmp.imagp[j]/s;
3792 }
3793
3794 }
3795
3796 /* these functions are available from OSX 10.9 */
3797 int32_t vDSP_reset_New(CSOUND *csound, void *pp){
3798 IGN(pp);
3799 #ifdef USE_DOUBLE
3800 vDSP_DFT_DestroySetupD(csound->vdsp_setup_fwd);
3801 vDSP_DFT_DestroySetupD(csound->vdsp_setup_inv);
3802 #else
3803 vDSP_DFT_DestroySetup(csound->vdsp_setup_fwd);
3804 vDSP_DFT_DestroySetup(csound->vdsp_setup_inv);
3805 #endif
3806 vDSP_free(csound, csound->vdsp_buffer);
3807 return OK;
3808 }
3809
3810 /* create setup if setup does not exist */
3811 static
3812 #ifdef USE_DOUBLE
3813 vDSP_DFT_SetupD
3814 #else
3815 vDSP_DFT_Setup
3816 #endif
3817 vDSP_setup_New(CSOUND *csound, int32_t FFTsize, int32_t d){
3818 if(csound->vdsp_setup_fwd == NULL ||
3819 csound->vdsp_setup_inv == NULL){
3820 #ifdef USE_DOUBLE
3821 if(d == 1)
3822 csound->vdsp_setup_fwd =
3823 vDSP_DFT_zrop_CreateSetupD(csound->vdsp_setup_fwd,
3824 FFTsize, d);
3825 else
3826 csound->vdsp_setup_inv =
3827 vDSP_DFT_zrop_CreateSetupD(csound->vdsp_setup_inv,
3828 FFTsize, d);
3829 #else
3830 if(d == 1)
3831 csound->vdsp_setup_fwd =
3832 vDSP_DFT_zrop_CreateSetup(csound->vdsp_setup_fwd,
3833 FFTsize, d);
3834 else
3835 csound->vdsp_setup_inv =
3836 vDSP_DFT_zrop_CreateSetup(csound->vdsp_setup_inv,
3837 FFTsize, d);
3838 #endif
3839 }
3840 if(csound->FFT_max_size < FFTsize) {
3841 vDSP_free(csound, csound->vdsp_buffer);
3842 csound->vdsp_buffer = vDSP_alloc(csound,
3843 FFTsize*(sizeof(MYFLT)));
3844 }
3845
3846 if(csound->FFT_max_size == 0)
3847 csound->RegisterResetCallback(csound, (void*) NULL,
3848 (int32_t (*)(CSOUND *, void *))
3849 vDSP_reset_New);
3850 csound->FFT_max_size = FFTsize;
3851 return d == 1 ? csound->vdsp_setup_fwd :
3852 csound->vdsp_setup_inv;
3853 }
3854
3855 void vDSP_RealFFT_New(CSOUND *csound,int32_t FFTsize,MYFLT *sig,
3856 FFTDirection d){
3857 int32_t i,j;
3858 MYFLT s;
3859 #ifdef USE_DOUBLE
3860 vDSP_DFT_SetupD setup;
3861 #else
3862 vDSP_DFT_Setup setup;
3863 #endif
3864 setup = vDSP_setup_New(csound, FFTsize, d);
3865 MYFLT *inr = &csound->vdsp_buffer[0];
3866 MYFLT *ini = &csound->vdsp_buffer[FFTsize>>1];
3867
3868 for(i=j=0;i<FFTsize;i+=2,j++){
3869 inr[j] = sig[i];
3870 ini[j] = sig[i+1];
3871 }
3872 #ifdef USE_DOUBLE
3873 vDSP_DFT_ExecuteD(setup,inr,ini,inr,ini);
3874 #else
3875 vDSP_DFT_Execute(setup,inr,ini,inr,ini);
3876 #endif
3877 s = (d == -1 ? (MYFLT)(FFTsize) : FL(2.0));
3878 for(i=j=0;i<FFTsize;i+=2,j++){
3879 sig[i] = inr[j]/s;
3880 sig[i+1] = ini[j]/s;
3881 }
3882 }
3883
3884 #else /* HAVE_VECLIB */
3885 int32_t pffft_reset(CSOUND *csound, void *pp){
3886 IGN(pp);
3887 int32_t i;
3888 for(i=0; i < 32; i++)
3889 if(csound->setup[i]) pffft_destroy_setup(csound->setup[i]);
3890 pffft_aligned_free(csound->vdsp_buffer);
3891 return OK;
3892 }
3893
3894 /* create setup if setup does not exist */
3895 static
3896 void pffft_setup(CSOUND *csound, int32_t FFTsize, int32_t M){
3897 if(csound->FFT_max_size != FFTsize){
3898 if(csound->FFT_max_size == 0)
3899 csound->RegisterResetCallback(csound, (void*) NULL,
3900 (int32_t (*)(CSOUND *, void *)) pffft_reset);
3901 if(csound->setup[M] == NULL)
3902 csound->setup[M] = pffft_new_setup(FFTsize,PFFFT_REAL);
3903 if(csound->FFT_max_size < FFTsize) {
3904 pffft_aligned_free(csound->vdsp_buffer);
3905 csound->vdsp_buffer = (MYFLT *) pffft_aligned_malloc(FFTsize*(sizeof(float)));
3906 memset(csound->vdsp_buffer,0, FFTsize*(sizeof(float)));
3907 csound->FFT_max_size = FFTsize;
3908 }
3909 }
3910 }
3911
3912 static
3913 void pffft_RealFFT(CSOUND *csound,
3914 int32_t FFTsize,MYFLT *sig,
3915 int32_t d){
3916 int32_t i;
3917 float s, *buf;
3918 int32_t M = ConvertFFTSize(csound, FFTsize);
3919 pffft_setup(csound, FFTsize, M);
3920 buf = (float *)csound->vdsp_buffer;
3921 for(i=0;i<FFTsize;i++)
3922 buf[i] = sig[i];
3923 pffft_transform_ordered(csound->setup[M],
3924 buf,buf,NULL,d);
3925 s = (d == PFFFT_BACKWARD ? (MYFLT)FFTsize : FL(1.0));
3926 for(i=0;i<FFTsize;i++)
3927 sig[i] = buf[i]/s;
3928 }
3929 #endif
3930 #endif
3931
3932