1 /*
2   fftlib.c:
3 
4   Copyright 2005 John Green Istvan Varga Victor Lazzarini
5 
6   FFT library
7   based on public domain code by John Green <green_jt@vsdec.npt.nuwc.navy.mil>
8   original version is available at
9   http://hyperarchive.lcs.mit.edu/
10   /HyperArchive/Archive/dev/src/ffts-for-risc-2-c.hqx
11   ported to Csound by Istvan Varga, 2005
12 
13   This file is part of Csound.
14 
15   The Csound Library is free software; you can redistribute it
16   and/or modify it under the terms of the GNU Lesser General Public
17   License as published by the Free Software Foundation; either
18   version 2.1 of the License, or (at your option) any later version.
19 
20   Csound is distributed in the hope that it will be useful,
21   but WITHOUT ANY WARRANTY; without even the implied warranty of
22   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23   GNU Lesser General Public License for more details.
24 
25   You should have received a copy of the GNU Lesser General Public
26   License along with Csound; if not, write to the Free Software
27   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
28   02110-1301 USA
29 */
30 
31 #include <stdlib.h>
32 #include <math.h>
33 #include "csoundCore.h"
34 #include "csound.h"
35 #include "fftlib.h"
36 #include "pffft.h"
37 
38 
39 
40 #define POW2(m) ((uint32) (1 << (m)))       /* integer power of 2 for m<32 */
41 
42 /* fft's with M bigger than this bust primary cache */
43 #define MCACHE  (11 - (sizeof(MYFLT) / 8))
44 
45 /* some math constants to 40 decimal places */
46 //#define MYPI      3.141592653589793238462643383279502884197   /* pi         */
47 //#define MYROOT2   1.414213562373095048801688724209698078569   /* sqrt(2)    */
48 #define MYCOSPID8 0.9238795325112867561281831893967882868224  /* cos(pi/8)  */
49 #define MYSINPID8 0.3826834323650897717284599840303988667613  /* sin(pi/8)  */
50 
51 
52 /*****************************************************
53  * routines to initialize tables used by fft routines *
54  *****************************************************/
55 
fftCosInit(int32_t M,MYFLT * Utbl)56 static void fftCosInit(int32_t M, MYFLT *Utbl)
57 {
58   /* Compute Utbl, the cosine table for ffts  */
59   /* of size (pow(2,M)/4 +1)                  */
60   /* INPUTS                                   */
61   /*   M = log2 of fft size                   */
62   /* OUTPUTS                                  */
63   /*   *Utbl = cosine table                   */
64   uint32_t fftN = POW2(M);
65   uint32_t i1;
66 
67   Utbl[0] = FL(1.0);
68   for (i1 = 1; i1 < fftN/4; i1++)
69     Utbl[i1] = COS((FL(2.0) * PI_F * (MYFLT)i1) / (MYFLT)fftN);
70   Utbl[fftN/4] = FL(0.0);
71 }
72 
fftBRInit(int32_t M,int16 * BRLow)73 static void fftBRInit(int32_t M, int16 *BRLow)
74 {
75   /* Compute BRLow, the bit reversed table for ffts */
76   /* of size pow(2,M/2 -1)                          */
77   /* INPUTS                                         */
78   /*   M = log2 of fft size                         */
79   /* OUTPUTS                                        */
80   /*   *BRLow = bit reversed counter table          */
81   int32_t Mroot_1 = M / 2 - 1;
82   int32_t Nroot_1 = POW2(Mroot_1);
83   int32_t i1;
84   int32_t bitsum;
85   int32_t bitmask;
86   int32_t bit;
87 
88   for (i1 = 0; i1 < Nroot_1; i1++) {
89     bitsum = 0;
90     bitmask = 1;
91     for (bit = 1; bit <= Mroot_1; bitmask <<= 1, bit++)
92       if (i1 & bitmask)
93         bitsum = bitsum + (Nroot_1 >> bit);
94     BRLow[i1] = bitsum;
95   }
96 }
97 
98 /*****************
99  * parts of ffts1 *
100  *****************/
101 
bitrevR2(MYFLT * ioptr,int32_t M,int16 * BRLow)102 static void bitrevR2(MYFLT *ioptr, int32_t M, int16 *BRLow)
103 {
104   /*** bit reverse and first radix 2 stage of forward or inverse fft ***/
105   MYFLT f0r;
106   MYFLT f0i;
107   MYFLT f1r;
108   MYFLT f1i;
109   MYFLT f2r;
110   MYFLT f2i;
111   MYFLT f3r;
112   MYFLT f3i;
113   MYFLT f4r;
114   MYFLT f4i;
115   MYFLT f5r;
116   MYFLT f5i;
117   MYFLT f6r;
118   MYFLT f6i;
119   MYFLT f7r;
120   MYFLT f7i;
121   MYFLT t0r;
122   MYFLT t0i;
123   MYFLT t1r;
124   MYFLT t1i;
125   MYFLT *p0r;
126   MYFLT *p1r;
127   MYFLT *IOP;
128   MYFLT *iolimit;
129   int32_t Colstart;
130   int32_t iCol;
131   uint32_t posA;
132   uint32_t posAi;
133   uint32_t posB;
134   uint32_t posBi;
135 
136   const uint32_t Nrems2 = POW2((M + 3) / 2);
137   const uint32_t Nroot_1_ColInc = POW2(M) - Nrems2;
138   const uint32_t Nroot_1 = POW2(M / 2 - 1) - 1;
139   const uint32_t ColstartShift = (M + 1) / 2 + 1;
140 
141   posA = POW2(M);               /* 1/2 of POW2(M) complexes */
142   posAi = posA + 1;
143   posB = posA + 2;
144   posBi = posB + 1;
145 
146   iolimit = ioptr + Nrems2;
147   for (; ioptr < iolimit; ioptr += POW2(M / 2 + 1)) {
148     for (Colstart = Nroot_1; Colstart >= 0; Colstart--) {
149       iCol = Nroot_1;
150       p0r = ioptr + Nroot_1_ColInc + BRLow[Colstart] * 4;
151       IOP = ioptr + (Colstart << ColstartShift);
152       p1r = IOP + BRLow[iCol] * 4;
153       f0r = *(p0r);
154       f0i = *(p0r + 1);
155       f1r = *(p0r + posA);
156       f1i = *(p0r + posAi);
157       for (; iCol > Colstart;) {
158         f2r = *(p0r + 2);
159         f2i = *(p0r + (2 + 1));
160         f3r = *(p0r + posB);
161         f3i = *(p0r + posBi);
162         f4r = *(p1r);
163         f4i = *(p1r + 1);
164         f5r = *(p1r + posA);
165         f5i = *(p1r + posAi);
166         f6r = *(p1r + 2);
167         f6i = *(p1r + (2 + 1));
168         f7r = *(p1r + posB);
169         f7i = *(p1r + posBi);
170 
171         t0r = f0r + f1r;
172         t0i = f0i + f1i;
173         f1r = f0r - f1r;
174         f1i = f0i - f1i;
175         t1r = f2r + f3r;
176         t1i = f2i + f3i;
177         f3r = f2r - f3r;
178         f3i = f2i - f3i;
179         f0r = f4r + f5r;
180         f0i = f4i + f5i;
181         f5r = f4r - f5r;
182         f5i = f4i - f5i;
183         f2r = f6r + f7r;
184         f2i = f6i + f7i;
185         f7r = f6r - f7r;
186         f7i = f6i - f7i;
187 
188         *(p1r) = t0r;
189         *(p1r + 1) = t0i;
190         *(p1r + 2) = f1r;
191         *(p1r + (2 + 1)) = f1i;
192         *(p1r + posA) = t1r;
193         *(p1r + posAi) = t1i;
194         *(p1r + posB) = f3r;
195         *(p1r + posBi) = f3i;
196         *(p0r) = f0r;
197         *(p0r + 1) = f0i;
198         *(p0r + 2) = f5r;
199         *(p0r + (2 + 1)) = f5i;
200         *(p0r + posA) = f2r;
201         *(p0r + posAi) = f2i;
202         *(p0r + posB) = f7r;
203         *(p0r + posBi) = f7i;
204 
205         p0r -= Nrems2;
206         f0r = *(p0r);
207         f0i = *(p0r + 1);
208         f1r = *(p0r + posA);
209         f1i = *(p0r + posAi);
210         iCol -= 1;
211         p1r = IOP + BRLow[iCol] * 4;
212       }
213       f2r = *(p0r + 2);
214       f2i = *(p0r + (2 + 1));
215       f3r = *(p0r + posB);
216       f3i = *(p0r + posBi);
217 
218       t0r = f0r + f1r;
219       t0i = f0i + f1i;
220       f1r = f0r - f1r;
221       f1i = f0i - f1i;
222       t1r = f2r + f3r;
223       t1i = f2i + f3i;
224       f3r = f2r - f3r;
225       f3i = f2i - f3i;
226 
227       *(p0r) = t0r;
228       *(p0r + 1) = t0i;
229       *(p0r + 2) = f1r;
230       *(p0r + (2 + 1)) = f1i;
231       *(p0r + posA) = t1r;
232       *(p0r + posAi) = t1i;
233       *(p0r + posB) = f3r;
234       *(p0r + posBi) = f3i;
235     }
236   }
237 }
238 
fft2pt(MYFLT * ioptr)239 static void fft2pt(MYFLT *ioptr)
240 {
241   /***   RADIX 2 fft      ***/
242   MYFLT f0r, f0i, f1r, f1i;
243   MYFLT t0r, t0i;
244 
245   /* bit reversed load */
246   f0r = ioptr[0];
247   f0i = ioptr[1];
248   f1r = ioptr[2];
249   f1i = ioptr[3];
250 
251   /* Butterflys           */
252   /*
253     f0   -       -       t0
254     f1   -  1 -  f1
255   */
256 
257   t0r = f0r + f1r;
258   t0i = f0i + f1i;
259   f1r = f0r - f1r;
260   f1i = f0i - f1i;
261 
262   /* store result */
263   ioptr[0] = t0r;
264   ioptr[1] = t0i;
265   ioptr[2] = f1r;
266   ioptr[3] = f1i;
267 }
268 
fft4pt(MYFLT * ioptr)269 static void fft4pt(MYFLT *ioptr)
270 {
271   /***   RADIX 4 fft      ***/
272   MYFLT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
273   MYFLT t0r, t0i, t1r, t1i;
274 
275   /* bit reversed load */
276   f0r = ioptr[0];
277   f0i = ioptr[1];
278   f1r = ioptr[4];
279   f1i = ioptr[5];
280   f2r = ioptr[2];
281   f2i = ioptr[3];
282   f3r = ioptr[6];
283   f3i = ioptr[7];
284 
285   /* Butterflys           */
286   /*
287     f0   -       -       t0      -       -       f0
288     f1   -  1 -  f1      -       -       f1
289     f2   -       -       f2      -  1 -  f2
290     f3   -  1 -  t1      - -i -  f3
291   */
292 
293   t0r = f0r + f1r;
294   t0i = f0i + f1i;
295   f1r = f0r - f1r;
296   f1i = f0i - f1i;
297 
298   t1r = f2r - f3r;
299   t1i = f2i - f3i;
300   f2r = f2r + f3r;
301   f2i = f2i + f3i;
302 
303   f0r = t0r + f2r;
304   f0i = t0i + f2i;
305   f2r = t0r - f2r;
306   f2i = t0i - f2i;
307 
308   f3r = f1r - t1i;
309   f3i = f1i + t1r;
310   f1r = f1r + t1i;
311   f1i = f1i - t1r;
312 
313   /* store result */
314   ioptr[0] = f0r;
315   ioptr[1] = f0i;
316   ioptr[2] = f1r;
317   ioptr[3] = f1i;
318   ioptr[4] = f2r;
319   ioptr[5] = f2i;
320   ioptr[6] = f3r;
321   ioptr[7] = f3i;
322 }
323 
fft8pt(MYFLT * ioptr)324 static void fft8pt(MYFLT *ioptr)
325 {
326   /***   RADIX 8 fft      ***/
327   MYFLT w0r = (MYFLT)(1.0 / ROOT2);    /* cos(pi/4)   */
328   MYFLT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
329   MYFLT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
330   MYFLT t0r, t0i, t1r, t1i;
331   const MYFLT Two = FL(2.0);
332 
333   /* bit reversed load */
334   f0r = ioptr[0];
335   f0i = ioptr[1];
336   f1r = ioptr[8];
337   f1i = ioptr[9];
338   f2r = ioptr[4];
339   f2i = ioptr[5];
340   f3r = ioptr[12];
341   f3i = ioptr[13];
342   f4r = ioptr[2];
343   f4i = ioptr[3];
344   f5r = ioptr[10];
345   f5i = ioptr[11];
346   f6r = ioptr[6];
347   f6i = ioptr[7];
348   f7r = ioptr[14];
349   f7i = ioptr[15];
350   /* Butterflys           */
351   /*
352     f0   -       -       t0      -       -       f0      -       -       f0
353     f1   -  1 -  f1      -       -       f1      -       -       f1
354     f2   -       -       f2      -  1 -  f2      -       -       f2
355     f3   -  1 -  t1      - -i -  f3      -       -       f3
356     f4   -       -       t0      -       -       f4      -  1 -  t0
357     f5   -  1 -  f5      -       -       f5      - w3 -  f4
358     f6   -       -       f6      -  1 -  f6      - -i -  t1
359     f7   -  1 -  t1      - -i -  f7      - iw3-  f6
360   */
361 
362   t0r = f0r + f1r;
363   t0i = f0i + f1i;
364   f1r = f0r - f1r;
365   f1i = f0i - f1i;
366 
367   t1r = f2r - f3r;
368   t1i = f2i - f3i;
369   f2r = f2r + f3r;
370   f2i = f2i + f3i;
371 
372   f0r = t0r + f2r;
373   f0i = t0i + f2i;
374   f2r = t0r - f2r;
375   f2i = t0i - f2i;
376 
377   f3r = f1r - t1i;
378   f3i = f1i + t1r;
379   f1r = f1r + t1i;
380   f1i = f1i - t1r;
381 
382   t0r = f4r + f5r;
383   t0i = f4i + f5i;
384   f5r = f4r - f5r;
385   f5i = f4i - f5i;
386 
387   t1r = f6r - f7r;
388   t1i = f6i - f7i;
389   f6r = f6r + f7r;
390   f6i = f6i + f7i;
391 
392   f4r = t0r + f6r;
393   f4i = t0i + f6i;
394   f6r = t0r - f6r;
395   f6i = t0i - f6i;
396 
397   f7r = f5r - t1i;
398   f7i = f5i + t1r;
399   f5r = f5r + t1i;
400   f5i = f5i - t1r;
401 
402   t0r = f0r - f4r;
403   t0i = f0i - f4i;
404   f0r = f0r + f4r;
405   f0i = f0i + f4i;
406 
407   t1r = f2r - f6i;
408   t1i = f2i + f6r;
409   f2r = f2r + f6i;
410   f2i = f2i - f6r;
411 
412   f4r = f1r - f5r * w0r - f5i * w0r;
413   f4i = f1i + f5r * w0r - f5i * w0r;
414   f1r = f1r * Two - f4r;
415   f1i = f1i * Two - f4i;
416 
417   f6r = f3r + f7r * w0r - f7i * w0r;
418   f6i = f3i + f7r * w0r + f7i * w0r;
419   f3r = f3r * Two - f6r;
420   f3i = f3i * Two - f6i;
421 
422   /* store result */
423   ioptr[0] = f0r;
424   ioptr[1] = f0i;
425   ioptr[2] = f1r;
426   ioptr[3] = f1i;
427   ioptr[4] = f2r;
428   ioptr[5] = f2i;
429   ioptr[6] = f3r;
430   ioptr[7] = f3i;
431   ioptr[8] = t0r;
432   ioptr[9] = t0i;
433   ioptr[10] = f4r;
434   ioptr[11] = f4i;
435   ioptr[12] = t1r;
436   ioptr[13] = t1i;
437   ioptr[14] = f6r;
438   ioptr[15] = f6i;
439 }
440 
bfR2(MYFLT * ioptr,int32_t M,int32_t NDiffU)441 static void bfR2(MYFLT *ioptr, int32_t M, int32_t NDiffU)
442 {
443   /*** 2nd radix 2 stage ***/
444   uint32_t pos;
445   uint32_t posi;
446   uint32_t pinc;
447   uint32_t pnext;
448   uint32_t NSameU;
449   uint32_t SameUCnt;
450 
451   MYFLT *pstrt;
452   MYFLT *p0r, *p1r, *p2r, *p3r;
453 
454   MYFLT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
455   MYFLT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
456 
457   pinc = NDiffU * 2;            /* 2 floats per complex */
458   pnext = pinc * 4;
459   pos = 2;
460   posi = pos + 1;
461   NSameU = POW2(M) / 4 / NDiffU;        /* 4 Us at a time */
462   pstrt = ioptr;
463   p0r = pstrt;
464   p1r = pstrt + pinc;
465   p2r = p1r + pinc;
466   p3r = p2r + pinc;
467 
468   /* Butterflys           */
469   /*
470     f0   -       -       f4
471     f1   -  1 -  f5
472     f2   -       -       f6
473     f3   -  1 -  f7
474   */
475   /* Butterflys           */
476   /*
477     f0   -       -       f4
478     f1   -  1 -  f5
479     f2   -       -       f6
480     f3   -  1 -  f7
481   */
482 
483   for (SameUCnt = NSameU; SameUCnt > 0; SameUCnt--) {
484 
485     f0r = *p0r;
486     f1r = *p1r;
487     f0i = *(p0r + 1);
488     f1i = *(p1r + 1);
489     f2r = *p2r;
490     f3r = *p3r;
491     f2i = *(p2r + 1);
492     f3i = *(p3r + 1);
493 
494     f4r = f0r + f1r;
495     f4i = f0i + f1i;
496     f5r = f0r - f1r;
497     f5i = f0i - f1i;
498 
499     f6r = f2r + f3r;
500     f6i = f2i + f3i;
501     f7r = f2r - f3r;
502     f7i = f2i - f3i;
503 
504     *p0r = f4r;
505     *(p0r + 1) = f4i;
506     *p1r = f5r;
507     *(p1r + 1) = f5i;
508     *p2r = f6r;
509     *(p2r + 1) = f6i;
510     *p3r = f7r;
511     *(p3r + 1) = f7i;
512 
513     f0r = *(p0r + pos);
514     f1i = *(p1r + posi);
515     f0i = *(p0r + posi);
516     f1r = *(p1r + pos);
517     f2r = *(p2r + pos);
518     f3i = *(p3r + posi);
519     f2i = *(p2r + posi);
520     f3r = *(p3r + pos);
521 
522     f4r = f0r + f1i;
523     f4i = f0i - f1r;
524     f5r = f0r - f1i;
525     f5i = f0i + f1r;
526 
527     f6r = f2r + f3i;
528     f6i = f2i - f3r;
529     f7r = f2r - f3i;
530     f7i = f2i + f3r;
531 
532     *(p0r + pos) = f4r;
533     *(p0r + posi) = f4i;
534     *(p1r + pos) = f5r;
535     *(p1r + posi) = f5i;
536     *(p2r + pos) = f6r;
537     *(p2r + posi) = f6i;
538     *(p3r + pos) = f7r;
539     *(p3r + posi) = f7i;
540 
541     p0r += pnext;
542     p1r += pnext;
543     p2r += pnext;
544     p3r += pnext;
545   }
546 }
547 
bfR4(MYFLT * ioptr,int32_t M,int32_t NDiffU)548 static void bfR4(MYFLT *ioptr, int32_t M, int32_t NDiffU)
549 {
550   /*** 1 radix 4 stage ***/
551   uint32_t pos;
552   uint32_t posi;
553   uint32_t pinc;
554   uint32_t pnext;
555   uint32_t pnexti;
556   uint32_t NSameU;
557   uint32_t SameUCnt;
558 
559   MYFLT *pstrt;
560   MYFLT *p0r, *p1r, *p2r, *p3r;
561 
562   MYFLT w1r = FL(1.0) / FL(ROOT2);    /* cos(pi/4)   */
563   MYFLT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
564   MYFLT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
565   MYFLT t1r, t1i;
566   const MYFLT Two = FL(2.0);
567 
568   pinc = NDiffU * 2;            /* 2 floats per complex */
569   pnext = pinc * 4;
570   pnexti = pnext + 1;
571   pos = 2;
572   posi = pos + 1;
573   NSameU = POW2(M) / 4 / NDiffU;        /* 4 pts per butterfly */
574   pstrt = ioptr;
575   p0r = pstrt;
576   p1r = pstrt + pinc;
577   p2r = p1r + pinc;
578   p3r = p2r + pinc;
579 
580   /* Butterflys           */
581   /*
582     f0   -       -       f0      -       -       f4
583     f1   -  1 -  f5      -       -       f5
584     f2   -       -       f6      -  1 -  f6
585     f3   -  1 -  f3      - -i -  f7
586   */
587   /* Butterflys           */
588   /*
589     f0   -       -       f4      -       -       f4
590     f1   - -i -  t1      -       -       f5
591     f2   -       -       f2      - w1 -  f6
592     f3   - -i -  f7      - iw1-  f7
593   */
594 
595   f0r = *p0r;
596   f1r = *p1r;
597   f2r = *p2r;
598   f3r = *p3r;
599   f0i = *(p0r + 1);
600   f1i = *(p1r + 1);
601   f2i = *(p2r + 1);
602   f3i = *(p3r + 1);
603 
604   f5r = f0r - f1r;
605   f5i = f0i - f1i;
606   f0r = f0r + f1r;
607   f0i = f0i + f1i;
608 
609   f6r = f2r + f3r;
610   f6i = f2i + f3i;
611   f3r = f2r - f3r;
612   f3i = f2i - f3i;
613 
614   for (SameUCnt = NSameU - 1; SameUCnt > 0; SameUCnt--) {
615 
616     f7r = f5r - f3i;
617     f7i = f5i + f3r;
618     f5r = f5r + f3i;
619     f5i = f5i - f3r;
620 
621     f4r = f0r + f6r;
622     f4i = f0i + f6i;
623     f6r = f0r - f6r;
624     f6i = f0i - f6i;
625 
626     f2r = *(p2r + pos);
627     f2i = *(p2r + posi);
628     f1r = *(p1r + pos);
629     f1i = *(p1r + posi);
630     f3i = *(p3r + posi);
631     f0r = *(p0r + pos);
632     f3r = *(p3r + pos);
633     f0i = *(p0r + posi);
634 
635     *p3r = f7r;
636     *p0r = f4r;
637     *(p3r + 1) = f7i;
638     *(p0r + 1) = f4i;
639     *p1r = f5r;
640     *p2r = f6r;
641     *(p1r + 1) = f5i;
642     *(p2r + 1) = f6i;
643 
644     f7r = f2r - f3i;
645     f7i = f2i + f3r;
646     f2r = f2r + f3i;
647     f2i = f2i - f3r;
648 
649     f4r = f0r + f1i;
650     f4i = f0i - f1r;
651     t1r = f0r - f1i;
652     t1i = f0i + f1r;
653 
654     f5r = t1r - f7r * w1r + f7i * w1r;
655     f5i = t1i - f7r * w1r - f7i * w1r;
656     f7r = t1r * Two - f5r;
657     f7i = t1i * Two - f5i;
658 
659     f6r = f4r - f2r * w1r - f2i * w1r;
660     f6i = f4i + f2r * w1r - f2i * w1r;
661     f4r = f4r * Two - f6r;
662     f4i = f4i * Two - f6i;
663 
664     f3r = *(p3r + pnext);
665     f0r = *(p0r + pnext);
666     f3i = *(p3r + pnexti);
667     f0i = *(p0r + pnexti);
668     f2r = *(p2r + pnext);
669     f2i = *(p2r + pnexti);
670     f1r = *(p1r + pnext);
671     f1i = *(p1r + pnexti);
672 
673     *(p2r + pos) = f6r;
674     *(p1r + pos) = f5r;
675     *(p2r + posi) = f6i;
676     *(p1r + posi) = f5i;
677     *(p3r + pos) = f7r;
678     *(p0r + pos) = f4r;
679     *(p3r + posi) = f7i;
680     *(p0r + posi) = f4i;
681 
682     f6r = f2r + f3r;
683     f6i = f2i + f3i;
684     f3r = f2r - f3r;
685     f3i = f2i - f3i;
686 
687     f5r = f0r - f1r;
688     f5i = f0i - f1i;
689     f0r = f0r + f1r;
690     f0i = f0i + f1i;
691 
692     p3r += pnext;
693     p0r += pnext;
694     p1r += pnext;
695     p2r += pnext;
696   }
697   f7r = f5r - f3i;
698   f7i = f5i + f3r;
699   f5r = f5r + f3i;
700   f5i = f5i - f3r;
701 
702   f4r = f0r + f6r;
703   f4i = f0i + f6i;
704   f6r = f0r - f6r;
705   f6i = f0i - f6i;
706 
707   f2r = *(p2r + pos);
708   f2i = *(p2r + posi);
709   f1r = *(p1r + pos);
710   f1i = *(p1r + posi);
711   f3i = *(p3r + posi);
712   f0r = *(p0r + pos);
713   f3r = *(p3r + pos);
714   f0i = *(p0r + posi);
715 
716   *p3r = f7r;
717   *p0r = f4r;
718   *(p3r + 1) = f7i;
719   *(p0r + 1) = f4i;
720   *p1r = f5r;
721   *p2r = f6r;
722   *(p1r + 1) = f5i;
723   *(p2r + 1) = f6i;
724 
725   f7r = f2r - f3i;
726   f7i = f2i + f3r;
727   f2r = f2r + f3i;
728   f2i = f2i - f3r;
729 
730   f4r = f0r + f1i;
731   f4i = f0i - f1r;
732   t1r = f0r - f1i;
733   t1i = f0i + f1r;
734 
735   f5r = t1r - f7r * w1r + f7i * w1r;
736   f5i = t1i - f7r * w1r - f7i * w1r;
737   f7r = t1r * Two - f5r;
738   f7i = t1i * Two - f5i;
739 
740   f6r = f4r - f2r * w1r - f2i * w1r;
741   f6i = f4i + f2r * w1r - f2i * w1r;
742   f4r = f4r * Two - f6r;
743   f4i = f4i * Two - f6i;
744 
745   *(p2r + pos) = f6r;
746   *(p1r + pos) = f5r;
747   *(p2r + posi) = f6i;
748   *(p1r + posi) = f5i;
749   *(p3r + pos) = f7r;
750   *(p0r + pos) = f4r;
751   *(p3r + posi) = f7i;
752   *(p0r + posi) = f4i;
753 }
754 
bfstages(MYFLT * ioptr,int32_t M,MYFLT * Utbl,int32_t Ustride,int32_t NDiffU,int32_t StageCnt)755 static void bfstages(MYFLT *ioptr, int32_t M, MYFLT *Utbl, int32_t Ustride,
756                      int32_t NDiffU, int32_t StageCnt)
757 {
758   /***   RADIX 8 Stages   ***/
759   uint32_t pos;
760   uint32_t posi;
761   uint32_t pinc;
762   uint32_t pnext;
763   uint32_t NSameU;
764   int32_t          Uinc;
765   int32_t          Uinc2;
766   int32_t          Uinc4;
767   uint32_t DiffUCnt;
768   uint32_t SameUCnt;
769   uint32_t U2toU3;
770 
771   MYFLT *pstrt;
772   MYFLT *p0r, *p1r, *p2r, *p3r;
773   MYFLT *u0r, *u0i, *u1r, *u1i, *u2r, *u2i;
774 
775   MYFLT w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i;
776   MYFLT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
777   MYFLT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
778   MYFLT t0r, t0i, t1r, t1i;
779   const MYFLT Two = FL(2.0);
780 
781   pinc = NDiffU * 2;            /* 2 floats per complex */
782   pnext = pinc * 8;
783   pos = pinc * 4;
784   posi = pos + 1;
785   NSameU = POW2(M) / 8 / NDiffU;        /* 8 pts per butterfly */
786   Uinc = (int32_t) NSameU * Ustride;
787   Uinc2 = Uinc * 2;
788   Uinc4 = Uinc * 4;
789   U2toU3 = (POW2(M) / 8) * Ustride;
790   for (; StageCnt > 0; StageCnt--) {
791 
792     u0r = &Utbl[0];
793     u0i = &Utbl[POW2(M - 2) * Ustride];
794     u1r = u0r;
795     u1i = u0i;
796     u2r = u0r;
797     u2i = u0i;
798 
799     w0r = *u0r;
800     w0i = *u0i;
801     w1r = *u1r;
802     w1i = *u1i;
803     w2r = *u2r;
804     w2i = *u2i;
805     w3r = *(u2r + U2toU3);
806     w3i = *(u2i - U2toU3);
807 
808     pstrt = ioptr;
809 
810     p0r = pstrt;
811     p1r = pstrt + pinc;
812     p2r = p1r + pinc;
813     p3r = p2r + pinc;
814 
815     /* Butterflys           */
816     /*
817       f0   -       -       t0      -       -       f0      -       -       f0
818       f1   - w0-   f1      -       -       f1      -       -       f1
819       f2   -       -       f2      - w1-   f2      -       -       f4
820       f3   - w0-   t1      - iw1-  f3      -       -       f5
821 
822       f4   -       -       t0      -       -       f4      - w2-   t0
823       f5   - w0-   f5      -       -       f5      - w3-   t1
824       f6   -       -       f6      - w1-   f6      - iw2-  f6
825       f7   - w0-   t1      - iw1-  f7      - iw3-  f7
826     */
827 
828     for (DiffUCnt = NDiffU; DiffUCnt > 0; DiffUCnt--) {
829       f0r = *p0r;
830       f0i = *(p0r + 1);
831       f1r = *p1r;
832       f1i = *(p1r + 1);
833       for (SameUCnt = NSameU - 1; SameUCnt > 0; SameUCnt--) {
834         f2r = *p2r;
835         f2i = *(p2r + 1);
836         f3r = *p3r;
837         f3i = *(p3r + 1);
838 
839         t0r = f0r + f1r * w0r + f1i * w0i;
840         t0i = f0i - f1r * w0i + f1i * w0r;
841         f1r = f0r * Two - t0r;
842         f1i = f0i * Two - t0i;
843 
844         f4r = *(p0r + pos);
845         f4i = *(p0r + posi);
846         f5r = *(p1r + pos);
847         f5i = *(p1r + posi);
848 
849         f6r = *(p2r + pos);
850         f6i = *(p2r + posi);
851         f7r = *(p3r + pos);
852         f7i = *(p3r + posi);
853 
854         t1r = f2r - f3r * w0r - f3i * w0i;
855         t1i = f2i + f3r * w0i - f3i * w0r;
856         f2r = f2r * Two - t1r;
857         f2i = f2i * Two - t1i;
858 
859         f0r = t0r + f2r * w1r + f2i * w1i;
860         f0i = t0i - f2r * w1i + f2i * w1r;
861         f2r = t0r * Two - f0r;
862         f2i = t0i * Two - f0i;
863 
864         f3r = f1r + t1r * w1i - t1i * w1r;
865         f3i = f1i + t1r * w1r + t1i * w1i;
866         f1r = f1r * Two - f3r;
867         f1i = f1i * Two - f3i;
868 
869         t0r = f4r + f5r * w0r + f5i * w0i;
870         t0i = f4i - f5r * w0i + f5i * w0r;
871         f5r = f4r * Two - t0r;
872         f5i = f4i * Two - t0i;
873 
874         t1r = f6r - f7r * w0r - f7i * w0i;
875         t1i = f6i + f7r * w0i - f7i * w0r;
876         f6r = f6r * Two - t1r;
877         f6i = f6i * Two - t1i;
878 
879         f4r = t0r + f6r * w1r + f6i * w1i;
880         f4i = t0i - f6r * w1i + f6i * w1r;
881         f6r = t0r * Two - f4r;
882         f6i = t0i * Two - f4i;
883 
884         f7r = f5r + t1r * w1i - t1i * w1r;
885         f7i = f5i + t1r * w1r + t1i * w1i;
886         f5r = f5r * Two - f7r;
887         f5i = f5i * Two - f7i;
888 
889         t0r = f0r - f4r * w2r - f4i * w2i;
890         t0i = f0i + f4r * w2i - f4i * w2r;
891         f0r = f0r * Two - t0r;
892         f0i = f0i * Two - t0i;
893 
894         t1r = f1r - f5r * w3r - f5i * w3i;
895         t1i = f1i + f5r * w3i - f5i * w3r;
896         f1r = f1r * Two - t1r;
897         f1i = f1i * Two - t1i;
898 
899         *(p0r + pos) = t0r;
900         *(p1r + pos) = t1r;
901         *(p0r + posi) = t0i;
902         *(p1r + posi) = t1i;
903         *p0r = f0r;
904         *p1r = f1r;
905         *(p0r + 1) = f0i;
906         *(p1r + 1) = f1i;
907 
908         p0r += pnext;
909         f0r = *p0r;
910         f0i = *(p0r + 1);
911 
912         p1r += pnext;
913 
914         f1r = *p1r;
915         f1i = *(p1r + 1);
916 
917         f4r = f2r - f6r * w2i + f6i * w2r;
918         f4i = f2i - f6r * w2r - f6i * w2i;
919         f6r = f2r * Two - f4r;
920         f6i = f2i * Two - f4i;
921 
922         f5r = f3r - f7r * w3i + f7i * w3r;
923         f5i = f3i - f7r * w3r - f7i * w3i;
924         f7r = f3r * Two - f5r;
925         f7i = f3i * Two - f5i;
926 
927         *p2r = f4r;
928         *p3r = f5r;
929         *(p2r + 1) = f4i;
930         *(p3r + 1) = f5i;
931         *(p2r + pos) = f6r;
932         *(p3r + pos) = f7r;
933         *(p2r + posi) = f6i;
934         *(p3r + posi) = f7i;
935 
936         p2r += pnext;
937         p3r += pnext;
938       }
939 
940       f2r = *p2r;
941       f2i = *(p2r + 1);
942       f3r = *p3r;
943       f3i = *(p3r + 1);
944 
945       t0r = f0r + f1r * w0r + f1i * w0i;
946       t0i = f0i - f1r * w0i + f1i * w0r;
947       f1r = f0r * Two - t0r;
948       f1i = f0i * Two - t0i;
949 
950       f4r = *(p0r + pos);
951       f4i = *(p0r + posi);
952       f5r = *(p1r + pos);
953       f5i = *(p1r + posi);
954 
955       f6r = *(p2r + pos);
956       f6i = *(p2r + posi);
957       f7r = *(p3r + pos);
958       f7i = *(p3r + posi);
959 
960       t1r = f2r - f3r * w0r - f3i * w0i;
961       t1i = f2i + f3r * w0i - f3i * w0r;
962       f2r = f2r * Two - t1r;
963       f2i = f2i * Two - t1i;
964 
965       f0r = t0r + f2r * w1r + f2i * w1i;
966       f0i = t0i - f2r * w1i + f2i * w1r;
967       f2r = t0r * Two - f0r;
968       f2i = t0i * Two - f0i;
969 
970       f3r = f1r + t1r * w1i - t1i * w1r;
971       f3i = f1i + t1r * w1r + t1i * w1i;
972       f1r = f1r * Two - f3r;
973       f1i = f1i * Two - f3i;
974 
975       if ((int32_t) DiffUCnt == NDiffU / 2)
976         Uinc4 = -Uinc4;
977 
978       u0r += Uinc4;
979       u0i -= Uinc4;
980       u1r += Uinc2;
981       u1i -= Uinc2;
982       u2r += Uinc;
983       u2i -= Uinc;
984 
985       pstrt += 2;
986 
987       t0r = f4r + f5r * w0r + f5i * w0i;
988       t0i = f4i - f5r * w0i + f5i * w0r;
989       f5r = f4r * Two - t0r;
990       f5i = f4i * Two - t0i;
991 
992       t1r = f6r - f7r * w0r - f7i * w0i;
993       t1i = f6i + f7r * w0i - f7i * w0r;
994       f6r = f6r * Two - t1r;
995       f6i = f6i * Two - t1i;
996 
997       f4r = t0r + f6r * w1r + f6i * w1i;
998       f4i = t0i - f6r * w1i + f6i * w1r;
999       f6r = t0r * Two - f4r;
1000       f6i = t0i * Two - f4i;
1001 
1002       f7r = f5r + t1r * w1i - t1i * w1r;
1003       f7i = f5i + t1r * w1r + t1i * w1i;
1004       f5r = f5r * Two - f7r;
1005       f5i = f5i * Two - f7i;
1006 
1007       w0r = *u0r;
1008       w0i = *u0i;
1009       w1r = *u1r;
1010       w1i = *u1i;
1011 
1012       if ((int32_t) DiffUCnt <= NDiffU / 2)
1013         w0r = -w0r;
1014 
1015       t0r = f0r - f4r * w2r - f4i * w2i;
1016       t0i = f0i + f4r * w2i - f4i * w2r;
1017       f0r = f0r * Two - t0r;
1018       f0i = f0i * Two - t0i;
1019 
1020       f4r = f2r - f6r * w2i + f6i * w2r;
1021       f4i = f2i - f6r * w2r - f6i * w2i;
1022       f6r = f2r * Two - f4r;
1023       f6i = f2i * Two - f4i;
1024 
1025       *(p0r + pos) = t0r;
1026       *p2r = f4r;
1027       *(p0r + posi) = t0i;
1028       *(p2r + 1) = f4i;
1029       w2r = *u2r;
1030       w2i = *u2i;
1031       *p0r = f0r;
1032       *(p2r + pos) = f6r;
1033       *(p0r + 1) = f0i;
1034       *(p2r + posi) = f6i;
1035 
1036       p0r = pstrt;
1037       p2r = pstrt + pinc + pinc;
1038 
1039       t1r = f1r - f5r * w3r - f5i * w3i;
1040       t1i = f1i + f5r * w3i - f5i * w3r;
1041       f1r = f1r * Two - t1r;
1042       f1i = f1i * Two - t1i;
1043 
1044       f5r = f3r - f7r * w3i + f7i * w3r;
1045       f5i = f3i - f7r * w3r - f7i * w3i;
1046       f7r = f3r * Two - f5r;
1047       f7i = f3i * Two - f5i;
1048 
1049       *(p1r + pos) = t1r;
1050       *p3r = f5r;
1051       *(p1r + posi) = t1i;
1052       *(p3r + 1) = f5i;
1053       w3r = *(u2r + U2toU3);
1054       w3i = *(u2i - U2toU3);
1055       *p1r = f1r;
1056       *(p3r + pos) = f7r;
1057       *(p1r + 1) = f1i;
1058       *(p3r + posi) = f7i;
1059 
1060       p1r = pstrt + pinc;
1061       p3r = p2r + pinc;
1062     }
1063     NSameU /= 8;
1064     Uinc /= 8;
1065     Uinc2 /= 8;
1066     Uinc4 = Uinc * 4;
1067     NDiffU *= 8;
1068     pinc *= 8;
1069     pnext *= 8;
1070     pos *= 8;
1071     posi = pos + 1;
1072   }
1073 }
1074 
fftrecurs(MYFLT * ioptr,int32_t M,MYFLT * Utbl,int32_t Ustride,int32_t NDiffU,int32_t StageCnt)1075 static void fftrecurs(MYFLT *ioptr, int32_t M, MYFLT *Utbl, int32_t Ustride,
1076                       int32_t NDiffU, int32_t StageCnt)
1077 {
1078   /* recursive bfstages calls to maximize on chip cache efficiency */
1079   int32_t i1;
1080 
1081   if (M <= (int32_t) MCACHE)              /* fits on chip ? */
1082     bfstages(ioptr, M, Utbl, Ustride, NDiffU, StageCnt); /* RADIX 8 Stages */
1083   else {
1084     for (i1 = 0; i1 < 8; i1++) {
1085       fftrecurs(&ioptr[i1 * POW2(M - 3) * 2], M - 3, Utbl, 8 * Ustride,
1086                 NDiffU, StageCnt - 1);  /*  RADIX 8 Stages      */
1087     }
1088     bfstages(ioptr, M, Utbl, Ustride, POW2(M - 3), 1);  /*  RADIX 8 Stage */
1089   }
1090 }
1091 
ffts1(MYFLT * ioptr,int32_t M,MYFLT * Utbl,int16 * BRLow)1092 static void ffts1(MYFLT *ioptr, int32_t M, MYFLT *Utbl, int16 *BRLow)
1093 {
1094   /* Compute in-place complex fft on the rows of the input array  */
1095   /* INPUTS                                                       */
1096   /*   *ioptr = input data array                                  */
1097   /*   M = log2 of fft size (ex M=10 for 1024 point fft)          */
1098   /*   *Utbl = cosine table                                       */
1099   /*   *BRLow = bit reversed counter table                        */
1100   /* OUTPUTS                                                      */
1101   /*   *ioptr = output data array                                 */
1102 
1103   int32_t StageCnt;
1104   int32_t NDiffU;
1105 
1106   switch (M) {
1107   case 0:
1108     break;
1109   case 1:
1110     fft2pt(ioptr);            /* a 2 pt fft */
1111     break;
1112   case 2:
1113     fft4pt(ioptr);            /* a 4 pt fft */
1114     break;
1115   case 3:
1116     fft8pt(ioptr);            /* an 8 pt fft */
1117     break;
1118   default:
1119     bitrevR2(ioptr, M, BRLow);  /* bit reverse and first radix 2 stage */
1120     StageCnt = (M - 1) / 3;     /* number of radix 8 stages           */
1121     NDiffU = 2;                 /* one radix 2 stage already complete */
1122     if ((M - 1 - (StageCnt * 3)) == 1) {
1123       bfR2(ioptr, M, NDiffU); /* 1 radix 2 stage */
1124       NDiffU *= 2;
1125     }
1126     if ((M - 1 - (StageCnt * 3)) == 2) {
1127       bfR4(ioptr, M, NDiffU); /* 1 radix 4 stage */
1128       NDiffU *= 4;
1129     }
1130     if (M <= (int32_t) MCACHE)
1131       bfstages(ioptr, M, Utbl, 1, NDiffU, StageCnt);  /* RADIX 8 Stages */
1132     else
1133       fftrecurs(ioptr, M, Utbl, 1, NDiffU, StageCnt); /* RADIX 8 Stages */
1134   }
1135 }
1136 
1137 /******************
1138  * parts of iffts1 *
1139  ******************/
1140 
scbitrevR2(MYFLT * ioptr,int32_t M,int16 * BRLow,MYFLT scale)1141 static void scbitrevR2(MYFLT *ioptr, int32_t M, int16 *BRLow, MYFLT scale)
1142 {
1143   /*** scaled bit reverse and first radix 2 stage forward or inverse fft ***/
1144   MYFLT f0r;
1145   MYFLT f0i;
1146   MYFLT f1r;
1147   MYFLT f1i;
1148   MYFLT f2r;
1149   MYFLT f2i;
1150   MYFLT f3r;
1151   MYFLT f3i;
1152   MYFLT f4r;
1153   MYFLT f4i;
1154   MYFLT f5r;
1155   MYFLT f5i;
1156   MYFLT f6r;
1157   MYFLT f6i;
1158   MYFLT f7r;
1159   MYFLT f7i;
1160   MYFLT t0r;
1161   MYFLT t0i;
1162   MYFLT t1r;
1163   MYFLT t1i;
1164   MYFLT *p0r;
1165   MYFLT *p1r;
1166   MYFLT *IOP;
1167   MYFLT *iolimit;
1168   int32_t Colstart;
1169   int32_t iCol;
1170   uint32_t posA;
1171   uint32_t posAi;
1172   uint32_t posB;
1173   uint32_t posBi;
1174 
1175   const uint32_t Nrems2 = POW2((M + 3) / 2);
1176   const uint32_t Nroot_1_ColInc = POW2(M) - Nrems2;
1177   const uint32_t Nroot_1 = POW2(M / 2 - 1) - 1;
1178   const uint32_t ColstartShift = (M + 1) / 2 + 1;
1179 
1180   posA = POW2(M);               /* 1/2 of POW2(M) complexes */
1181   posAi = posA + 1;
1182   posB = posA + 2;
1183   posBi = posB + 1;
1184 
1185   iolimit = ioptr + Nrems2;
1186   for (; ioptr < iolimit; ioptr += POW2(M / 2 + 1)) {
1187     for (Colstart = Nroot_1; Colstart >= 0; Colstart--) {
1188       iCol = Nroot_1;
1189       p0r = ioptr + Nroot_1_ColInc + BRLow[Colstart] * 4;
1190       IOP = ioptr + (Colstart << ColstartShift);
1191       p1r = IOP + BRLow[iCol] * 4;
1192       f0r = *(p0r);
1193       f0i = *(p0r + 1);
1194       f1r = *(p0r + posA);
1195       f1i = *(p0r + posAi);
1196       for (; iCol > Colstart;) {
1197         f2r = *(p0r + 2);
1198         f2i = *(p0r + (2 + 1));
1199         f3r = *(p0r + posB);
1200         f3i = *(p0r + posBi);
1201         f4r = *(p1r);
1202         f4i = *(p1r + 1);
1203         f5r = *(p1r + posA);
1204         f5i = *(p1r + posAi);
1205         f6r = *(p1r + 2);
1206         f6i = *(p1r + (2 + 1));
1207         f7r = *(p1r + posB);
1208         f7i = *(p1r + posBi);
1209 
1210         t0r = f0r + f1r;
1211         t0i = f0i + f1i;
1212         f1r = f0r - f1r;
1213         f1i = f0i - f1i;
1214         t1r = f2r + f3r;
1215         t1i = f2i + f3i;
1216         f3r = f2r - f3r;
1217         f3i = f2i - f3i;
1218         f0r = f4r + f5r;
1219         f0i = f4i + f5i;
1220         f5r = f4r - f5r;
1221         f5i = f4i - f5i;
1222         f2r = f6r + f7r;
1223         f2i = f6i + f7i;
1224         f7r = f6r - f7r;
1225         f7i = f6i - f7i;
1226 
1227         *(p1r) = scale * t0r;
1228         *(p1r + 1) = scale * t0i;
1229         *(p1r + 2) = scale * f1r;
1230         *(p1r + (2 + 1)) = scale * f1i;
1231         *(p1r + posA) = scale * t1r;
1232         *(p1r + posAi) = scale * t1i;
1233         *(p1r + posB) = scale * f3r;
1234         *(p1r + posBi) = scale * f3i;
1235         *(p0r) = scale * f0r;
1236         *(p0r + 1) = scale * f0i;
1237         *(p0r + 2) = scale * f5r;
1238         *(p0r + (2 + 1)) = scale * f5i;
1239         *(p0r + posA) = scale * f2r;
1240         *(p0r + posAi) = scale * f2i;
1241         *(p0r + posB) = scale * f7r;
1242         *(p0r + posBi) = scale * f7i;
1243 
1244         p0r -= Nrems2;
1245         f0r = *(p0r);
1246         f0i = *(p0r + 1);
1247         f1r = *(p0r + posA);
1248         f1i = *(p0r + posAi);
1249         iCol -= 1;
1250         p1r = IOP + BRLow[iCol] * 4;
1251       }
1252       f2r = *(p0r + 2);
1253       f2i = *(p0r + (2 + 1));
1254       f3r = *(p0r + posB);
1255       f3i = *(p0r + posBi);
1256 
1257       t0r = f0r + f1r;
1258       t0i = f0i + f1i;
1259       f1r = f0r - f1r;
1260       f1i = f0i - f1i;
1261       t1r = f2r + f3r;
1262       t1i = f2i + f3i;
1263       f3r = f2r - f3r;
1264       f3i = f2i - f3i;
1265 
1266       *(p0r) = scale * t0r;
1267       *(p0r + 1) = scale * t0i;
1268       *(p0r + 2) = scale * f1r;
1269       *(p0r + (2 + 1)) = scale * f1i;
1270       *(p0r + posA) = scale * t1r;
1271       *(p0r + posAi) = scale * t1i;
1272       *(p0r + posB) = scale * f3r;
1273       *(p0r + posBi) = scale * f3i;
1274     }
1275   }
1276 }
1277 
ifft2pt(MYFLT * ioptr,MYFLT scale)1278 static void ifft2pt(MYFLT *ioptr, MYFLT scale)
1279 {
1280   /***   RADIX 2 ifft     ***/
1281   MYFLT f0r, f0i, f1r, f1i;
1282   MYFLT t0r, t0i;
1283 
1284   /* bit reversed load */
1285   f0r = ioptr[0];
1286   f0i = ioptr[1];
1287   f1r = ioptr[2];
1288   f1i = ioptr[3];
1289 
1290   /* Butterflys           */
1291   /*
1292     f0   -       -       t0
1293     f1   -  1 -  f1
1294   */
1295 
1296   t0r = f0r + f1r;
1297   t0i = f0i + f1i;
1298   f1r = f0r - f1r;
1299   f1i = f0i - f1i;
1300 
1301   /* store result */
1302   ioptr[0] = scale * t0r;
1303   ioptr[1] = scale * t0i;
1304   ioptr[2] = scale * f1r;
1305   ioptr[3] = scale * f1i;
1306 }
1307 
ifft4pt(MYFLT * ioptr,MYFLT scale)1308 static void ifft4pt(MYFLT *ioptr, MYFLT scale)
1309 {
1310   /***   RADIX 4 ifft     ***/
1311   MYFLT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
1312   MYFLT t0r, t0i, t1r, t1i;
1313 
1314   /* bit reversed load */
1315   f0r = ioptr[0];
1316   f0i = ioptr[1];
1317   f1r = ioptr[4];
1318   f1i = ioptr[5];
1319   f2r = ioptr[2];
1320   f2i = ioptr[3];
1321   f3r = ioptr[6];
1322   f3i = ioptr[7];
1323 
1324   /* Butterflys           */
1325   /*
1326     f0   -       -       t0      -       -       f0
1327     f1   -  1 -  f1      -       -       f1
1328     f2   -       -       f2      -  1 -  f2
1329     f3   -  1 -  t1      -  i -  f3
1330   */
1331 
1332   t0r = f0r + f1r;
1333   t0i = f0i + f1i;
1334   f1r = f0r - f1r;
1335   f1i = f0i - f1i;
1336 
1337   t1r = f2r - f3r;
1338   t1i = f2i - f3i;
1339   f2r = f2r + f3r;
1340   f2i = f2i + f3i;
1341 
1342   f0r = t0r + f2r;
1343   f0i = t0i + f2i;
1344   f2r = t0r - f2r;
1345   f2i = t0i - f2i;
1346 
1347   f3r = f1r + t1i;
1348   f3i = f1i - t1r;
1349   f1r = f1r - t1i;
1350   f1i = f1i + t1r;
1351 
1352   /* store result */
1353   ioptr[0] = scale * f0r;
1354   ioptr[1] = scale * f0i;
1355   ioptr[2] = scale * f1r;
1356   ioptr[3] = scale * f1i;
1357   ioptr[4] = scale * f2r;
1358   ioptr[5] = scale * f2i;
1359   ioptr[6] = scale * f3r;
1360   ioptr[7] = scale * f3i;
1361 }
1362 
ifft8pt(MYFLT * ioptr,MYFLT scale)1363 static void ifft8pt(MYFLT *ioptr, MYFLT scale)
1364 {
1365   /***   RADIX 8 ifft     ***/
1366   MYFLT w0r = FL(1.0) / FL(ROOT2);    /* cos(pi/4)   */
1367   MYFLT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
1368   MYFLT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
1369   MYFLT t0r, t0i, t1r, t1i;
1370   const MYFLT Two = FL(2.0);
1371 
1372   /* bit reversed load */
1373   f0r = ioptr[0];
1374   f0i = ioptr[1];
1375   f1r = ioptr[8];
1376   f1i = ioptr[9];
1377   f2r = ioptr[4];
1378   f2i = ioptr[5];
1379   f3r = ioptr[12];
1380   f3i = ioptr[13];
1381   f4r = ioptr[2];
1382   f4i = ioptr[3];
1383   f5r = ioptr[10];
1384   f5i = ioptr[11];
1385   f6r = ioptr[6];
1386   f6i = ioptr[7];
1387   f7r = ioptr[14];
1388   f7i = ioptr[15];
1389 
1390   /* Butterflys           */
1391   /*
1392     f0   -       -       t0      -       -       f0      -       -       f0
1393     f1   -  1 -  f1      -       -       f1      -       -       f1
1394     f2   -       -       f2      -  1 -  f2      -       -       f2
1395     f3   -  1 -  t1      -  i -  f3      -       -       f3
1396     f4   -       -       t0      -       -       f4      -  1 -  t0
1397     f5   -  1 -  f5      -       -       f5      - w3 -  f4
1398     f6   -       -       f6      -  1 -  f6      -  i -  t1
1399     f7   -  1 -  t1      -  i -  f7      - iw3-  f6
1400   */
1401 
1402   t0r = f0r + f1r;
1403   t0i = f0i + f1i;
1404   f1r = f0r - f1r;
1405   f1i = f0i - f1i;
1406 
1407   t1r = f2r - f3r;
1408   t1i = f2i - f3i;
1409   f2r = f2r + f3r;
1410   f2i = f2i + f3i;
1411 
1412   f0r = t0r + f2r;
1413   f0i = t0i + f2i;
1414   f2r = t0r - f2r;
1415   f2i = t0i - f2i;
1416 
1417   f3r = f1r + t1i;
1418   f3i = f1i - t1r;
1419   f1r = f1r - t1i;
1420   f1i = f1i + t1r;
1421 
1422   t0r = f4r + f5r;
1423   t0i = f4i + f5i;
1424   f5r = f4r - f5r;
1425   f5i = f4i - f5i;
1426 
1427   t1r = f6r - f7r;
1428   t1i = f6i - f7i;
1429   f6r = f6r + f7r;
1430   f6i = f6i + f7i;
1431 
1432   f4r = t0r + f6r;
1433   f4i = t0i + f6i;
1434   f6r = t0r - f6r;
1435   f6i = t0i - f6i;
1436 
1437   f7r = f5r + t1i;
1438   f7i = f5i - t1r;
1439   f5r = f5r - t1i;
1440   f5i = f5i + t1r;
1441 
1442   t0r = f0r - f4r;
1443   t0i = f0i - f4i;
1444   f0r = f0r + f4r;
1445   f0i = f0i + f4i;
1446 
1447   t1r = f2r + f6i;
1448   t1i = f2i - f6r;
1449   f2r = f2r - f6i;
1450   f2i = f2i + f6r;
1451 
1452   f4r = f1r - f5r * w0r + f5i * w0r;
1453   f4i = f1i - f5r * w0r - f5i * w0r;
1454   f1r = f1r * Two - f4r;
1455   f1i = f1i * Two - f4i;
1456 
1457   f6r = f3r + f7r * w0r + f7i * w0r;
1458   f6i = f3i - f7r * w0r + f7i * w0r;
1459   f3r = f3r * Two - f6r;
1460   f3i = f3i * Two - f6i;
1461 
1462   /* store result */
1463   ioptr[0] = scale * f0r;
1464   ioptr[1] = scale * f0i;
1465   ioptr[2] = scale * f1r;
1466   ioptr[3] = scale * f1i;
1467   ioptr[4] = scale * f2r;
1468   ioptr[5] = scale * f2i;
1469   ioptr[6] = scale * f3r;
1470   ioptr[7] = scale * f3i;
1471   ioptr[8] = scale * t0r;
1472   ioptr[9] = scale * t0i;
1473   ioptr[10] = scale * f4r;
1474   ioptr[11] = scale * f4i;
1475   ioptr[12] = scale * t1r;
1476   ioptr[13] = scale * t1i;
1477   ioptr[14] = scale * f6r;
1478   ioptr[15] = scale * f6i;
1479 }
1480 
ibfR2(MYFLT * ioptr,int32_t M,int32_t NDiffU)1481 static void ibfR2(MYFLT *ioptr, int32_t M, int32_t NDiffU)
1482 {
1483   /*** 2nd radix 2 stage ***/
1484   uint32_t pos;
1485   uint32_t posi;
1486   uint32_t pinc;
1487   uint32_t pnext;
1488   uint32_t NSameU;
1489   uint32_t SameUCnt;
1490 
1491   MYFLT *pstrt;
1492   MYFLT *p0r, *p1r, *p2r, *p3r;
1493 
1494   MYFLT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
1495   MYFLT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
1496 
1497   pinc = NDiffU * 2;            /* 2 floats per complex */
1498   pnext = pinc * 4;
1499   pos = 2;
1500   posi = pos + 1;
1501   NSameU = POW2(M) / 4 / NDiffU;        /* 4 Us at a time */
1502   pstrt = ioptr;
1503   p0r = pstrt;
1504   p1r = pstrt + pinc;
1505   p2r = p1r + pinc;
1506   p3r = p2r + pinc;
1507 
1508   /* Butterflys           */
1509   /*
1510     f0   -       -       f4
1511     f1   -  1 -  f5
1512     f2   -       -       f6
1513     f3   -  1 -  f7
1514   */
1515   /* Butterflys           */
1516   /*
1517     f0   -       -       f4
1518     f1   -  1 -  f5
1519     f2   -       -       f6
1520     f3   -  1 -  f7
1521   */
1522 
1523   for (SameUCnt = NSameU; SameUCnt > 0; SameUCnt--) {
1524 
1525     f0r = *p0r;
1526     f1r = *p1r;
1527     f0i = *(p0r + 1);
1528     f1i = *(p1r + 1);
1529     f2r = *p2r;
1530     f3r = *p3r;
1531     f2i = *(p2r + 1);
1532     f3i = *(p3r + 1);
1533 
1534     f4r = f0r + f1r;
1535     f4i = f0i + f1i;
1536     f5r = f0r - f1r;
1537     f5i = f0i - f1i;
1538 
1539     f6r = f2r + f3r;
1540     f6i = f2i + f3i;
1541     f7r = f2r - f3r;
1542     f7i = f2i - f3i;
1543 
1544     *p0r = f4r;
1545     *(p0r + 1) = f4i;
1546     *p1r = f5r;
1547     *(p1r + 1) = f5i;
1548     *p2r = f6r;
1549     *(p2r + 1) = f6i;
1550     *p3r = f7r;
1551     *(p3r + 1) = f7i;
1552 
1553     f0r = *(p0r + pos);
1554     f1i = *(p1r + posi);
1555     f0i = *(p0r + posi);
1556     f1r = *(p1r + pos);
1557     f2r = *(p2r + pos);
1558     f3i = *(p3r + posi);
1559     f2i = *(p2r + posi);
1560     f3r = *(p3r + pos);
1561 
1562     f4r = f0r - f1i;
1563     f4i = f0i + f1r;
1564     f5r = f0r + f1i;
1565     f5i = f0i - f1r;
1566 
1567     f6r = f2r - f3i;
1568     f6i = f2i + f3r;
1569     f7r = f2r + f3i;
1570     f7i = f2i - f3r;
1571 
1572     *(p0r + pos) = f4r;
1573     *(p0r + posi) = f4i;
1574     *(p1r + pos) = f5r;
1575     *(p1r + posi) = f5i;
1576     *(p2r + pos) = f6r;
1577     *(p2r + posi) = f6i;
1578     *(p3r + pos) = f7r;
1579     *(p3r + posi) = f7i;
1580 
1581     p0r += pnext;
1582     p1r += pnext;
1583     p2r += pnext;
1584     p3r += pnext;
1585   }
1586 }
1587 
ibfR4(MYFLT * ioptr,int32_t M,int32_t NDiffU)1588 static void ibfR4(MYFLT *ioptr, int32_t M, int32_t NDiffU)
1589 {
1590   /*** 1 radix 4 stage ***/
1591   uint32_t pos;
1592   uint32_t posi;
1593   uint32_t pinc;
1594   uint32_t pnext;
1595   uint32_t pnexti;
1596   uint32_t NSameU;
1597   uint32_t SameUCnt;
1598 
1599   MYFLT *pstrt;
1600   MYFLT *p0r, *p1r, *p2r, *p3r;
1601 
1602   MYFLT w1r = FL(1.0) / FL(ROOT2);    /* cos(pi/4)   */
1603   MYFLT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
1604   MYFLT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
1605   MYFLT t1r, t1i;
1606   const MYFLT Two = FL(2.0);
1607 
1608   pinc = NDiffU * 2;            /* 2 floats per complex */
1609   pnext = pinc * 4;
1610   pnexti = pnext + 1;
1611   pos = 2;
1612   posi = pos + 1;
1613   NSameU = POW2(M) / 4 / NDiffU;        /* 4 pts per butterfly */
1614   pstrt = ioptr;
1615   p0r = pstrt;
1616   p1r = pstrt + pinc;
1617   p2r = p1r + pinc;
1618   p3r = p2r + pinc;
1619 
1620   /* Butterflys           */
1621   /*
1622     f0   -       -       f0      -       -       f4
1623     f1   -  1 -  f5      -       -       f5
1624     f2   -       -       f6      -  1 -  f6
1625     f3   -  1 -  f3      - -i -  f7
1626   */
1627   /* Butterflys           */
1628   /*
1629     f0   -       -       f4      -       -       f4
1630     f1   - -i -  t1      -       -       f5
1631     f2   -       -       f2      - w1 -  f6
1632     f3   - -i -  f7      - iw1-  f7
1633   */
1634 
1635   f0r = *p0r;
1636   f1r = *p1r;
1637   f2r = *p2r;
1638   f3r = *p3r;
1639   f0i = *(p0r + 1);
1640   f1i = *(p1r + 1);
1641   f2i = *(p2r + 1);
1642   f3i = *(p3r + 1);
1643 
1644   f5r = f0r - f1r;
1645   f5i = f0i - f1i;
1646   f0r = f0r + f1r;
1647   f0i = f0i + f1i;
1648 
1649   f6r = f2r + f3r;
1650   f6i = f2i + f3i;
1651   f3r = f2r - f3r;
1652   f3i = f2i - f3i;
1653 
1654   for (SameUCnt = NSameU - 1; SameUCnt > 0; SameUCnt--) {
1655 
1656     f7r = f5r + f3i;
1657     f7i = f5i - f3r;
1658     f5r = f5r - f3i;
1659     f5i = f5i + f3r;
1660 
1661     f4r = f0r + f6r;
1662     f4i = f0i + f6i;
1663     f6r = f0r - f6r;
1664     f6i = f0i - f6i;
1665 
1666     f2r = *(p2r + pos);
1667     f2i = *(p2r + posi);
1668     f1r = *(p1r + pos);
1669     f1i = *(p1r + posi);
1670     f3i = *(p3r + posi);
1671     f0r = *(p0r + pos);
1672     f3r = *(p3r + pos);
1673     f0i = *(p0r + posi);
1674 
1675     *p3r = f7r;
1676     *p0r = f4r;
1677     *(p3r + 1) = f7i;
1678     *(p0r + 1) = f4i;
1679     *p1r = f5r;
1680     *p2r = f6r;
1681     *(p1r + 1) = f5i;
1682     *(p2r + 1) = f6i;
1683 
1684     f7r = f2r + f3i;
1685     f7i = f2i - f3r;
1686     f2r = f2r - f3i;
1687     f2i = f2i + f3r;
1688 
1689     f4r = f0r - f1i;
1690     f4i = f0i + f1r;
1691     t1r = f0r + f1i;
1692     t1i = f0i - f1r;
1693 
1694     f5r = t1r - f7r * w1r - f7i * w1r;
1695     f5i = t1i + f7r * w1r - f7i * w1r;
1696     f7r = t1r * Two - f5r;
1697     f7i = t1i * Two - f5i;
1698 
1699     f6r = f4r - f2r * w1r + f2i * w1r;
1700     f6i = f4i - f2r * w1r - f2i * w1r;
1701     f4r = f4r * Two - f6r;
1702     f4i = f4i * Two - f6i;
1703 
1704     f3r = *(p3r + pnext);
1705     f0r = *(p0r + pnext);
1706     f3i = *(p3r + pnexti);
1707     f0i = *(p0r + pnexti);
1708     f2r = *(p2r + pnext);
1709     f2i = *(p2r + pnexti);
1710     f1r = *(p1r + pnext);
1711     f1i = *(p1r + pnexti);
1712 
1713     *(p2r + pos) = f6r;
1714     *(p1r + pos) = f5r;
1715     *(p2r + posi) = f6i;
1716     *(p1r + posi) = f5i;
1717     *(p3r + pos) = f7r;
1718     *(p0r + pos) = f4r;
1719     *(p3r + posi) = f7i;
1720     *(p0r + posi) = f4i;
1721 
1722     f6r = f2r + f3r;
1723     f6i = f2i + f3i;
1724     f3r = f2r - f3r;
1725     f3i = f2i - f3i;
1726 
1727     f5r = f0r - f1r;
1728     f5i = f0i - f1i;
1729     f0r = f0r + f1r;
1730     f0i = f0i + f1i;
1731 
1732     p3r += pnext;
1733     p0r += pnext;
1734     p1r += pnext;
1735     p2r += pnext;
1736   }
1737   f7r = f5r + f3i;
1738   f7i = f5i - f3r;
1739   f5r = f5r - f3i;
1740   f5i = f5i + f3r;
1741 
1742   f4r = f0r + f6r;
1743   f4i = f0i + f6i;
1744   f6r = f0r - f6r;
1745   f6i = f0i - f6i;
1746 
1747   f2r = *(p2r + pos);
1748   f2i = *(p2r + posi);
1749   f1r = *(p1r + pos);
1750   f1i = *(p1r + posi);
1751   f3i = *(p3r + posi);
1752   f0r = *(p0r + pos);
1753   f3r = *(p3r + pos);
1754   f0i = *(p0r + posi);
1755 
1756   *p3r = f7r;
1757   *p0r = f4r;
1758   *(p3r + 1) = f7i;
1759   *(p0r + 1) = f4i;
1760   *p1r = f5r;
1761   *p2r = f6r;
1762   *(p1r + 1) = f5i;
1763   *(p2r + 1) = f6i;
1764 
1765   f7r = f2r + f3i;
1766   f7i = f2i - f3r;
1767   f2r = f2r - f3i;
1768   f2i = f2i + f3r;
1769 
1770   f4r = f0r - f1i;
1771   f4i = f0i + f1r;
1772   t1r = f0r + f1i;
1773   t1i = f0i - f1r;
1774 
1775   f5r = t1r - f7r * w1r - f7i * w1r;
1776   f5i = t1i + f7r * w1r - f7i * w1r;
1777   f7r = t1r * Two - f5r;
1778   f7i = t1i * Two - f5i;
1779 
1780   f6r = f4r - f2r * w1r + f2i * w1r;
1781   f6i = f4i - f2r * w1r - f2i * w1r;
1782   f4r = f4r * Two - f6r;
1783   f4i = f4i * Two - f6i;
1784 
1785   *(p2r + pos) = f6r;
1786   *(p1r + pos) = f5r;
1787   *(p2r + posi) = f6i;
1788   *(p1r + posi) = f5i;
1789   *(p3r + pos) = f7r;
1790   *(p0r + pos) = f4r;
1791   *(p3r + posi) = f7i;
1792   *(p0r + posi) = f4i;
1793 }
1794 
ibfstages(MYFLT * ioptr,int32_t M,MYFLT * Utbl,int32_t Ustride,int32_t NDiffU,int32_t StageCnt)1795 static void ibfstages(MYFLT *ioptr, int32_t M, MYFLT *Utbl, int32_t Ustride,
1796                       int32_t NDiffU, int32_t StageCnt)
1797 {
1798   /***   RADIX 8 Stages   ***/
1799   uint32_t pos;
1800   uint32_t posi;
1801   uint32_t pinc;
1802   uint32_t pnext;
1803   uint32_t NSameU;
1804   int32_t          Uinc;
1805   int32_t          Uinc2;
1806   int32_t          Uinc4;
1807   uint32_t DiffUCnt;
1808   uint32_t SameUCnt;
1809   uint32_t U2toU3;
1810 
1811   MYFLT *pstrt;
1812   MYFLT *p0r, *p1r, *p2r, *p3r;
1813   MYFLT *u0r, *u0i, *u1r, *u1i, *u2r, *u2i;
1814 
1815   MYFLT w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i;
1816   MYFLT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
1817   MYFLT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
1818   MYFLT t0r, t0i, t1r, t1i;
1819   const MYFLT Two = FL(2.0);
1820 
1821   pinc = NDiffU * 2;            /* 2 floats per complex */
1822   pnext = pinc * 8;
1823   pos = pinc * 4;
1824   posi = pos + 1;
1825   NSameU = POW2(M) / 8 / NDiffU;        /* 8 pts per butterfly */
1826   Uinc = (int32_t) NSameU * Ustride;
1827   Uinc2 = Uinc * 2;
1828   Uinc4 = Uinc * 4;
1829   U2toU3 = (POW2(M) / 8) * Ustride;
1830   for (; StageCnt > 0; StageCnt--) {
1831 
1832     u0r = &Utbl[0];
1833     u0i = &Utbl[POW2(M - 2) * Ustride];
1834     u1r = u0r;
1835     u1i = u0i;
1836     u2r = u0r;
1837     u2i = u0i;
1838 
1839     w0r = *u0r;
1840     w0i = *u0i;
1841     w1r = *u1r;
1842     w1i = *u1i;
1843     w2r = *u2r;
1844     w2i = *u2i;
1845     w3r = *(u2r + U2toU3);
1846     w3i = *(u2i - U2toU3);
1847 
1848     pstrt = ioptr;
1849 
1850     p0r = pstrt;
1851     p1r = pstrt + pinc;
1852     p2r = p1r + pinc;
1853     p3r = p2r + pinc;
1854 
1855     /* Butterflys           */
1856     /*
1857       f0   -       -       t0      -       -       f0      -       -       f0
1858       f1   - w0-   f1      -       -       f1      -       -       f1
1859       f2   -       -       f2      - w1-   f2      -       -       f4
1860       f3   - w0-   t1      - iw1-  f3      -       -       f5
1861 
1862       f4   -       -       t0      -       -       f4      - w2-   t0
1863       f5   - w0-   f5      -       -       f5      - w3-   t1
1864       f6   -       -       f6      - w1-   f6      - iw2-  f6
1865       f7   - w0-   t1      - iw1-  f7      - iw3-  f7
1866     */
1867 
1868     for (DiffUCnt = NDiffU; DiffUCnt > 0; DiffUCnt--) {
1869       f0r = *p0r;
1870       f0i = *(p0r + 1);
1871       f1r = *p1r;
1872       f1i = *(p1r + 1);
1873       for (SameUCnt = NSameU - 1; SameUCnt > 0; SameUCnt--) {
1874         f2r = *p2r;
1875         f2i = *(p2r + 1);
1876         f3r = *p3r;
1877         f3i = *(p3r + 1);
1878 
1879         t0r = f0r + f1r * w0r - f1i * w0i;
1880         t0i = f0i + f1r * w0i + f1i * w0r;
1881         f1r = f0r * Two - t0r;
1882         f1i = f0i * Two - t0i;
1883 
1884         f4r = *(p0r + pos);
1885         f4i = *(p0r + posi);
1886         f5r = *(p1r + pos);
1887         f5i = *(p1r + posi);
1888 
1889         f6r = *(p2r + pos);
1890         f6i = *(p2r + posi);
1891         f7r = *(p3r + pos);
1892         f7i = *(p3r + posi);
1893 
1894         t1r = f2r - f3r * w0r + f3i * w0i;
1895         t1i = f2i - f3r * w0i - f3i * w0r;
1896         f2r = f2r * Two - t1r;
1897         f2i = f2i * Two - t1i;
1898 
1899         f0r = t0r + f2r * w1r - f2i * w1i;
1900         f0i = t0i + f2r * w1i + f2i * w1r;
1901         f2r = t0r * Two - f0r;
1902         f2i = t0i * Two - f0i;
1903 
1904         f3r = f1r + t1r * w1i + t1i * w1r;
1905         f3i = f1i - t1r * w1r + t1i * w1i;
1906         f1r = f1r * Two - f3r;
1907         f1i = f1i * Two - f3i;
1908 
1909         t0r = f4r + f5r * w0r - f5i * w0i;
1910         t0i = f4i + f5r * w0i + f5i * w0r;
1911         f5r = f4r * Two - t0r;
1912         f5i = f4i * Two - t0i;
1913 
1914         t1r = f6r - f7r * w0r + f7i * w0i;
1915         t1i = f6i - f7r * w0i - f7i * w0r;
1916         f6r = f6r * Two - t1r;
1917         f6i = f6i * Two - t1i;
1918 
1919         f4r = t0r + f6r * w1r - f6i * w1i;
1920         f4i = t0i + f6r * w1i + f6i * w1r;
1921         f6r = t0r * Two - f4r;
1922         f6i = t0i * Two - f4i;
1923 
1924         f7r = f5r + t1r * w1i + t1i * w1r;
1925         f7i = f5i - t1r * w1r + t1i * w1i;
1926         f5r = f5r * Two - f7r;
1927         f5i = f5i * Two - f7i;
1928 
1929         t0r = f0r - f4r * w2r + f4i * w2i;
1930         t0i = f0i - f4r * w2i - f4i * w2r;
1931         f0r = f0r * Two - t0r;
1932         f0i = f0i * Two - t0i;
1933 
1934         t1r = f1r - f5r * w3r + f5i * w3i;
1935         t1i = f1i - f5r * w3i - f5i * w3r;
1936         f1r = f1r * Two - t1r;
1937         f1i = f1i * Two - t1i;
1938 
1939         *(p0r + pos) = t0r;
1940         *(p0r + posi) = t0i;
1941         *p0r = f0r;
1942         *(p0r + 1) = f0i;
1943 
1944         p0r += pnext;
1945         f0r = *p0r;
1946         f0i = *(p0r + 1);
1947 
1948         *(p1r + pos) = t1r;
1949         *(p1r + posi) = t1i;
1950         *p1r = f1r;
1951         *(p1r + 1) = f1i;
1952 
1953         p1r += pnext;
1954 
1955         f1r = *p1r;
1956         f1i = *(p1r + 1);
1957 
1958         f4r = f2r - f6r * w2i - f6i * w2r;
1959         f4i = f2i + f6r * w2r - f6i * w2i;
1960         f6r = f2r * Two - f4r;
1961         f6i = f2i * Two - f4i;
1962 
1963         f5r = f3r - f7r * w3i - f7i * w3r;
1964         f5i = f3i + f7r * w3r - f7i * w3i;
1965         f7r = f3r * Two - f5r;
1966         f7i = f3i * Two - f5i;
1967 
1968         *p2r = f4r;
1969         *(p2r + 1) = f4i;
1970         *(p2r + pos) = f6r;
1971         *(p2r + posi) = f6i;
1972 
1973         p2r += pnext;
1974 
1975         *p3r = f5r;
1976         *(p3r + 1) = f5i;
1977         *(p3r + pos) = f7r;
1978         *(p3r + posi) = f7i;
1979 
1980         p3r += pnext;
1981       }
1982 
1983       f2r = *p2r;
1984       f2i = *(p2r + 1);
1985       f3r = *p3r;
1986       f3i = *(p3r + 1);
1987 
1988       t0r = f0r + f1r * w0r - f1i * w0i;
1989       t0i = f0i + f1r * w0i + f1i * w0r;
1990       f1r = f0r * Two - t0r;
1991       f1i = f0i * Two - t0i;
1992 
1993       f4r = *(p0r + pos);
1994       f4i = *(p0r + posi);
1995       f5r = *(p1r + pos);
1996       f5i = *(p1r + posi);
1997 
1998       f6r = *(p2r + pos);
1999       f6i = *(p2r + posi);
2000       f7r = *(p3r + pos);
2001       f7i = *(p3r + posi);
2002 
2003       t1r = f2r - f3r * w0r + f3i * w0i;
2004       t1i = f2i - f3r * w0i - f3i * w0r;
2005       f2r = f2r * Two - t1r;
2006       f2i = f2i * Two - t1i;
2007 
2008       f0r = t0r + f2r * w1r - f2i * w1i;
2009       f0i = t0i + f2r * w1i + f2i * w1r;
2010       f2r = t0r * Two - f0r;
2011       f2i = t0i * Two - f0i;
2012 
2013       f3r = f1r + t1r * w1i + t1i * w1r;
2014       f3i = f1i - t1r * w1r + t1i * w1i;
2015       f1r = f1r * Two - f3r;
2016       f1i = f1i * Two - f3i;
2017 
2018       if ((int32_t) DiffUCnt == NDiffU / 2)
2019         Uinc4 = -Uinc4;
2020 
2021       u0r += Uinc4;
2022       u0i -= Uinc4;
2023       u1r += Uinc2;
2024       u1i -= Uinc2;
2025       u2r += Uinc;
2026       u2i -= Uinc;
2027 
2028       pstrt += 2;
2029 
2030       t0r = f4r + f5r * w0r - f5i * w0i;
2031       t0i = f4i + f5r * w0i + f5i * w0r;
2032       f5r = f4r * Two - t0r;
2033       f5i = f4i * Two - t0i;
2034 
2035       t1r = f6r - f7r * w0r + f7i * w0i;
2036       t1i = f6i - f7r * w0i - f7i * w0r;
2037       f6r = f6r * Two - t1r;
2038       f6i = f6i * Two - t1i;
2039 
2040       f4r = t0r + f6r * w1r - f6i * w1i;
2041       f4i = t0i + f6r * w1i + f6i * w1r;
2042       f6r = t0r * Two - f4r;
2043       f6i = t0i * Two - f4i;
2044 
2045       f7r = f5r + t1r * w1i + t1i * w1r;
2046       f7i = f5i - t1r * w1r + t1i * w1i;
2047       f5r = f5r * Two - f7r;
2048       f5i = f5i * Two - f7i;
2049 
2050       w0r = *u0r;
2051       w0i = *u0i;
2052       w1r = *u1r;
2053       w1i = *u1i;
2054 
2055       if ((int32_t) DiffUCnt <= NDiffU / 2)
2056         w0r = -w0r;
2057 
2058       t0r = f0r - f4r * w2r + f4i * w2i;
2059       t0i = f0i - f4r * w2i - f4i * w2r;
2060       f0r = f0r * Two - t0r;
2061       f0i = f0i * Two - t0i;
2062 
2063       f4r = f2r - f6r * w2i - f6i * w2r;
2064       f4i = f2i + f6r * w2r - f6i * w2i;
2065       f6r = f2r * Two - f4r;
2066       f6i = f2i * Two - f4i;
2067 
2068       *(p0r + pos) = t0r;
2069       *p2r = f4r;
2070       *(p0r + posi) = t0i;
2071       *(p2r + 1) = f4i;
2072       w2r = *u2r;
2073       w2i = *u2i;
2074       *p0r = f0r;
2075       *(p2r + pos) = f6r;
2076       *(p0r + 1) = f0i;
2077       *(p2r + posi) = f6i;
2078 
2079       p0r = pstrt;
2080       p2r = pstrt + pinc + pinc;
2081 
2082       t1r = f1r - f5r * w3r + f5i * w3i;
2083       t1i = f1i - f5r * w3i - f5i * w3r;
2084       f1r = f1r * Two - t1r;
2085       f1i = f1i * Two - t1i;
2086 
2087       f5r = f3r - f7r * w3i - f7i * w3r;
2088       f5i = f3i + f7r * w3r - f7i * w3i;
2089       f7r = f3r * Two - f5r;
2090       f7i = f3i * Two - f5i;
2091 
2092       *(p1r + pos) = t1r;
2093       *p3r = f5r;
2094       *(p1r + posi) = t1i;
2095       *(p3r + 1) = f5i;
2096       w3r = *(u2r + U2toU3);
2097       w3i = *(u2i - U2toU3);
2098       *p1r = f1r;
2099       *(p3r + pos) = f7r;
2100       *(p1r + 1) = f1i;
2101       *(p3r + posi) = f7i;
2102 
2103       p1r = pstrt + pinc;
2104       p3r = p2r + pinc;
2105     }
2106     NSameU /= 8;
2107     Uinc /= 8;
2108     Uinc2 /= 8;
2109     Uinc4 = Uinc * 4;
2110     NDiffU *= 8;
2111     pinc *= 8;
2112     pnext *= 8;
2113     pos *= 8;
2114     posi = pos + 1;
2115   }
2116 }
2117 
ifftrecurs(MYFLT * ioptr,int32_t M,MYFLT * Utbl,int32_t Ustride,int32_t NDiffU,int32_t StageCnt)2118 static void ifftrecurs(MYFLT *ioptr, int32_t M, MYFLT *Utbl, int32_t Ustride,
2119                        int32_t NDiffU, int32_t StageCnt)
2120 {
2121   /* recursive bfstages calls to maximize on chip cache efficiency */
2122   int32_t i1;
2123 
2124   if (M <= (int32_t) MCACHE)
2125     ibfstages(ioptr, M, Utbl, Ustride, NDiffU, StageCnt); /* RADIX 8 Stages */
2126   else {
2127     for (i1 = 0; i1 < 8; i1++) {
2128       ifftrecurs(&ioptr[i1 * POW2(M - 3) * 2], M - 3, Utbl, 8 * Ustride,
2129                  NDiffU, StageCnt - 1);           /*  RADIX 8 Stages       */
2130     }
2131     ibfstages(ioptr, M, Utbl, Ustride, POW2(M - 3), 1);   /* RADIX 8 Stage */
2132   }
2133 }
2134 
iffts1(MYFLT * ioptr,int32_t M,MYFLT * Utbl,int16 * BRLow)2135 static void iffts1(MYFLT *ioptr, int32_t M, MYFLT *Utbl, int16 *BRLow)
2136 {
2137   /* Compute in-place inverse complex fft on the rows of the input array  */
2138   /* INPUTS                                                               */
2139   /*   *ioptr = input data array                                          */
2140   /*   M = log2 of fft size                                               */
2141   /*   *Utbl = cosine table                                               */
2142   /*   *BRLow = bit reversed counter table                                */
2143   /* OUTPUTS                                                              */
2144   /*   *ioptr = output data array                                         */
2145 
2146   int32_t StageCnt;
2147   int32_t NDiffU;
2148   const MYFLT scale = FL(1.0) / POW2(M);
2149 
2150   switch (M) {
2151   case 0:
2152     break;
2153   case 1:
2154     ifft2pt(ioptr, scale);    /* a 2 pt fft */
2155     break;
2156   case 2:
2157     ifft4pt(ioptr, scale);    /* a 4 pt fft */
2158     break;
2159   case 3:
2160     ifft8pt(ioptr, scale);    /* an 8 pt fft */
2161     break;
2162   default:
2163     /* bit reverse and first radix 2 stage */
2164     scbitrevR2(ioptr, M, BRLow, scale);
2165     StageCnt = (M - 1) / 3;   /* number of radix 8 stages */
2166     NDiffU = 2;               /* one radix 2 stage already complete */
2167     if ((M - 1 - (StageCnt * 3)) == 1) {
2168       ibfR2(ioptr, M, NDiffU);        /* 1 radix 2 stage */
2169       NDiffU *= 2;
2170     }
2171     if ((M - 1 - (StageCnt * 3)) == 2) {
2172       ibfR4(ioptr, M, NDiffU);        /* 1 radix 4 stage */
2173       NDiffU *= 4;
2174     }
2175     if (M <= (int32_t) MCACHE)
2176       ibfstages(ioptr, M, Utbl, 1, NDiffU, StageCnt);  /* RADIX 8 Stages */
2177     else
2178       ifftrecurs(ioptr, M, Utbl, 1, NDiffU, StageCnt); /* RADIX 8 Stages */
2179   }
2180 }
2181 
2182 /******************
2183  * parts of rffts1 *
2184  ******************/
2185 
rfft1pt(MYFLT * ioptr)2186 static void rfft1pt(MYFLT *ioptr)
2187 {
2188   /***   RADIX 2 rfft     ***/
2189   MYFLT f0r, f0i;
2190   MYFLT t0r, t0i;
2191 
2192   /* bit reversed load */
2193   f0r = ioptr[0];
2194   f0i = ioptr[1];
2195 
2196   /* finish rfft */
2197   t0r = f0r + f0i;
2198   t0i = f0r - f0i;
2199 
2200   /* store result */
2201   ioptr[0] = t0r;
2202   ioptr[1] = t0i;
2203 }
2204 
rfft2pt(MYFLT * ioptr)2205 static void rfft2pt(MYFLT *ioptr)
2206 {
2207   /***   RADIX 4 rfft     ***/
2208   MYFLT f0r, f0i, f1r, f1i;
2209   MYFLT t0r, t0i;
2210 
2211   /* bit reversed load */
2212   f0r = ioptr[0];
2213   f0i = ioptr[1];
2214   f1r = ioptr[2];
2215   f1i = ioptr[3];
2216 
2217   /* Butterflys           */
2218   /*
2219     f0   -       -       t0
2220     f1   -  1 -  f1
2221   */
2222 
2223   t0r = f0r + f1r;
2224   t0i = f0i + f1i;
2225   f1r = f0r - f1r;
2226   f1i = f1i - f0i;
2227   /* finish rfft */
2228   f0r = t0r + t0i;
2229   f0i = t0r - t0i;
2230 
2231   /* store result */
2232   ioptr[0] = f0r;
2233   ioptr[1] = f0i;
2234   ioptr[2] = f1r;
2235   ioptr[3] = f1i;
2236 }
2237 
rfft4pt(MYFLT * ioptr)2238 static void rfft4pt(MYFLT *ioptr)
2239 {
2240   /***   RADIX 8 rfft     ***/
2241   MYFLT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
2242   MYFLT t0r, t0i, t1r, t1i;
2243   MYFLT w0r = 1.0 / ROOT2;    /* cos(pi/4)   */
2244   const MYFLT Two = FL(2.0);
2245   const MYFLT scale = FL(0.5);
2246 
2247   /* bit reversed load */
2248   f0r = ioptr[0];
2249   f0i = ioptr[1];
2250   f1r = ioptr[4];
2251   f1i = ioptr[5];
2252   f2r = ioptr[2];
2253   f2i = ioptr[3];
2254   f3r = ioptr[6];
2255   f3i = ioptr[7];
2256 
2257   /* Butterflys           */
2258   /*
2259     f0   -       -       t0      -       -       f0
2260     f1   -  1 -  f1      -       -       f1
2261     f2   -       -       f2      -  1 -  f2
2262     f3   -  1 -  t1      - -i -  f3
2263   */
2264 
2265   t0r = f0r + f1r;
2266   t0i = f0i + f1i;
2267   f1r = f0r - f1r;
2268   f1i = f0i - f1i;
2269 
2270   t1r = f2r - f3r;
2271   t1i = f2i - f3i;
2272   f2r = f2r + f3r;
2273   f2i = f2i + f3i;
2274 
2275   f0r = t0r + f2r;
2276   f0i = t0i + f2i;
2277   f2r = t0r - f2r;
2278   f2i = f2i - t0i;              /* neg for rfft */
2279 
2280   f3r = f1r - t1i;
2281   f3i = f1i + t1r;
2282   f1r = f1r + t1i;
2283   f1i = f1i - t1r;
2284 
2285   /* finish rfft */
2286   t0r = f0r + f0i;              /* compute Re(x[0]) */
2287   t0i = f0r - f0i;              /* compute Re(x[N/2]) */
2288 
2289   t1r = f1r + f3r;
2290   t1i = f1i - f3i;
2291   f0r = f1i + f3i;
2292   f0i = f3r - f1r;
2293 
2294   f1r = t1r + w0r * f0r + w0r * f0i;
2295   f1i = t1i - w0r * f0r + w0r * f0i;
2296   f3r = Two * t1r - f1r;
2297   f3i = f1i - Two * t1i;
2298 
2299   /* store result */
2300   ioptr[4] = f2r;
2301   ioptr[5] = f2i;
2302   ioptr[0] = t0r;
2303   ioptr[1] = t0i;
2304 
2305   ioptr[2] = scale * f1r;
2306   ioptr[3] = scale * f1i;
2307   ioptr[6] = scale * f3r;
2308   ioptr[7] = scale * f3i;
2309 }
2310 
rfft8pt(MYFLT * ioptr)2311 static void rfft8pt(MYFLT *ioptr)
2312 {
2313   /***   RADIX 16 rfft    ***/
2314   MYFLT w0r = 1.0 / ROOT2;    /* cos(pi/4)   */
2315   MYFLT w1r = MYCOSPID8;        /* cos(pi/8)     */
2316   MYFLT w1i = MYSINPID8;        /* sin(pi/8)     */
2317   MYFLT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
2318   MYFLT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
2319   MYFLT t0r, t0i, t1r, t1i;
2320   const MYFLT Two = FL(2.0);
2321   const MYFLT scale = FL(0.5);
2322 
2323   /* bit reversed load */
2324   f0r = ioptr[0];
2325   f0i = ioptr[1];
2326   f1r = ioptr[8];
2327   f1i = ioptr[9];
2328   f2r = ioptr[4];
2329   f2i = ioptr[5];
2330   f3r = ioptr[12];
2331   f3i = ioptr[13];
2332   f4r = ioptr[2];
2333   f4i = ioptr[3];
2334   f5r = ioptr[10];
2335   f5i = ioptr[11];
2336   f6r = ioptr[6];
2337   f6i = ioptr[7];
2338   f7r = ioptr[14];
2339   f7i = ioptr[15];
2340   /* Butterflys           */
2341   /*
2342     f0   -       -       t0      -       -       f0      -       -       f0
2343     f1   -  1 -  f1      -       -       f1      -       -       f1
2344     f2   -       -       f2      -  1 -  f2      -       -       f2
2345     f3   -  1 -  t1      - -i -  f3      -       -       f3
2346     f4   -       -       t0      -       -       f4      -  1 -  t0
2347     f5   -  1 -  f5      -       -       f5      - w3 -  f4
2348     f6   -       -       f6      -  1 -  f6      - -i -  t1
2349     f7   -  1 -  t1      - -i -  f7      - iw3-  f6
2350   */
2351 
2352   t0r = f0r + f1r;
2353   t0i = f0i + f1i;
2354   f1r = f0r - f1r;
2355   f1i = f0i - f1i;
2356 
2357   t1r = f2r - f3r;
2358   t1i = f2i - f3i;
2359   f2r = f2r + f3r;
2360   f2i = f2i + f3i;
2361 
2362   f0r = t0r + f2r;
2363   f0i = t0i + f2i;
2364   f2r = t0r - f2r;
2365   f2i = t0i - f2i;
2366 
2367   f3r = f1r - t1i;
2368   f3i = f1i + t1r;
2369   f1r = f1r + t1i;
2370   f1i = f1i - t1r;
2371 
2372   t0r = f4r + f5r;
2373   t0i = f4i + f5i;
2374   f5r = f4r - f5r;
2375   f5i = f4i - f5i;
2376 
2377   t1r = f6r - f7r;
2378   t1i = f6i - f7i;
2379   f6r = f6r + f7r;
2380   f6i = f6i + f7i;
2381 
2382   f4r = t0r + f6r;
2383   f4i = t0i + f6i;
2384   f6r = t0r - f6r;
2385   f6i = t0i - f6i;
2386 
2387   f7r = f5r - t1i;
2388   f7i = f5i + t1r;
2389   f5r = f5r + t1i;
2390   f5i = f5i - t1r;
2391 
2392   t0r = f0r - f4r;
2393   t0i = f4i - f0i;              /* neg for rfft */
2394   f0r = f0r + f4r;
2395   f0i = f0i + f4i;
2396 
2397   t1r = f2r - f6i;
2398   t1i = f2i + f6r;
2399   f2r = f2r + f6i;
2400   f2i = f2i - f6r;
2401 
2402   f4r = f1r - f5r * w0r - f5i * w0r;
2403   f4i = f1i + f5r * w0r - f5i * w0r;
2404   f1r = f1r * Two - f4r;
2405   f1i = f1i * Two - f4i;
2406 
2407   f6r = f3r + f7r * w0r - f7i * w0r;
2408   f6i = f3i + f7r * w0r + f7i * w0r;
2409   f3r = f3r * Two - f6r;
2410   f3i = f3i * Two - f6i;
2411 
2412   /* finish rfft */
2413   f5r = f0r + f0i;              /* compute Re(x[0]) */
2414   f5i = f0r - f0i;              /* compute Re(x[N/2]) */
2415 
2416   f0r = f2r + t1r;
2417   f0i = f2i - t1i;
2418   f7r = f2i + t1i;
2419   f7i = t1r - f2r;
2420 
2421   f2r = f0r + w0r * f7r + w0r * f7i;
2422   f2i = f0i - w0r * f7r + w0r * f7i;
2423   t1r = Two * f0r - f2r;
2424   t1i = f2i - Two * f0i;
2425 
2426   f0r = f1r + f6r;
2427   f0i = f1i - f6i;
2428   f7r = f1i + f6i;
2429   f7i = f6r - f1r;
2430 
2431   f1r = f0r + w1r * f7r + w1i * f7i;
2432   f1i = f0i - w1i * f7r + w1r * f7i;
2433   f6r = Two * f0r - f1r;
2434   f6i = f1i - Two * f0i;
2435 
2436   f0r = f3r + f4r;
2437   f0i = f3i - f4i;
2438   f7r = f3i + f4i;
2439   f7i = f4r - f3r;
2440 
2441   f3r = f0r + w1i * f7r + w1r * f7i;
2442   f3i = f0i - w1r * f7r + w1i * f7i;
2443   f4r = Two * f0r - f3r;
2444   f4i = f3i - Two * f0i;
2445 
2446   /* store result */
2447   ioptr[8] = t0r;
2448   ioptr[9] = t0i;
2449   ioptr[0] = f5r;
2450   ioptr[1] = f5i;
2451 
2452   ioptr[4] = scale * f2r;
2453   ioptr[5] = scale * f2i;
2454   ioptr[12] = scale * t1r;
2455   ioptr[13] = scale * t1i;
2456 
2457   ioptr[2] = scale * f1r;
2458   ioptr[3] = scale * f1i;
2459   ioptr[6] = scale * f3r;
2460   ioptr[7] = scale * f3i;
2461   ioptr[10] = scale * f4r;
2462   ioptr[11] = scale * f4i;
2463   ioptr[14] = scale * f6r;
2464   ioptr[15] = scale * f6i;
2465 }
2466 
frstage(MYFLT * ioptr,int32_t M,MYFLT * Utbl)2467 static void frstage(MYFLT *ioptr, int32_t M, MYFLT *Utbl)
2468 {
2469   /*      Finish RFFT             */
2470 
2471   uint32_t pos;
2472   uint32_t posi;
2473   uint32_t diffUcnt;
2474 
2475   MYFLT *p0r, *p1r;
2476   MYFLT *u0r, *u0i;
2477 
2478   MYFLT w0r, w0i;
2479   MYFLT f0r, f0i, f1r, f1i, f4r, f4i, f5r, f5i;
2480   MYFLT t0r, t0i, t1r, t1i;
2481   const MYFLT Two = FL(2.0);
2482 
2483   pos = POW2(M - 1);
2484   posi = pos + 1;
2485 
2486   p0r = ioptr;
2487   p1r = ioptr + pos / 2;
2488 
2489   u0r = Utbl + POW2(M - 3);
2490 
2491   w0r = *u0r, f0r = *(p0r);
2492   f0i = *(p0r + 1);
2493   f4r = *(p0r + pos);
2494   f4i = *(p0r + posi);
2495   f1r = *(p1r);
2496   f1i = *(p1r + 1);
2497   f5r = *(p1r + pos);
2498   f5i = *(p1r + posi);
2499 
2500   t0r = Two * f0r + Two * f0i;  /* compute Re(x[0]) */
2501   t0i = Two * f0r - Two * f0i;  /* compute Re(x[N/2]) */
2502   t1r = f4r + f4r;
2503   t1i = -f4i - f4i;
2504 
2505   f0r = f1r + f5r;
2506   f0i = f1i - f5i;
2507   f4r = f1i + f5i;
2508   f4i = f5r - f1r;
2509 
2510   f1r = f0r + w0r * f4r + w0r * f4i;
2511   f1i = f0i - w0r * f4r + w0r * f4i;
2512   f5r = Two * f0r - f1r;
2513   f5i = f1i - Two * f0i;
2514 
2515   *(p0r) = t0r;
2516   *(p0r + 1) = t0i;
2517   *(p0r + pos) = t1r;
2518   *(p0r + posi) = t1i;
2519   *(p1r) = f1r;
2520   *(p1r + 1) = f1i;
2521   *(p1r + pos) = f5r;
2522   *(p1r + posi) = f5i;
2523 
2524   u0r = Utbl + 1;
2525   u0i = Utbl + (POW2(M - 2) - 1);
2526 
2527   w0r = *u0r, w0i = *u0i;
2528 
2529   p0r = (ioptr + 2);
2530   p1r = (ioptr + (POW2(M - 2) - 1) * 2);
2531 
2532   /* Butterflys */
2533   /*
2534     f0   -       t0      -       -       f0
2535     f5   -       t1      - w0    -       f5
2536 
2537     f1   -       t0      -       -       f1
2538     f4   -       t1      -iw0 -  f4
2539   */
2540 
2541   for (diffUcnt = POW2(M - 3) - 1; diffUcnt > 0; diffUcnt--) {
2542 
2543     f0r = *(p0r);
2544     f0i = *(p0r + 1);
2545     f5r = *(p1r + pos);
2546     f5i = *(p1r + posi);
2547     f1r = *(p1r);
2548     f1i = *(p1r + 1);
2549     f4r = *(p0r + pos);
2550     f4i = *(p0r + posi);
2551 
2552     t0r = f0r + f5r;
2553     t0i = f0i - f5i;
2554     t1r = f0i + f5i;
2555     t1i = f5r - f0r;
2556 
2557     f0r = t0r + w0r * t1r + w0i * t1i;
2558     f0i = t0i - w0i * t1r + w0r * t1i;
2559     f5r = Two * t0r - f0r;
2560     f5i = f0i - Two * t0i;
2561 
2562     t0r = f1r + f4r;
2563     t0i = f1i - f4i;
2564     t1r = f1i + f4i;
2565     t1i = f4r - f1r;
2566 
2567     f1r = t0r + w0i * t1r + w0r * t1i;
2568     f1i = t0i - w0r * t1r + w0i * t1i;
2569     f4r = Two * t0r - f1r;
2570     f4i = f1i - Two * t0i;
2571 
2572     *(p0r) = f0r;
2573     *(p0r + 1) = f0i;
2574     *(p1r + pos) = f5r;
2575     *(p1r + posi) = f5i;
2576 
2577     w0r = *++u0r;
2578     w0i = *--u0i;
2579 
2580     *(p1r) = f1r;
2581     *(p1r + 1) = f1i;
2582     *(p0r + pos) = f4r;
2583     *(p0r + posi) = f4i;
2584 
2585     p0r += 2;
2586     p1r -= 2;
2587   }
2588 }
2589 
rffts1(MYFLT * ioptr,int32_t M,MYFLT * Utbl,int16 * BRLow)2590 static void rffts1(MYFLT *ioptr, int32_t M, MYFLT *Utbl, int16 *BRLow)
2591 {
2592   /* Compute in-place real fft on the rows of the input array           */
2593   /* The result is the complex spectra of the positive frequencies      */
2594   /* except the location for the first complex number contains the real */
2595   /* values for DC and Nyquest                                          */
2596   /* INPUTS                                                             */
2597   /*   *ioptr = real input data array                                   */
2598   /*   M = log2 of fft size                                             */
2599   /*   *Utbl = cosine table                                             */
2600   /*   *BRLow = bit reversed counter table                              */
2601   /* OUTPUTS                                                            */
2602   /*   *ioptr = output data array   in the following order              */
2603   /*     Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]),  */
2604   /*     ... Re(x[N/2-1]), Im(x[N/2-1]).                                */
2605 
2606   MYFLT scale;
2607   int32_t StageCnt;
2608   int32_t NDiffU;
2609 
2610   M = M - 1;
2611   switch (M) {
2612   case -1:
2613     break;
2614   case 0:
2615     rfft1pt(ioptr);           /* a 2 pt fft */
2616     break;
2617   case 1:
2618     rfft2pt(ioptr);           /* a 4 pt fft */
2619     break;
2620   case 2:
2621     rfft4pt(ioptr);           /* an 8 pt fft */
2622     break;
2623   case 3:
2624     rfft8pt(ioptr);           /* a 16 pt fft */
2625     break;
2626   default:
2627     scale = 0.5;
2628     /* bit reverse and first radix 2 stage */
2629     scbitrevR2(ioptr, M, BRLow, scale);
2630     StageCnt = (M - 1) / 3;   /* number of radix 8 stages           */
2631     NDiffU = 2;               /* one radix 2 stage already complete */
2632     if ((M - 1 - (StageCnt * 3)) == 1) {
2633       bfR2(ioptr, M, NDiffU); /* 1 radix 2 stage */
2634       NDiffU *= 2;
2635     }
2636     if ((M - 1 - (StageCnt * 3)) == 2) {
2637       bfR4(ioptr, M, NDiffU); /* 1 radix 4 stage */
2638       NDiffU *= 4;
2639     }
2640     if (M <= (int32_t) MCACHE)
2641       bfstages(ioptr, M, Utbl, 2, NDiffU, StageCnt);  /* RADIX 8 Stages */
2642     else
2643       fftrecurs(ioptr, M, Utbl, 2, NDiffU, StageCnt); /* RADIX 8 Stages */
2644     frstage(ioptr, M + 1, Utbl);
2645   }
2646 }
2647 
2648 /*******************
2649  * parts of riffts1 *
2650  *******************/
2651 
rifft1pt(MYFLT * ioptr,MYFLT scale)2652 static void rifft1pt(MYFLT *ioptr, MYFLT scale)
2653 {
2654   /***   RADIX 2 rifft    ***/
2655   MYFLT f0r, f0i;
2656   MYFLT t0r, t0i;
2657 
2658   /* bit reversed load */
2659   f0r = ioptr[0];
2660   f0i = ioptr[1];
2661 
2662   /* finish rfft */
2663   t0r = f0r + f0i;
2664   t0i = f0r - f0i;
2665 
2666   /* store result */
2667   ioptr[0] = scale * t0r;
2668   ioptr[1] = scale * t0i;
2669 }
2670 
rifft2pt(MYFLT * ioptr,MYFLT scale)2671 static void rifft2pt(MYFLT *ioptr, MYFLT scale)
2672 {
2673   /***   RADIX 4 rifft    ***/
2674   MYFLT f0r, f0i, f1r, f1i;
2675   MYFLT t0r, t0i;
2676   const MYFLT Two = FL(2.0);
2677 
2678   /* bit reversed load */
2679   t0r = ioptr[0];
2680   t0i = ioptr[1];
2681   f1r = Two * ioptr[2];
2682   f1i = Two * ioptr[3];
2683 
2684   /* start rifft */
2685   f0r = t0r + t0i;
2686   f0i = t0r - t0i;
2687   /* Butterflys           */
2688   /*
2689     f0   -       -       t0
2690     f1   -  1 -  f1
2691   */
2692 
2693   t0r = f0r + f1r;
2694   t0i = f0i - f1i;
2695   f1r = f0r - f1r;
2696   f1i = f0i + f1i;
2697 
2698   /* store result */
2699   ioptr[0] = scale * t0r;
2700   ioptr[1] = scale * t0i;
2701   ioptr[2] = scale * f1r;
2702   ioptr[3] = scale * f1i;
2703 }
2704 
rifft4pt(MYFLT * ioptr,MYFLT scale)2705 static void rifft4pt(MYFLT *ioptr, MYFLT scale)
2706 {
2707   /***   RADIX 8 rifft    ***/
2708   MYFLT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
2709   MYFLT t0r, t0i, t1r, t1i;
2710   MYFLT w0r = 1.0 / ROOT2;    /* cos(pi/4)   */
2711   const MYFLT Two = FL(2.0);
2712 
2713   /* bit reversed load */
2714   t0r = ioptr[0];
2715   t0i = ioptr[1];
2716   f2r = ioptr[2];
2717   f2i = ioptr[3];
2718   f1r = Two * ioptr[4];
2719   f1i = Two * ioptr[5];
2720   f3r = ioptr[6];
2721   f3i = ioptr[7];
2722 
2723   /* start rfft */
2724   f0r = t0r + t0i;              /* compute Re(x[0]) */
2725   f0i = t0r - t0i;              /* compute Re(x[N/2]) */
2726 
2727   t1r = f2r + f3r;
2728   t1i = f2i - f3i;
2729   t0r = f2r - f3r;
2730   t0i = f2i + f3i;
2731 
2732   f2r = t1r - w0r * t0r - w0r * t0i;
2733   f2i = t1i + w0r * t0r - w0r * t0i;
2734   f3r = Two * t1r - f2r;
2735   f3i = f2i - Two * t1i;
2736 
2737   /* Butterflys           */
2738   /*
2739     f0   -       -       t0      -       -       f0
2740     f1   -  1 -  f1      -       -       f1
2741     f2   -       -       f2      -  1 -  f2
2742     f3   -  1 -  t1      -  i -  f3
2743   */
2744 
2745   t0r = f0r + f1r;
2746   t0i = f0i - f1i;
2747   f1r = f0r - f1r;
2748   f1i = f0i + f1i;
2749 
2750   t1r = f2r - f3r;
2751   t1i = f2i - f3i;
2752   f2r = f2r + f3r;
2753   f2i = f2i + f3i;
2754 
2755   f0r = t0r + f2r;
2756   f0i = t0i + f2i;
2757   f2r = t0r - f2r;
2758   f2i = t0i - f2i;
2759 
2760   f3r = f1r + t1i;
2761   f3i = f1i - t1r;
2762   f1r = f1r - t1i;
2763   f1i = f1i + t1r;
2764 
2765   /* store result */
2766   ioptr[0] = scale * f0r;
2767   ioptr[1] = scale * f0i;
2768   ioptr[2] = scale * f1r;
2769   ioptr[3] = scale * f1i;
2770   ioptr[4] = scale * f2r;
2771   ioptr[5] = scale * f2i;
2772   ioptr[6] = scale * f3r;
2773   ioptr[7] = scale * f3i;
2774 }
2775 
rifft8pt(MYFLT * ioptr,MYFLT scale)2776 static void rifft8pt(MYFLT *ioptr, MYFLT scale)
2777 {
2778   /***   RADIX 16 rifft   ***/
2779   MYFLT w0r = (MYFLT) (1.0 / ROOT2);    /* cos(pi/4)    */
2780   MYFLT w1r = MYCOSPID8;                  /* cos(pi/8)    */
2781   MYFLT w1i = MYSINPID8;                  /* sin(pi/8)    */
2782   MYFLT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
2783   MYFLT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
2784   MYFLT t0r, t0i, t1r, t1i;
2785   const MYFLT Two = FL(2.0);
2786 
2787   /* bit reversed load */
2788   t0r = ioptr[0];
2789   t0i = ioptr[1];
2790   f4r = ioptr[2];
2791   f4i = ioptr[3];
2792   f2r = ioptr[4];
2793   f2i = ioptr[5];
2794   f6r = ioptr[6];
2795   f6i = ioptr[7];
2796   f1r = Two * ioptr[8];
2797   f1i = Two * ioptr[9];
2798   f5r = ioptr[10];
2799   f5i = ioptr[11];
2800   f3r = ioptr[12];
2801   f3i = ioptr[13];
2802   f7r = ioptr[14];
2803   f7i = ioptr[15];
2804 
2805   /* start rfft */
2806   f0r = t0r + t0i;              /* compute Re(x[0]) */
2807   f0i = t0r - t0i;              /* compute Re(x[N/2]) */
2808 
2809   t0r = f2r + f3r;
2810   t0i = f2i - f3i;
2811   t1r = f2r - f3r;
2812   t1i = f2i + f3i;
2813 
2814   f2r = t0r - w0r * t1r - w0r * t1i;
2815   f2i = t0i + w0r * t1r - w0r * t1i;
2816   f3r = Two * t0r - f2r;
2817   f3i = f2i - Two * t0i;
2818 
2819   t0r = f4r + f7r;
2820   t0i = f4i - f7i;
2821   t1r = f4r - f7r;
2822   t1i = f4i + f7i;
2823 
2824   f4r = t0r - w1i * t1r - w1r * t1i;
2825   f4i = t0i + w1r * t1r - w1i * t1i;
2826   f7r = Two * t0r - f4r;
2827   f7i = f4i - Two * t0i;
2828 
2829   t0r = f6r + f5r;
2830   t0i = f6i - f5i;
2831   t1r = f6r - f5r;
2832   t1i = f6i + f5i;
2833 
2834   f6r = t0r - w1r * t1r - w1i * t1i;
2835   f6i = t0i + w1i * t1r - w1r * t1i;
2836   f5r = Two * t0r - f6r;
2837   f5i = f6i - Two * t0i;
2838 
2839   /* Butterflys           */
2840   /*
2841     f0   -       -       t0      -       -       f0      -       -       f0
2842     f1*  -  1 -  f1      -       -       f1      -       -       f1
2843     f2   -       -       f2      -  1 -  f2      -       -       f2
2844     f3   -  1 -  t1      -  i -  f3      -       -       f3
2845     f4   -       -       t0      -       -       f4      -  1 -  t0
2846     f5   -  1 -  f5      -       -       f5      - w3 -  f4
2847     f6   -       -       f6      -  1 -  f6      -  i -  t1
2848     f7   -  1 -  t1      -  i -  f7      - iw3-  f6
2849   */
2850 
2851   t0r = f0r + f1r;
2852   t0i = f0i - f1i;
2853   f1r = f0r - f1r;
2854   f1i = f0i + f1i;
2855 
2856   t1r = f2r - f3r;
2857   t1i = f2i - f3i;
2858   f2r = f2r + f3r;
2859   f2i = f2i + f3i;
2860 
2861   f0r = t0r + f2r;
2862   f0i = t0i + f2i;
2863   f2r = t0r - f2r;
2864   f2i = t0i - f2i;
2865 
2866   f3r = f1r + t1i;
2867   f3i = f1i - t1r;
2868   f1r = f1r - t1i;
2869   f1i = f1i + t1r;
2870 
2871   t0r = f4r + f5r;
2872   t0i = f4i + f5i;
2873   f5r = f4r - f5r;
2874   f5i = f4i - f5i;
2875 
2876   t1r = f6r - f7r;
2877   t1i = f6i - f7i;
2878   f6r = f6r + f7r;
2879   f6i = f6i + f7i;
2880 
2881   f4r = t0r + f6r;
2882   f4i = t0i + f6i;
2883   f6r = t0r - f6r;
2884   f6i = t0i - f6i;
2885 
2886   f7r = f5r + t1i;
2887   f7i = f5i - t1r;
2888   f5r = f5r - t1i;
2889   f5i = f5i + t1r;
2890 
2891   t0r = f0r - f4r;
2892   t0i = f0i - f4i;
2893   f0r = f0r + f4r;
2894   f0i = f0i + f4i;
2895 
2896   t1r = f2r + f6i;
2897   t1i = f2i - f6r;
2898   f2r = f2r - f6i;
2899   f2i = f2i + f6r;
2900 
2901   f4r = f1r - f5r * w0r + f5i * w0r;
2902   f4i = f1i - f5r * w0r - f5i * w0r;
2903   f1r = f1r * Two - f4r;
2904   f1i = f1i * Two - f4i;
2905 
2906   f6r = f3r + f7r * w0r + f7i * w0r;
2907   f6i = f3i - f7r * w0r + f7i * w0r;
2908   f3r = f3r * Two - f6r;
2909   f3i = f3i * Two - f6i;
2910 
2911   /* store result */
2912   ioptr[0] = scale * f0r;
2913   ioptr[1] = scale * f0i;
2914   ioptr[2] = scale * f1r;
2915   ioptr[3] = scale * f1i;
2916   ioptr[4] = scale * f2r;
2917   ioptr[5] = scale * f2i;
2918   ioptr[6] = scale * f3r;
2919   ioptr[7] = scale * f3i;
2920   ioptr[8] = scale * t0r;
2921   ioptr[9] = scale * t0i;
2922   ioptr[10] = scale * f4r;
2923   ioptr[11] = scale * f4i;
2924   ioptr[12] = scale * t1r;
2925   ioptr[13] = scale * t1i;
2926   ioptr[14] = scale * f6r;
2927   ioptr[15] = scale * f6i;
2928 }
2929 
ifrstage(MYFLT * ioptr,int32_t M,MYFLT * Utbl)2930 static void ifrstage(MYFLT *ioptr, int32_t M, MYFLT *Utbl)
2931 {
2932   /*      Start RIFFT             */
2933 
2934   uint32_t pos;
2935   uint32_t posi;
2936   uint32_t diffUcnt;
2937 
2938   MYFLT *p0r, *p1r;
2939   MYFLT *u0r, *u0i;
2940 
2941   MYFLT w0r, w0i;
2942   MYFLT f0r, f0i, f1r, f1i, f4r, f4i, f5r, f5i;
2943   MYFLT t0r, t0i, t1r, t1i;
2944   const MYFLT Two = FL(2.0);
2945 
2946   pos = POW2(M - 1);
2947   posi = pos + 1;
2948 
2949   p0r = ioptr;
2950   p1r = ioptr + pos / 2;
2951 
2952   u0r = Utbl + POW2(M - 3);
2953 
2954   w0r = *u0r, f0r = *(p0r);
2955   f0i = *(p0r + 1);
2956   f4r = *(p0r + pos);
2957   f4i = *(p0r + posi);
2958   f1r = *(p1r);
2959   f1i = *(p1r + 1);
2960   f5r = *(p1r + pos);
2961   f5i = *(p1r + posi);
2962 
2963   t0r = f0r + f0i;
2964   t0i = f0r - f0i;
2965   t1r = f4r + f4r;
2966   t1i = -f4i - f4i;
2967 
2968   f0r = f1r + f5r;
2969   f0i = f1i - f5i;
2970   f4r = f1r - f5r;
2971   f4i = f1i + f5i;
2972 
2973   f1r = f0r - w0r * f4r - w0r * f4i;
2974   f1i = f0i + w0r * f4r - w0r * f4i;
2975   f5r = Two * f0r - f1r;
2976   f5i = f1i - Two * f0i;
2977 
2978   *(p0r) = t0r;
2979   *(p0r + 1) = t0i;
2980   *(p0r + pos) = t1r;
2981   *(p0r + posi) = t1i;
2982   *(p1r) = f1r;
2983   *(p1r + 1) = f1i;
2984   *(p1r + pos) = f5r;
2985   *(p1r + posi) = f5i;
2986 
2987   u0r = Utbl + 1;
2988   u0i = Utbl + (POW2(M - 2) - 1);
2989 
2990   w0r = *u0r, w0i = *u0i;
2991 
2992   p0r = (ioptr + 2);
2993   p1r = (ioptr + (POW2(M - 2) - 1) * 2);
2994 
2995   /* Butterflys */
2996   /*
2997     f0   -        t0             -       f0
2998     f1   -     t1     -w0-   f1
2999 
3000     f2   -        t0             -       f2
3001     f3   -     t1           -iw0-  f3
3002   */
3003 
3004   for (diffUcnt = POW2(M - 3) - 1; diffUcnt > 0; diffUcnt--) {
3005 
3006     f0r = *(p0r);
3007     f0i = *(p0r + 1);
3008     f5r = *(p1r + pos);
3009     f5i = *(p1r + posi);
3010     f1r = *(p1r);
3011     f1i = *(p1r + 1);
3012     f4r = *(p0r + pos);
3013     f4i = *(p0r + posi);
3014 
3015     t0r = f0r + f5r;
3016     t0i = f0i - f5i;
3017     t1r = f0r - f5r;
3018     t1i = f0i + f5i;
3019 
3020     f0r = t0r - w0i * t1r - w0r * t1i;
3021     f0i = t0i + w0r * t1r - w0i * t1i;
3022     f5r = Two * t0r - f0r;
3023     f5i = f0i - Two * t0i;
3024 
3025     t0r = f1r + f4r;
3026     t0i = f1i - f4i;
3027     t1r = f1r - f4r;
3028     t1i = f1i + f4i;
3029 
3030     f1r = t0r - w0r * t1r - w0i * t1i;
3031     f1i = t0i + w0i * t1r - w0r * t1i;
3032     f4r = Two * t0r - f1r;
3033     f4i = f1i - Two * t0i;
3034 
3035     *(p0r) = f0r;
3036     *(p0r + 1) = f0i;
3037     *(p1r + pos) = f5r;
3038     *(p1r + posi) = f5i;
3039 
3040     w0r = *++u0r;
3041     w0i = *--u0i;
3042 
3043     *(p1r) = f1r;
3044     *(p1r + 1) = f1i;
3045     *(p0r + pos) = f4r;
3046     *(p0r + posi) = f4i;
3047 
3048     p0r += 2;
3049     p1r -= 2;
3050   }
3051 }
3052 
riffts1(MYFLT * ioptr,int32_t M,MYFLT * Utbl,int16 * BRLow)3053 static void riffts1(MYFLT *ioptr, int32_t M, MYFLT *Utbl, int16 *BRLow)
3054 {
3055   /* Compute in-place real ifft on the rows of the input array    */
3056   /* data order as from rffts1                                    */
3057   /* INPUTS                                                       */
3058   /*   *ioptr = input data array in the following order           */
3059   /*   M = log2 of fft size                                       */
3060   /*   Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]),                  */
3061   /*   Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]).        */
3062   /*   *Utbl = cosine table                                       */
3063   /*   *BRLow = bit reversed counter table                        */
3064   /* OUTPUTS                                                      */
3065   /*   *ioptr = real output data array                            */
3066 
3067   MYFLT scale;
3068   int32_t StageCnt;
3069   int32_t NDiffU;
3070 
3071   scale = (MYFLT)(1.0 / (double)((int32_t)POW2(M)));
3072   M = M - 1;
3073   switch (M) {
3074   case -1:
3075     break;
3076   case 0:
3077     rifft1pt(ioptr, scale);   /* a 2 pt fft */
3078     break;
3079   case 1:
3080     rifft2pt(ioptr, scale);   /* a 4 pt fft */
3081     break;
3082   case 2:
3083     rifft4pt(ioptr, scale);   /* an 8 pt fft */
3084     break;
3085   case 3:
3086     rifft8pt(ioptr, scale);   /* a 16 pt fft */
3087     break;
3088   default:
3089     ifrstage(ioptr, M + 1, Utbl);
3090     /* bit reverse and first radix 2 stage */
3091     scbitrevR2(ioptr, M, BRLow, scale);
3092     StageCnt = (M - 1) / 3;   /* number of radix 8 stages           */
3093     NDiffU = 2;               /* one radix 2 stage already complete */
3094     if ((M - 1 - (StageCnt * 3)) == 1) {
3095       ibfR2(ioptr, M, NDiffU);        /* 1 radix 2 stage */
3096       NDiffU *= 2;
3097     }
3098     if ((M - 1 - (StageCnt * 3)) == 2) {
3099       ibfR4(ioptr, M, NDiffU);        /* 1 radix 4 stage */
3100       NDiffU *= 4;
3101     }
3102     if (M <= (int32_t) MCACHE)
3103       ibfstages(ioptr, M, Utbl, 2, NDiffU, StageCnt); /*  RADIX 8 Stages */
3104     else
3105       ifftrecurs(ioptr, M, Utbl, 2, NDiffU, StageCnt); /* RADIX 8 Stages */
3106   }
3107 }
3108 
3109 
fftInit(CSOUND * csound,int32_t M)3110 static void fftInit(CSOUND *csound, int32_t M)
3111 {
3112   /* malloc and init cosine and bit reversed tables for a given size  */
3113   /* fft, ifft, rfft, rifft                                           */
3114   /* INPUTS                                                           */
3115   /*   M = log2 of fft size (ex M=10 for 1024 point fft)              */
3116   /* OUTPUTS                                                          */
3117   /*   private cosine and bit reversed tables                         */
3118 
3119   MYFLT **UtblArray;
3120   int16 **BRLowArray;
3121   int32_t   i;
3122 
3123   if (!csound->FFT_max_size) {
3124     if (csound->FFT_table_1 == NULL)
3125       csound->FFT_table_1 = csound->Malloc(csound, sizeof(MYFLT*) * 32);
3126     if (csound->FFT_table_2 == NULL)
3127       csound->FFT_table_2 = csound->Malloc(csound, sizeof(int16*) * 32);
3128     for (i = 0; i < 32; i++) {
3129       ((MYFLT**) csound->FFT_table_1)[i] = (MYFLT*) NULL;
3130       ((int16**) csound->FFT_table_2)[i] = (int16*) NULL;
3131     }
3132   }
3133   UtblArray = (MYFLT**) csound->FFT_table_1;
3134   BRLowArray = (int16**) csound->FFT_table_2;
3135 
3136   /*** I did NOT test cases with M>27 ***/
3137   /* init cos table */
3138   UtblArray[M] = (MYFLT*) csound->Malloc(csound,
3139                                          (POW2(M) / 4 + 1) * sizeof(MYFLT));
3140   fftCosInit(M, UtblArray[M]);
3141   if (M > 1) {
3142     /* init bit reversed table for cmplx FFT */
3143     if (BRLowArray[M / 2] == NULL) {
3144       BRLowArray[M / 2] =
3145         (int16*) csound->Malloc(csound, POW2(M / 2 - 1) * sizeof(int16));
3146       fftBRInit(M, BRLowArray[M / 2]);
3147     }
3148   }
3149   if (M > 2) {
3150     /* init bit reversed table for real FFT */
3151     if (BRLowArray[(M - 1) / 2] == 0) {
3152       BRLowArray[(M - 1) / 2] =
3153         (int16*) csound->Malloc(csound,
3154                                 POW2((M - 1) / 2 - 1) * sizeof(int16));
3155       fftBRInit(M - 1, BRLowArray[(M - 1) / 2]);
3156     }
3157   }
3158 
3159   csound->FFT_max_size |= (1 << M);
3160 }
3161 
3162 
ConvertFFTSize(CSOUND * csound,int32_t N)3163 static inline int32_t ConvertFFTSize(CSOUND *csound, int32_t N)
3164 {
3165   if (N <= 0)
3166     return (-N);
3167   switch (N) {
3168   case 0x00000001:  return 0;
3169   case 0x00000002:  return 1;
3170   case 0x00000004:  return 2;
3171   case 0x00000008:  return 3;
3172   case 0x00000010:  return 4;
3173   case 0x00000020:  return 5;
3174   case 0x00000040:  return 6;
3175   case 0x00000080:  return 7;
3176   case 0x00000100:  return 8;
3177   case 0x00000200:  return 9;
3178   case 0x00000400:  return 10;
3179   case 0x00000800:  return 11;
3180   case 0x00001000:  return 12;
3181   case 0x00002000:  return 13;
3182   case 0x00004000:  return 14;
3183   case 0x00008000:  return 15;
3184   case 0x00010000:  return 16;
3185   case 0x00020000:  return 17;
3186   case 0x00040000:  return 18;
3187   case 0x00080000:  return 19;
3188   case 0x00100000:  return 20;
3189   case 0x00200000:  return 21;
3190   case 0x00400000:  return 22;
3191   case 0x00800000:  return 23;
3192   case 0x01000000:  return 24;
3193   case 0x02000000:  return 25;
3194   case 0x04000000:  return 26;
3195   case 0x08000000:  return 27;
3196   case 0x10000000:  return 28;
3197   }
3198   csound->Warning(csound, Str(" *** fftlib.c: internal error: "
3199                               "invalid FFT size: %d"), N);
3200   return 0;
3201 }
3202 
getTablePointers(CSOUND * p,MYFLT ** ct,int16 ** bt,int32_t cn,int32_t bn)3203 static inline void getTablePointers(CSOUND *p, MYFLT **ct, int16 **bt,
3204                                     int32_t cn, int32_t bn)
3205 {
3206   if (!(p->FFT_max_size & (1 << cn)))
3207     fftInit(p, cn);
3208   *ct = ((MYFLT**) p->FFT_table_1)[cn];
3209   *bt = ((int16**) p->FFT_table_2)[bn];
3210 }
3211 
3212 
3213 /**
3214  * Returns the amplitude scale that should be applied to the result of
3215  * an inverse complex FFT with a length of 'FFTsize' samples.
3216  */
csoundGetInverseComplexFFTScale(CSOUND * csound,int32_t FFTsize)3217 MYFLT csoundGetInverseComplexFFTScale(CSOUND *csound, int32_t FFTsize)
3218 {
3219   IGN(FFTsize);
3220   IGN(csound);
3221   return FL(1.0);
3222 }
3223 
3224 /**
3225  * Returns the amplitude scale that should be applied to the result of
3226  * an inverse real FFT with a length of 'FFTsize' samples.
3227  */
csoundGetInverseRealFFTScale(CSOUND * csound,int32_t FFTsize)3228 MYFLT csoundGetInverseRealFFTScale(CSOUND *csound, int32_t FFTsize)
3229 {
3230   IGN(FFTsize);
3231   IGN(csound);
3232   return FL(1.0);
3233 }
3234 
3235 /**
3236  * Compute in-place complex FFT
3237  * FFTsize: FFT length in samples
3238  * buf:     array of FFTsize*2 MYFLT values,
3239  *          in interleaved real/imaginary format
3240  */
csoundComplexFFT(CSOUND * csound,MYFLT * buf,int32_t FFTsize)3241 void csoundComplexFFT(CSOUND *csound, MYFLT *buf, int32_t FFTsize)
3242 {
3243   MYFLT *Utbl;
3244   int16 *BRLow;
3245   int32_t   M;
3246 
3247   M = ConvertFFTSize(csound, FFTsize);
3248   getTablePointers(csound, &Utbl, &BRLow, M, M / 2);
3249   ffts1(buf, M, Utbl, BRLow);
3250 }
3251 
3252 /**
3253  * Compute in-place inverse complex FFT
3254  * FFTsize: FFT length in samples
3255  * buf:     array of FFTsize*2 MYFLT values,
3256  *          in interleaved real/imaginary format
3257  * Output should be scaled by the return value of
3258  * csoundGetInverseComplexFFTScale(csound, FFTsize).
3259  */
3260 
csoundInverseComplexFFT(CSOUND * csound,MYFLT * buf,int32_t FFTsize)3261 void csoundInverseComplexFFT(CSOUND *csound, MYFLT *buf, int32_t FFTsize)
3262 {
3263   MYFLT *Utbl;
3264   int16 *BRLow;
3265   int32_t   M;
3266 
3267   M = ConvertFFTSize(csound, FFTsize);
3268   getTablePointers(csound, &Utbl, &BRLow, M, M / 2);
3269   iffts1(buf, M, Utbl, BRLow);
3270 }
3271 
3272 /**
3273  * Compute in-place real FFT
3274  * FFTsize: FFT length in samples
3275  * buf:     array of FFTsize MYFLT values; output is in interleaved
3276  *          real/imaginary format, except for buf[1] which is the real
3277  *          part for the Nyquist frequency
3278  */
3279 
csoundRealFFT(CSOUND * csound,MYFLT * buf,int32_t FFTsize)3280 void csoundRealFFT(CSOUND *csound, MYFLT *buf, int32_t FFTsize)
3281 {
3282 
3283     MYFLT *Utbl;
3284     int16 *BRLow;
3285     int32_t   M;
3286     M = ConvertFFTSize(csound, FFTsize);
3287     getTablePointers(csound, &Utbl, &BRLow, M, (M - 1) / 2);
3288     rffts1(buf, M, Utbl, BRLow);
3289 }
3290 
3291 /**
3292  * Compute in-place inverse real FFT
3293  * FFTsize: FFT length in samples
3294  * buf:     array of FFTsize MYFLT values; input is expected to be in
3295  *          interleaved real/imaginary format, except for buf[1] which
3296  *          is the real part for the Nyquist frequency
3297  * Output should be scaled by the return value of
3298  * csoundGetInverseRealFFTScale(csound, FFTsize).
3299  */
3300 
csoundInverseRealFFT(CSOUND * csound,MYFLT * buf,int32_t FFTsize)3301 void csoundInverseRealFFT(CSOUND *csound, MYFLT *buf, int32_t FFTsize)
3302 {
3303     MYFLT *Utbl;
3304     int16 *BRLow;
3305     int32_t   M;
3306     M = ConvertFFTSize(csound, FFTsize);
3307     getTablePointers(csound, &Utbl, &BRLow, M, (M - 1) / 2);
3308     riffts1(buf, M, Utbl, BRLow);
3309 }
3310 
3311 /**
3312  * Multiply two arrays (buf1 and buf2) of complex data in the format
3313  * returned by csoundRealFFT(), and leave the result in outbuf, which
3314  * may be the same as either buf1 or buf2.
3315  * An amplitude scale of 'scaleFac' is also applied.
3316  * The arrays should contain 'FFTsize' MYFLT values.
3317  */
3318 
csoundRealFFTMult(CSOUND * csound,MYFLT * outbuf,MYFLT * buf1,MYFLT * buf2,int32_t FFTsize,MYFLT scaleFac)3319 void csoundRealFFTMult(CSOUND *csound, MYFLT *outbuf,
3320                        MYFLT *buf1, MYFLT *buf2, int32_t FFTsize, MYFLT scaleFac)
3321 {
3322   MYFLT re, im;
3323   int32_t   i;
3324   IGN(csound);
3325 
3326   if (scaleFac != FL(1.0)) {
3327     outbuf[0] = buf1[0] * buf2[0] * scaleFac;
3328     if (FFTsize < 2)
3329       return;
3330     outbuf[1] = buf1[1] * buf2[1] * scaleFac;
3331     for (i = 2; i < FFTsize; ) {
3332       re = ((buf1[i] * buf2[i]) - (buf1[i + 1] * buf2[i + 1])) * scaleFac;
3333       im = ((buf1[i] * buf2[i + 1]) + (buf2[i] * buf1[i + 1])) * scaleFac;
3334       outbuf[i++] = re;
3335       outbuf[i++] = im;
3336     }
3337   }
3338   else {
3339     outbuf[0] = buf1[0] * buf2[0];
3340     if (FFTsize < 2)
3341       return;
3342     outbuf[1] = buf1[1] * buf2[1];
3343     for (i = 2; i < FFTsize; ) {
3344       re = (buf1[i] * buf2[i]) - (buf1[i + 1] * buf2[i + 1]);
3345       im = (buf1[i] * buf2[i + 1]) + (buf2[i] * buf1[i + 1]);
3346       outbuf[i++] = re;
3347       outbuf[i++] = im;
3348     }
3349   }
3350 }
3351 
3352 
3353 
3354 /*
3355   New FFT interface
3356   VL, 2016
3357 */
3358 static
pffft_execute(CSOUND_FFT_SETUP * setup,MYFLT * sig)3359 void pffft_execute(CSOUND_FFT_SETUP *setup,
3360                    MYFLT *sig) {
3361   int32_t i, N = setup->N;
3362   float s, *buf;
3363   buf = (float *) setup->buffer;
3364   for(i=0;i<N;i++)
3365     buf[i] = sig[i];
3366   pffft_transform_ordered((PFFFT_Setup *)
3367                           setup->setup,
3368                           buf,buf,NULL,setup->d);
3369   s = (setup->d == PFFFT_BACKWARD ?
3370        (MYFLT) setup->N : FL(1.0));
3371   for(i=0;i<N;i++)
3372     sig[i] = buf[i]/s;
3373 }
3374 
3375 #if defined(__MACH__)
3376 /* vDSP FFT implementation */
3377 #include <Accelerate/Accelerate.h>
3378 static
vDSP_execute(CSOUND_FFT_SETUP * setup,MYFLT * sig)3379 void vDSP_execute(CSOUND_FFT_SETUP *setup,
3380                    MYFLT *sig){
3381 #ifdef USE_DOUBLE
3382   DSPDoubleSplitComplex tmp;
3383 #else
3384   DSPSplitComplex tmp;
3385 #endif
3386   int32_t i,j;
3387   MYFLT s;
3388   int32_t N = setup->N;
3389   tmp.realp = &setup->buffer[0];
3390   tmp.imagp = &setup->buffer[N>>1];
3391   for(i=j=0;i<N;i+=2,j++){
3392     tmp.realp[j] = sig[i];
3393     tmp.imagp[j] = sig[i+1];
3394     }
3395 #ifdef USE_DOUBLE
3396   vDSP_fft_zripD((FFTSetupD) setup->setup,
3397                  &tmp, 1,
3398                  setup->M,setup->d);
3399 #else
3400   vDSP_fft_zrip((FFTSetup) setup->setup,
3401                  &tmp, 1,
3402                  setup->M,setup->d);
3403 #endif
3404  s = (setup->d == -1 ? (MYFLT)(N) : FL(2.0));
3405  for(i=j=0;i<N;i+=2,j++){
3406     sig[i] = tmp.realp[j]/s;
3407     sig[i+1] = tmp.imagp[j]/s;
3408     }
3409 }
3410 #endif
3411 
3412 #define ALIGN_BYTES 64
3413 static
align_alloc(CSOUND * csound,size_t nb_bytes)3414 void *align_alloc(CSOUND *csound, size_t nb_bytes){
3415   void *p, *p0 = csound->Malloc(csound, nb_bytes + ALIGN_BYTES);
3416   if(!p0) return (void *) 0;
3417   p = (void *) (((size_t) p0 + ALIGN_BYTES)
3418                 & (~((size_t) (ALIGN_BYTES-1))));
3419   *((void **) p - 1) = p0;
3420   return p;
3421 }
3422 
setupDispose(CSOUND * csound,void * pp)3423 int32_t setupDispose(CSOUND *csound, void *pp){
3424   IGN(csound);
3425   CSOUND_FFT_SETUP *setup =(CSOUND_FFT_SETUP *) pp;
3426   switch(setup->lib){
3427 #if defined(__MACH__)
3428   case VDSP_LIB:
3429 #ifdef USE_DOUBLE
3430      vDSP_destroy_fftsetupD((FFTSetupD)
3431 #else
3432      vDSP_destroy_fftsetup((FFTSetup)
3433 #endif
3434                            setup->setup);
3435     break;
3436 #endif
3437   case PFFT_LIB:
3438     pffft_destroy_setup((PFFFT_Setup *)setup->setup);
3439     break;
3440   }
3441   return OK;
3442 }
3443 
isPowTwo(int32_t N)3444 int32_t isPowTwo(int32_t N) {
3445   return (N != 0) ? !(N & (N - 1)) : 0;
3446 }
3447 
csoundRealFFT2Setup(CSOUND * csound,int32_t FFTsize,int32_t d)3448 void *csoundRealFFT2Setup(CSOUND *csound,
3449                          int32_t FFTsize,
3450                          int32_t d){
3451   CSOUND_FFT_SETUP *setup;
3452   int32_t lib = csound->oparms->fft_lib;
3453   if(lib == PFFT_LIB && FFTsize <= 16){
3454     csound->Warning(csound,
3455       "FFTsize %d \n"
3456       "Cannot use PFFT with sizes <= 16\n"
3457       "--defaulting to FFTLIB",
3458         FFTsize);
3459     lib = 0;
3460   }
3461   setup = (CSOUND_FFT_SETUP *)
3462     csound->Calloc(csound, sizeof(CSOUND_FFT_SETUP));
3463   setup->N = FFTsize;
3464   setup->p2 = isPowTwo(FFTsize);
3465   switch(lib){
3466 #if defined(__MACH__)
3467   case VDSP_LIB:
3468     setup->M = ConvertFFTSize(csound, FFTsize);
3469     setup->setup = (void *)
3470 #ifdef USE_DOUBLE
3471       vDSP_create_fftsetupD(setup->M,kFFTRadix2);
3472 #else
3473       vDSP_create_fftsetup(setup->M,kFFTRadix2);
3474 #endif
3475       setup->d = (d ==  FFT_FWD ?
3476                 kFFTDirection_Forward :
3477                 kFFTDirection_Inverse);
3478     setup->lib = lib;
3479     break;
3480 #endif
3481   case PFFT_LIB:
3482     setup->setup = (void *)
3483       pffft_new_setup(FFTsize,PFFFT_REAL);
3484     setup->d = (d ==  FFT_FWD ?
3485                 PFFFT_FORWARD :
3486                 PFFFT_BACKWARD);
3487     setup->lib = lib;
3488     break;
3489   default:
3490     setup->lib = 0;
3491     setup->d = d;
3492     return (void *) setup;
3493   }
3494   setup->buffer = (MYFLT *) align_alloc(csound, sizeof(MYFLT)*FFTsize);
3495   csound->RegisterResetCallback(csound, (void*) setup,
3496                                 (int32_t (*)(CSOUND *, void *))
3497                                 setupDispose);
3498   return (void *) setup;
3499 }
3500 
csoundRealFFT2(CSOUND * csound,void * p,MYFLT * sig)3501 void csoundRealFFT2(CSOUND *csound,
3502                      void *p, MYFLT *sig){
3503   CSOUND_FFT_SETUP *setup =
3504         (CSOUND_FFT_SETUP *) p;
3505   switch(setup->lib) {
3506 #if defined(__MACH__)
3507   case VDSP_LIB:
3508     vDSP_execute(setup,sig);
3509     break;
3510 #endif
3511   case PFFT_LIB:
3512     pffft_execute(setup,sig);
3513     break;
3514   default:
3515     (setup->d == FFT_FWD ?
3516       csoundRealFFT(csound,
3517                      sig,setup->N) :
3518       csoundInverseRealFFT(csound,
3519                      sig,setup->N));
3520     setup->lib = 0;
3521   }
3522 }
3523 
3524 
csoundDCTSetup(CSOUND * csound,int32_t FFTsize,int32_t d)3525 void *csoundDCTSetup(CSOUND *csound,
3526                      int32_t FFTsize, int32_t d){
3527  CSOUND_FFT_SETUP *setup;
3528  setup = (CSOUND_FFT_SETUP *)
3529    csoundRealFFT2Setup(csound,
3530                        FFTsize*4,d);
3531  if(setup->lib == 0){
3532   setup->buffer = (MYFLT *)
3533     csound->Calloc(csound, sizeof(MYFLT)*setup->N);
3534  }
3535  return setup;
3536 }
3537 
3538 
pffft_DCT_execute(CSOUND * csound,void * p,MYFLT * sig)3539 void pffft_DCT_execute(CSOUND *csound,
3540                      void *p, MYFLT *sig){
3541   IGN(csound);
3542   CSOUND_FFT_SETUP *setup =
3543         (CSOUND_FFT_SETUP *) p;
3544   int32_t i,j, N= setup->N;
3545   float *buffer = (float *)setup->buffer;
3546   if(setup->d == FFT_FWD){
3547   for(i=j = 0; i < N/2; i+=2, j++){
3548     buffer[i] = FL(0.0);
3549     buffer[i+1] = sig[j];
3550   }
3551   for(i=N/2,j=N/4-1; i < N; i+=2, j--){
3552     buffer[i] = FL(0.0);
3553     buffer[i+1] = sig[j];
3554   }
3555   pffft_transform_ordered((PFFFT_Setup *)
3556                           setup->setup,
3557                           buffer,buffer,
3558                           NULL,PFFFT_FORWARD);
3559   for(i=j=0; i < N/2; i+=2, j++){
3560     sig[j] = buffer[i];
3561   }
3562   } else {
3563   buffer[0] = sig[0];
3564   buffer[1] = -sig[0];
3565   for(i=2,j=1; i < N/2; i+=2, j++){
3566     buffer[i] = sig[j];
3567     buffer[i+1] = FL(0.0);
3568   }
3569   buffer[N/2] = buffer[N/2+1] = FL(0.0);
3570   for(i=N/2+2,j=N/4-1; i < N; i+=2, j--){
3571     buffer[i] = -sig[j];
3572     buffer[i+1] = FL(0.0);
3573   }
3574   pffft_transform_ordered((PFFFT_Setup *)
3575                           setup->setup,
3576                           buffer,buffer,
3577                           NULL,PFFFT_BACKWARD);
3578   for(i=j=0; i < N/2; i+=2, j++){
3579     sig[j] = buffer[i+1]/N;
3580   }
3581   }
3582 }
3583 
3584 #if defined(__MACH__)
vDSP_DCT_execute(CSOUND * csound,void * p,MYFLT * sig)3585 void vDSP_DCT_execute(CSOUND *csound,
3586                      void *p, MYFLT *sig){
3587   IGN(csound);
3588   CSOUND_FFT_SETUP *setup =
3589         (CSOUND_FFT_SETUP *) p;
3590   int32_t i,j, N= setup->N;
3591 #ifdef USE_DOUBLE
3592   DSPDoubleSplitComplex tmp;
3593 #else
3594   DSPSplitComplex tmp;
3595 #endif
3596   tmp.realp = &setup->buffer[0];
3597   tmp.imagp = &setup->buffer[N>>1];
3598   if(setup->d ==  kFFTDirection_Forward){
3599   for(j=0;j<N/4;j++){
3600     tmp.realp[j] = FL(0.0);
3601     tmp.imagp[j] = sig[j];
3602     }
3603   for(i=N/4,j=N/4-1;i<N/2;i++,j--){
3604     tmp.realp[i] = FL(0.0);
3605     tmp.imagp[i] = sig[j];
3606   }
3607 #ifdef USE_DOUBLE
3608   vDSP_fft_zripD((FFTSetupD) setup->setup,
3609 #else
3610   vDSP_fft_zrip((FFTSetup) setup->setup,
3611 #endif
3612                  &tmp, 1,
3613                  setup->M,setup->d);
3614 
3615   for(j=0; j < N/4; j++){
3616     sig[j] = tmp.realp[j]/FL(2.0);
3617   }
3618   } else {
3619   tmp.realp[0] = sig[0];
3620   tmp.imagp[0] = -sig[0];
3621   for(j=1; j < N/4; j++){
3622     tmp.realp[j] = sig[j];
3623     tmp.imagp[j] = FL(0.0);
3624   }
3625   tmp.realp[N/4] = tmp.imagp[N/4] = FL(0.0);
3626   for(i=N/4+1,j=N/4-1; i < N/2; i++, j--){
3627     tmp.realp[i] = -sig[j];
3628     tmp.imagp[i] = FL(0.0);
3629   }
3630 #ifdef USE_DOUBLE
3631   vDSP_fft_zripD((FFTSetupD) setup->setup,
3632 #else
3633   vDSP_fft_zrip((FFTSetup) setup->setup,
3634 #endif
3635                  &tmp, 1,
3636                  setup->M,setup->d);
3637   for(j=0; j < N/4; j++){
3638     sig[j] = tmp.imagp[j]/N;
3639   }
3640   }
3641 }
3642 #endif
3643 
DCT_execute(CSOUND * csound,void * p,MYFLT * sig)3644 void DCT_execute(CSOUND *csound,
3645                      void *p, MYFLT *sig){
3646   CSOUND_FFT_SETUP *setup =
3647         (CSOUND_FFT_SETUP *) p;
3648   int32_t i,j, N= setup->N;
3649   MYFLT *buffer = setup->buffer;
3650   if(setup->d == FFT_FWD){
3651   for(i=j = 0; i < N/2; i+=2, j++){
3652     buffer[i] = FL(0.0);
3653     buffer[i+1] = sig[j];
3654   }
3655   for(i=N/2,j=N/4-1; i < N; i+=2, j--){
3656     buffer[i] = FL(0.0);
3657     buffer[i+1] = sig[j];
3658   }
3659   csoundRealFFT(csound,buffer,N);
3660   for(i=j=0; i < N/2; i+=2, j++){
3661     sig[j] = buffer[i];
3662   }
3663   } else {
3664   buffer[0] = sig[0];
3665   buffer[1] = -sig[0];
3666   for(i=2,j=1; i < N/2; i+=2, j++){
3667     buffer[i] = sig[j];
3668     buffer[i+1] = FL(0.0);
3669   }
3670   buffer[N/2] = buffer[N/2+1] = FL(0.0);
3671   for(i=N/2+2,j=N/4-1; i < N; i+=2, j--){
3672     buffer[i] = -sig[j];
3673     buffer[i+1] = FL(0.0);
3674   }
3675   csoundInverseRealFFT(csound,buffer,N);
3676   for(i=j=0; i < N/2; i+=2, j++){
3677     sig[j] = buffer[i+1];
3678   }
3679   }
3680 }
3681 
csoundDCT(CSOUND * csound,void * p,MYFLT * sig)3682 void csoundDCT(CSOUND *csound,
3683                void *p, MYFLT *sig){
3684 CSOUND_FFT_SETUP *setup =
3685         (CSOUND_FFT_SETUP *) p;
3686   switch(setup->lib) {
3687 #if defined(__MACH__)
3688   case VDSP_LIB:
3689     vDSP_DCT_execute(csound,setup,sig);
3690     break;
3691 #endif
3692   case PFFT_LIB:
3693     pffft_DCT_execute(csound,setup,sig);
3694     break;
3695   default:
3696     DCT_execute(csound,setup,sig);
3697     setup->lib = 0;
3698   }
3699 }
3700 
3701 /* =======--====================*/
3702 #if 0
3703 #ifdef HAVE_VECLIB
3704 /* vDSP FFT interface */
3705 
3706 #define ALIGN64 64
3707 static
3708 void *vDSP_alloc(CSOUND *csound, size_t nb_bytes){
3709   void *p, *p0 = csound->Malloc(csound, nb_bytes + ALIGN64);
3710   if(!p0) return (void *) 0;
3711   p = (void *) (((size_t) p0 + ALIGN64)
3712                 & (~((size_t) (ALIGN64-1))));
3713   *((void **) p - 1) = p0;
3714   return p;
3715 }
3716 
3717 static
3718 void vDSP_free(CSOUND *csound, void *p) {
3719   if(p)csound->Free(csound, *((void **) p - 1));
3720 }
3721 
3722 /* reset vdsp */
3723 int32_t vDSP_reset(CSOUND *csound, void *pp){
3724   IGN(pp);
3725 #ifdef USE_DOUBLE
3726   vDSP_destroy_fftsetupD(csound->vdsp_setup);
3727 #else
3728   vDSP_destroy_fftsetup(csound->vdsp_setup);
3729 #endif
3730   csound->vdsp_setup = NULL;
3731   vDSP_free(csound, csound->vdsp_buffer);
3732   return OK;
3733 }
3734 
3735 /* create setup if setup does not exist */
3736 static
3737 void vDSP_setup(CSOUND *csound, int32_t FFTsize){
3738   int32_t M = ConvertFFTSize(csound, FFTsize);
3739   if(csound->FFT_max_size < FFTsize){
3740 #ifdef USE_DOUBLE
3741     if(csound->vdsp_setup != NULL)
3742       vDSP_destroy_fftsetupD(csound->vdsp_setup);
3743     csound->vdsp_setup =
3744       vDSP_create_fftsetupD(M,kFFTRadix2);
3745 #else
3746     if(csound->vdsp_setup != NULL)
3747       vDSP_destroy_fftsetup(csound->vdsp_setup);
3748     csound->vdsp_setup =
3749       vDSP_create_fftsetup(M,kFFTRadix2);
3750 #endif
3751     if(csound->vdsp_buffer != NULL)
3752       vDSP_free(csound, csound->vdsp_buffer);
3753     csound->vdsp_buffer = vDSP_alloc(csound,
3754                                      FFTsize*(sizeof(MYFLT)));
3755     if(csound->FFT_max_size == 0)
3756       csound->RegisterResetCallback(csound, (void*) NULL,
3757                                     (int32_t (*)(CSOUND *, void *))
3758                                     vDSP_reset);
3759     csound->FFT_max_size = FFTsize;
3760   }
3761 }
3762 
3763 static
3764 void vDSP_RealFFT(CSOUND *csound,int32_t FFTsize,MYFLT *sig,FFTDirection d){
3765 #ifdef USE_DOUBLE
3766   DSPDoubleSplitComplex tmp;
3767 #else
3768   DSPSplitComplex tmp;
3769 #endif
3770   int32_t i,j;
3771   MYFLT s;
3772   vDSP_setup(csound, FFTsize);
3773   tmp.realp = &csound->vdsp_buffer[0];
3774   tmp.imagp = &csound->vdsp_buffer[FFTsize>>1];
3775   for(i=j=0;i<FFTsize;i+=2,j++){
3776     tmp.realp[j] = sig[i];
3777     tmp.imagp[j] = sig[i+1];
3778     }
3779 #ifdef USE_DOUBLE
3780   vDSP_fft_zripD(csound->vdsp_setup, &tmp, 1,
3781                  ConvertFFTSize(csound, FFTsize),
3782                  d);
3783 #else
3784   vDSP_fft_zrip(csound->vdsp_setup, &tmp, 1,
3785                 ConvertFFTSize(csound, FFTsize),
3786                 d);
3787 #endif
3788  s = (d == -1 ? (MYFLT)(FFTsize) : FL(2.0));
3789  for(i=j=0;i<FFTsize;i+=2,j++){
3790     sig[i] = tmp.realp[j]/s;
3791     sig[i+1] = tmp.imagp[j]/s;
3792     }
3793 
3794 }
3795 
3796 /* these functions are available from OSX 10.9 */
3797 int32_t vDSP_reset_New(CSOUND *csound, void *pp){
3798   IGN(pp);
3799 #ifdef USE_DOUBLE
3800   vDSP_DFT_DestroySetupD(csound->vdsp_setup_fwd);
3801   vDSP_DFT_DestroySetupD(csound->vdsp_setup_inv);
3802 #else
3803   vDSP_DFT_DestroySetup(csound->vdsp_setup_fwd);
3804   vDSP_DFT_DestroySetup(csound->vdsp_setup_inv);
3805 #endif
3806   vDSP_free(csound, csound->vdsp_buffer);
3807   return OK;
3808 }
3809 
3810 /* create setup if setup does not exist */
3811 static
3812 #ifdef USE_DOUBLE
3813 vDSP_DFT_SetupD
3814 #else
3815 vDSP_DFT_Setup
3816 #endif
3817 vDSP_setup_New(CSOUND *csound, int32_t FFTsize, int32_t d){
3818   if(csound->vdsp_setup_fwd == NULL ||
3819      csound->vdsp_setup_inv == NULL){
3820 #ifdef USE_DOUBLE
3821     if(d == 1)
3822       csound->vdsp_setup_fwd =
3823         vDSP_DFT_zrop_CreateSetupD(csound->vdsp_setup_fwd,
3824                                    FFTsize, d);
3825     else
3826       csound->vdsp_setup_inv =
3827         vDSP_DFT_zrop_CreateSetupD(csound->vdsp_setup_inv,
3828                                    FFTsize, d);
3829 #else
3830     if(d == 1)
3831       csound->vdsp_setup_fwd =
3832         vDSP_DFT_zrop_CreateSetup(csound->vdsp_setup_fwd,
3833                                   FFTsize, d);
3834     else
3835       csound->vdsp_setup_inv =
3836         vDSP_DFT_zrop_CreateSetup(csound->vdsp_setup_inv,
3837                                   FFTsize, d);
3838 #endif
3839   }
3840   if(csound->FFT_max_size < FFTsize) {
3841     vDSP_free(csound, csound->vdsp_buffer);
3842     csound->vdsp_buffer = vDSP_alloc(csound,
3843                                      FFTsize*(sizeof(MYFLT)));
3844   }
3845 
3846   if(csound->FFT_max_size == 0)
3847     csound->RegisterResetCallback(csound, (void*) NULL,
3848                                   (int32_t (*)(CSOUND *, void *))
3849                                   vDSP_reset_New);
3850   csound->FFT_max_size = FFTsize;
3851   return d == 1 ? csound->vdsp_setup_fwd :
3852     csound->vdsp_setup_inv;
3853 }
3854 
3855 void vDSP_RealFFT_New(CSOUND *csound,int32_t FFTsize,MYFLT *sig,
3856                   FFTDirection d){
3857   int32_t i,j;
3858   MYFLT s;
3859 #ifdef USE_DOUBLE
3860   vDSP_DFT_SetupD setup;
3861 #else
3862   vDSP_DFT_Setup setup;
3863 #endif
3864   setup = vDSP_setup_New(csound, FFTsize, d);
3865   MYFLT *inr = &csound->vdsp_buffer[0];
3866   MYFLT *ini = &csound->vdsp_buffer[FFTsize>>1];
3867 
3868   for(i=j=0;i<FFTsize;i+=2,j++){
3869     inr[j] = sig[i];
3870     ini[j] = sig[i+1];
3871   }
3872 #ifdef USE_DOUBLE
3873   vDSP_DFT_ExecuteD(setup,inr,ini,inr,ini);
3874 #else
3875   vDSP_DFT_Execute(setup,inr,ini,inr,ini);
3876 #endif
3877     s = (d == -1 ? (MYFLT)(FFTsize) : FL(2.0));
3878   for(i=j=0;i<FFTsize;i+=2,j++){
3879     sig[i] = inr[j]/s;
3880     sig[i+1] = ini[j]/s;
3881   }
3882 }
3883 
3884 #else /* HAVE_VECLIB */
3885 int32_t pffft_reset(CSOUND *csound, void *pp){
3886   IGN(pp);
3887   int32_t i;
3888   for(i=0; i < 32; i++)
3889     if(csound->setup[i]) pffft_destroy_setup(csound->setup[i]);
3890   pffft_aligned_free(csound->vdsp_buffer);
3891   return OK;
3892 }
3893 
3894 /* create setup if setup does not exist */
3895 static
3896 void pffft_setup(CSOUND *csound, int32_t FFTsize, int32_t M){
3897   if(csound->FFT_max_size != FFTsize){
3898     if(csound->FFT_max_size == 0)
3899       csound->RegisterResetCallback(csound, (void*) NULL,
3900                                     (int32_t (*)(CSOUND *, void *)) pffft_reset);
3901     if(csound->setup[M] == NULL)
3902        csound->setup[M] = pffft_new_setup(FFTsize,PFFFT_REAL);
3903     if(csound->FFT_max_size < FFTsize) {
3904       pffft_aligned_free(csound->vdsp_buffer);
3905       csound->vdsp_buffer = (MYFLT *) pffft_aligned_malloc(FFTsize*(sizeof(float)));
3906       memset(csound->vdsp_buffer,0, FFTsize*(sizeof(float)));
3907       csound->FFT_max_size = FFTsize;
3908     }
3909   }
3910 }
3911 
3912 static
3913 void pffft_RealFFT(CSOUND *csound,
3914                    int32_t FFTsize,MYFLT *sig,
3915                    int32_t d){
3916   int32_t i;
3917   float s, *buf;
3918   int32_t M = ConvertFFTSize(csound, FFTsize);
3919   pffft_setup(csound, FFTsize, M);
3920   buf = (float *)csound->vdsp_buffer;
3921   for(i=0;i<FFTsize;i++)
3922     buf[i] = sig[i];
3923   pffft_transform_ordered(csound->setup[M],
3924                           buf,buf,NULL,d);
3925   s = (d == PFFFT_BACKWARD ? (MYFLT)FFTsize : FL(1.0));
3926   for(i=0;i<FFTsize;i++)
3927     sig[i] = buf[i]/s;
3928 }
3929 #endif
3930 #endif
3931 
3932