1 /*
2     FFT library
3     based on public domain code by John Green <green_jt@vsdec.npt.nuwc.navy.mil>
4     original version is available at
5       http://hyperarchive.lcs.mit.edu/
6             /HyperArchive/Archive/dev/src/ffts-for-risc-2-c.hqx
7     ported to Csound by Istvan Varga, 2005
8 */
9 
10 #include <stdint.h>
11 #include <stdlib.h>
12 #include <math.h>
13 #include "base.h"
14 
15 #ifndef M_PI
16 #define M_PI 3.14159265358979323846
17 #endif
18 
19 #define POW2(m) ((uint32_t) 1 << (m))       /* integer power of 2 for m<32 */
20 
21 /* fft's with M bigger than this bust primary cache */
22 #define MCACHE  (11 - (sizeof(SPFLOAT) / 8))
23 
24 /* some math constants to 40 decimal places */
25 #define MYPI      3.141592653589793238462643383279502884197   /* pi         */
26 #define MYROOT2   1.414213562373095048801688724209698078569   /* sqrt(2)    */
27 #define MYCOSPID8 0.9238795325112867561281831893967882868224  /* cos(pi/8)  */
28 #define MYSINPID8 0.3826834323650897717284599840303988667613  /* sin(pi/8)  */
29 
30 /*****************************************************
31 * routines to initialize tables used by fft routines *
32 *****************************************************/
33 
34 static void fftCosInit(int M, SPFLOAT *Utbl)
35 {
36     /* Compute Utbl, the cosine table for ffts  */
37     /* of size (pow(2,M)/4 +1)                  */
38     /* INPUTS                                   */
39     /*   M = log2 of fft size                   */
40     /* OUTPUTS                                  */
41     /*   *Utbl = cosine table                   */
42     unsigned int fftN = POW2(M);
43     unsigned int i1;
44 
45     Utbl[0] = 1.0;
46     for (i1 = 1; i1 < fftN/4; i1++)
47       Utbl[i1] = cos((2.0 * M_PI * (SPFLOAT)i1) / (SPFLOAT)fftN);
48     Utbl[fftN/4] = 0.0;
49 }
50 
51 void fftBRInit(int M, int16_t *BRLow)
52 {
53     /* Compute BRLow, the bit reversed table for ffts */
54     /* of size pow(2,M/2 -1)                          */
55     /* INPUTS                                         */
56     /*   M = log2 of fft size                         */
57     /* OUTPUTS                                        */
58     /*   *BRLow = bit reversed counter table          */
59     int Mroot_1 = M / 2 - 1;
60     int Nroot_1 = POW2(Mroot_1);
61     int i1;
62     int bitsum;
63     int bitmask;
64     int bit;
65 
66     for (i1 = 0; i1 < Nroot_1; i1++) {
67       bitsum = 0;
68       bitmask = 1;
69       for (bit = 1; bit <= Mroot_1; bitmask <<= 1, bit++)
70         if (i1 & bitmask)
71           bitsum = bitsum + (Nroot_1 >> bit);
72       BRLow[i1] = bitsum;
73     }
74 }
75 
76 /*****************
77 * parts of ffts1 *
78 *****************/
79 
80 static void bitrevR2(SPFLOAT *ioptr, int M, int16_t *BRLow)
81 {
82     /*** bit reverse and first radix 2 stage of forward or inverse fft ***/
83     SPFLOAT f0r;
84     SPFLOAT f0i;
85     SPFLOAT f1r;
86     SPFLOAT f1i;
87     SPFLOAT f2r;
88     SPFLOAT f2i;
89     SPFLOAT f3r;
90     SPFLOAT f3i;
91     SPFLOAT f4r;
92     SPFLOAT f4i;
93     SPFLOAT f5r;
94     SPFLOAT f5i;
95     SPFLOAT f6r;
96     SPFLOAT f6i;
97     SPFLOAT f7r;
98     SPFLOAT f7i;
99     SPFLOAT t0r;
100     SPFLOAT t0i;
101     SPFLOAT t1r;
102     SPFLOAT t1i;
103     SPFLOAT *p0r;
104     SPFLOAT *p1r;
105     SPFLOAT *IOP;
106     SPFLOAT *iolimit;
107     int Colstart;
108     int iCol;
109     unsigned int posA;
110     unsigned int posAi;
111     unsigned int posB;
112     unsigned int posBi;
113 
114     const unsigned int Nrems2 = POW2((M + 3) / 2);
115     const unsigned int Nroot_1_ColInc = POW2(M) - Nrems2;
116     const unsigned int Nroot_1 = POW2(M / 2 - 1) - 1;
117     const unsigned int ColstartShift = (M + 1) / 2 + 1;
118 
119     posA = POW2(M);               /* 1/2 of POW2(M) complexes */
120     posAi = posA + 1;
121     posB = posA + 2;
122     posBi = posB + 1;
123 
124     iolimit = ioptr + Nrems2;
125     for (; ioptr < iolimit; ioptr += POW2(M / 2 + 1)) {
126       for (Colstart = Nroot_1; Colstart >= 0; Colstart--) {
127         iCol = Nroot_1;
128         p0r = ioptr + Nroot_1_ColInc + BRLow[Colstart] * 4;
129         IOP = ioptr + (Colstart << ColstartShift);
130         p1r = IOP + BRLow[iCol] * 4;
131         f0r = *(p0r);
132         f0i = *(p0r + 1);
133         f1r = *(p0r + posA);
134         f1i = *(p0r + posAi);
135         for (; iCol > Colstart;) {
136           f2r = *(p0r + 2);
137           f2i = *(p0r + (2 + 1));
138           f3r = *(p0r + posB);
139           f3i = *(p0r + posBi);
140           f4r = *(p1r);
141           f4i = *(p1r + 1);
142           f5r = *(p1r + posA);
143           f5i = *(p1r + posAi);
144           f6r = *(p1r + 2);
145           f6i = *(p1r + (2 + 1));
146           f7r = *(p1r + posB);
147           f7i = *(p1r + posBi);
148 
149           t0r = f0r + f1r;
150           t0i = f0i + f1i;
151           f1r = f0r - f1r;
152           f1i = f0i - f1i;
153           t1r = f2r + f3r;
154           t1i = f2i + f3i;
155           f3r = f2r - f3r;
156           f3i = f2i - f3i;
157           f0r = f4r + f5r;
158           f0i = f4i + f5i;
159           f5r = f4r - f5r;
160           f5i = f4i - f5i;
161           f2r = f6r + f7r;
162           f2i = f6i + f7i;
163           f7r = f6r - f7r;
164           f7i = f6i - f7i;
165 
166           *(p1r) = t0r;
167           *(p1r + 1) = t0i;
168           *(p1r + 2) = f1r;
169           *(p1r + (2 + 1)) = f1i;
170           *(p1r + posA) = t1r;
171           *(p1r + posAi) = t1i;
172           *(p1r + posB) = f3r;
173           *(p1r + posBi) = f3i;
174           *(p0r) = f0r;
175           *(p0r + 1) = f0i;
176           *(p0r + 2) = f5r;
177           *(p0r + (2 + 1)) = f5i;
178           *(p0r + posA) = f2r;
179           *(p0r + posAi) = f2i;
180           *(p0r + posB) = f7r;
181           *(p0r + posBi) = f7i;
182 
183           p0r -= Nrems2;
184           f0r = *(p0r);
185           f0i = *(p0r + 1);
186           f1r = *(p0r + posA);
187           f1i = *(p0r + posAi);
188           iCol -= 1;
189           p1r = IOP + BRLow[iCol] * 4;
190         }
191         f2r = *(p0r + 2);
192         f2i = *(p0r + (2 + 1));
193         f3r = *(p0r + posB);
194         f3i = *(p0r + posBi);
195 
196         t0r = f0r + f1r;
197         t0i = f0i + f1i;
198         f1r = f0r - f1r;
199         f1i = f0i - f1i;
200         t1r = f2r + f3r;
201         t1i = f2i + f3i;
202         f3r = f2r - f3r;
203         f3i = f2i - f3i;
204 
205         *(p0r) = t0r;
206         *(p0r + 1) = t0i;
207         *(p0r + 2) = f1r;
208         *(p0r + (2 + 1)) = f1i;
209         *(p0r + posA) = t1r;
210         *(p0r + posAi) = t1i;
211         *(p0r + posB) = f3r;
212         *(p0r + posBi) = f3i;
213       }
214     }
215 }
216 
217 static void fft2pt(SPFLOAT *ioptr)
218 {
219     /***   RADIX 2 fft      ***/
220     SPFLOAT f0r, f0i, f1r, f1i;
221     SPFLOAT t0r, t0i;
222 
223     /* bit reversed load */
224     f0r = ioptr[0];
225     f0i = ioptr[1];
226     f1r = ioptr[2];
227     f1i = ioptr[3];
228 
229     /* Butterflys           */
230     /*
231        f0   -       -       t0
232        f1   -  1 -  f1
233      */
234 
235     t0r = f0r + f1r;
236     t0i = f0i + f1i;
237     f1r = f0r - f1r;
238     f1i = f0i - f1i;
239 
240     /* store result */
241     ioptr[0] = t0r;
242     ioptr[1] = t0i;
243     ioptr[2] = f1r;
244     ioptr[3] = f1i;
245 }
246 
247 static void fft4pt(SPFLOAT *ioptr)
248 {
249     /***   RADIX 4 fft      ***/
250     SPFLOAT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
251     SPFLOAT t0r, t0i, t1r, t1i;
252 
253     /* bit reversed load */
254     f0r = ioptr[0];
255     f0i = ioptr[1];
256     f1r = ioptr[4];
257     f1i = ioptr[5];
258     f2r = ioptr[2];
259     f2i = ioptr[3];
260     f3r = ioptr[6];
261     f3i = ioptr[7];
262 
263     /* Butterflys           */
264     /*
265        f0   -       -       t0      -       -       f0
266        f1   -  1 -  f1      -       -       f1
267        f2   -       -       f2      -  1 -  f2
268        f3   -  1 -  t1      - -i -  f3
269      */
270 
271     t0r = f0r + f1r;
272     t0i = f0i + f1i;
273     f1r = f0r - f1r;
274     f1i = f0i - f1i;
275 
276     t1r = f2r - f3r;
277     t1i = f2i - f3i;
278     f2r = f2r + f3r;
279     f2i = f2i + f3i;
280 
281     f0r = t0r + f2r;
282     f0i = t0i + f2i;
283     f2r = t0r - f2r;
284     f2i = t0i - f2i;
285 
286     f3r = f1r - t1i;
287     f3i = f1i + t1r;
288     f1r = f1r + t1i;
289     f1i = f1i - t1r;
290 
291     /* store result */
292     ioptr[0] = f0r;
293     ioptr[1] = f0i;
294     ioptr[2] = f1r;
295     ioptr[3] = f1i;
296     ioptr[4] = f2r;
297     ioptr[5] = f2i;
298     ioptr[6] = f3r;
299     ioptr[7] = f3i;
300 }
301 
302 static void fft8pt(SPFLOAT *ioptr)
303 {
304     /***   RADIX 8 fft      ***/
305     SPFLOAT w0r = (SPFLOAT)(1.0 / MYROOT2);    /* cos(pi/4)   */
306     SPFLOAT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
307     SPFLOAT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
308     SPFLOAT t0r, t0i, t1r, t1i;
309     const SPFLOAT Two = 2.0;
310 
311     /* bit reversed load */
312     f0r = ioptr[0];
313     f0i = ioptr[1];
314     f1r = ioptr[8];
315     f1i = ioptr[9];
316     f2r = ioptr[4];
317     f2i = ioptr[5];
318     f3r = ioptr[12];
319     f3i = ioptr[13];
320     f4r = ioptr[2];
321     f4i = ioptr[3];
322     f5r = ioptr[10];
323     f5i = ioptr[11];
324     f6r = ioptr[6];
325     f6i = ioptr[7];
326     f7r = ioptr[14];
327     f7i = ioptr[15];
328     /* Butterflys           */
329     /*
330        f0   -       -       t0      -       -       f0      -       -       f0
331        f1   -  1 -  f1      -       -       f1      -       -       f1
332        f2   -       -       f2      -  1 -  f2      -       -       f2
333        f3   -  1 -  t1      - -i -  f3      -       -       f3
334        f4   -       -       t0      -       -       f4      -  1 -  t0
335        f5   -  1 -  f5      -       -       f5      - w3 -  f4
336        f6   -       -       f6      -  1 -  f6      - -i -  t1
337        f7   -  1 -  t1      - -i -  f7      - iw3-  f6
338      */
339 
340     t0r = f0r + f1r;
341     t0i = f0i + f1i;
342     f1r = f0r - f1r;
343     f1i = f0i - f1i;
344 
345     t1r = f2r - f3r;
346     t1i = f2i - f3i;
347     f2r = f2r + f3r;
348     f2i = f2i + f3i;
349 
350     f0r = t0r + f2r;
351     f0i = t0i + f2i;
352     f2r = t0r - f2r;
353     f2i = t0i - f2i;
354 
355     f3r = f1r - t1i;
356     f3i = f1i + t1r;
357     f1r = f1r + t1i;
358     f1i = f1i - t1r;
359 
360     t0r = f4r + f5r;
361     t0i = f4i + f5i;
362     f5r = f4r - f5r;
363     f5i = f4i - f5i;
364 
365     t1r = f6r - f7r;
366     t1i = f6i - f7i;
367     f6r = f6r + f7r;
368     f6i = f6i + f7i;
369 
370     f4r = t0r + f6r;
371     f4i = t0i + f6i;
372     f6r = t0r - f6r;
373     f6i = t0i - f6i;
374 
375     f7r = f5r - t1i;
376     f7i = f5i + t1r;
377     f5r = f5r + t1i;
378     f5i = f5i - t1r;
379 
380     t0r = f0r - f4r;
381     t0i = f0i - f4i;
382     f0r = f0r + f4r;
383     f0i = f0i + f4i;
384 
385     t1r = f2r - f6i;
386     t1i = f2i + f6r;
387     f2r = f2r + f6i;
388     f2i = f2i - f6r;
389 
390     f4r = f1r - f5r * w0r - f5i * w0r;
391     f4i = f1i + f5r * w0r - f5i * w0r;
392     f1r = f1r * Two - f4r;
393     f1i = f1i * Two - f4i;
394 
395     f6r = f3r + f7r * w0r - f7i * w0r;
396     f6i = f3i + f7r * w0r + f7i * w0r;
397     f3r = f3r * Two - f6r;
398     f3i = f3i * Two - f6i;
399 
400     /* store result */
401     ioptr[0] = f0r;
402     ioptr[1] = f0i;
403     ioptr[2] = f1r;
404     ioptr[3] = f1i;
405     ioptr[4] = f2r;
406     ioptr[5] = f2i;
407     ioptr[6] = f3r;
408     ioptr[7] = f3i;
409     ioptr[8] = t0r;
410     ioptr[9] = t0i;
411     ioptr[10] = f4r;
412     ioptr[11] = f4i;
413     ioptr[12] = t1r;
414     ioptr[13] = t1i;
415     ioptr[14] = f6r;
416     ioptr[15] = f6i;
417 }
418 
419 static void bfR2(SPFLOAT *ioptr, int M, int NDiffU)
420 {
421     /*** 2nd radix 2 stage ***/
422     unsigned int pos;
423     unsigned int posi;
424     unsigned int pinc;
425     unsigned int pnext;
426     unsigned int NSameU;
427     unsigned int SameUCnt;
428 
429     SPFLOAT *pstrt;
430     SPFLOAT *p0r, *p1r, *p2r, *p3r;
431 
432     SPFLOAT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
433     SPFLOAT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
434 
435     pinc = NDiffU * 2;            /* 2 floats per complex */
436     pnext = pinc * 4;
437     pos = 2;
438     posi = pos + 1;
439     NSameU = POW2(M) / 4 / NDiffU;        /* 4 Us at a time */
440     pstrt = ioptr;
441     p0r = pstrt;
442     p1r = pstrt + pinc;
443     p2r = p1r + pinc;
444     p3r = p2r + pinc;
445 
446     /* Butterflys           */
447     /*
448        f0   -       -       f4
449        f1   -  1 -  f5
450        f2   -       -       f6
451        f3   -  1 -  f7
452      */
453     /* Butterflys           */
454     /*
455        f0   -       -       f4
456        f1   -  1 -  f5
457        f2   -       -       f6
458        f3   -  1 -  f7
459      */
460 
461     for (SameUCnt = NSameU; SameUCnt > 0; SameUCnt--) {
462 
463       f0r = *p0r;
464       f1r = *p1r;
465       f0i = *(p0r + 1);
466       f1i = *(p1r + 1);
467       f2r = *p2r;
468       f3r = *p3r;
469       f2i = *(p2r + 1);
470       f3i = *(p3r + 1);
471 
472       f4r = f0r + f1r;
473       f4i = f0i + f1i;
474       f5r = f0r - f1r;
475       f5i = f0i - f1i;
476 
477       f6r = f2r + f3r;
478       f6i = f2i + f3i;
479       f7r = f2r - f3r;
480       f7i = f2i - f3i;
481 
482       *p0r = f4r;
483       *(p0r + 1) = f4i;
484       *p1r = f5r;
485       *(p1r + 1) = f5i;
486       *p2r = f6r;
487       *(p2r + 1) = f6i;
488       *p3r = f7r;
489       *(p3r + 1) = f7i;
490 
491       f0r = *(p0r + pos);
492       f1i = *(p1r + posi);
493       f0i = *(p0r + posi);
494       f1r = *(p1r + pos);
495       f2r = *(p2r + pos);
496       f3i = *(p3r + posi);
497       f2i = *(p2r + posi);
498       f3r = *(p3r + pos);
499 
500       f4r = f0r + f1i;
501       f4i = f0i - f1r;
502       f5r = f0r - f1i;
503       f5i = f0i + f1r;
504 
505       f6r = f2r + f3i;
506       f6i = f2i - f3r;
507       f7r = f2r - f3i;
508       f7i = f2i + f3r;
509 
510       *(p0r + pos) = f4r;
511       *(p0r + posi) = f4i;
512       *(p1r + pos) = f5r;
513       *(p1r + posi) = f5i;
514       *(p2r + pos) = f6r;
515       *(p2r + posi) = f6i;
516       *(p3r + pos) = f7r;
517       *(p3r + posi) = f7i;
518 
519       p0r += pnext;
520       p1r += pnext;
521       p2r += pnext;
522       p3r += pnext;
523     }
524 }
525 
526 static void bfR4(SPFLOAT *ioptr, int M, int NDiffU)
527 {
528     /*** 1 radix 4 stage ***/
529     unsigned int pos;
530     unsigned int posi;
531     unsigned int pinc;
532     unsigned int pnext;
533     unsigned int pnexti;
534     unsigned int NSameU;
535     unsigned int SameUCnt;
536 
537     SPFLOAT *pstrt;
538     SPFLOAT *p0r, *p1r, *p2r, *p3r;
539 
540     SPFLOAT w1r = 1.0 / MYROOT2;    /* cos(pi/4)   */
541     SPFLOAT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
542     SPFLOAT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
543     SPFLOAT t1r, t1i;
544     const SPFLOAT Two = 2.0;
545 
546     pinc = NDiffU * 2;            /* 2 floats per complex */
547     pnext = pinc * 4;
548     pnexti = pnext + 1;
549     pos = 2;
550     posi = pos + 1;
551     NSameU = POW2(M) / 4 / NDiffU;        /* 4 pts per butterfly */
552     pstrt = ioptr;
553     p0r = pstrt;
554     p1r = pstrt + pinc;
555     p2r = p1r + pinc;
556     p3r = p2r + pinc;
557 
558     /* Butterflys           */
559     /*
560        f0   -       -       f0      -       -       f4
561        f1   -  1 -  f5      -       -       f5
562        f2   -       -       f6      -  1 -  f6
563        f3   -  1 -  f3      - -i -  f7
564      */
565     /* Butterflys           */
566     /*
567        f0   -       -       f4      -       -       f4
568        f1   - -i -  t1      -       -       f5
569        f2   -       -       f2      - w1 -  f6
570        f3   - -i -  f7      - iw1-  f7
571      */
572 
573     f0r = *p0r;
574     f1r = *p1r;
575     f2r = *p2r;
576     f3r = *p3r;
577     f0i = *(p0r + 1);
578     f1i = *(p1r + 1);
579     f2i = *(p2r + 1);
580     f3i = *(p3r + 1);
581 
582     f5r = f0r - f1r;
583     f5i = f0i - f1i;
584     f0r = f0r + f1r;
585     f0i = f0i + f1i;
586 
587     f6r = f2r + f3r;
588     f6i = f2i + f3i;
589     f3r = f2r - f3r;
590     f3i = f2i - f3i;
591 
592     for (SameUCnt = NSameU - 1; SameUCnt > 0; SameUCnt--) {
593 
594       f7r = f5r - f3i;
595       f7i = f5i + f3r;
596       f5r = f5r + f3i;
597       f5i = f5i - f3r;
598 
599       f4r = f0r + f6r;
600       f4i = f0i + f6i;
601       f6r = f0r - f6r;
602       f6i = f0i - f6i;
603 
604       f2r = *(p2r + pos);
605       f2i = *(p2r + posi);
606       f1r = *(p1r + pos);
607       f1i = *(p1r + posi);
608       f3i = *(p3r + posi);
609       f0r = *(p0r + pos);
610       f3r = *(p3r + pos);
611       f0i = *(p0r + posi);
612 
613       *p3r = f7r;
614       *p0r = f4r;
615       *(p3r + 1) = f7i;
616       *(p0r + 1) = f4i;
617       *p1r = f5r;
618       *p2r = f6r;
619       *(p1r + 1) = f5i;
620       *(p2r + 1) = f6i;
621 
622       f7r = f2r - f3i;
623       f7i = f2i + f3r;
624       f2r = f2r + f3i;
625       f2i = f2i - f3r;
626 
627       f4r = f0r + f1i;
628       f4i = f0i - f1r;
629       t1r = f0r - f1i;
630       t1i = f0i + f1r;
631 
632       f5r = t1r - f7r * w1r + f7i * w1r;
633       f5i = t1i - f7r * w1r - f7i * w1r;
634       f7r = t1r * Two - f5r;
635       f7i = t1i * Two - f5i;
636 
637       f6r = f4r - f2r * w1r - f2i * w1r;
638       f6i = f4i + f2r * w1r - f2i * w1r;
639       f4r = f4r * Two - f6r;
640       f4i = f4i * Two - f6i;
641 
642       f3r = *(p3r + pnext);
643       f0r = *(p0r + pnext);
644       f3i = *(p3r + pnexti);
645       f0i = *(p0r + pnexti);
646       f2r = *(p2r + pnext);
647       f2i = *(p2r + pnexti);
648       f1r = *(p1r + pnext);
649       f1i = *(p1r + pnexti);
650 
651       *(p2r + pos) = f6r;
652       *(p1r + pos) = f5r;
653       *(p2r + posi) = f6i;
654       *(p1r + posi) = f5i;
655       *(p3r + pos) = f7r;
656       *(p0r + pos) = f4r;
657       *(p3r + posi) = f7i;
658       *(p0r + posi) = f4i;
659 
660       f6r = f2r + f3r;
661       f6i = f2i + f3i;
662       f3r = f2r - f3r;
663       f3i = f2i - f3i;
664 
665       f5r = f0r - f1r;
666       f5i = f0i - f1i;
667       f0r = f0r + f1r;
668       f0i = f0i + f1i;
669 
670       p3r += pnext;
671       p0r += pnext;
672       p1r += pnext;
673       p2r += pnext;
674     }
675     f7r = f5r - f3i;
676     f7i = f5i + f3r;
677     f5r = f5r + f3i;
678     f5i = f5i - f3r;
679 
680     f4r = f0r + f6r;
681     f4i = f0i + f6i;
682     f6r = f0r - f6r;
683     f6i = f0i - f6i;
684 
685     f2r = *(p2r + pos);
686     f2i = *(p2r + posi);
687     f1r = *(p1r + pos);
688     f1i = *(p1r + posi);
689     f3i = *(p3r + posi);
690     f0r = *(p0r + pos);
691     f3r = *(p3r + pos);
692     f0i = *(p0r + posi);
693 
694     *p3r = f7r;
695     *p0r = f4r;
696     *(p3r + 1) = f7i;
697     *(p0r + 1) = f4i;
698     *p1r = f5r;
699     *p2r = f6r;
700     *(p1r + 1) = f5i;
701     *(p2r + 1) = f6i;
702 
703     f7r = f2r - f3i;
704     f7i = f2i + f3r;
705     f2r = f2r + f3i;
706     f2i = f2i - f3r;
707 
708     f4r = f0r + f1i;
709     f4i = f0i - f1r;
710     t1r = f0r - f1i;
711     t1i = f0i + f1r;
712 
713     f5r = t1r - f7r * w1r + f7i * w1r;
714     f5i = t1i - f7r * w1r - f7i * w1r;
715     f7r = t1r * Two - f5r;
716     f7i = t1i * Two - f5i;
717 
718     f6r = f4r - f2r * w1r - f2i * w1r;
719     f6i = f4i + f2r * w1r - f2i * w1r;
720     f4r = f4r * Two - f6r;
721     f4i = f4i * Two - f6i;
722 
723     *(p2r + pos) = f6r;
724     *(p1r + pos) = f5r;
725     *(p2r + posi) = f6i;
726     *(p1r + posi) = f5i;
727     *(p3r + pos) = f7r;
728     *(p0r + pos) = f4r;
729     *(p3r + posi) = f7i;
730     *(p0r + posi) = f4i;
731 }
732 
733 static void bfstages(SPFLOAT *ioptr, int M, SPFLOAT *Utbl, int Ustride,
734                      int NDiffU, int StageCnt)
735 {
736     /***   RADIX 8 Stages   ***/
737     unsigned int pos;
738     unsigned int posi;
739     unsigned int pinc;
740     unsigned int pnext;
741     unsigned int NSameU;
742     int          Uinc;
743     int          Uinc2;
744     int          Uinc4;
745     unsigned int DiffUCnt;
746     unsigned int SameUCnt;
747     unsigned int U2toU3;
748 
749     SPFLOAT *pstrt;
750     SPFLOAT *p0r, *p1r, *p2r, *p3r;
751     SPFLOAT *u0r, *u0i, *u1r, *u1i, *u2r, *u2i;
752 
753     SPFLOAT w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i;
754     SPFLOAT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
755     SPFLOAT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
756     SPFLOAT t0r, t0i, t1r, t1i;
757     const SPFLOAT Two = 2.0;
758 
759     pinc = NDiffU * 2;            /* 2 floats per complex */
760     pnext = pinc * 8;
761     pos = pinc * 4;
762     posi = pos + 1;
763     NSameU = POW2(M) / 8 / NDiffU;        /* 8 pts per butterfly */
764     Uinc = (int) NSameU * Ustride;
765     Uinc2 = Uinc * 2;
766     Uinc4 = Uinc * 4;
767     U2toU3 = (POW2(M) / 8) * Ustride;
768     for (; StageCnt > 0; StageCnt--) {
769 
770       u0r = &Utbl[0];
771       u0i = &Utbl[POW2(M - 2) * Ustride];
772       u1r = u0r;
773       u1i = u0i;
774       u2r = u0r;
775       u2i = u0i;
776 
777       w0r = *u0r;
778       w0i = *u0i;
779       w1r = *u1r;
780       w1i = *u1i;
781       w2r = *u2r;
782       w2i = *u2i;
783       w3r = *(u2r + U2toU3);
784       w3i = *(u2i - U2toU3);
785 
786       pstrt = ioptr;
787 
788       p0r = pstrt;
789       p1r = pstrt + pinc;
790       p2r = p1r + pinc;
791       p3r = p2r + pinc;
792 
793       /* Butterflys           */
794       /*
795          f0   -       -       t0      -       -       f0      -       -       f0
796          f1   - w0-   f1      -       -       f1      -       -       f1
797          f2   -       -       f2      - w1-   f2      -       -       f4
798          f3   - w0-   t1      - iw1-  f3      -       -       f5
799 
800          f4   -       -       t0      -       -       f4      - w2-   t0
801          f5   - w0-   f5      -       -       f5      - w3-   t1
802          f6   -       -       f6      - w1-   f6      - iw2-  f6
803          f7   - w0-   t1      - iw1-  f7      - iw3-  f7
804        */
805 
806       for (DiffUCnt = NDiffU; DiffUCnt > 0; DiffUCnt--) {
807         f0r = *p0r;
808         f0i = *(p0r + 1);
809         f1r = *p1r;
810         f1i = *(p1r + 1);
811         for (SameUCnt = NSameU - 1; SameUCnt > 0; SameUCnt--) {
812           f2r = *p2r;
813           f2i = *(p2r + 1);
814           f3r = *p3r;
815           f3i = *(p3r + 1);
816 
817           t0r = f0r + f1r * w0r + f1i * w0i;
818           t0i = f0i - f1r * w0i + f1i * w0r;
819           f1r = f0r * Two - t0r;
820           f1i = f0i * Two - t0i;
821 
822           f4r = *(p0r + pos);
823           f4i = *(p0r + posi);
824           f5r = *(p1r + pos);
825           f5i = *(p1r + posi);
826 
827           f6r = *(p2r + pos);
828           f6i = *(p2r + posi);
829           f7r = *(p3r + pos);
830           f7i = *(p3r + posi);
831 
832           t1r = f2r - f3r * w0r - f3i * w0i;
833           t1i = f2i + f3r * w0i - f3i * w0r;
834           f2r = f2r * Two - t1r;
835           f2i = f2i * Two - t1i;
836 
837           f0r = t0r + f2r * w1r + f2i * w1i;
838           f0i = t0i - f2r * w1i + f2i * w1r;
839           f2r = t0r * Two - f0r;
840           f2i = t0i * Two - f0i;
841 
842           f3r = f1r + t1r * w1i - t1i * w1r;
843           f3i = f1i + t1r * w1r + t1i * w1i;
844           f1r = f1r * Two - f3r;
845           f1i = f1i * Two - f3i;
846 
847           t0r = f4r + f5r * w0r + f5i * w0i;
848           t0i = f4i - f5r * w0i + f5i * w0r;
849           f5r = f4r * Two - t0r;
850           f5i = f4i * Two - t0i;
851 
852           t1r = f6r - f7r * w0r - f7i * w0i;
853           t1i = f6i + f7r * w0i - f7i * w0r;
854           f6r = f6r * Two - t1r;
855           f6i = f6i * Two - t1i;
856 
857           f4r = t0r + f6r * w1r + f6i * w1i;
858           f4i = t0i - f6r * w1i + f6i * w1r;
859           f6r = t0r * Two - f4r;
860           f6i = t0i * Two - f4i;
861 
862           f7r = f5r + t1r * w1i - t1i * w1r;
863           f7i = f5i + t1r * w1r + t1i * w1i;
864           f5r = f5r * Two - f7r;
865           f5i = f5i * Two - f7i;
866 
867           t0r = f0r - f4r * w2r - f4i * w2i;
868           t0i = f0i + f4r * w2i - f4i * w2r;
869           f0r = f0r * Two - t0r;
870           f0i = f0i * Two - t0i;
871 
872           t1r = f1r - f5r * w3r - f5i * w3i;
873           t1i = f1i + f5r * w3i - f5i * w3r;
874           f1r = f1r * Two - t1r;
875           f1i = f1i * Two - t1i;
876 
877           *(p0r + pos) = t0r;
878           *(p1r + pos) = t1r;
879           *(p0r + posi) = t0i;
880           *(p1r + posi) = t1i;
881           *p0r = f0r;
882           *p1r = f1r;
883           *(p0r + 1) = f0i;
884           *(p1r + 1) = f1i;
885 
886           p0r += pnext;
887           f0r = *p0r;
888           f0i = *(p0r + 1);
889 
890           p1r += pnext;
891 
892           f1r = *p1r;
893           f1i = *(p1r + 1);
894 
895           f4r = f2r - f6r * w2i + f6i * w2r;
896           f4i = f2i - f6r * w2r - f6i * w2i;
897           f6r = f2r * Two - f4r;
898           f6i = f2i * Two - f4i;
899 
900           f5r = f3r - f7r * w3i + f7i * w3r;
901           f5i = f3i - f7r * w3r - f7i * w3i;
902           f7r = f3r * Two - f5r;
903           f7i = f3i * Two - f5i;
904 
905           *p2r = f4r;
906           *p3r = f5r;
907           *(p2r + 1) = f4i;
908           *(p3r + 1) = f5i;
909           *(p2r + pos) = f6r;
910           *(p3r + pos) = f7r;
911           *(p2r + posi) = f6i;
912           *(p3r + posi) = f7i;
913 
914           p2r += pnext;
915           p3r += pnext;
916         }
917 
918         f2r = *p2r;
919         f2i = *(p2r + 1);
920         f3r = *p3r;
921         f3i = *(p3r + 1);
922 
923         t0r = f0r + f1r * w0r + f1i * w0i;
924         t0i = f0i - f1r * w0i + f1i * w0r;
925         f1r = f0r * Two - t0r;
926         f1i = f0i * Two - t0i;
927 
928         f4r = *(p0r + pos);
929         f4i = *(p0r + posi);
930         f5r = *(p1r + pos);
931         f5i = *(p1r + posi);
932 
933         f6r = *(p2r + pos);
934         f6i = *(p2r + posi);
935         f7r = *(p3r + pos);
936         f7i = *(p3r + posi);
937 
938         t1r = f2r - f3r * w0r - f3i * w0i;
939         t1i = f2i + f3r * w0i - f3i * w0r;
940         f2r = f2r * Two - t1r;
941         f2i = f2i * Two - t1i;
942 
943         f0r = t0r + f2r * w1r + f2i * w1i;
944         f0i = t0i - f2r * w1i + f2i * w1r;
945         f2r = t0r * Two - f0r;
946         f2i = t0i * Two - f0i;
947 
948         f3r = f1r + t1r * w1i - t1i * w1r;
949         f3i = f1i + t1r * w1r + t1i * w1i;
950         f1r = f1r * Two - f3r;
951         f1i = f1i * Two - f3i;
952 
953         if ((int) DiffUCnt == NDiffU / 2)
954           Uinc4 = -Uinc4;
955 
956         u0r += Uinc4;
957         u0i -= Uinc4;
958         u1r += Uinc2;
959         u1i -= Uinc2;
960         u2r += Uinc;
961         u2i -= Uinc;
962 
963         pstrt += 2;
964 
965         t0r = f4r + f5r * w0r + f5i * w0i;
966         t0i = f4i - f5r * w0i + f5i * w0r;
967         f5r = f4r * Two - t0r;
968         f5i = f4i * Two - t0i;
969 
970         t1r = f6r - f7r * w0r - f7i * w0i;
971         t1i = f6i + f7r * w0i - f7i * w0r;
972         f6r = f6r * Two - t1r;
973         f6i = f6i * Two - t1i;
974 
975         f4r = t0r + f6r * w1r + f6i * w1i;
976         f4i = t0i - f6r * w1i + f6i * w1r;
977         f6r = t0r * Two - f4r;
978         f6i = t0i * Two - f4i;
979 
980         f7r = f5r + t1r * w1i - t1i * w1r;
981         f7i = f5i + t1r * w1r + t1i * w1i;
982         f5r = f5r * Two - f7r;
983         f5i = f5i * Two - f7i;
984 
985         w0r = *u0r;
986         w0i = *u0i;
987         w1r = *u1r;
988         w1i = *u1i;
989 
990         if ((int) DiffUCnt <= NDiffU / 2)
991           w0r = -w0r;
992 
993         t0r = f0r - f4r * w2r - f4i * w2i;
994         t0i = f0i + f4r * w2i - f4i * w2r;
995         f0r = f0r * Two - t0r;
996         f0i = f0i * Two - t0i;
997 
998         f4r = f2r - f6r * w2i + f6i * w2r;
999         f4i = f2i - f6r * w2r - f6i * w2i;
1000         f6r = f2r * Two - f4r;
1001         f6i = f2i * Two - f4i;
1002 
1003         *(p0r + pos) = t0r;
1004         *p2r = f4r;
1005         *(p0r + posi) = t0i;
1006         *(p2r + 1) = f4i;
1007         w2r = *u2r;
1008         w2i = *u2i;
1009         *p0r = f0r;
1010         *(p2r + pos) = f6r;
1011         *(p0r + 1) = f0i;
1012         *(p2r + posi) = f6i;
1013 
1014         p0r = pstrt;
1015         p2r = pstrt + pinc + pinc;
1016 
1017         t1r = f1r - f5r * w3r - f5i * w3i;
1018         t1i = f1i + f5r * w3i - f5i * w3r;
1019         f1r = f1r * Two - t1r;
1020         f1i = f1i * Two - t1i;
1021 
1022         f5r = f3r - f7r * w3i + f7i * w3r;
1023         f5i = f3i - f7r * w3r - f7i * w3i;
1024         f7r = f3r * Two - f5r;
1025         f7i = f3i * Two - f5i;
1026 
1027         *(p1r + pos) = t1r;
1028         *p3r = f5r;
1029         *(p1r + posi) = t1i;
1030         *(p3r + 1) = f5i;
1031         w3r = *(u2r + U2toU3);
1032         w3i = *(u2i - U2toU3);
1033         *p1r = f1r;
1034         *(p3r + pos) = f7r;
1035         *(p1r + 1) = f1i;
1036         *(p3r + posi) = f7i;
1037 
1038         p1r = pstrt + pinc;
1039         p3r = p2r + pinc;
1040       }
1041       NSameU /= 8;
1042       Uinc /= 8;
1043       Uinc2 /= 8;
1044       Uinc4 = Uinc * 4;
1045       NDiffU *= 8;
1046       pinc *= 8;
1047       pnext *= 8;
1048       pos *= 8;
1049       posi = pos + 1;
1050     }
1051 }
1052 
1053 static void fftrecurs(SPFLOAT *ioptr, int M, SPFLOAT *Utbl, int Ustride, int NDiffU,
1054                       int StageCnt)
1055 {
1056     /* recursive bfstages calls to maximize on chip cache efficiency */
1057     int i1;
1058 
1059     if (M <= (int) MCACHE)              /* fits on chip ? */
1060       bfstages(ioptr, M, Utbl, Ustride, NDiffU, StageCnt); /* RADIX 8 Stages */
1061     else {
1062       for (i1 = 0; i1 < 8; i1++) {
1063         fftrecurs(&ioptr[i1 * POW2(M - 3) * 2], M - 3, Utbl, 8 * Ustride,
1064                   NDiffU, StageCnt - 1);  /*  RADIX 8 Stages      */
1065       }
1066       bfstages(ioptr, M, Utbl, Ustride, POW2(M - 3), 1);  /*  RADIX 8 Stage */
1067     }
1068 }
1069 
1070 static void ffts1(SPFLOAT *ioptr, int M, SPFLOAT *Utbl, int16_t *BRLow)
1071 {
1072     /* Compute in-place complex fft on the rows of the input array  */
1073     /* INPUTS                                                       */
1074     /*   *ioptr = input data array                                  */
1075     /*   M = log2 of fft size (ex M=10 for 1024 point fft)          */
1076     /*   *Utbl = cosine table                                       */
1077     /*   *BRLow = bit reversed counter table                        */
1078     /* OUTPUTS                                                      */
1079     /*   *ioptr = output data array                                 */
1080 
1081     int StageCnt;
1082     int NDiffU;
1083 
1084     switch (M) {
1085     case 0:
1086       break;
1087     case 1:
1088       fft2pt(ioptr);            /* a 2 pt fft */
1089       break;
1090     case 2:
1091       fft4pt(ioptr);            /* a 4 pt fft */
1092       break;
1093     case 3:
1094       fft8pt(ioptr);            /* an 8 pt fft */
1095       break;
1096     default:
1097       bitrevR2(ioptr, M, BRLow);  /* bit reverse and first radix 2 stage */
1098       StageCnt = (M - 1) / 3;     /* number of radix 8 stages           */
1099       NDiffU = 2;                 /* one radix 2 stage already complete */
1100       if ((M - 1 - (StageCnt * 3)) == 1) {
1101         bfR2(ioptr, M, NDiffU); /* 1 radix 2 stage */
1102         NDiffU *= 2;
1103       }
1104       if ((M - 1 - (StageCnt * 3)) == 2) {
1105         bfR4(ioptr, M, NDiffU); /* 1 radix 4 stage */
1106         NDiffU *= 4;
1107       }
1108       if (M <= (int) MCACHE)
1109         bfstages(ioptr, M, Utbl, 1, NDiffU, StageCnt);  /* RADIX 8 Stages */
1110       else
1111         fftrecurs(ioptr, M, Utbl, 1, NDiffU, StageCnt); /* RADIX 8 Stages */
1112     }
1113 }
1114 
1115 /******************
1116 * parts of iffts1 *
1117 ******************/
1118 
1119 static void scbitrevR2(SPFLOAT *ioptr, int M, int16_t *BRLow, SPFLOAT scale)
1120 {
1121     /*** scaled bit reverse and first radix 2 stage forward or inverse fft ***/
1122     SPFLOAT f0r;
1123     SPFLOAT f0i;
1124     SPFLOAT f1r;
1125     SPFLOAT f1i;
1126     SPFLOAT f2r;
1127     SPFLOAT f2i;
1128     SPFLOAT f3r;
1129     SPFLOAT f3i;
1130     SPFLOAT f4r;
1131     SPFLOAT f4i;
1132     SPFLOAT f5r;
1133     SPFLOAT f5i;
1134     SPFLOAT f6r;
1135     SPFLOAT f6i;
1136     SPFLOAT f7r;
1137     SPFLOAT f7i;
1138     SPFLOAT t0r;
1139     SPFLOAT t0i;
1140     SPFLOAT t1r;
1141     SPFLOAT t1i;
1142     SPFLOAT *p0r;
1143     SPFLOAT *p1r;
1144     SPFLOAT *IOP;
1145     SPFLOAT *iolimit;
1146     int Colstart;
1147     int iCol;
1148     unsigned int posA;
1149     unsigned int posAi;
1150     unsigned int posB;
1151     unsigned int posBi;
1152 
1153     const unsigned int Nrems2 = POW2((M + 3) / 2);
1154     const unsigned int Nroot_1_ColInc = POW2(M) - Nrems2;
1155     const unsigned int Nroot_1 = POW2(M / 2 - 1) - 1;
1156     const unsigned int ColstartShift = (M + 1) / 2 + 1;
1157 
1158     posA = POW2(M);               /* 1/2 of POW2(M) complexes */
1159     posAi = posA + 1;
1160     posB = posA + 2;
1161     posBi = posB + 1;
1162 
1163     iolimit = ioptr + Nrems2;
1164     for (; ioptr < iolimit; ioptr += POW2(M / 2 + 1)) {
1165       for (Colstart = Nroot_1; Colstart >= 0; Colstart--) {
1166         iCol = Nroot_1;
1167         p0r = ioptr + Nroot_1_ColInc + BRLow[Colstart] * 4;
1168         IOP = ioptr + (Colstart << ColstartShift);
1169         p1r = IOP + BRLow[iCol] * 4;
1170         f0r = *(p0r);
1171         f0i = *(p0r + 1);
1172         f1r = *(p0r + posA);
1173         f1i = *(p0r + posAi);
1174         for (; iCol > Colstart;) {
1175           f2r = *(p0r + 2);
1176           f2i = *(p0r + (2 + 1));
1177           f3r = *(p0r + posB);
1178           f3i = *(p0r + posBi);
1179           f4r = *(p1r);
1180           f4i = *(p1r + 1);
1181           f5r = *(p1r + posA);
1182           f5i = *(p1r + posAi);
1183           f6r = *(p1r + 2);
1184           f6i = *(p1r + (2 + 1));
1185           f7r = *(p1r + posB);
1186           f7i = *(p1r + posBi);
1187 
1188           t0r = f0r + f1r;
1189           t0i = f0i + f1i;
1190           f1r = f0r - f1r;
1191           f1i = f0i - f1i;
1192           t1r = f2r + f3r;
1193           t1i = f2i + f3i;
1194           f3r = f2r - f3r;
1195           f3i = f2i - f3i;
1196           f0r = f4r + f5r;
1197           f0i = f4i + f5i;
1198           f5r = f4r - f5r;
1199           f5i = f4i - f5i;
1200           f2r = f6r + f7r;
1201           f2i = f6i + f7i;
1202           f7r = f6r - f7r;
1203           f7i = f6i - f7i;
1204 
1205           *(p1r) = scale * t0r;
1206           *(p1r + 1) = scale * t0i;
1207           *(p1r + 2) = scale * f1r;
1208           *(p1r + (2 + 1)) = scale * f1i;
1209           *(p1r + posA) = scale * t1r;
1210           *(p1r + posAi) = scale * t1i;
1211           *(p1r + posB) = scale * f3r;
1212           *(p1r + posBi) = scale * f3i;
1213           *(p0r) = scale * f0r;
1214           *(p0r + 1) = scale * f0i;
1215           *(p0r + 2) = scale * f5r;
1216           *(p0r + (2 + 1)) = scale * f5i;
1217           *(p0r + posA) = scale * f2r;
1218           *(p0r + posAi) = scale * f2i;
1219           *(p0r + posB) = scale * f7r;
1220           *(p0r + posBi) = scale * f7i;
1221 
1222           p0r -= Nrems2;
1223           f0r = *(p0r);
1224           f0i = *(p0r + 1);
1225           f1r = *(p0r + posA);
1226           f1i = *(p0r + posAi);
1227           iCol -= 1;
1228           p1r = IOP + BRLow[iCol] * 4;
1229         }
1230         f2r = *(p0r + 2);
1231         f2i = *(p0r + (2 + 1));
1232         f3r = *(p0r + posB);
1233         f3i = *(p0r + posBi);
1234 
1235         t0r = f0r + f1r;
1236         t0i = f0i + f1i;
1237         f1r = f0r - f1r;
1238         f1i = f0i - f1i;
1239         t1r = f2r + f3r;
1240         t1i = f2i + f3i;
1241         f3r = f2r - f3r;
1242         f3i = f2i - f3i;
1243 
1244         *(p0r) = scale * t0r;
1245         *(p0r + 1) = scale * t0i;
1246         *(p0r + 2) = scale * f1r;
1247         *(p0r + (2 + 1)) = scale * f1i;
1248         *(p0r + posA) = scale * t1r;
1249         *(p0r + posAi) = scale * t1i;
1250         *(p0r + posB) = scale * f3r;
1251         *(p0r + posBi) = scale * f3i;
1252       }
1253     }
1254 }
1255 
1256 static void ifft2pt(SPFLOAT *ioptr, SPFLOAT scale)
1257 {
1258     /***   RADIX 2 ifft     ***/
1259     SPFLOAT f0r, f0i, f1r, f1i;
1260     SPFLOAT t0r, t0i;
1261 
1262     /* bit reversed load */
1263     f0r = ioptr[0];
1264     f0i = ioptr[1];
1265     f1r = ioptr[2];
1266     f1i = ioptr[3];
1267 
1268     /* Butterflys           */
1269     /*
1270        f0   -       -       t0
1271        f1   -  1 -  f1
1272      */
1273 
1274     t0r = f0r + f1r;
1275     t0i = f0i + f1i;
1276     f1r = f0r - f1r;
1277     f1i = f0i - f1i;
1278 
1279     /* store result */
1280     ioptr[0] = scale * t0r;
1281     ioptr[1] = scale * t0i;
1282     ioptr[2] = scale * f1r;
1283     ioptr[3] = scale * f1i;
1284 }
1285 
1286 static void ifft4pt(SPFLOAT *ioptr, SPFLOAT scale)
1287 {
1288     /***   RADIX 4 ifft     ***/
1289     SPFLOAT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
1290     SPFLOAT t0r, t0i, t1r, t1i;
1291 
1292     /* bit reversed load */
1293     f0r = ioptr[0];
1294     f0i = ioptr[1];
1295     f1r = ioptr[4];
1296     f1i = ioptr[5];
1297     f2r = ioptr[2];
1298     f2i = ioptr[3];
1299     f3r = ioptr[6];
1300     f3i = ioptr[7];
1301 
1302     /* Butterflys           */
1303     /*
1304        f0   -       -       t0      -       -       f0
1305        f1   -  1 -  f1      -       -       f1
1306        f2   -       -       f2      -  1 -  f2
1307        f3   -  1 -  t1      -  i -  f3
1308      */
1309 
1310     t0r = f0r + f1r;
1311     t0i = f0i + f1i;
1312     f1r = f0r - f1r;
1313     f1i = f0i - f1i;
1314 
1315     t1r = f2r - f3r;
1316     t1i = f2i - f3i;
1317     f2r = f2r + f3r;
1318     f2i = f2i + f3i;
1319 
1320     f0r = t0r + f2r;
1321     f0i = t0i + f2i;
1322     f2r = t0r - f2r;
1323     f2i = t0i - f2i;
1324 
1325     f3r = f1r + t1i;
1326     f3i = f1i - t1r;
1327     f1r = f1r - t1i;
1328     f1i = f1i + t1r;
1329 
1330     /* store result */
1331     ioptr[0] = scale * f0r;
1332     ioptr[1] = scale * f0i;
1333     ioptr[2] = scale * f1r;
1334     ioptr[3] = scale * f1i;
1335     ioptr[4] = scale * f2r;
1336     ioptr[5] = scale * f2i;
1337     ioptr[6] = scale * f3r;
1338     ioptr[7] = scale * f3i;
1339 }
1340 
1341 static void ifft8pt(SPFLOAT *ioptr, SPFLOAT scale)
1342 {
1343     /***   RADIX 8 ifft     ***/
1344     SPFLOAT w0r = 1.0 / MYROOT2;    /* cos(pi/4)   */
1345     SPFLOAT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
1346     SPFLOAT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
1347     SPFLOAT t0r, t0i, t1r, t1i;
1348     const SPFLOAT Two = 2.0;
1349 
1350     /* bit reversed load */
1351     f0r = ioptr[0];
1352     f0i = ioptr[1];
1353     f1r = ioptr[8];
1354     f1i = ioptr[9];
1355     f2r = ioptr[4];
1356     f2i = ioptr[5];
1357     f3r = ioptr[12];
1358     f3i = ioptr[13];
1359     f4r = ioptr[2];
1360     f4i = ioptr[3];
1361     f5r = ioptr[10];
1362     f5i = ioptr[11];
1363     f6r = ioptr[6];
1364     f6i = ioptr[7];
1365     f7r = ioptr[14];
1366     f7i = ioptr[15];
1367 
1368     /* Butterflys           */
1369     /*
1370        f0   -       -       t0      -       -       f0      -       -       f0
1371        f1   -  1 -  f1      -       -       f1      -       -       f1
1372        f2   -       -       f2      -  1 -  f2      -       -       f2
1373        f3   -  1 -  t1      -  i -  f3      -       -       f3
1374        f4   -       -       t0      -       -       f4      -  1 -  t0
1375        f5   -  1 -  f5      -       -       f5      - w3 -  f4
1376        f6   -       -       f6      -  1 -  f6      -  i -  t1
1377        f7   -  1 -  t1      -  i -  f7      - iw3-  f6
1378      */
1379 
1380     t0r = f0r + f1r;
1381     t0i = f0i + f1i;
1382     f1r = f0r - f1r;
1383     f1i = f0i - f1i;
1384 
1385     t1r = f2r - f3r;
1386     t1i = f2i - f3i;
1387     f2r = f2r + f3r;
1388     f2i = f2i + f3i;
1389 
1390     f0r = t0r + f2r;
1391     f0i = t0i + f2i;
1392     f2r = t0r - f2r;
1393     f2i = t0i - f2i;
1394 
1395     f3r = f1r + t1i;
1396     f3i = f1i - t1r;
1397     f1r = f1r - t1i;
1398     f1i = f1i + t1r;
1399 
1400     t0r = f4r + f5r;
1401     t0i = f4i + f5i;
1402     f5r = f4r - f5r;
1403     f5i = f4i - f5i;
1404 
1405     t1r = f6r - f7r;
1406     t1i = f6i - f7i;
1407     f6r = f6r + f7r;
1408     f6i = f6i + f7i;
1409 
1410     f4r = t0r + f6r;
1411     f4i = t0i + f6i;
1412     f6r = t0r - f6r;
1413     f6i = t0i - f6i;
1414 
1415     f7r = f5r + t1i;
1416     f7i = f5i - t1r;
1417     f5r = f5r - t1i;
1418     f5i = f5i + t1r;
1419 
1420     t0r = f0r - f4r;
1421     t0i = f0i - f4i;
1422     f0r = f0r + f4r;
1423     f0i = f0i + f4i;
1424 
1425     t1r = f2r + f6i;
1426     t1i = f2i - f6r;
1427     f2r = f2r - f6i;
1428     f2i = f2i + f6r;
1429 
1430     f4r = f1r - f5r * w0r + f5i * w0r;
1431     f4i = f1i - f5r * w0r - f5i * w0r;
1432     f1r = f1r * Two - f4r;
1433     f1i = f1i * Two - f4i;
1434 
1435     f6r = f3r + f7r * w0r + f7i * w0r;
1436     f6i = f3i - f7r * w0r + f7i * w0r;
1437     f3r = f3r * Two - f6r;
1438     f3i = f3i * Two - f6i;
1439 
1440     /* store result */
1441     ioptr[0] = scale * f0r;
1442     ioptr[1] = scale * f0i;
1443     ioptr[2] = scale * f1r;
1444     ioptr[3] = scale * f1i;
1445     ioptr[4] = scale * f2r;
1446     ioptr[5] = scale * f2i;
1447     ioptr[6] = scale * f3r;
1448     ioptr[7] = scale * f3i;
1449     ioptr[8] = scale * t0r;
1450     ioptr[9] = scale * t0i;
1451     ioptr[10] = scale * f4r;
1452     ioptr[11] = scale * f4i;
1453     ioptr[12] = scale * t1r;
1454     ioptr[13] = scale * t1i;
1455     ioptr[14] = scale * f6r;
1456     ioptr[15] = scale * f6i;
1457 }
1458 
1459 static void ibfR2(SPFLOAT *ioptr, int M, int NDiffU)
1460 {
1461     /*** 2nd radix 2 stage ***/
1462     unsigned int pos;
1463     unsigned int posi;
1464     unsigned int pinc;
1465     unsigned int pnext;
1466     unsigned int NSameU;
1467     unsigned int SameUCnt;
1468 
1469     SPFLOAT *pstrt;
1470     SPFLOAT *p0r, *p1r, *p2r, *p3r;
1471 
1472     SPFLOAT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
1473     SPFLOAT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
1474 
1475     pinc = NDiffU * 2;            /* 2 floats per complex */
1476     pnext = pinc * 4;
1477     pos = 2;
1478     posi = pos + 1;
1479     NSameU = POW2(M) / 4 / NDiffU;        /* 4 Us at a time */
1480     pstrt = ioptr;
1481     p0r = pstrt;
1482     p1r = pstrt + pinc;
1483     p2r = p1r + pinc;
1484     p3r = p2r + pinc;
1485 
1486     /* Butterflys           */
1487     /*
1488        f0   -       -       f4
1489        f1   -  1 -  f5
1490        f2   -       -       f6
1491        f3   -  1 -  f7
1492      */
1493     /* Butterflys           */
1494     /*
1495        f0   -       -       f4
1496        f1   -  1 -  f5
1497        f2   -       -       f6
1498        f3   -  1 -  f7
1499      */
1500 
1501     for (SameUCnt = NSameU; SameUCnt > 0; SameUCnt--) {
1502 
1503       f0r = *p0r;
1504       f1r = *p1r;
1505       f0i = *(p0r + 1);
1506       f1i = *(p1r + 1);
1507       f2r = *p2r;
1508       f3r = *p3r;
1509       f2i = *(p2r + 1);
1510       f3i = *(p3r + 1);
1511 
1512       f4r = f0r + f1r;
1513       f4i = f0i + f1i;
1514       f5r = f0r - f1r;
1515       f5i = f0i - f1i;
1516 
1517       f6r = f2r + f3r;
1518       f6i = f2i + f3i;
1519       f7r = f2r - f3r;
1520       f7i = f2i - f3i;
1521 
1522       *p0r = f4r;
1523       *(p0r + 1) = f4i;
1524       *p1r = f5r;
1525       *(p1r + 1) = f5i;
1526       *p2r = f6r;
1527       *(p2r + 1) = f6i;
1528       *p3r = f7r;
1529       *(p3r + 1) = f7i;
1530 
1531       f0r = *(p0r + pos);
1532       f1i = *(p1r + posi);
1533       f0i = *(p0r + posi);
1534       f1r = *(p1r + pos);
1535       f2r = *(p2r + pos);
1536       f3i = *(p3r + posi);
1537       f2i = *(p2r + posi);
1538       f3r = *(p3r + pos);
1539 
1540       f4r = f0r - f1i;
1541       f4i = f0i + f1r;
1542       f5r = f0r + f1i;
1543       f5i = f0i - f1r;
1544 
1545       f6r = f2r - f3i;
1546       f6i = f2i + f3r;
1547       f7r = f2r + f3i;
1548       f7i = f2i - f3r;
1549 
1550       *(p0r + pos) = f4r;
1551       *(p0r + posi) = f4i;
1552       *(p1r + pos) = f5r;
1553       *(p1r + posi) = f5i;
1554       *(p2r + pos) = f6r;
1555       *(p2r + posi) = f6i;
1556       *(p3r + pos) = f7r;
1557       *(p3r + posi) = f7i;
1558 
1559       p0r += pnext;
1560       p1r += pnext;
1561       p2r += pnext;
1562       p3r += pnext;
1563     }
1564 }
1565 
1566 static void ibfR4(SPFLOAT *ioptr, int M, int NDiffU)
1567 {
1568     /*** 1 radix 4 stage ***/
1569     unsigned int pos;
1570     unsigned int posi;
1571     unsigned int pinc;
1572     unsigned int pnext;
1573     unsigned int pnexti;
1574     unsigned int NSameU;
1575     unsigned int SameUCnt;
1576 
1577     SPFLOAT *pstrt;
1578     SPFLOAT *p0r, *p1r, *p2r, *p3r;
1579 
1580     SPFLOAT w1r = 1.0 / MYROOT2;    /* cos(pi/4)   */
1581     SPFLOAT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
1582     SPFLOAT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
1583     SPFLOAT t1r, t1i;
1584     const SPFLOAT Two = 2.0;
1585 
1586     pinc = NDiffU * 2;            /* 2 floats per complex */
1587     pnext = pinc * 4;
1588     pnexti = pnext + 1;
1589     pos = 2;
1590     posi = pos + 1;
1591     NSameU = POW2(M) / 4 / NDiffU;        /* 4 pts per butterfly */
1592     pstrt = ioptr;
1593     p0r = pstrt;
1594     p1r = pstrt + pinc;
1595     p2r = p1r + pinc;
1596     p3r = p2r + pinc;
1597 
1598     /* Butterflys           */
1599     /*
1600        f0   -       -       f0      -       -       f4
1601        f1   -  1 -  f5      -       -       f5
1602        f2   -       -       f6      -  1 -  f6
1603        f3   -  1 -  f3      - -i -  f7
1604      */
1605     /* Butterflys           */
1606     /*
1607        f0   -       -       f4      -       -       f4
1608        f1   - -i -  t1      -       -       f5
1609        f2   -       -       f2      - w1 -  f6
1610        f3   - -i -  f7      - iw1-  f7
1611      */
1612 
1613     f0r = *p0r;
1614     f1r = *p1r;
1615     f2r = *p2r;
1616     f3r = *p3r;
1617     f0i = *(p0r + 1);
1618     f1i = *(p1r + 1);
1619     f2i = *(p2r + 1);
1620     f3i = *(p3r + 1);
1621 
1622     f5r = f0r - f1r;
1623     f5i = f0i - f1i;
1624     f0r = f0r + f1r;
1625     f0i = f0i + f1i;
1626 
1627     f6r = f2r + f3r;
1628     f6i = f2i + f3i;
1629     f3r = f2r - f3r;
1630     f3i = f2i - f3i;
1631 
1632     for (SameUCnt = NSameU - 1; SameUCnt > 0; SameUCnt--) {
1633 
1634       f7r = f5r + f3i;
1635       f7i = f5i - f3r;
1636       f5r = f5r - f3i;
1637       f5i = f5i + f3r;
1638 
1639       f4r = f0r + f6r;
1640       f4i = f0i + f6i;
1641       f6r = f0r - f6r;
1642       f6i = f0i - f6i;
1643 
1644       f2r = *(p2r + pos);
1645       f2i = *(p2r + posi);
1646       f1r = *(p1r + pos);
1647       f1i = *(p1r + posi);
1648       f3i = *(p3r + posi);
1649       f0r = *(p0r + pos);
1650       f3r = *(p3r + pos);
1651       f0i = *(p0r + posi);
1652 
1653       *p3r = f7r;
1654       *p0r = f4r;
1655       *(p3r + 1) = f7i;
1656       *(p0r + 1) = f4i;
1657       *p1r = f5r;
1658       *p2r = f6r;
1659       *(p1r + 1) = f5i;
1660       *(p2r + 1) = f6i;
1661 
1662       f7r = f2r + f3i;
1663       f7i = f2i - f3r;
1664       f2r = f2r - f3i;
1665       f2i = f2i + f3r;
1666 
1667       f4r = f0r - f1i;
1668       f4i = f0i + f1r;
1669       t1r = f0r + f1i;
1670       t1i = f0i - f1r;
1671 
1672       f5r = t1r - f7r * w1r - f7i * w1r;
1673       f5i = t1i + f7r * w1r - f7i * w1r;
1674       f7r = t1r * Two - f5r;
1675       f7i = t1i * Two - f5i;
1676 
1677       f6r = f4r - f2r * w1r + f2i * w1r;
1678       f6i = f4i - f2r * w1r - f2i * w1r;
1679       f4r = f4r * Two - f6r;
1680       f4i = f4i * Two - f6i;
1681 
1682       f3r = *(p3r + pnext);
1683       f0r = *(p0r + pnext);
1684       f3i = *(p3r + pnexti);
1685       f0i = *(p0r + pnexti);
1686       f2r = *(p2r + pnext);
1687       f2i = *(p2r + pnexti);
1688       f1r = *(p1r + pnext);
1689       f1i = *(p1r + pnexti);
1690 
1691       *(p2r + pos) = f6r;
1692       *(p1r + pos) = f5r;
1693       *(p2r + posi) = f6i;
1694       *(p1r + posi) = f5i;
1695       *(p3r + pos) = f7r;
1696       *(p0r + pos) = f4r;
1697       *(p3r + posi) = f7i;
1698       *(p0r + posi) = f4i;
1699 
1700       f6r = f2r + f3r;
1701       f6i = f2i + f3i;
1702       f3r = f2r - f3r;
1703       f3i = f2i - f3i;
1704 
1705       f5r = f0r - f1r;
1706       f5i = f0i - f1i;
1707       f0r = f0r + f1r;
1708       f0i = f0i + f1i;
1709 
1710       p3r += pnext;
1711       p0r += pnext;
1712       p1r += pnext;
1713       p2r += pnext;
1714     }
1715     f7r = f5r + f3i;
1716     f7i = f5i - f3r;
1717     f5r = f5r - f3i;
1718     f5i = f5i + f3r;
1719 
1720     f4r = f0r + f6r;
1721     f4i = f0i + f6i;
1722     f6r = f0r - f6r;
1723     f6i = f0i - f6i;
1724 
1725     f2r = *(p2r + pos);
1726     f2i = *(p2r + posi);
1727     f1r = *(p1r + pos);
1728     f1i = *(p1r + posi);
1729     f3i = *(p3r + posi);
1730     f0r = *(p0r + pos);
1731     f3r = *(p3r + pos);
1732     f0i = *(p0r + posi);
1733 
1734     *p3r = f7r;
1735     *p0r = f4r;
1736     *(p3r + 1) = f7i;
1737     *(p0r + 1) = f4i;
1738     *p1r = f5r;
1739     *p2r = f6r;
1740     *(p1r + 1) = f5i;
1741     *(p2r + 1) = f6i;
1742 
1743     f7r = f2r + f3i;
1744     f7i = f2i - f3r;
1745     f2r = f2r - f3i;
1746     f2i = f2i + f3r;
1747 
1748     f4r = f0r - f1i;
1749     f4i = f0i + f1r;
1750     t1r = f0r + f1i;
1751     t1i = f0i - f1r;
1752 
1753     f5r = t1r - f7r * w1r - f7i * w1r;
1754     f5i = t1i + f7r * w1r - f7i * w1r;
1755     f7r = t1r * Two - f5r;
1756     f7i = t1i * Two - f5i;
1757 
1758     f6r = f4r - f2r * w1r + f2i * w1r;
1759     f6i = f4i - f2r * w1r - f2i * w1r;
1760     f4r = f4r * Two - f6r;
1761     f4i = f4i * Two - f6i;
1762 
1763     *(p2r + pos) = f6r;
1764     *(p1r + pos) = f5r;
1765     *(p2r + posi) = f6i;
1766     *(p1r + posi) = f5i;
1767     *(p3r + pos) = f7r;
1768     *(p0r + pos) = f4r;
1769     *(p3r + posi) = f7i;
1770     *(p0r + posi) = f4i;
1771 }
1772 
1773 static void ibfstages(SPFLOAT *ioptr, int M, SPFLOAT *Utbl, int Ustride,
1774                       int NDiffU, int StageCnt)
1775 {
1776     /***   RADIX 8 Stages   ***/
1777     unsigned int pos;
1778     unsigned int posi;
1779     unsigned int pinc;
1780     unsigned int pnext;
1781     unsigned int NSameU;
1782     int          Uinc;
1783     int          Uinc2;
1784     int          Uinc4;
1785     unsigned int DiffUCnt;
1786     unsigned int SameUCnt;
1787     unsigned int U2toU3;
1788 
1789     SPFLOAT *pstrt;
1790     SPFLOAT *p0r, *p1r, *p2r, *p3r;
1791     SPFLOAT *u0r, *u0i, *u1r, *u1i, *u2r, *u2i;
1792 
1793     SPFLOAT w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i;
1794     SPFLOAT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
1795     SPFLOAT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
1796     SPFLOAT t0r, t0i, t1r, t1i;
1797     const SPFLOAT Two = 2.0;
1798 
1799     pinc = NDiffU * 2;            /* 2 floats per complex */
1800     pnext = pinc * 8;
1801     pos = pinc * 4;
1802     posi = pos + 1;
1803     NSameU = POW2(M) / 8 / NDiffU;        /* 8 pts per butterfly */
1804     Uinc = (int) NSameU * Ustride;
1805     Uinc2 = Uinc * 2;
1806     Uinc4 = Uinc * 4;
1807     U2toU3 = (POW2(M) / 8) * Ustride;
1808     for (; StageCnt > 0; StageCnt--) {
1809 
1810       u0r = &Utbl[0];
1811       u0i = &Utbl[POW2(M - 2) * Ustride];
1812       u1r = u0r;
1813       u1i = u0i;
1814       u2r = u0r;
1815       u2i = u0i;
1816 
1817       w0r = *u0r;
1818       w0i = *u0i;
1819       w1r = *u1r;
1820       w1i = *u1i;
1821       w2r = *u2r;
1822       w2i = *u2i;
1823       w3r = *(u2r + U2toU3);
1824       w3i = *(u2i - U2toU3);
1825 
1826       pstrt = ioptr;
1827 
1828       p0r = pstrt;
1829       p1r = pstrt + pinc;
1830       p2r = p1r + pinc;
1831       p3r = p2r + pinc;
1832 
1833       /* Butterflys           */
1834       /*
1835          f0   -       -       t0      -       -       f0      -       -       f0
1836          f1   - w0-   f1      -       -       f1      -       -       f1
1837          f2   -       -       f2      - w1-   f2      -       -       f4
1838          f3   - w0-   t1      - iw1-  f3      -       -       f5
1839 
1840          f4   -       -       t0      -       -       f4      - w2-   t0
1841          f5   - w0-   f5      -       -       f5      - w3-   t1
1842          f6   -       -       f6      - w1-   f6      - iw2-  f6
1843          f7   - w0-   t1      - iw1-  f7      - iw3-  f7
1844        */
1845 
1846       for (DiffUCnt = NDiffU; DiffUCnt > 0; DiffUCnt--) {
1847         f0r = *p0r;
1848         f0i = *(p0r + 1);
1849         f1r = *p1r;
1850         f1i = *(p1r + 1);
1851         for (SameUCnt = NSameU - 1; SameUCnt > 0; SameUCnt--) {
1852           f2r = *p2r;
1853           f2i = *(p2r + 1);
1854           f3r = *p3r;
1855           f3i = *(p3r + 1);
1856 
1857           t0r = f0r + f1r * w0r - f1i * w0i;
1858           t0i = f0i + f1r * w0i + f1i * w0r;
1859           f1r = f0r * Two - t0r;
1860           f1i = f0i * Two - t0i;
1861 
1862           f4r = *(p0r + pos);
1863           f4i = *(p0r + posi);
1864           f5r = *(p1r + pos);
1865           f5i = *(p1r + posi);
1866 
1867           f6r = *(p2r + pos);
1868           f6i = *(p2r + posi);
1869           f7r = *(p3r + pos);
1870           f7i = *(p3r + posi);
1871 
1872           t1r = f2r - f3r * w0r + f3i * w0i;
1873           t1i = f2i - f3r * w0i - f3i * w0r;
1874           f2r = f2r * Two - t1r;
1875           f2i = f2i * Two - t1i;
1876 
1877           f0r = t0r + f2r * w1r - f2i * w1i;
1878           f0i = t0i + f2r * w1i + f2i * w1r;
1879           f2r = t0r * Two - f0r;
1880           f2i = t0i * Two - f0i;
1881 
1882           f3r = f1r + t1r * w1i + t1i * w1r;
1883           f3i = f1i - t1r * w1r + t1i * w1i;
1884           f1r = f1r * Two - f3r;
1885           f1i = f1i * Two - f3i;
1886 
1887           t0r = f4r + f5r * w0r - f5i * w0i;
1888           t0i = f4i + f5r * w0i + f5i * w0r;
1889           f5r = f4r * Two - t0r;
1890           f5i = f4i * Two - t0i;
1891 
1892           t1r = f6r - f7r * w0r + f7i * w0i;
1893           t1i = f6i - f7r * w0i - f7i * w0r;
1894           f6r = f6r * Two - t1r;
1895           f6i = f6i * Two - t1i;
1896 
1897           f4r = t0r + f6r * w1r - f6i * w1i;
1898           f4i = t0i + f6r * w1i + f6i * w1r;
1899           f6r = t0r * Two - f4r;
1900           f6i = t0i * Two - f4i;
1901 
1902           f7r = f5r + t1r * w1i + t1i * w1r;
1903           f7i = f5i - t1r * w1r + t1i * w1i;
1904           f5r = f5r * Two - f7r;
1905           f5i = f5i * Two - f7i;
1906 
1907           t0r = f0r - f4r * w2r + f4i * w2i;
1908           t0i = f0i - f4r * w2i - f4i * w2r;
1909           f0r = f0r * Two - t0r;
1910           f0i = f0i * Two - t0i;
1911 
1912           t1r = f1r - f5r * w3r + f5i * w3i;
1913           t1i = f1i - f5r * w3i - f5i * w3r;
1914           f1r = f1r * Two - t1r;
1915           f1i = f1i * Two - t1i;
1916 
1917           *(p0r + pos) = t0r;
1918           *(p0r + posi) = t0i;
1919           *p0r = f0r;
1920           *(p0r + 1) = f0i;
1921 
1922           p0r += pnext;
1923           f0r = *p0r;
1924           f0i = *(p0r + 1);
1925 
1926           *(p1r + pos) = t1r;
1927           *(p1r + posi) = t1i;
1928           *p1r = f1r;
1929           *(p1r + 1) = f1i;
1930 
1931           p1r += pnext;
1932 
1933           f1r = *p1r;
1934           f1i = *(p1r + 1);
1935 
1936           f4r = f2r - f6r * w2i - f6i * w2r;
1937           f4i = f2i + f6r * w2r - f6i * w2i;
1938           f6r = f2r * Two - f4r;
1939           f6i = f2i * Two - f4i;
1940 
1941           f5r = f3r - f7r * w3i - f7i * w3r;
1942           f5i = f3i + f7r * w3r - f7i * w3i;
1943           f7r = f3r * Two - f5r;
1944           f7i = f3i * Two - f5i;
1945 
1946           *p2r = f4r;
1947           *(p2r + 1) = f4i;
1948           *(p2r + pos) = f6r;
1949           *(p2r + posi) = f6i;
1950 
1951           p2r += pnext;
1952 
1953           *p3r = f5r;
1954           *(p3r + 1) = f5i;
1955           *(p3r + pos) = f7r;
1956           *(p3r + posi) = f7i;
1957 
1958           p3r += pnext;
1959         }
1960 
1961         f2r = *p2r;
1962         f2i = *(p2r + 1);
1963         f3r = *p3r;
1964         f3i = *(p3r + 1);
1965 
1966         t0r = f0r + f1r * w0r - f1i * w0i;
1967         t0i = f0i + f1r * w0i + f1i * w0r;
1968         f1r = f0r * Two - t0r;
1969         f1i = f0i * Two - t0i;
1970 
1971         f4r = *(p0r + pos);
1972         f4i = *(p0r + posi);
1973         f5r = *(p1r + pos);
1974         f5i = *(p1r + posi);
1975 
1976         f6r = *(p2r + pos);
1977         f6i = *(p2r + posi);
1978         f7r = *(p3r + pos);
1979         f7i = *(p3r + posi);
1980 
1981         t1r = f2r - f3r * w0r + f3i * w0i;
1982         t1i = f2i - f3r * w0i - f3i * w0r;
1983         f2r = f2r * Two - t1r;
1984         f2i = f2i * Two - t1i;
1985 
1986         f0r = t0r + f2r * w1r - f2i * w1i;
1987         f0i = t0i + f2r * w1i + f2i * w1r;
1988         f2r = t0r * Two - f0r;
1989         f2i = t0i * Two - f0i;
1990 
1991         f3r = f1r + t1r * w1i + t1i * w1r;
1992         f3i = f1i - t1r * w1r + t1i * w1i;
1993         f1r = f1r * Two - f3r;
1994         f1i = f1i * Two - f3i;
1995 
1996         if ((int) DiffUCnt == NDiffU / 2)
1997           Uinc4 = -Uinc4;
1998 
1999         u0r += Uinc4;
2000         u0i -= Uinc4;
2001         u1r += Uinc2;
2002         u1i -= Uinc2;
2003         u2r += Uinc;
2004         u2i -= Uinc;
2005 
2006         pstrt += 2;
2007 
2008         t0r = f4r + f5r * w0r - f5i * w0i;
2009         t0i = f4i + f5r * w0i + f5i * w0r;
2010         f5r = f4r * Two - t0r;
2011         f5i = f4i * Two - t0i;
2012 
2013         t1r = f6r - f7r * w0r + f7i * w0i;
2014         t1i = f6i - f7r * w0i - f7i * w0r;
2015         f6r = f6r * Two - t1r;
2016         f6i = f6i * Two - t1i;
2017 
2018         f4r = t0r + f6r * w1r - f6i * w1i;
2019         f4i = t0i + f6r * w1i + f6i * w1r;
2020         f6r = t0r * Two - f4r;
2021         f6i = t0i * Two - f4i;
2022 
2023         f7r = f5r + t1r * w1i + t1i * w1r;
2024         f7i = f5i - t1r * w1r + t1i * w1i;
2025         f5r = f5r * Two - f7r;
2026         f5i = f5i * Two - f7i;
2027 
2028         w0r = *u0r;
2029         w0i = *u0i;
2030         w1r = *u1r;
2031         w1i = *u1i;
2032 
2033         if ((int) DiffUCnt <= NDiffU / 2)
2034           w0r = -w0r;
2035 
2036         t0r = f0r - f4r * w2r + f4i * w2i;
2037         t0i = f0i - f4r * w2i - f4i * w2r;
2038         f0r = f0r * Two - t0r;
2039         f0i = f0i * Two - t0i;
2040 
2041         f4r = f2r - f6r * w2i - f6i * w2r;
2042         f4i = f2i + f6r * w2r - f6i * w2i;
2043         f6r = f2r * Two - f4r;
2044         f6i = f2i * Two - f4i;
2045 
2046         *(p0r + pos) = t0r;
2047         *p2r = f4r;
2048         *(p0r + posi) = t0i;
2049         *(p2r + 1) = f4i;
2050         w2r = *u2r;
2051         w2i = *u2i;
2052         *p0r = f0r;
2053         *(p2r + pos) = f6r;
2054         *(p0r + 1) = f0i;
2055         *(p2r + posi) = f6i;
2056 
2057         p0r = pstrt;
2058         p2r = pstrt + pinc + pinc;
2059 
2060         t1r = f1r - f5r * w3r + f5i * w3i;
2061         t1i = f1i - f5r * w3i - f5i * w3r;
2062         f1r = f1r * Two - t1r;
2063         f1i = f1i * Two - t1i;
2064 
2065         f5r = f3r - f7r * w3i - f7i * w3r;
2066         f5i = f3i + f7r * w3r - f7i * w3i;
2067         f7r = f3r * Two - f5r;
2068         f7i = f3i * Two - f5i;
2069 
2070         *(p1r + pos) = t1r;
2071         *p3r = f5r;
2072         *(p1r + posi) = t1i;
2073         *(p3r + 1) = f5i;
2074         w3r = *(u2r + U2toU3);
2075         w3i = *(u2i - U2toU3);
2076         *p1r = f1r;
2077         *(p3r + pos) = f7r;
2078         *(p1r + 1) = f1i;
2079         *(p3r + posi) = f7i;
2080 
2081         p1r = pstrt + pinc;
2082         p3r = p2r + pinc;
2083       }
2084       NSameU /= 8;
2085       Uinc /= 8;
2086       Uinc2 /= 8;
2087       Uinc4 = Uinc * 4;
2088       NDiffU *= 8;
2089       pinc *= 8;
2090       pnext *= 8;
2091       pos *= 8;
2092       posi = pos + 1;
2093     }
2094 }
2095 
2096 static void ifftrecurs(SPFLOAT *ioptr, int M, SPFLOAT *Utbl, int Ustride,
2097                        int NDiffU, int StageCnt)
2098 {
2099     /* recursive bfstages calls to maximize on chip cache efficiency */
2100     int i1;
2101 
2102     if (M <= (int) MCACHE)
2103       ibfstages(ioptr, M, Utbl, Ustride, NDiffU, StageCnt); /* RADIX 8 Stages */
2104     else {
2105       for (i1 = 0; i1 < 8; i1++) {
2106         ifftrecurs(&ioptr[i1 * POW2(M - 3) * 2], M - 3, Utbl, 8 * Ustride,
2107                    NDiffU, StageCnt - 1);           /*  RADIX 8 Stages       */
2108       }
2109       ibfstages(ioptr, M, Utbl, Ustride, POW2(M - 3), 1);   /* RADIX 8 Stage */
2110     }
2111 }
2112 
2113 static void iffts1(SPFLOAT *ioptr, int M, SPFLOAT *Utbl, int16_t *BRLow)
2114 {
2115     /* Compute in-place inverse complex fft on the rows of the input array  */
2116     /* INPUTS                                                               */
2117     /*   *ioptr = input data array                                          */
2118     /*   M = log2 of fft size                                               */
2119     /*   *Utbl = cosine table                                               */
2120     /*   *BRLow = bit reversed counter table                                */
2121     /* OUTPUTS                                                              */
2122     /*   *ioptr = output data array                                         */
2123 
2124     int StageCnt;
2125     int NDiffU;
2126     const SPFLOAT scale = 1.0 / POW2(M);
2127 
2128     switch (M) {
2129     case 0:
2130       break;
2131     case 1:
2132       ifft2pt(ioptr, scale);    /* a 2 pt fft */
2133       break;
2134     case 2:
2135       ifft4pt(ioptr, scale);    /* a 4 pt fft */
2136       break;
2137     case 3:
2138       ifft8pt(ioptr, scale);    /* an 8 pt fft */
2139       break;
2140     default:
2141       /* bit reverse and first radix 2 stage */
2142       scbitrevR2(ioptr, M, BRLow, scale);
2143       StageCnt = (M - 1) / 3;   /* number of radix 8 stages */
2144       NDiffU = 2;               /* one radix 2 stage already complete */
2145       if ((M - 1 - (StageCnt * 3)) == 1) {
2146         ibfR2(ioptr, M, NDiffU);        /* 1 radix 2 stage */
2147         NDiffU *= 2;
2148       }
2149       if ((M - 1 - (StageCnt * 3)) == 2) {
2150         ibfR4(ioptr, M, NDiffU);        /* 1 radix 4 stage */
2151         NDiffU *= 4;
2152       }
2153       if (M <= (int) MCACHE)
2154         ibfstages(ioptr, M, Utbl, 1, NDiffU, StageCnt);  /* RADIX 8 Stages */
2155       else
2156         ifftrecurs(ioptr, M, Utbl, 1, NDiffU, StageCnt); /* RADIX 8 Stages */
2157     }
2158 }
2159 
2160 /******************
2161 * parts of rffts1 *
2162 ******************/
2163 
2164 static void rfft1pt(SPFLOAT *ioptr)
2165 {
2166     /***   RADIX 2 rfft     ***/
2167     SPFLOAT f0r, f0i;
2168     SPFLOAT t0r, t0i;
2169 
2170     /* bit reversed load */
2171     f0r = ioptr[0];
2172     f0i = ioptr[1];
2173 
2174     /* finish rfft */
2175     t0r = f0r + f0i;
2176     t0i = f0r - f0i;
2177 
2178     /* store result */
2179     ioptr[0] = t0r;
2180     ioptr[1] = t0i;
2181 }
2182 
2183 static void rfft2pt(SPFLOAT *ioptr)
2184 {
2185     /***   RADIX 4 rfft     ***/
2186     SPFLOAT f0r, f0i, f1r, f1i;
2187     SPFLOAT t0r, t0i;
2188 
2189     /* bit reversed load */
2190     f0r = ioptr[0];
2191     f0i = ioptr[1];
2192     f1r = ioptr[2];
2193     f1i = ioptr[3];
2194 
2195     /* Butterflys           */
2196     /*
2197        f0   -       -       t0
2198        f1   -  1 -  f1
2199      */
2200 
2201     t0r = f0r + f1r;
2202     t0i = f0i + f1i;
2203     f1r = f0r - f1r;
2204     f1i = f1i - f0i;
2205     /* finish rfft */
2206     f0r = t0r + t0i;
2207     f0i = t0r - t0i;
2208 
2209     /* store result */
2210     ioptr[0] = f0r;
2211     ioptr[1] = f0i;
2212     ioptr[2] = f1r;
2213     ioptr[3] = f1i;
2214 }
2215 
2216 static void rfft4pt(SPFLOAT *ioptr)
2217 {
2218     /***   RADIX 8 rfft     ***/
2219     SPFLOAT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
2220     SPFLOAT t0r, t0i, t1r, t1i;
2221     SPFLOAT w0r = 1.0 / MYROOT2;    /* cos(pi/4)   */
2222     const SPFLOAT Two = 2.0;
2223     const SPFLOAT scale = 0.5;
2224 
2225     /* bit reversed load */
2226     f0r = ioptr[0];
2227     f0i = ioptr[1];
2228     f1r = ioptr[4];
2229     f1i = ioptr[5];
2230     f2r = ioptr[2];
2231     f2i = ioptr[3];
2232     f3r = ioptr[6];
2233     f3i = ioptr[7];
2234 
2235     /* Butterflys           */
2236     /*
2237        f0   -       -       t0      -       -       f0
2238        f1   -  1 -  f1      -       -       f1
2239        f2   -       -       f2      -  1 -  f2
2240        f3   -  1 -  t1      - -i -  f3
2241      */
2242 
2243     t0r = f0r + f1r;
2244     t0i = f0i + f1i;
2245     f1r = f0r - f1r;
2246     f1i = f0i - f1i;
2247 
2248     t1r = f2r - f3r;
2249     t1i = f2i - f3i;
2250     f2r = f2r + f3r;
2251     f2i = f2i + f3i;
2252 
2253     f0r = t0r + f2r;
2254     f0i = t0i + f2i;
2255     f2r = t0r - f2r;
2256     f2i = f2i - t0i;              /* neg for rfft */
2257 
2258     f3r = f1r - t1i;
2259     f3i = f1i + t1r;
2260     f1r = f1r + t1i;
2261     f1i = f1i - t1r;
2262 
2263     /* finish rfft */
2264     t0r = f0r + f0i;              /* compute Re(x[0]) */
2265     t0i = f0r - f0i;              /* compute Re(x[N/2]) */
2266 
2267     t1r = f1r + f3r;
2268     t1i = f1i - f3i;
2269     f0r = f1i + f3i;
2270     f0i = f3r - f1r;
2271 
2272     f1r = t1r + w0r * f0r + w0r * f0i;
2273     f1i = t1i - w0r * f0r + w0r * f0i;
2274     f3r = Two * t1r - f1r;
2275     f3i = f1i - Two * t1i;
2276 
2277     /* store result */
2278     ioptr[4] = f2r;
2279     ioptr[5] = f2i;
2280     ioptr[0] = t0r;
2281     ioptr[1] = t0i;
2282 
2283     ioptr[2] = scale * f1r;
2284     ioptr[3] = scale * f1i;
2285     ioptr[6] = scale * f3r;
2286     ioptr[7] = scale * f3i;
2287 }
2288 
2289 static void rfft8pt(SPFLOAT *ioptr)
2290 {
2291     /***   RADIX 16 rfft    ***/
2292     SPFLOAT w0r = 1.0 / MYROOT2;    /* cos(pi/4)   */
2293     SPFLOAT w1r = MYCOSPID8;        /* cos(pi/8)     */
2294     SPFLOAT w1i = MYSINPID8;        /* sin(pi/8)     */
2295     SPFLOAT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
2296     SPFLOAT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
2297     SPFLOAT t0r, t0i, t1r, t1i;
2298     const SPFLOAT Two = 2.0;
2299     const SPFLOAT scale = 0.5;
2300 
2301     /* bit reversed load */
2302     f0r = ioptr[0];
2303     f0i = ioptr[1];
2304     f1r = ioptr[8];
2305     f1i = ioptr[9];
2306     f2r = ioptr[4];
2307     f2i = ioptr[5];
2308     f3r = ioptr[12];
2309     f3i = ioptr[13];
2310     f4r = ioptr[2];
2311     f4i = ioptr[3];
2312     f5r = ioptr[10];
2313     f5i = ioptr[11];
2314     f6r = ioptr[6];
2315     f6i = ioptr[7];
2316     f7r = ioptr[14];
2317     f7i = ioptr[15];
2318     /* Butterflys           */
2319     /*
2320        f0   -       -       t0      -       -       f0      -       -       f0
2321        f1   -  1 -  f1      -       -       f1      -       -       f1
2322        f2   -       -       f2      -  1 -  f2      -       -       f2
2323        f3   -  1 -  t1      - -i -  f3      -       -       f3
2324        f4   -       -       t0      -       -       f4      -  1 -  t0
2325        f5   -  1 -  f5      -       -       f5      - w3 -  f4
2326        f6   -       -       f6      -  1 -  f6      - -i -  t1
2327        f7   -  1 -  t1      - -i -  f7      - iw3-  f6
2328      */
2329 
2330     t0r = f0r + f1r;
2331     t0i = f0i + f1i;
2332     f1r = f0r - f1r;
2333     f1i = f0i - f1i;
2334 
2335     t1r = f2r - f3r;
2336     t1i = f2i - f3i;
2337     f2r = f2r + f3r;
2338     f2i = f2i + f3i;
2339 
2340     f0r = t0r + f2r;
2341     f0i = t0i + f2i;
2342     f2r = t0r - f2r;
2343     f2i = t0i - f2i;
2344 
2345     f3r = f1r - t1i;
2346     f3i = f1i + t1r;
2347     f1r = f1r + t1i;
2348     f1i = f1i - t1r;
2349 
2350     t0r = f4r + f5r;
2351     t0i = f4i + f5i;
2352     f5r = f4r - f5r;
2353     f5i = f4i - f5i;
2354 
2355     t1r = f6r - f7r;
2356     t1i = f6i - f7i;
2357     f6r = f6r + f7r;
2358     f6i = f6i + f7i;
2359 
2360     f4r = t0r + f6r;
2361     f4i = t0i + f6i;
2362     f6r = t0r - f6r;
2363     f6i = t0i - f6i;
2364 
2365     f7r = f5r - t1i;
2366     f7i = f5i + t1r;
2367     f5r = f5r + t1i;
2368     f5i = f5i - t1r;
2369 
2370     t0r = f0r - f4r;
2371     t0i = f4i - f0i;              /* neg for rfft */
2372     f0r = f0r + f4r;
2373     f0i = f0i + f4i;
2374 
2375     t1r = f2r - f6i;
2376     t1i = f2i + f6r;
2377     f2r = f2r + f6i;
2378     f2i = f2i - f6r;
2379 
2380     f4r = f1r - f5r * w0r - f5i * w0r;
2381     f4i = f1i + f5r * w0r - f5i * w0r;
2382     f1r = f1r * Two - f4r;
2383     f1i = f1i * Two - f4i;
2384 
2385     f6r = f3r + f7r * w0r - f7i * w0r;
2386     f6i = f3i + f7r * w0r + f7i * w0r;
2387     f3r = f3r * Two - f6r;
2388     f3i = f3i * Two - f6i;
2389 
2390     /* finish rfft */
2391     f5r = f0r + f0i;              /* compute Re(x[0]) */
2392     f5i = f0r - f0i;              /* compute Re(x[N/2]) */
2393 
2394     f0r = f2r + t1r;
2395     f0i = f2i - t1i;
2396     f7r = f2i + t1i;
2397     f7i = t1r - f2r;
2398 
2399     f2r = f0r + w0r * f7r + w0r * f7i;
2400     f2i = f0i - w0r * f7r + w0r * f7i;
2401     t1r = Two * f0r - f2r;
2402     t1i = f2i - Two * f0i;
2403 
2404     f0r = f1r + f6r;
2405     f0i = f1i - f6i;
2406     f7r = f1i + f6i;
2407     f7i = f6r - f1r;
2408 
2409     f1r = f0r + w1r * f7r + w1i * f7i;
2410     f1i = f0i - w1i * f7r + w1r * f7i;
2411     f6r = Two * f0r - f1r;
2412     f6i = f1i - Two * f0i;
2413 
2414     f0r = f3r + f4r;
2415     f0i = f3i - f4i;
2416     f7r = f3i + f4i;
2417     f7i = f4r - f3r;
2418 
2419     f3r = f0r + w1i * f7r + w1r * f7i;
2420     f3i = f0i - w1r * f7r + w1i * f7i;
2421     f4r = Two * f0r - f3r;
2422     f4i = f3i - Two * f0i;
2423 
2424     /* store result */
2425     ioptr[8] = t0r;
2426     ioptr[9] = t0i;
2427     ioptr[0] = f5r;
2428     ioptr[1] = f5i;
2429 
2430     ioptr[4] = scale * f2r;
2431     ioptr[5] = scale * f2i;
2432     ioptr[12] = scale * t1r;
2433     ioptr[13] = scale * t1i;
2434 
2435     ioptr[2] = scale * f1r;
2436     ioptr[3] = scale * f1i;
2437     ioptr[6] = scale * f3r;
2438     ioptr[7] = scale * f3i;
2439     ioptr[10] = scale * f4r;
2440     ioptr[11] = scale * f4i;
2441     ioptr[14] = scale * f6r;
2442     ioptr[15] = scale * f6i;
2443 }
2444 
2445 static void frstage(SPFLOAT *ioptr, int M, SPFLOAT *Utbl)
2446 {
2447     /*      Finish RFFT             */
2448 
2449     unsigned int pos;
2450     unsigned int posi;
2451     unsigned int diffUcnt;
2452 
2453     SPFLOAT *p0r, *p1r;
2454     SPFLOAT *u0r, *u0i;
2455 
2456     SPFLOAT w0r, w0i;
2457     SPFLOAT f0r, f0i, f1r, f1i, f4r, f4i, f5r, f5i;
2458     SPFLOAT t0r, t0i, t1r, t1i;
2459     const SPFLOAT Two = 2.0;
2460 
2461     pos = POW2(M - 1);
2462     posi = pos + 1;
2463 
2464     p0r = ioptr;
2465     p1r = ioptr + pos / 2;
2466 
2467     u0r = Utbl + POW2(M - 3);
2468 
2469     w0r = *u0r; f0r = *(p0r);
2470     f0i = *(p0r + 1);
2471     f4r = *(p0r + pos);
2472     f4i = *(p0r + posi);
2473     f1r = *(p1r);
2474     f1i = *(p1r + 1);
2475     f5r = *(p1r + pos);
2476     f5i = *(p1r + posi);
2477 
2478     t0r = Two * f0r + Two * f0i;  /* compute Re(x[0]) */
2479     t0i = Two * f0r - Two * f0i;  /* compute Re(x[N/2]) */
2480     t1r = f4r + f4r;
2481     t1i = -f4i - f4i;
2482 
2483     f0r = f1r + f5r;
2484     f0i = f1i - f5i;
2485     f4r = f1i + f5i;
2486     f4i = f5r - f1r;
2487 
2488     f1r = f0r + w0r * f4r + w0r * f4i;
2489     f1i = f0i - w0r * f4r + w0r * f4i;
2490     f5r = Two * f0r - f1r;
2491     f5i = f1i - Two * f0i;
2492 
2493     *(p0r) = t0r;
2494     *(p0r + 1) = t0i;
2495     *(p0r + pos) = t1r;
2496     *(p0r + posi) = t1i;
2497     *(p1r) = f1r;
2498     *(p1r + 1) = f1i;
2499     *(p1r + pos) = f5r;
2500     *(p1r + posi) = f5i;
2501 
2502     u0r = Utbl + 1;
2503     u0i = Utbl + (POW2(M - 2) - 1);
2504 
2505     w0r = *u0r; w0i = *u0i;
2506 
2507     p0r = (ioptr + 2);
2508     p1r = (ioptr + (POW2(M - 2) - 1) * 2);
2509 
2510     /* Butterflys */
2511     /*
2512        f0   -       t0      -       -       f0
2513        f5   -       t1      - w0    -       f5
2514 
2515        f1   -       t0      -       -       f1
2516        f4   -       t1      -iw0 -  f4
2517      */
2518 
2519     for (diffUcnt = POW2(M - 3) - 1; diffUcnt > 0; diffUcnt--) {
2520 
2521       f0r = *(p0r);
2522       f0i = *(p0r + 1);
2523       f5r = *(p1r + pos);
2524       f5i = *(p1r + posi);
2525       f1r = *(p1r);
2526       f1i = *(p1r + 1);
2527       f4r = *(p0r + pos);
2528       f4i = *(p0r + posi);
2529 
2530       t0r = f0r + f5r;
2531       t0i = f0i - f5i;
2532       t1r = f0i + f5i;
2533       t1i = f5r - f0r;
2534 
2535       f0r = t0r + w0r * t1r + w0i * t1i;
2536       f0i = t0i - w0i * t1r + w0r * t1i;
2537       f5r = Two * t0r - f0r;
2538       f5i = f0i - Two * t0i;
2539 
2540       t0r = f1r + f4r;
2541       t0i = f1i - f4i;
2542       t1r = f1i + f4i;
2543       t1i = f4r - f1r;
2544 
2545       f1r = t0r + w0i * t1r + w0r * t1i;
2546       f1i = t0i - w0r * t1r + w0i * t1i;
2547       f4r = Two * t0r - f1r;
2548       f4i = f1i - Two * t0i;
2549 
2550       *(p0r) = f0r;
2551       *(p0r + 1) = f0i;
2552       *(p1r + pos) = f5r;
2553       *(p1r + posi) = f5i;
2554 
2555       w0r = *++u0r;
2556       w0i = *--u0i;
2557 
2558       *(p1r) = f1r;
2559       *(p1r + 1) = f1i;
2560       *(p0r + pos) = f4r;
2561       *(p0r + posi) = f4i;
2562 
2563       p0r += 2;
2564       p1r -= 2;
2565     }
2566 }
2567 
2568 static void rffts1(SPFLOAT *ioptr, int M, SPFLOAT *Utbl, int16_t *BRLow)
2569 {
2570     /* Compute in-place real fft on the rows of the input array           */
2571     /* The result is the complex spectra of the positive frequencies      */
2572     /* except the location for the first complex number contains the real */
2573     /* values for DC and Nyquest                                          */
2574     /* INPUTS                                                             */
2575     /*   *ioptr = real input data array                                   */
2576     /*   M = log2 of fft size                                             */
2577     /*   *Utbl = cosine table                                             */
2578     /*   *BRLow = bit reversed counter table                              */
2579     /* OUTPUTS                                                            */
2580     /*   *ioptr = output data array   in the following order              */
2581     /*     Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]),  */
2582     /*     ... Re(x[N/2-1]), Im(x[N/2-1]).                                */
2583 
2584     SPFLOAT scale;
2585     int StageCnt;
2586     int NDiffU;
2587 
2588     M = M - 1;
2589     switch (M) {
2590     case -1:
2591       break;
2592     case 0:
2593       rfft1pt(ioptr);           /* a 2 pt fft */
2594       break;
2595     case 1:
2596       rfft2pt(ioptr);           /* a 4 pt fft */
2597       break;
2598     case 2:
2599       rfft4pt(ioptr);           /* an 8 pt fft */
2600       break;
2601     case 3:
2602       rfft8pt(ioptr);           /* a 16 pt fft */
2603       break;
2604     default:
2605       scale = 0.5;
2606       /* bit reverse and first radix 2 stage */
2607       scbitrevR2(ioptr, M, BRLow, scale);
2608       StageCnt = (M - 1) / 3;   /* number of radix 8 stages           */
2609       NDiffU = 2;               /* one radix 2 stage already complete */
2610       if ((M - 1 - (StageCnt * 3)) == 1) {
2611         bfR2(ioptr, M, NDiffU); /* 1 radix 2 stage */
2612         NDiffU *= 2;
2613       }
2614       if ((M - 1 - (StageCnt * 3)) == 2) {
2615         bfR4(ioptr, M, NDiffU); /* 1 radix 4 stage */
2616         NDiffU *= 4;
2617       }
2618       if (M <= (int) MCACHE)
2619         bfstages(ioptr, M, Utbl, 2, NDiffU, StageCnt);  /* RADIX 8 Stages */
2620       else
2621         fftrecurs(ioptr, M, Utbl, 2, NDiffU, StageCnt); /* RADIX 8 Stages */
2622       frstage(ioptr, M + 1, Utbl);
2623     }
2624 }
2625 
2626 /*******************
2627 * parts of riffts1 *
2628 *******************/
2629 
2630 static void rifft1pt(SPFLOAT *ioptr, SPFLOAT scale)
2631 {
2632     /***   RADIX 2 rifft    ***/
2633     SPFLOAT f0r, f0i;
2634     SPFLOAT t0r, t0i;
2635 
2636     /* bit reversed load */
2637     f0r = ioptr[0];
2638     f0i = ioptr[1];
2639 
2640     /* finish rfft */
2641     t0r = f0r + f0i;
2642     t0i = f0r - f0i;
2643 
2644     /* store result */
2645     ioptr[0] = scale * t0r;
2646     ioptr[1] = scale * t0i;
2647 }
2648 
2649 static void rifft2pt(SPFLOAT *ioptr, SPFLOAT scale)
2650 {
2651     /***   RADIX 4 rifft    ***/
2652     SPFLOAT f0r, f0i, f1r, f1i;
2653     SPFLOAT t0r, t0i;
2654     const SPFLOAT Two = 2.0;
2655 
2656     /* bit reversed load */
2657     t0r = ioptr[0];
2658     t0i = ioptr[1];
2659     f1r = Two * ioptr[2];
2660     f1i = Two * ioptr[3];
2661 
2662     /* start rifft */
2663     f0r = t0r + t0i;
2664     f0i = t0r - t0i;
2665     /* Butterflys           */
2666     /*
2667        f0   -       -       t0
2668        f1   -  1 -  f1
2669      */
2670 
2671     t0r = f0r + f1r;
2672     t0i = f0i - f1i;
2673     f1r = f0r - f1r;
2674     f1i = f0i + f1i;
2675 
2676     /* store result */
2677     ioptr[0] = scale * t0r;
2678     ioptr[1] = scale * t0i;
2679     ioptr[2] = scale * f1r;
2680     ioptr[3] = scale * f1i;
2681 }
2682 
2683 static void rifft4pt(SPFLOAT *ioptr, SPFLOAT scale)
2684 {
2685     /***   RADIX 8 rifft    ***/
2686     SPFLOAT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
2687     SPFLOAT t0r, t0i, t1r, t1i;
2688     SPFLOAT w0r = 1.0 / MYROOT2;    /* cos(pi/4)   */
2689     const SPFLOAT Two = 2.0;
2690 
2691     /* bit reversed load */
2692     t0r = ioptr[0];
2693     t0i = ioptr[1];
2694     f2r = ioptr[2];
2695     f2i = ioptr[3];
2696     f1r = Two * ioptr[4];
2697     f1i = Two * ioptr[5];
2698     f3r = ioptr[6];
2699     f3i = ioptr[7];
2700 
2701     /* start rfft */
2702     f0r = t0r + t0i;              /* compute Re(x[0]) */
2703     f0i = t0r - t0i;              /* compute Re(x[N/2]) */
2704 
2705     t1r = f2r + f3r;
2706     t1i = f2i - f3i;
2707     t0r = f2r - f3r;
2708     t0i = f2i + f3i;
2709 
2710     f2r = t1r - w0r * t0r - w0r * t0i;
2711     f2i = t1i + w0r * t0r - w0r * t0i;
2712     f3r = Two * t1r - f2r;
2713     f3i = f2i - Two * t1i;
2714 
2715     /* Butterflys           */
2716     /*
2717        f0   -       -       t0      -       -       f0
2718        f1   -  1 -  f1      -       -       f1
2719        f2   -       -       f2      -  1 -  f2
2720        f3   -  1 -  t1      -  i -  f3
2721      */
2722 
2723     t0r = f0r + f1r;
2724     t0i = f0i - f1i;
2725     f1r = f0r - f1r;
2726     f1i = f0i + f1i;
2727 
2728     t1r = f2r - f3r;
2729     t1i = f2i - f3i;
2730     f2r = f2r + f3r;
2731     f2i = f2i + f3i;
2732 
2733     f0r = t0r + f2r;
2734     f0i = t0i + f2i;
2735     f2r = t0r - f2r;
2736     f2i = t0i - f2i;
2737 
2738     f3r = f1r + t1i;
2739     f3i = f1i - t1r;
2740     f1r = f1r - t1i;
2741     f1i = f1i + t1r;
2742 
2743     /* store result */
2744     ioptr[0] = scale * f0r;
2745     ioptr[1] = scale * f0i;
2746     ioptr[2] = scale * f1r;
2747     ioptr[3] = scale * f1i;
2748     ioptr[4] = scale * f2r;
2749     ioptr[5] = scale * f2i;
2750     ioptr[6] = scale * f3r;
2751     ioptr[7] = scale * f3i;
2752 }
2753 
2754 static void rifft8pt(SPFLOAT *ioptr, SPFLOAT scale)
2755 {
2756     /***   RADIX 16 rifft   ***/
2757     SPFLOAT w0r = (SPFLOAT) (1.0 / MYROOT2);    /* cos(pi/4)    */
2758     SPFLOAT w1r = MYCOSPID8;                  /* cos(pi/8)    */
2759     SPFLOAT w1i = MYSINPID8;                  /* sin(pi/8)    */
2760     SPFLOAT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
2761     SPFLOAT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
2762     SPFLOAT t0r, t0i, t1r, t1i;
2763     const SPFLOAT Two = 2.0;
2764 
2765     /* bit reversed load */
2766     t0r = ioptr[0];
2767     t0i = ioptr[1];
2768     f4r = ioptr[2];
2769     f4i = ioptr[3];
2770     f2r = ioptr[4];
2771     f2i = ioptr[5];
2772     f6r = ioptr[6];
2773     f6i = ioptr[7];
2774     f1r = Two * ioptr[8];
2775     f1i = Two * ioptr[9];
2776     f5r = ioptr[10];
2777     f5i = ioptr[11];
2778     f3r = ioptr[12];
2779     f3i = ioptr[13];
2780     f7r = ioptr[14];
2781     f7i = ioptr[15];
2782 
2783     /* start rfft */
2784     f0r = t0r + t0i;              /* compute Re(x[0]) */
2785     f0i = t0r - t0i;              /* compute Re(x[N/2]) */
2786 
2787     t0r = f2r + f3r;
2788     t0i = f2i - f3i;
2789     t1r = f2r - f3r;
2790     t1i = f2i + f3i;
2791 
2792     f2r = t0r - w0r * t1r - w0r * t1i;
2793     f2i = t0i + w0r * t1r - w0r * t1i;
2794     f3r = Two * t0r - f2r;
2795     f3i = f2i - Two * t0i;
2796 
2797     t0r = f4r + f7r;
2798     t0i = f4i - f7i;
2799     t1r = f4r - f7r;
2800     t1i = f4i + f7i;
2801 
2802     f4r = t0r - w1i * t1r - w1r * t1i;
2803     f4i = t0i + w1r * t1r - w1i * t1i;
2804     f7r = Two * t0r - f4r;
2805     f7i = f4i - Two * t0i;
2806 
2807     t0r = f6r + f5r;
2808     t0i = f6i - f5i;
2809     t1r = f6r - f5r;
2810     t1i = f6i + f5i;
2811 
2812     f6r = t0r - w1r * t1r - w1i * t1i;
2813     f6i = t0i + w1i * t1r - w1r * t1i;
2814     f5r = Two * t0r - f6r;
2815     f5i = f6i - Two * t0i;
2816 
2817     /* Butterflys           */
2818     /*
2819        f0   -       -       t0      -       -       f0      -       -       f0
2820        f1*  -  1 -  f1      -       -       f1      -       -       f1
2821        f2   -       -       f2      -  1 -  f2      -       -       f2
2822        f3   -  1 -  t1      -  i -  f3      -       -       f3
2823        f4   -       -       t0      -       -       f4      -  1 -  t0
2824        f5   -  1 -  f5      -       -       f5      - w3 -  f4
2825        f6   -       -       f6      -  1 -  f6      -  i -  t1
2826        f7   -  1 -  t1      -  i -  f7      - iw3-  f6
2827      */
2828 
2829     t0r = f0r + f1r;
2830     t0i = f0i - f1i;
2831     f1r = f0r - f1r;
2832     f1i = f0i + f1i;
2833 
2834     t1r = f2r - f3r;
2835     t1i = f2i - f3i;
2836     f2r = f2r + f3r;
2837     f2i = f2i + f3i;
2838 
2839     f0r = t0r + f2r;
2840     f0i = t0i + f2i;
2841     f2r = t0r - f2r;
2842     f2i = t0i - f2i;
2843 
2844     f3r = f1r + t1i;
2845     f3i = f1i - t1r;
2846     f1r = f1r - t1i;
2847     f1i = f1i + t1r;
2848 
2849     t0r = f4r + f5r;
2850     t0i = f4i + f5i;
2851     f5r = f4r - f5r;
2852     f5i = f4i - f5i;
2853 
2854     t1r = f6r - f7r;
2855     t1i = f6i - f7i;
2856     f6r = f6r + f7r;
2857     f6i = f6i + f7i;
2858 
2859     f4r = t0r + f6r;
2860     f4i = t0i + f6i;
2861     f6r = t0r - f6r;
2862     f6i = t0i - f6i;
2863 
2864     f7r = f5r + t1i;
2865     f7i = f5i - t1r;
2866     f5r = f5r - t1i;
2867     f5i = f5i + t1r;
2868 
2869     t0r = f0r - f4r;
2870     t0i = f0i - f4i;
2871     f0r = f0r + f4r;
2872     f0i = f0i + f4i;
2873 
2874     t1r = f2r + f6i;
2875     t1i = f2i - f6r;
2876     f2r = f2r - f6i;
2877     f2i = f2i + f6r;
2878 
2879     f4r = f1r - f5r * w0r + f5i * w0r;
2880     f4i = f1i - f5r * w0r - f5i * w0r;
2881     f1r = f1r * Two - f4r;
2882     f1i = f1i * Two - f4i;
2883 
2884     f6r = f3r + f7r * w0r + f7i * w0r;
2885     f6i = f3i - f7r * w0r + f7i * w0r;
2886     f3r = f3r * Two - f6r;
2887     f3i = f3i * Two - f6i;
2888 
2889     /* store result */
2890     ioptr[0] = scale * f0r;
2891     ioptr[1] = scale * f0i;
2892     ioptr[2] = scale * f1r;
2893     ioptr[3] = scale * f1i;
2894     ioptr[4] = scale * f2r;
2895     ioptr[5] = scale * f2i;
2896     ioptr[6] = scale * f3r;
2897     ioptr[7] = scale * f3i;
2898     ioptr[8] = scale * t0r;
2899     ioptr[9] = scale * t0i;
2900     ioptr[10] = scale * f4r;
2901     ioptr[11] = scale * f4i;
2902     ioptr[12] = scale * t1r;
2903     ioptr[13] = scale * t1i;
2904     ioptr[14] = scale * f6r;
2905     ioptr[15] = scale * f6i;
2906 }
2907 
2908 static void ifrstage(SPFLOAT *ioptr, int M, SPFLOAT *Utbl)
2909 {
2910     /*      Start RIFFT             */
2911 
2912     unsigned int pos;
2913     unsigned int posi;
2914     unsigned int diffUcnt;
2915 
2916     SPFLOAT *p0r, *p1r;
2917     SPFLOAT *u0r, *u0i;
2918 
2919     SPFLOAT w0r, w0i;
2920     SPFLOAT f0r, f0i, f1r, f1i, f4r, f4i, f5r, f5i;
2921     SPFLOAT t0r, t0i, t1r, t1i;
2922     const SPFLOAT Two = 2.0;
2923 
2924     pos = POW2(M - 1);
2925     posi = pos + 1;
2926 
2927     p0r = ioptr;
2928     p1r = ioptr + pos / 2;
2929 
2930     u0r = Utbl + POW2(M - 3);
2931 
2932     w0r = *u0r; f0r = *(p0r);
2933     f0i = *(p0r + 1);
2934     f4r = *(p0r + pos);
2935     f4i = *(p0r + posi);
2936     f1r = *(p1r);
2937     f1i = *(p1r + 1);
2938     f5r = *(p1r + pos);
2939     f5i = *(p1r + posi);
2940 
2941     t0r = f0r + f0i;
2942     t0i = f0r - f0i;
2943     t1r = f4r + f4r;
2944     t1i = -f4i - f4i;
2945 
2946     f0r = f1r + f5r;
2947     f0i = f1i - f5i;
2948     f4r = f1r - f5r;
2949     f4i = f1i + f5i;
2950 
2951     f1r = f0r - w0r * f4r - w0r * f4i;
2952     f1i = f0i + w0r * f4r - w0r * f4i;
2953     f5r = Two * f0r - f1r;
2954     f5i = f1i - Two * f0i;
2955 
2956     *(p0r) = t0r;
2957     *(p0r + 1) = t0i;
2958     *(p0r + pos) = t1r;
2959     *(p0r + posi) = t1i;
2960     *(p1r) = f1r;
2961     *(p1r + 1) = f1i;
2962     *(p1r + pos) = f5r;
2963     *(p1r + posi) = f5i;
2964 
2965     u0r = Utbl + 1;
2966     u0i = Utbl + (POW2(M - 2) - 1);
2967 
2968     w0r = *u0r; w0i = *u0i;
2969 
2970     p0r = (ioptr + 2);
2971     p1r = (ioptr + (POW2(M - 2) - 1) * 2);
2972 
2973     /* Butterflys */
2974     /*
2975        f0   -        t0             -       f0
2976        f1   -     t1     -w0-   f1
2977 
2978        f2   -        t0             -       f2
2979        f3   -     t1           -iw0-  f3
2980      */
2981 
2982     for (diffUcnt = POW2(M - 3) - 1; diffUcnt > 0; diffUcnt--) {
2983 
2984       f0r = *(p0r);
2985       f0i = *(p0r + 1);
2986       f5r = *(p1r + pos);
2987       f5i = *(p1r + posi);
2988       f1r = *(p1r);
2989       f1i = *(p1r + 1);
2990       f4r = *(p0r + pos);
2991       f4i = *(p0r + posi);
2992 
2993       t0r = f0r + f5r;
2994       t0i = f0i - f5i;
2995       t1r = f0r - f5r;
2996       t1i = f0i + f5i;
2997 
2998       f0r = t0r - w0i * t1r - w0r * t1i;
2999       f0i = t0i + w0r * t1r - w0i * t1i;
3000       f5r = Two * t0r - f0r;
3001       f5i = f0i - Two * t0i;
3002 
3003       t0r = f1r + f4r;
3004       t0i = f1i - f4i;
3005       t1r = f1r - f4r;
3006       t1i = f1i + f4i;
3007 
3008       f1r = t0r - w0r * t1r - w0i * t1i;
3009       f1i = t0i + w0i * t1r - w0r * t1i;
3010       f4r = Two * t0r - f1r;
3011       f4i = f1i - Two * t0i;
3012 
3013       *(p0r) = f0r;
3014       *(p0r + 1) = f0i;
3015       *(p1r + pos) = f5r;
3016       *(p1r + posi) = f5i;
3017 
3018       w0r = *++u0r;
3019       w0i = *--u0i;
3020 
3021       *(p1r) = f1r;
3022       *(p1r + 1) = f1i;
3023       *(p0r + pos) = f4r;
3024       *(p0r + posi) = f4i;
3025 
3026       p0r += 2;
3027       p1r -= 2;
3028     }
3029 }
3030 
3031 static void riffts1(SPFLOAT *ioptr, int M, SPFLOAT *Utbl, int16_t *BRLow)
3032 {
3033     /* Compute in-place real ifft on the rows of the input array    */
3034     /* data order as from rffts1                                    */
3035     /* INPUTS                                                       */
3036     /*   *ioptr = input data array in the following order           */
3037     /*   M = log2 of fft size                                       */
3038     /*   Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]),                  */
3039     /*   Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]).        */
3040     /*   *Utbl = cosine table                                       */
3041     /*   *BRLow = bit reversed counter table                        */
3042     /* OUTPUTS                                                      */
3043     /*   *ioptr = real output data array                            */
3044 
3045     SPFLOAT scale;
3046     int StageCnt;
3047     int NDiffU;
3048 
3049     scale = (SPFLOAT)(1.0 / (double)((int)POW2(M)));
3050     M = M - 1;
3051     switch (M) {
3052     case -1:
3053       break;
3054     case 0:
3055       rifft1pt(ioptr, scale);   /* a 2 pt fft */
3056       break;
3057     case 1:
3058       rifft2pt(ioptr, scale);   /* a 4 pt fft */
3059       break;
3060     case 2:
3061       rifft4pt(ioptr, scale);   /* an 8 pt fft */
3062       break;
3063     case 3:
3064       rifft8pt(ioptr, scale);   /* a 16 pt fft */
3065       break;
3066     default:
3067       ifrstage(ioptr, M + 1, Utbl);
3068       /* bit reverse and first radix 2 stage */
3069       scbitrevR2(ioptr, M, BRLow, scale);
3070       StageCnt = (M - 1) / 3;   /* number of radix 8 stages           */
3071       NDiffU = 2;               /* one radix 2 stage already complete */
3072       if ((M - 1 - (StageCnt * 3)) == 1) {
3073         ibfR2(ioptr, M, NDiffU);        /* 1 radix 2 stage */
3074         NDiffU *= 2;
3075       }
3076       if ((M - 1 - (StageCnt * 3)) == 2) {
3077         ibfR4(ioptr, M, NDiffU);        /* 1 radix 4 stage */
3078         NDiffU *= 4;
3079       }
3080       if (M <= (int) MCACHE)
3081         ibfstages(ioptr, M, Utbl, 2, NDiffU, StageCnt); /*  RADIX 8 Stages */
3082       else
3083         ifftrecurs(ioptr, M, Utbl, 2, NDiffU, StageCnt); /* RADIX 8 Stages */
3084     }
3085 }
3086