1 /*******************************************************************
2 lower level fft stuff including routines called in fftext.c and fft2d.c
3 *******************************************************************/
4 /* inline declarations modified by RBD for C99 compiler */
5
6
7 #include <stdint.h>
8 #include "fftlib.h"
9 #include <math.h>
10 #define MCACHE (11-(sizeof(float)/8)) // fft's with M bigger than this bust primary cache
11 #ifdef WIN32
12 #define inline __inline
13 #endif
14
15 // some math constants to 40 decimal places
16 #define MYPI 3.141592653589793238462643383279502884197F // pi
17 #define MYROOT2 1.414213562373095048801688724209698078569F // sqrt(2)
18 #define MYCOSPID8 0.9238795325112867561281831893967882868224F // cos(pi/8)
19 #define MYSINPID8 0.3826834323650897717284599840303988667613F // sin(pi/8)
20
21
22 /*************************************************
23 routines to initialize tables used by fft routines
24 **************************************************/
25
fftCosInit(long M,float * Utbl)26 void fftCosInit(long M, float *Utbl){
27 /* Compute Utbl, the cosine table for ffts */
28 /* of size (pow(2,M)/4 +1) */
29 /* INPUTS */
30 /* M = log2 of fft size */
31 /* OUTPUTS */
32 /* *Utbl = cosine table */
33 unsigned long fftN = POW2(M);
34 unsigned long i1;
35 Utbl[0] = 1.0;
36 for (i1 = 1; i1 < fftN/4; i1++)
37 Utbl[i1] = (float) cos( (2.0 * MYPI * i1) / fftN );
38 Utbl[fftN/4] = 0.0;
39 }
40
fftBRInit(long M,short * BRLow)41 void fftBRInit(long M, short *BRLow){
42 /* Compute BRLow, the bit reversed table for ffts */
43 /* of size pow(2,M/2 -1) */
44 /* INPUTS */
45 /* M = log2 of fft size */
46 /* OUTPUTS */
47 /* *BRLow = bit reversed counter table */
48 long Mroot_1 = M / 2 - 1;
49 long Nroot_1 = POW2(Mroot_1);
50 long i1;
51 long bitsum;
52 long bitmask;
53 long bit;
54 for (i1 = 0; i1 < Nroot_1; i1++){
55 bitsum = 0;
56 bitmask = 1;
57 for (bit=1; bit <= Mroot_1; bitmask<<=1, bit++)
58 if (i1 & bitmask)
59 bitsum = bitsum + (Nroot_1 >> bit);
60 BRLow[i1] = (short) bitsum;
61 };
62 }
63
64 /************************************************
65 parts of ffts1
66 *************************************************/
67
68 //inline void bitrevR2(float *ioptr, long M, short *BRLow);
bitrevR2(float * ioptr,long M,short * BRLow)69 static inline void bitrevR2(float *ioptr, long M, short *BRLow){
70 /*** bit reverse and first radix 2 stage of forward or inverse fft ***/
71 float f0r;
72 float f0i;
73 float f1r;
74 float f1i;
75 float f2r;
76 float f2i;
77 float f3r;
78 float f3i;
79 float f4r;
80 float f4i;
81 float f5r;
82 float f5i;
83 float f6r;
84 float f6i;
85 float f7r;
86 float f7i;
87 float t0r;
88 float t0i;
89 float t1r;
90 float t1i;
91 float *p0r;
92 float *p1r;
93 float *IOP;
94 float *iolimit;
95 long Colstart;
96 long iCol;
97 unsigned long posA;
98 unsigned long posAi;
99 unsigned long posB;
100 unsigned long posBi;
101
102 const unsigned long Nrems2 = POW2((M+3)/2);
103 const unsigned long Nroot_1_ColInc = POW2(M)-Nrems2;
104 const unsigned long Nroot_1 = POW2(M/2-1)-1;
105 const unsigned long ColstartShift = (M+1)/2 +1;
106 posA = POW2(M); // 1/2 of POW2(M) complexes
107 posAi = posA + 1;
108 posB = posA + 2;
109 posBi = posB + 1;
110
111 iolimit = ioptr + Nrems2;
112 for (; ioptr < iolimit; ioptr += (int64_t) POW2(M/2+1)){
113 for (Colstart = Nroot_1; Colstart >= 0; Colstart--){
114 iCol = Nroot_1;
115 p0r = ioptr+ Nroot_1_ColInc + BRLow[Colstart]*4;
116 IOP = ioptr + (Colstart << ColstartShift);
117 p1r = IOP + BRLow[iCol]*4;
118 f0r = *(p0r);
119 f0i = *(p0r+1);
120 f1r = *(p0r+posA);
121 f1i = *(p0r+posAi);
122 for (; iCol > Colstart;){
123 f2r = *(p0r+2);
124 f2i = *(p0r+(2+1));
125 f3r = *(p0r+posB);
126 f3i = *(p0r+posBi);
127 f4r = *(p1r);
128 f4i = *(p1r+1);
129 f5r = *(p1r+posA);
130 f5i = *(p1r+posAi);
131 f6r = *(p1r+2);
132 f6i = *(p1r+(2+1));
133 f7r = *(p1r+posB);
134 f7i = *(p1r+posBi);
135
136 t0r = f0r + f1r;
137 t0i = f0i + f1i;
138 f1r = f0r - f1r;
139 f1i = f0i - f1i;
140 t1r = f2r + f3r;
141 t1i = f2i + f3i;
142 f3r = f2r - f3r;
143 f3i = f2i - f3i;
144 f0r = f4r + f5r;
145 f0i = f4i + f5i;
146 f5r = f4r - f5r;
147 f5i = f4i - f5i;
148 f2r = f6r + f7r;
149 f2i = f6i + f7i;
150 f7r = f6r - f7r;
151 f7i = f6i - f7i;
152
153 *(p1r) = t0r;
154 *(p1r+1) = t0i;
155 *(p1r+2) = f1r;
156 *(p1r+(2+1)) = f1i;
157 *(p1r+posA) = t1r;
158 *(p1r+posAi) = t1i;
159 *(p1r+posB) = f3r;
160 *(p1r+posBi) = f3i;
161 *(p0r) = f0r;
162 *(p0r+1) = f0i;
163 *(p0r+2) = f5r;
164 *(p0r+(2+1)) = f5i;
165 *(p0r+posA) = f2r;
166 *(p0r+posAi) = f2i;
167 *(p0r+posB) = f7r;
168 *(p0r+posBi) = f7i;
169
170 p0r -= Nrems2;
171 f0r = *(p0r);
172 f0i = *(p0r+1);
173 f1r = *(p0r+posA);
174 f1i = *(p0r+posAi);
175 iCol -= 1;
176 p1r = IOP + BRLow[iCol]*4;
177 };
178 f2r = *(p0r+2);
179 f2i = *(p0r+(2+1));
180 f3r = *(p0r+posB);
181 f3i = *(p0r+posBi);
182
183 t0r = f0r + f1r;
184 t0i = f0i + f1i;
185 f1r = f0r - f1r;
186 f1i = f0i - f1i;
187 t1r = f2r + f3r;
188 t1i = f2i + f3i;
189 f3r = f2r - f3r;
190 f3i = f2i - f3i;
191
192 *(p0r) = t0r;
193 *(p0r+1) = t0i;
194 *(p0r+2) = f1r;
195 *(p0r+(2+1)) = f1i;
196 *(p0r+posA) = t1r;
197 *(p0r+posAi) = t1i;
198 *(p0r+posB) = f3r;
199 *(p0r+posBi) = f3i;
200
201 };
202 };
203 }
204
205 //inline void fft2pt(float *ioptr);
fft2pt(float * ioptr)206 static inline void fft2pt(float *ioptr){
207 /*** RADIX 2 fft ***/
208 float f0r, f0i, f1r, f1i;
209 float t0r, t0i;
210
211 /* bit reversed load */
212 f0r = ioptr[0];
213 f0i = ioptr[1];
214 f1r = ioptr[2];
215 f1i = ioptr[3];
216
217 /* Butterflys */
218 /*
219 f0 - - t0
220 f1 - 1 - f1
221 */
222
223 t0r = f0r + f1r;
224 t0i = f0i + f1i;
225 f1r = f0r - f1r;
226 f1i = f0i - f1i;
227
228 /* store result */
229 ioptr[0] = t0r;
230 ioptr[1] = t0i;
231 ioptr[2] = f1r;
232 ioptr[3] = f1i;
233 }
234
235
236 //inline void fft4pt(float *ioptr);
fft4pt(float * ioptr)237 static inline void fft4pt(float *ioptr){
238 /*** RADIX 4 fft ***/
239 float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
240 float t0r, t0i, t1r, t1i;
241
242 /* bit reversed load */
243 f0r = ioptr[0];
244 f0i = ioptr[1];
245 f1r = ioptr[4];
246 f1i = ioptr[5];
247 f2r = ioptr[2];
248 f2i = ioptr[3];
249 f3r = ioptr[6];
250 f3i = ioptr[7];
251
252 /* Butterflys */
253 /*
254 f0 - - t0 - - f0
255 f1 - 1 - f1 - - f1
256 f2 - - f2 - 1 - f2
257 f3 - 1 - t1 - -i - f3
258 */
259
260 t0r = f0r + f1r;
261 t0i = f0i + f1i;
262 f1r = f0r - f1r;
263 f1i = f0i - f1i;
264
265 t1r = f2r - f3r;
266 t1i = f2i - f3i;
267 f2r = f2r + f3r;
268 f2i = f2i + f3i;
269
270 f0r = t0r + f2r;
271 f0i = t0i + f2i;
272 f2r = t0r - f2r;
273 f2i = t0i - f2i;
274
275 f3r = f1r - t1i;
276 f3i = f1i + t1r;
277 f1r = f1r + t1i;
278 f1i = f1i - t1r;
279
280 /* store result */
281 ioptr[0] = f0r;
282 ioptr[1] = f0i;
283 ioptr[2] = f1r;
284 ioptr[3] = f1i;
285 ioptr[4] = f2r;
286 ioptr[5] = f2i;
287 ioptr[6] = f3r;
288 ioptr[7] = f3i;
289 }
290
291 //inline void fft8pt(float *ioptr);
fft8pt(float * ioptr)292 static inline void fft8pt(float *ioptr){
293 /*** RADIX 8 fft ***/
294 float w0r = 1.0F/MYROOT2; /* cos(pi/4) */
295 float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
296 float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
297 float t0r, t0i, t1r, t1i;
298 const float Two = 2.0;
299
300 /* bit reversed load */
301 f0r = ioptr[0];
302 f0i = ioptr[1];
303 f1r = ioptr[8];
304 f1i = ioptr[9];
305 f2r = ioptr[4];
306 f2i = ioptr[5];
307 f3r = ioptr[12];
308 f3i = ioptr[13];
309 f4r = ioptr[2];
310 f4i = ioptr[3];
311 f5r = ioptr[10];
312 f5i = ioptr[11];
313 f6r = ioptr[6];
314 f6i = ioptr[7];
315 f7r = ioptr[14];
316 f7i = ioptr[15];
317 /* Butterflys */
318 /*
319 f0 - - t0 - - f0 - - f0
320 f1 - 1 - f1 - - f1 - - f1
321 f2 - - f2 - 1 - f2 - - f2
322 f3 - 1 - t1 - -i - f3 - - f3
323 f4 - - t0 - - f4 - 1 - t0
324 f5 - 1 - f5 - - f5 - w3 - f4
325 f6 - - f6 - 1 - f6 - -i - t1
326 f7 - 1 - t1 - -i - f7 - iw3- f6
327 */
328
329 t0r = f0r + f1r;
330 t0i = f0i + f1i;
331 f1r = f0r - f1r;
332 f1i = f0i - f1i;
333
334 t1r = f2r - f3r;
335 t1i = f2i - f3i;
336 f2r = f2r + f3r;
337 f2i = f2i + f3i;
338
339 f0r = t0r + f2r;
340 f0i = t0i + f2i;
341 f2r = t0r - f2r;
342 f2i = t0i - f2i;
343
344 f3r = f1r - t1i;
345 f3i = f1i + t1r;
346 f1r = f1r + t1i;
347 f1i = f1i - t1r;
348
349
350 t0r = f4r + f5r;
351 t0i = f4i + f5i;
352 f5r = f4r - f5r;
353 f5i = f4i - f5i;
354
355 t1r = f6r - f7r;
356 t1i = f6i - f7i;
357 f6r = f6r + f7r;
358 f6i = f6i + f7i;
359
360 f4r = t0r + f6r;
361 f4i = t0i + f6i;
362 f6r = t0r - f6r;
363 f6i = t0i - f6i;
364
365 f7r = f5r - t1i;
366 f7i = f5i + t1r;
367 f5r = f5r + t1i;
368 f5i = f5i - t1r;
369
370
371 t0r = f0r - f4r;
372 t0i = f0i - f4i;
373 f0r = f0r + f4r;
374 f0i = f0i + f4i;
375
376 t1r = f2r - f6i;
377 t1i = f2i + f6r;
378 f2r = f2r + f6i;
379 f2i = f2i - f6r;
380
381 f4r = f1r - f5r * w0r - f5i * w0r;
382 f4i = f1i + f5r * w0r - f5i * w0r;
383 f1r = f1r * Two - f4r;
384 f1i = f1i * Two - f4i;
385
386 f6r = f3r + f7r * w0r - f7i * w0r;
387 f6i = f3i + f7r * w0r + f7i * w0r;
388 f3r = f3r * Two - f6r;
389 f3i = f3i * Two - f6i;
390
391 /* store result */
392 ioptr[0] = f0r;
393 ioptr[1] = f0i;
394 ioptr[2] = f1r;
395 ioptr[3] = f1i;
396 ioptr[4] = f2r;
397 ioptr[5] = f2i;
398 ioptr[6] = f3r;
399 ioptr[7] = f3i;
400 ioptr[8] = t0r;
401 ioptr[9] = t0i;
402 ioptr[10] = f4r;
403 ioptr[11] = f4i;
404 ioptr[12] = t1r;
405 ioptr[13] = t1i;
406 ioptr[14] = f6r;
407 ioptr[15] = f6i;
408 }
409
410 //inline void bfR2(float *ioptr, long M, long NDiffU);
bfR2(float * ioptr,long M,long NDiffU)411 static inline void bfR2(float *ioptr, long M, long NDiffU){
412 /*** 2nd radix 2 stage ***/
413 unsigned long pos;
414 unsigned long posi;
415 unsigned long pinc;
416 unsigned long pnext;
417 unsigned long NSameU;
418 unsigned long SameUCnt;
419
420 float *pstrt;
421 float *p0r, *p1r, *p2r, *p3r;
422
423 float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
424 float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
425 /* UNUSED: const float Two = 2.0; */
426
427 pinc = NDiffU * 2; // 2 floats per complex
428 pnext = pinc * 4;
429 pos = 2;
430 posi = pos+1;
431 NSameU = POW2(M) / 4 /NDiffU; // 4 Us at a time
432 pstrt = ioptr;
433 p0r = pstrt;
434 p1r = pstrt+pinc;
435 p2r = p1r+pinc;
436 p3r = p2r+pinc;
437
438 /* Butterflys */
439 /*
440 f0 - - f4
441 f1 - 1 - f5
442 f2 - - f6
443 f3 - 1 - f7
444 */
445 /* Butterflys */
446 /*
447 f0 - - f4
448 f1 - 1 - f5
449 f2 - - f6
450 f3 - 1 - f7
451 */
452
453 for (SameUCnt = NSameU; SameUCnt > 0 ; SameUCnt--){
454
455 f0r = *p0r;
456 f1r = *p1r;
457 f0i = *(p0r + 1);
458 f1i = *(p1r + 1);
459 f2r = *p2r;
460 f3r = *p3r;
461 f2i = *(p2r + 1);
462 f3i = *(p3r + 1);
463
464 f4r = f0r + f1r;
465 f4i = f0i + f1i;
466 f5r = f0r - f1r;
467 f5i = f0i - f1i;
468
469 f6r = f2r + f3r;
470 f6i = f2i + f3i;
471 f7r = f2r - f3r;
472 f7i = f2i - f3i;
473
474 *p0r = f4r;
475 *(p0r + 1) = f4i;
476 *p1r = f5r;
477 *(p1r + 1) = f5i;
478 *p2r = f6r;
479 *(p2r + 1) = f6i;
480 *p3r = f7r;
481 *(p3r + 1) = f7i;
482
483 f0r = *(p0r + pos);
484 f1i = *(p1r + posi);
485 f0i = *(p0r + posi);
486 f1r = *(p1r + pos);
487 f2r = *(p2r + pos);
488 f3i = *(p3r + posi);
489 f2i = *(p2r + posi);
490 f3r = *(p3r + pos);
491
492 f4r = f0r + f1i;
493 f4i = f0i - f1r;
494 f5r = f0r - f1i;
495 f5i = f0i + f1r;
496
497 f6r = f2r + f3i;
498 f6i = f2i - f3r;
499 f7r = f2r - f3i;
500 f7i = f2i + f3r;
501
502 *(p0r + pos) = f4r;
503 *(p0r + posi) = f4i;
504 *(p1r + pos) = f5r;
505 *(p1r + posi) = f5i;
506 *(p2r + pos) = f6r;
507 *(p2r + posi) = f6i;
508 *(p3r + pos) = f7r;
509 *(p3r + posi) = f7i;
510
511 p0r += pnext;
512 p1r += pnext;
513 p2r += pnext;
514 p3r += pnext;
515
516 }
517 }
518
519 //inline void bfR4(float *ioptr, long M, long NDiffU);
bfR4(float * ioptr,long M,long NDiffU)520 static inline void bfR4(float *ioptr, long M, long NDiffU){
521 /*** 1 radix 4 stage ***/
522 unsigned long pos;
523 unsigned long posi;
524 unsigned long pinc;
525 unsigned long pnext;
526 unsigned long pnexti;
527 unsigned long NSameU;
528 unsigned long SameUCnt;
529
530 float *pstrt;
531 float *p0r, *p1r, *p2r, *p3r;
532
533 float w1r = 1.0F/MYROOT2; /* cos(pi/4) */
534 float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
535 float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
536 float t1r, t1i;
537 const float Two = 2.0;
538
539 pinc = NDiffU * 2; // 2 floats per complex
540 pnext = pinc * 4;
541 pnexti = pnext + 1;
542 pos = 2;
543 posi = pos+1;
544 NSameU = POW2(M) / 4 /NDiffU; // 4 pts per butterfly
545 pstrt = ioptr;
546 p0r = pstrt;
547 p1r = pstrt+pinc;
548 p2r = p1r+pinc;
549 p3r = p2r+pinc;
550
551 /* Butterflys */
552 /*
553 f0 - - f0 - - f4
554 f1 - 1 - f5 - - f5
555 f2 - - f6 - 1 - f6
556 f3 - 1 - f3 - -i - f7
557 */
558 /* Butterflys */
559 /*
560 f0 - - f4 - - f4
561 f1 - -i - t1 - - f5
562 f2 - - f2 - w1 - f6
563 f3 - -i - f7 - iw1- f7
564 */
565
566 f0r = *p0r;
567 f1r = *p1r;
568 f2r = *p2r;
569 f3r = *p3r;
570 f0i = *(p0r + 1);
571 f1i = *(p1r + 1);
572 f2i = *(p2r + 1);
573 f3i = *(p3r + 1);
574
575 f5r = f0r - f1r;
576 f5i = f0i - f1i;
577 f0r = f0r + f1r;
578 f0i = f0i + f1i;
579
580 f6r = f2r + f3r;
581 f6i = f2i + f3i;
582 f3r = f2r - f3r;
583 f3i = f2i - f3i;
584
585 for (SameUCnt = NSameU-1; SameUCnt > 0 ; SameUCnt--){
586
587 f7r = f5r - f3i;
588 f7i = f5i + f3r;
589 f5r = f5r + f3i;
590 f5i = f5i - f3r;
591
592 f4r = f0r + f6r;
593 f4i = f0i + f6i;
594 f6r = f0r - f6r;
595 f6i = f0i - f6i;
596
597 f2r = *(p2r + pos);
598 f2i = *(p2r + posi);
599 f1r = *(p1r + pos);
600 f1i = *(p1r + posi);
601 f3i = *(p3r + posi);
602 f0r = *(p0r + pos);
603 f3r = *(p3r + pos);
604 f0i = *(p0r + posi);
605
606 *p3r = f7r;
607 *p0r = f4r;
608 *(p3r + 1) = f7i;
609 *(p0r + 1) = f4i;
610 *p1r = f5r;
611 *p2r = f6r;
612 *(p1r + 1) = f5i;
613 *(p2r + 1) = f6i;
614
615 f7r = f2r - f3i;
616 f7i = f2i + f3r;
617 f2r = f2r + f3i;
618 f2i = f2i - f3r;
619
620 f4r = f0r + f1i;
621 f4i = f0i - f1r;
622 t1r = f0r - f1i;
623 t1i = f0i + f1r;
624
625 f5r = t1r - f7r * w1r + f7i * w1r;
626 f5i = t1i - f7r * w1r - f7i * w1r;
627 f7r = t1r * Two - f5r;
628 f7i = t1i * Two - f5i;
629
630 f6r = f4r - f2r * w1r - f2i * w1r;
631 f6i = f4i + f2r * w1r - f2i * w1r;
632 f4r = f4r * Two - f6r;
633 f4i = f4i * Two - f6i;
634
635 f3r = *(p3r + pnext);
636 f0r = *(p0r + pnext);
637 f3i = *(p3r + pnexti);
638 f0i = *(p0r + pnexti);
639 f2r = *(p2r + pnext);
640 f2i = *(p2r + pnexti);
641 f1r = *(p1r + pnext);
642 f1i = *(p1r + pnexti);
643
644 *(p2r + pos) = f6r;
645 *(p1r + pos) = f5r;
646 *(p2r + posi) = f6i;
647 *(p1r + posi) = f5i;
648 *(p3r + pos) = f7r;
649 *(p0r + pos) = f4r;
650 *(p3r + posi) = f7i;
651 *(p0r + posi) = f4i;
652
653 f6r = f2r + f3r;
654 f6i = f2i + f3i;
655 f3r = f2r - f3r;
656 f3i = f2i - f3i;
657
658 f5r = f0r - f1r;
659 f5i = f0i - f1i;
660 f0r = f0r + f1r;
661 f0i = f0i + f1i;
662
663 p3r += pnext;
664 p0r += pnext;
665 p1r += pnext;
666 p2r += pnext;
667
668 }
669 f7r = f5r - f3i;
670 f7i = f5i + f3r;
671 f5r = f5r + f3i;
672 f5i = f5i - f3r;
673
674 f4r = f0r + f6r;
675 f4i = f0i + f6i;
676 f6r = f0r - f6r;
677 f6i = f0i - f6i;
678
679 f2r = *(p2r + pos);
680 f2i = *(p2r + posi);
681 f1r = *(p1r + pos);
682 f1i = *(p1r + posi);
683 f3i = *(p3r + posi);
684 f0r = *(p0r + pos);
685 f3r = *(p3r + pos);
686 f0i = *(p0r + posi);
687
688 *p3r = f7r;
689 *p0r = f4r;
690 *(p3r + 1) = f7i;
691 *(p0r + 1) = f4i;
692 *p1r = f5r;
693 *p2r = f6r;
694 *(p1r + 1) = f5i;
695 *(p2r + 1) = f6i;
696
697 f7r = f2r - f3i;
698 f7i = f2i + f3r;
699 f2r = f2r + f3i;
700 f2i = f2i - f3r;
701
702 f4r = f0r + f1i;
703 f4i = f0i - f1r;
704 t1r = f0r - f1i;
705 t1i = f0i + f1r;
706
707 f5r = t1r - f7r * w1r + f7i * w1r;
708 f5i = t1i - f7r * w1r - f7i * w1r;
709 f7r = t1r * Two - f5r;
710 f7i = t1i * Two - f5i;
711
712 f6r = f4r - f2r * w1r - f2i * w1r;
713 f6i = f4i + f2r * w1r - f2i * w1r;
714 f4r = f4r * Two - f6r;
715 f4i = f4i * Two - f6i;
716
717 *(p2r + pos) = f6r;
718 *(p1r + pos) = f5r;
719 *(p2r + posi) = f6i;
720 *(p1r + posi) = f5i;
721 *(p3r + pos) = f7r;
722 *(p0r + pos) = f4r;
723 *(p3r + posi) = f7i;
724 *(p0r + posi) = f4i;
725
726 }
727
728 // inline void bfstages(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt);
bfstages(float * ioptr,long M,float * Utbl,long Ustride,long NDiffU,long StageCnt)729 static inline void bfstages(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt){
730 /*** RADIX 8 Stages ***/
731 unsigned long pos;
732 unsigned long posi;
733 unsigned long pinc;
734 unsigned long pnext;
735 unsigned long NSameU;
736 long Uinc;
737 long Uinc2;
738 long Uinc4;
739 unsigned long DiffUCnt;
740 unsigned long SameUCnt;
741 unsigned long U2toU3;
742
743 float *pstrt;
744 float *p0r, *p1r, *p2r, *p3r;
745 float *u0r, *u0i, *u1r, *u1i, *u2r, *u2i;
746
747 float w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i;
748 float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
749 float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
750 float t0r, t0i, t1r, t1i;
751 const float Two = 2.0;
752
753 pinc = NDiffU * 2; // 2 floats per complex
754 pnext = pinc * 8;
755 pos = pinc * 4;
756 posi = pos + 1;
757 NSameU = POW2(M) / 8 /NDiffU; // 8 pts per butterfly
758 Uinc = NSameU * Ustride;
759 Uinc2 = Uinc * 2;
760 Uinc4 = Uinc * 4;
761 U2toU3 = (POW2(M) / 8)*Ustride;
762 for (; StageCnt > 0 ; StageCnt--){
763
764 u0r = &Utbl[0];
765 u0i = &Utbl[POW2(M-2)*Ustride];
766 u1r = u0r;
767 u1i = u0i;
768 u2r = u0r;
769 u2i = u0i;
770
771 w0r = *u0r;
772 w0i = *u0i;
773 w1r = *u1r;
774 w1i = *u1i;
775 w2r = *u2r;
776 w2i = *u2i;
777 w3r = *(u2r+U2toU3);
778 w3i = *(u2i-U2toU3);
779
780 pstrt = ioptr;
781
782 p0r = pstrt;
783 p1r = pstrt+pinc;
784 p2r = p1r+pinc;
785 p3r = p2r+pinc;
786
787 /* Butterflys */
788 /*
789 f0 - - t0 - - f0 - - f0
790 f1 - w0- f1 - - f1 - - f1
791 f2 - - f2 - w1- f2 - - f4
792 f3 - w0- t1 - iw1- f3 - - f5
793
794 f4 - - t0 - - f4 - w2- t0
795 f5 - w0- f5 - - f5 - w3- t1
796 f6 - - f6 - w1- f6 - iw2- f6
797 f7 - w0- t1 - iw1- f7 - iw3- f7
798 */
799
800 for (DiffUCnt = NDiffU; DiffUCnt > 0 ; DiffUCnt--){
801 f0r = *p0r;
802 f0i = *(p0r + 1);
803 f1r = *p1r;
804 f1i = *(p1r + 1);
805 for (SameUCnt = NSameU-1; SameUCnt > 0 ; SameUCnt--){
806 f2r = *p2r;
807 f2i = *(p2r + 1);
808 f3r = *p3r;
809 f3i = *(p3r + 1);
810
811 t0r = f0r + f1r * w0r + f1i * w0i;
812 t0i = f0i - f1r * w0i + f1i * w0r;
813 f1r = f0r * Two - t0r;
814 f1i = f0i * Two - t0i;
815
816 f4r = *(p0r + pos);
817 f4i = *(p0r + posi);
818 f5r = *(p1r + pos);
819 f5i = *(p1r + posi);
820
821 f6r = *(p2r + pos);
822 f6i = *(p2r + posi);
823 f7r = *(p3r + pos);
824 f7i = *(p3r + posi);
825
826 t1r = f2r - f3r * w0r - f3i * w0i;
827 t1i = f2i + f3r * w0i - f3i * w0r;
828 f2r = f2r * Two - t1r;
829 f2i = f2i * Two - t1i;
830
831 f0r = t0r + f2r * w1r + f2i * w1i;
832 f0i = t0i - f2r * w1i + f2i * w1r;
833 f2r = t0r * Two - f0r;
834 f2i = t0i * Two - f0i;
835
836 f3r = f1r + t1r * w1i - t1i * w1r;
837 f3i = f1i + t1r * w1r + t1i * w1i;
838 f1r = f1r * Two - f3r;
839 f1i = f1i * Two - f3i;
840
841
842 t0r = f4r + f5r * w0r + f5i * w0i;
843 t0i = f4i - f5r * w0i + f5i * w0r;
844 f5r = f4r * Two - t0r;
845 f5i = f4i * Two - t0i;
846
847 t1r = f6r - f7r * w0r - f7i * w0i;
848 t1i = f6i + f7r * w0i - f7i * w0r;
849 f6r = f6r * Two - t1r;
850 f6i = f6i * Two - t1i;
851
852 f4r = t0r + f6r * w1r + f6i * w1i;
853 f4i = t0i - f6r * w1i + f6i * w1r;
854 f6r = t0r * Two - f4r;
855 f6i = t0i * Two - f4i;
856
857 f7r = f5r + t1r * w1i - t1i * w1r;
858 f7i = f5i + t1r * w1r + t1i * w1i;
859 f5r = f5r * Two - f7r;
860 f5i = f5i * Two - f7i;
861
862 t0r = f0r - f4r * w2r - f4i * w2i;
863 t0i = f0i + f4r * w2i - f4i * w2r;
864 f0r = f0r * Two - t0r;
865 f0i = f0i * Two - t0i;
866
867 t1r = f1r - f5r * w3r - f5i * w3i;
868 t1i = f1i + f5r * w3i - f5i * w3r;
869 f1r = f1r * Two - t1r;
870 f1i = f1i * Two - t1i;
871
872 *(p0r + pos) = t0r;
873 *(p1r + pos) = t1r;
874 *(p0r + posi) = t0i;
875 *(p1r + posi) = t1i;
876 *p0r = f0r;
877 *p1r = f1r;
878 *(p0r + 1) = f0i;
879 *(p1r + 1) = f1i;
880
881 p0r += pnext;
882 f0r = *p0r;
883 f0i = *(p0r + 1);
884
885 p1r += pnext;
886
887 f1r = *p1r;
888 f1i = *(p1r + 1);
889
890 f4r = f2r - f6r * w2i + f6i * w2r;
891 f4i = f2i - f6r * w2r - f6i * w2i;
892 f6r = f2r * Two - f4r;
893 f6i = f2i * Two - f4i;
894
895 f5r = f3r - f7r * w3i + f7i * w3r;
896 f5i = f3i - f7r * w3r - f7i * w3i;
897 f7r = f3r * Two - f5r;
898 f7i = f3i * Two - f5i;
899
900 *p2r = f4r;
901 *p3r = f5r;
902 *(p2r + 1) = f4i;
903 *(p3r + 1) = f5i;
904 *(p2r + pos) = f6r;
905 *(p3r + pos) = f7r;
906 *(p2r + posi) = f6i;
907 *(p3r + posi) = f7i;
908
909 p2r += pnext;
910 p3r += pnext;
911
912 }
913
914 f2r = *p2r;
915 f2i = *(p2r + 1);
916 f3r = *p3r;
917 f3i = *(p3r + 1);
918
919 t0r = f0r + f1r * w0r + f1i * w0i;
920 t0i = f0i - f1r * w0i + f1i * w0r;
921 f1r = f0r * Two - t0r;
922 f1i = f0i * Two - t0i;
923
924 f4r = *(p0r + pos);
925 f4i = *(p0r + posi);
926 f5r = *(p1r + pos);
927 f5i = *(p1r + posi);
928
929 f6r = *(p2r + pos);
930 f6i = *(p2r + posi);
931 f7r = *(p3r + pos);
932 f7i = *(p3r + posi);
933
934 t1r = f2r - f3r * w0r - f3i * w0i;
935 t1i = f2i + f3r * w0i - f3i * w0r;
936 f2r = f2r * Two - t1r;
937 f2i = f2i * Two - t1i;
938
939 f0r = t0r + f2r * w1r + f2i * w1i;
940 f0i = t0i - f2r * w1i + f2i * w1r;
941 f2r = t0r * Two - f0r;
942 f2i = t0i * Two - f0i;
943
944 f3r = f1r + t1r * w1i - t1i * w1r;
945 f3i = f1i + t1r * w1r + t1i * w1i;
946 f1r = f1r * Two - f3r;
947 f1i = f1i * Two - f3i;
948
949 if (DiffUCnt == NDiffU/2)
950 Uinc4 = -Uinc4;
951
952 u0r += Uinc4;
953 u0i -= Uinc4;
954 u1r += Uinc2;
955 u1i -= Uinc2;
956 u2r += Uinc;
957 u2i -= Uinc;
958
959 pstrt += 2;
960
961 t0r = f4r + f5r * w0r + f5i * w0i;
962 t0i = f4i - f5r * w0i + f5i * w0r;
963 f5r = f4r * Two - t0r;
964 f5i = f4i * Two - t0i;
965
966 t1r = f6r - f7r * w0r - f7i * w0i;
967 t1i = f6i + f7r * w0i - f7i * w0r;
968 f6r = f6r * Two - t1r;
969 f6i = f6i * Two - t1i;
970
971 f4r = t0r + f6r * w1r + f6i * w1i;
972 f4i = t0i - f6r * w1i + f6i * w1r;
973 f6r = t0r * Two - f4r;
974 f6i = t0i * Two - f4i;
975
976 f7r = f5r + t1r * w1i - t1i * w1r;
977 f7i = f5i + t1r * w1r + t1i * w1i;
978 f5r = f5r * Two - f7r;
979 f5i = f5i * Two - f7i;
980
981 w0r = *u0r;
982 w0i = *u0i;
983 w1r = *u1r;
984 w1i = *u1i;
985
986 if ((long) DiffUCnt <= NDiffU/2)
987 w0r = -w0r;
988
989 t0r = f0r - f4r * w2r - f4i * w2i;
990 t0i = f0i + f4r * w2i - f4i * w2r;
991 f0r = f0r * Two - t0r;
992 f0i = f0i * Two - t0i;
993
994 f4r = f2r - f6r * w2i + f6i * w2r;
995 f4i = f2i - f6r * w2r - f6i * w2i;
996 f6r = f2r * Two - f4r;
997 f6i = f2i * Two - f4i;
998
999 *(p0r + pos) = t0r;
1000 *p2r = f4r;
1001 *(p0r + posi) = t0i;
1002 *(p2r + 1) = f4i;
1003 w2r = *u2r;
1004 w2i = *u2i;
1005 *p0r = f0r;
1006 *(p2r + pos) = f6r;
1007 *(p0r + 1) = f0i;
1008 *(p2r + posi) = f6i;
1009
1010 p0r = pstrt;
1011 p2r = pstrt + pinc + pinc;
1012
1013 t1r = f1r - f5r * w3r - f5i * w3i;
1014 t1i = f1i + f5r * w3i - f5i * w3r;
1015 f1r = f1r * Two - t1r;
1016 f1i = f1i * Two - t1i;
1017
1018 f5r = f3r - f7r * w3i + f7i * w3r;
1019 f5i = f3i - f7r * w3r - f7i * w3i;
1020 f7r = f3r * Two - f5r;
1021 f7i = f3i * Two - f5i;
1022
1023 *(p1r + pos) = t1r;
1024 *p3r = f5r;
1025 *(p1r + posi) = t1i;
1026 *(p3r + 1) = f5i;
1027 w3r = *(u2r+U2toU3);
1028 w3i = *(u2i-U2toU3);
1029 *p1r = f1r;
1030 *(p3r + pos) = f7r;
1031 *(p1r + 1) = f1i;
1032 *(p3r + posi) = f7i;
1033
1034 p1r = pstrt + pinc;
1035 p3r = p2r + pinc;
1036 }
1037 NSameU /= 8;
1038 Uinc /= 8;
1039 Uinc2 /= 8;
1040 Uinc4 = Uinc * 4;
1041 NDiffU *= 8;
1042 pinc *= 8;
1043 pnext *= 8;
1044 pos *= 8;
1045 posi = pos + 1;
1046 }
1047 }
1048
1049 void fftrecurs(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt);
fftrecurs(float * ioptr,long M,float * Utbl,long Ustride,long NDiffU,long StageCnt)1050 void fftrecurs(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt){
1051 /* recursive bfstages calls to maximize on chip cache efficiency */
1052 long i1;
1053 if (M <= MCACHE) // fits on chip ?
1054 bfstages(ioptr, M, Utbl, Ustride, NDiffU, StageCnt); /* RADIX 8 Stages */
1055 else{
1056 for (i1=0; i1<8; i1++){
1057 fftrecurs(&ioptr[i1*POW2(M-3)*2], M-3, Utbl, 8*Ustride, NDiffU, StageCnt-1); /* RADIX 8 Stages */
1058 }
1059 bfstages(ioptr, M, Utbl, Ustride, POW2(M-3), 1); /* RADIX 8 Stage */
1060 }
1061 }
1062
ffts1(float * ioptr,long M,long Rows,float * Utbl,short * BRLow)1063 void ffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow){
1064 /* Compute in-place complex fft on the rows of the input array */
1065 /* INPUTS */
1066 /* *ioptr = input data array */
1067 /* M = log2 of fft size (ex M=10 for 1024 point fft) */
1068 /* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */
1069 /* *Utbl = cosine table */
1070 /* *BRLow = bit reversed counter table */
1071 /* OUTPUTS */
1072 /* *ioptr = output data array */
1073
1074 long StageCnt;
1075 long NDiffU;
1076
1077 switch (M){
1078 case 0:
1079 break;
1080 case 1:
1081 for (;Rows>0;Rows--){
1082 fft2pt(ioptr); /* a 2 pt fft */
1083 ioptr += 2*POW2(M);
1084 }
1085 break;
1086 case 2:
1087 for (;Rows>0;Rows--){
1088 fft4pt(ioptr); /* a 4 pt fft */
1089 ioptr += 2*POW2(M);
1090 }
1091 break;
1092 case 3:
1093 for (;Rows>0;Rows--){
1094 fft8pt(ioptr); /* an 8 pt fft */
1095 ioptr += 2*POW2(M);
1096 }
1097 break;
1098 default:
1099 for (;Rows>0;Rows--){
1100
1101 bitrevR2(ioptr, M, BRLow); /* bit reverse and first radix 2 stage */
1102
1103 StageCnt = (M-1) / 3; // number of radix 8 stages
1104 NDiffU = 2; // one radix 2 stage already complete
1105
1106 if ( (M-1-(StageCnt * 3)) == 1 ){
1107 bfR2(ioptr, M, NDiffU); /* 1 radix 2 stage */
1108 NDiffU *= 2;
1109 }
1110
1111 if ( (M-1-(StageCnt * 3)) == 2 ){
1112 bfR4(ioptr, M, NDiffU); /* 1 radix 4 stage */
1113 NDiffU *= 4;
1114 }
1115
1116 if (M <= MCACHE)
1117 bfstages(ioptr, M, Utbl, 1, NDiffU, StageCnt); /* RADIX 8 Stages */
1118
1119 else{
1120 fftrecurs(ioptr, M, Utbl, 1, NDiffU, StageCnt); /* RADIX 8 Stages */
1121 }
1122
1123 ioptr += 2*POW2(M);
1124 }
1125 }
1126 }
1127
1128 /************************************************
1129 parts of iffts1
1130 *************************************************/
1131
1132 // inline void scbitrevR2(float *ioptr, long M, short *BRLow, float scale);
scbitrevR2(float * ioptr,long M,short * BRLow,float scale)1133 static inline void scbitrevR2(float *ioptr, long M, short *BRLow, float scale){
1134 /*** scaled bit reverse and first radix 2 stage forward or inverse fft ***/
1135 float f0r;
1136 float f0i;
1137 float f1r;
1138 float f1i;
1139 float f2r;
1140 float f2i;
1141 float f3r;
1142 float f3i;
1143 float f4r;
1144 float f4i;
1145 float f5r;
1146 float f5i;
1147 float f6r;
1148 float f6i;
1149 float f7r;
1150 float f7i;
1151 float t0r;
1152 float t0i;
1153 float t1r;
1154 float t1i;
1155 float *p0r;
1156 float *p1r;
1157 float *IOP;
1158 float *iolimit;
1159 long Colstart;
1160 long iCol;
1161 unsigned long posA;
1162 unsigned long posAi;
1163 unsigned long posB;
1164 unsigned long posBi;
1165
1166 const unsigned long Nrems2 = POW2((M+3)/2);
1167 const unsigned long Nroot_1_ColInc = POW2(M)-Nrems2;
1168 const unsigned long Nroot_1 = POW2(M/2-1)-1;
1169 const unsigned long ColstartShift = (M+1)/2 +1;
1170 posA = POW2(M); // 1/2 of POW2(M) complexes
1171 posAi = posA + 1;
1172 posB = posA + 2;
1173 posBi = posB + 1;
1174
1175 iolimit = ioptr + Nrems2;
1176 for (; ioptr < iolimit; ioptr += (int64_t) POW2(M/2+1)){
1177 for (Colstart = Nroot_1; Colstart >= 0; Colstart--){
1178 iCol = Nroot_1;
1179 p0r = ioptr+ Nroot_1_ColInc + BRLow[Colstart]*4;
1180 IOP = ioptr + (Colstart << ColstartShift);
1181 p1r = IOP + BRLow[iCol]*4;
1182 f0r = *(p0r);
1183 f0i = *(p0r+1);
1184 f1r = *(p0r+posA);
1185 f1i = *(p0r+posAi);
1186 for (; iCol > Colstart;){
1187 f2r = *(p0r+2);
1188 f2i = *(p0r+(2+1));
1189 f3r = *(p0r+posB);
1190 f3i = *(p0r+posBi);
1191 f4r = *(p1r);
1192 f4i = *(p1r+1);
1193 f5r = *(p1r+posA);
1194 f5i = *(p1r+posAi);
1195 f6r = *(p1r+2);
1196 f6i = *(p1r+(2+1));
1197 f7r = *(p1r+posB);
1198 f7i = *(p1r+posBi);
1199
1200 t0r = f0r + f1r;
1201 t0i = f0i + f1i;
1202 f1r = f0r - f1r;
1203 f1i = f0i - f1i;
1204 t1r = f2r + f3r;
1205 t1i = f2i + f3i;
1206 f3r = f2r - f3r;
1207 f3i = f2i - f3i;
1208 f0r = f4r + f5r;
1209 f0i = f4i + f5i;
1210 f5r = f4r - f5r;
1211 f5i = f4i - f5i;
1212 f2r = f6r + f7r;
1213 f2i = f6i + f7i;
1214 f7r = f6r - f7r;
1215 f7i = f6i - f7i;
1216
1217 *(p1r) = scale*t0r;
1218 *(p1r+1) = scale*t0i;
1219 *(p1r+2) = scale*f1r;
1220 *(p1r+(2+1)) = scale*f1i;
1221 *(p1r+posA) = scale*t1r;
1222 *(p1r+posAi) = scale*t1i;
1223 *(p1r+posB) = scale*f3r;
1224 *(p1r+posBi) = scale*f3i;
1225 *(p0r) = scale*f0r;
1226 *(p0r+1) = scale*f0i;
1227 *(p0r+2) = scale*f5r;
1228 *(p0r+(2+1)) = scale*f5i;
1229 *(p0r+posA) = scale*f2r;
1230 *(p0r+posAi) = scale*f2i;
1231 *(p0r+posB) = scale*f7r;
1232 *(p0r+posBi) = scale*f7i;
1233
1234 p0r -= Nrems2;
1235 f0r = *(p0r);
1236 f0i = *(p0r+1);
1237 f1r = *(p0r+posA);
1238 f1i = *(p0r+posAi);
1239 iCol -= 1;
1240 p1r = IOP + BRLow[iCol]*4;
1241 };
1242 f2r = *(p0r+2);
1243 f2i = *(p0r+(2+1));
1244 f3r = *(p0r+posB);
1245 f3i = *(p0r+posBi);
1246
1247 t0r = f0r + f1r;
1248 t0i = f0i + f1i;
1249 f1r = f0r - f1r;
1250 f1i = f0i - f1i;
1251 t1r = f2r + f3r;
1252 t1i = f2i + f3i;
1253 f3r = f2r - f3r;
1254 f3i = f2i - f3i;
1255
1256 *(p0r) = scale*t0r;
1257 *(p0r+1) = scale*t0i;
1258 *(p0r+2) = scale*f1r;
1259 *(p0r+(2+1)) = scale*f1i;
1260 *(p0r+posA) = scale*t1r;
1261 *(p0r+posAi) = scale*t1i;
1262 *(p0r+posB) = scale*f3r;
1263 *(p0r+posBi) = scale*f3i;
1264
1265 };
1266 };
1267 }
1268
1269 //inline void ifft2pt(float *ioptr, float scale);
ifft2pt(float * ioptr,float scale)1270 static inline void ifft2pt(float *ioptr, float scale){
1271 /*** RADIX 2 ifft ***/
1272 float f0r, f0i, f1r, f1i;
1273 float t0r, t0i;
1274
1275 /* bit reversed load */
1276 f0r = ioptr[0];
1277 f0i = ioptr[1];
1278 f1r = ioptr[2];
1279 f1i = ioptr[3];
1280
1281 /* Butterflys */
1282 /*
1283 f0 - - t0
1284 f1 - 1 - f1
1285 */
1286
1287 t0r = f0r + f1r;
1288 t0i = f0i + f1i;
1289 f1r = f0r - f1r;
1290 f1i = f0i - f1i;
1291
1292 /* store result */
1293 ioptr[0] = scale*t0r;
1294 ioptr[1] = scale*t0i;
1295 ioptr[2] = scale*f1r;
1296 ioptr[3] = scale*f1i;
1297 }
1298
1299 // inline void ifft4pt(float *ioptr, float scale);
ifft4pt(float * ioptr,float scale)1300 static inline void ifft4pt(float *ioptr, float scale){
1301 /*** RADIX 4 ifft ***/
1302 float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
1303 float t0r, t0i, t1r, t1i;
1304
1305 /* bit reversed load */
1306 f0r = ioptr[0];
1307 f0i = ioptr[1];
1308 f1r = ioptr[4];
1309 f1i = ioptr[5];
1310 f2r = ioptr[2];
1311 f2i = ioptr[3];
1312 f3r = ioptr[6];
1313 f3i = ioptr[7];
1314
1315 /* Butterflys */
1316 /*
1317 f0 - - t0 - - f0
1318 f1 - 1 - f1 - - f1
1319 f2 - - f2 - 1 - f2
1320 f3 - 1 - t1 - i - f3
1321 */
1322
1323 t0r = f0r + f1r;
1324 t0i = f0i + f1i;
1325 f1r = f0r - f1r;
1326 f1i = f0i - f1i;
1327
1328 t1r = f2r - f3r;
1329 t1i = f2i - f3i;
1330 f2r = f2r + f3r;
1331 f2i = f2i + f3i;
1332
1333 f0r = t0r + f2r;
1334 f0i = t0i + f2i;
1335 f2r = t0r - f2r;
1336 f2i = t0i - f2i;
1337
1338 f3r = f1r + t1i;
1339 f3i = f1i - t1r;
1340 f1r = f1r - t1i;
1341 f1i = f1i + t1r;
1342
1343 /* store result */
1344 ioptr[0] = scale*f0r;
1345 ioptr[1] = scale*f0i;
1346 ioptr[2] = scale*f1r;
1347 ioptr[3] = scale*f1i;
1348 ioptr[4] = scale*f2r;
1349 ioptr[5] = scale*f2i;
1350 ioptr[6] = scale*f3r;
1351 ioptr[7] = scale*f3i;
1352 }
1353
1354 //inline void ifft8pt(float *ioptr, float scale);
ifft8pt(float * ioptr,float scale)1355 static inline void ifft8pt(float *ioptr, float scale){
1356 /*** RADIX 8 ifft ***/
1357 float w0r = 1.0F/MYROOT2; /* cos(pi/4) */
1358 float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
1359 float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
1360 float t0r, t0i, t1r, t1i;
1361 const float Two = 2.0;
1362
1363 /* bit reversed load */
1364 f0r = ioptr[0];
1365 f0i = ioptr[1];
1366 f1r = ioptr[8];
1367 f1i = ioptr[9];
1368 f2r = ioptr[4];
1369 f2i = ioptr[5];
1370 f3r = ioptr[12];
1371 f3i = ioptr[13];
1372 f4r = ioptr[2];
1373 f4i = ioptr[3];
1374 f5r = ioptr[10];
1375 f5i = ioptr[11];
1376 f6r = ioptr[6];
1377 f6i = ioptr[7];
1378 f7r = ioptr[14];
1379 f7i = ioptr[15];
1380
1381 /* Butterflys */
1382 /*
1383 f0 - - t0 - - f0 - - f0
1384 f1 - 1 - f1 - - f1 - - f1
1385 f2 - - f2 - 1 - f2 - - f2
1386 f3 - 1 - t1 - i - f3 - - f3
1387 f4 - - t0 - - f4 - 1 - t0
1388 f5 - 1 - f5 - - f5 - w3 - f4
1389 f6 - - f6 - 1 - f6 - i - t1
1390 f7 - 1 - t1 - i - f7 - iw3- f6
1391 */
1392
1393 t0r = f0r + f1r;
1394 t0i = f0i + f1i;
1395 f1r = f0r - f1r;
1396 f1i = f0i - f1i;
1397
1398 t1r = f2r - f3r;
1399 t1i = f2i - f3i;
1400 f2r = f2r + f3r;
1401 f2i = f2i + f3i;
1402
1403 f0r = t0r + f2r;
1404 f0i = t0i + f2i;
1405 f2r = t0r - f2r;
1406 f2i = t0i - f2i;
1407
1408 f3r = f1r + t1i;
1409 f3i = f1i - t1r;
1410 f1r = f1r - t1i;
1411 f1i = f1i + t1r;
1412
1413
1414 t0r = f4r + f5r;
1415 t0i = f4i + f5i;
1416 f5r = f4r - f5r;
1417 f5i = f4i - f5i;
1418
1419 t1r = f6r - f7r;
1420 t1i = f6i - f7i;
1421 f6r = f6r + f7r;
1422 f6i = f6i + f7i;
1423
1424 f4r = t0r + f6r;
1425 f4i = t0i + f6i;
1426 f6r = t0r - f6r;
1427 f6i = t0i - f6i;
1428
1429 f7r = f5r + t1i;
1430 f7i = f5i - t1r;
1431 f5r = f5r - t1i;
1432 f5i = f5i + t1r;
1433
1434
1435 t0r = f0r - f4r;
1436 t0i = f0i - f4i;
1437 f0r = f0r + f4r;
1438 f0i = f0i + f4i;
1439
1440 t1r = f2r + f6i;
1441 t1i = f2i - f6r;
1442 f2r = f2r - f6i;
1443 f2i = f2i + f6r;
1444
1445 f4r = f1r - f5r * w0r + f5i * w0r;
1446 f4i = f1i - f5r * w0r - f5i * w0r;
1447 f1r = f1r * Two - f4r;
1448 f1i = f1i * Two - f4i;
1449
1450 f6r = f3r + f7r * w0r + f7i * w0r;
1451 f6i = f3i - f7r * w0r + f7i * w0r;
1452 f3r = f3r * Two - f6r;
1453 f3i = f3i * Two - f6i;
1454
1455 /* store result */
1456 ioptr[0] = scale*f0r;
1457 ioptr[1] = scale*f0i;
1458 ioptr[2] = scale*f1r;
1459 ioptr[3] = scale*f1i;
1460 ioptr[4] = scale*f2r;
1461 ioptr[5] = scale*f2i;
1462 ioptr[6] = scale*f3r;
1463 ioptr[7] = scale*f3i;
1464 ioptr[8] = scale*t0r;
1465 ioptr[9] = scale*t0i;
1466 ioptr[10] = scale*f4r;
1467 ioptr[11] = scale*f4i;
1468 ioptr[12] = scale*t1r;
1469 ioptr[13] = scale*t1i;
1470 ioptr[14] = scale*f6r;
1471 ioptr[15] = scale*f6i;
1472 }
1473
1474 //inline void ibfR2(float *ioptr, long M, long NDiffU);
ibfR2(float * ioptr,long M,long NDiffU)1475 static inline void ibfR2(float *ioptr, long M, long NDiffU){
1476 /*** 2nd radix 2 stage ***/
1477 unsigned long pos;
1478 unsigned long posi;
1479 unsigned long pinc;
1480 unsigned long pnext;
1481 unsigned long NSameU;
1482 unsigned long SameUCnt;
1483
1484 float *pstrt;
1485 float *p0r, *p1r, *p2r, *p3r;
1486
1487 float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
1488 float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
1489 /* UNUSED: const float Two = 2.0; */
1490
1491 pinc = NDiffU * 2; // 2 floats per complex
1492 pnext = pinc * 4;
1493 pos = 2;
1494 posi = pos+1;
1495 NSameU = POW2(M) / 4 /NDiffU; // 4 Us at a time
1496 pstrt = ioptr;
1497 p0r = pstrt;
1498 p1r = pstrt+pinc;
1499 p2r = p1r+pinc;
1500 p3r = p2r+pinc;
1501
1502 /* Butterflys */
1503 /*
1504 f0 - - f4
1505 f1 - 1 - f5
1506 f2 - - f6
1507 f3 - 1 - f7
1508 */
1509 /* Butterflys */
1510 /*
1511 f0 - - f4
1512 f1 - 1 - f5
1513 f2 - - f6
1514 f3 - 1 - f7
1515 */
1516
1517 for (SameUCnt = NSameU; SameUCnt > 0 ; SameUCnt--){
1518
1519 f0r = *p0r;
1520 f1r = *p1r;
1521 f0i = *(p0r + 1);
1522 f1i = *(p1r + 1);
1523 f2r = *p2r;
1524 f3r = *p3r;
1525 f2i = *(p2r + 1);
1526 f3i = *(p3r + 1);
1527
1528 f4r = f0r + f1r;
1529 f4i = f0i + f1i;
1530 f5r = f0r - f1r;
1531 f5i = f0i - f1i;
1532
1533 f6r = f2r + f3r;
1534 f6i = f2i + f3i;
1535 f7r = f2r - f3r;
1536 f7i = f2i - f3i;
1537
1538 *p0r = f4r;
1539 *(p0r + 1) = f4i;
1540 *p1r = f5r;
1541 *(p1r + 1) = f5i;
1542 *p2r = f6r;
1543 *(p2r + 1) = f6i;
1544 *p3r = f7r;
1545 *(p3r + 1) = f7i;
1546
1547 f0r = *(p0r + pos);
1548 f1i = *(p1r + posi);
1549 f0i = *(p0r + posi);
1550 f1r = *(p1r + pos);
1551 f2r = *(p2r + pos);
1552 f3i = *(p3r + posi);
1553 f2i = *(p2r + posi);
1554 f3r = *(p3r + pos);
1555
1556 f4r = f0r - f1i;
1557 f4i = f0i + f1r;
1558 f5r = f0r + f1i;
1559 f5i = f0i - f1r;
1560
1561 f6r = f2r - f3i;
1562 f6i = f2i + f3r;
1563 f7r = f2r + f3i;
1564 f7i = f2i - f3r;
1565
1566 *(p0r + pos) = f4r;
1567 *(p0r + posi) = f4i;
1568 *(p1r + pos) = f5r;
1569 *(p1r + posi) = f5i;
1570 *(p2r + pos) = f6r;
1571 *(p2r + posi) = f6i;
1572 *(p3r + pos) = f7r;
1573 *(p3r + posi) = f7i;
1574
1575 p0r += pnext;
1576 p1r += pnext;
1577 p2r += pnext;
1578 p3r += pnext;
1579
1580 }
1581 }
1582
1583 //inline void ibfR4(float *ioptr, long M, long NDiffU);
ibfR4(float * ioptr,long M,long NDiffU)1584 static inline void ibfR4(float *ioptr, long M, long NDiffU){
1585 /*** 1 radix 4 stage ***/
1586 unsigned long pos;
1587 unsigned long posi;
1588 unsigned long pinc;
1589 unsigned long pnext;
1590 unsigned long pnexti;
1591 unsigned long NSameU;
1592 unsigned long SameUCnt;
1593
1594 float *pstrt;
1595 float *p0r, *p1r, *p2r, *p3r;
1596
1597 float w1r = 1.0F/MYROOT2; /* cos(pi/4) */
1598 float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
1599 float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
1600 float t1r, t1i;
1601 const float Two = 2.0;
1602
1603 pinc = NDiffU * 2; // 2 floats per complex
1604 pnext = pinc * 4;
1605 pnexti = pnext + 1;
1606 pos = 2;
1607 posi = pos+1;
1608 NSameU = POW2(M) / 4 /NDiffU; // 4 pts per butterfly
1609 pstrt = ioptr;
1610 p0r = pstrt;
1611 p1r = pstrt+pinc;
1612 p2r = p1r+pinc;
1613 p3r = p2r+pinc;
1614
1615 /* Butterflys */
1616 /*
1617 f0 - - f0 - - f4
1618 f1 - 1 - f5 - - f5
1619 f2 - - f6 - 1 - f6
1620 f3 - 1 - f3 - -i - f7
1621 */
1622 /* Butterflys */
1623 /*
1624 f0 - - f4 - - f4
1625 f1 - -i - t1 - - f5
1626 f2 - - f2 - w1 - f6
1627 f3 - -i - f7 - iw1- f7
1628 */
1629
1630 f0r = *p0r;
1631 f1r = *p1r;
1632 f2r = *p2r;
1633 f3r = *p3r;
1634 f0i = *(p0r + 1);
1635 f1i = *(p1r + 1);
1636 f2i = *(p2r + 1);
1637 f3i = *(p3r + 1);
1638
1639 f5r = f0r - f1r;
1640 f5i = f0i - f1i;
1641 f0r = f0r + f1r;
1642 f0i = f0i + f1i;
1643
1644 f6r = f2r + f3r;
1645 f6i = f2i + f3i;
1646 f3r = f2r - f3r;
1647 f3i = f2i - f3i;
1648
1649 for (SameUCnt = NSameU-1; SameUCnt > 0 ; SameUCnt--){
1650
1651 f7r = f5r + f3i;
1652 f7i = f5i - f3r;
1653 f5r = f5r - f3i;
1654 f5i = f5i + f3r;
1655
1656 f4r = f0r + f6r;
1657 f4i = f0i + f6i;
1658 f6r = f0r - f6r;
1659 f6i = f0i - f6i;
1660
1661 f2r = *(p2r + pos);
1662 f2i = *(p2r + posi);
1663 f1r = *(p1r + pos);
1664 f1i = *(p1r + posi);
1665 f3i = *(p3r + posi);
1666 f0r = *(p0r + pos);
1667 f3r = *(p3r + pos);
1668 f0i = *(p0r + posi);
1669
1670 *p3r = f7r;
1671 *p0r = f4r;
1672 *(p3r + 1) = f7i;
1673 *(p0r + 1) = f4i;
1674 *p1r = f5r;
1675 *p2r = f6r;
1676 *(p1r + 1) = f5i;
1677 *(p2r + 1) = f6i;
1678
1679 f7r = f2r + f3i;
1680 f7i = f2i - f3r;
1681 f2r = f2r - f3i;
1682 f2i = f2i + f3r;
1683
1684 f4r = f0r - f1i;
1685 f4i = f0i + f1r;
1686 t1r = f0r + f1i;
1687 t1i = f0i - f1r;
1688
1689 f5r = t1r - f7r * w1r - f7i * w1r;
1690 f5i = t1i + f7r * w1r - f7i * w1r;
1691 f7r = t1r * Two - f5r;
1692 f7i = t1i * Two - f5i;
1693
1694 f6r = f4r - f2r * w1r + f2i * w1r;
1695 f6i = f4i - f2r * w1r - f2i * w1r;
1696 f4r = f4r * Two - f6r;
1697 f4i = f4i * Two - f6i;
1698
1699 f3r = *(p3r + pnext);
1700 f0r = *(p0r + pnext);
1701 f3i = *(p3r + pnexti);
1702 f0i = *(p0r + pnexti);
1703 f2r = *(p2r + pnext);
1704 f2i = *(p2r + pnexti);
1705 f1r = *(p1r + pnext);
1706 f1i = *(p1r + pnexti);
1707
1708 *(p2r + pos) = f6r;
1709 *(p1r + pos) = f5r;
1710 *(p2r + posi) = f6i;
1711 *(p1r + posi) = f5i;
1712 *(p3r + pos) = f7r;
1713 *(p0r + pos) = f4r;
1714 *(p3r + posi) = f7i;
1715 *(p0r + posi) = f4i;
1716
1717 f6r = f2r + f3r;
1718 f6i = f2i + f3i;
1719 f3r = f2r - f3r;
1720 f3i = f2i - f3i;
1721
1722 f5r = f0r - f1r;
1723 f5i = f0i - f1i;
1724 f0r = f0r + f1r;
1725 f0i = f0i + f1i;
1726
1727 p3r += pnext;
1728 p0r += pnext;
1729 p1r += pnext;
1730 p2r += pnext;
1731
1732 }
1733 f7r = f5r + f3i;
1734 f7i = f5i - f3r;
1735 f5r = f5r - f3i;
1736 f5i = f5i + f3r;
1737
1738 f4r = f0r + f6r;
1739 f4i = f0i + f6i;
1740 f6r = f0r - f6r;
1741 f6i = f0i - f6i;
1742
1743 f2r = *(p2r + pos);
1744 f2i = *(p2r + posi);
1745 f1r = *(p1r + pos);
1746 f1i = *(p1r + posi);
1747 f3i = *(p3r + posi);
1748 f0r = *(p0r + pos);
1749 f3r = *(p3r + pos);
1750 f0i = *(p0r + posi);
1751
1752 *p3r = f7r;
1753 *p0r = f4r;
1754 *(p3r + 1) = f7i;
1755 *(p0r + 1) = f4i;
1756 *p1r = f5r;
1757 *p2r = f6r;
1758 *(p1r + 1) = f5i;
1759 *(p2r + 1) = f6i;
1760
1761 f7r = f2r + f3i;
1762 f7i = f2i - f3r;
1763 f2r = f2r - f3i;
1764 f2i = f2i + f3r;
1765
1766 f4r = f0r - f1i;
1767 f4i = f0i + f1r;
1768 t1r = f0r + f1i;
1769 t1i = f0i - f1r;
1770
1771 f5r = t1r - f7r * w1r - f7i * w1r;
1772 f5i = t1i + f7r * w1r - f7i * w1r;
1773 f7r = t1r * Two - f5r;
1774 f7i = t1i * Two - f5i;
1775
1776 f6r = f4r - f2r * w1r + f2i * w1r;
1777 f6i = f4i - f2r * w1r - f2i * w1r;
1778 f4r = f4r * Two - f6r;
1779 f4i = f4i * Two - f6i;
1780
1781 *(p2r + pos) = f6r;
1782 *(p1r + pos) = f5r;
1783 *(p2r + posi) = f6i;
1784 *(p1r + posi) = f5i;
1785 *(p3r + pos) = f7r;
1786 *(p0r + pos) = f4r;
1787 *(p3r + posi) = f7i;
1788 *(p0r + posi) = f4i;
1789
1790 }
1791
1792 //inline void ibfstages(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt);
ibfstages(float * ioptr,long M,float * Utbl,long Ustride,long NDiffU,long StageCnt)1793 static inline void ibfstages(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt){
1794 /*** RADIX 8 Stages ***/
1795 unsigned long pos;
1796 unsigned long posi;
1797 unsigned long pinc;
1798 unsigned long pnext;
1799 unsigned long NSameU;
1800 long Uinc;
1801 long Uinc2;
1802 long Uinc4;
1803 unsigned long DiffUCnt;
1804 unsigned long SameUCnt;
1805 unsigned long U2toU3;
1806
1807 float *pstrt;
1808 float *p0r, *p1r, *p2r, *p3r;
1809 float *u0r, *u0i, *u1r, *u1i, *u2r, *u2i;
1810
1811 float w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i;
1812 float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
1813 float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
1814 float t0r, t0i, t1r, t1i;
1815 const float Two = 2.0;
1816
1817 pinc = NDiffU * 2; // 2 floats per complex
1818 pnext = pinc * 8;
1819 pos = pinc * 4;
1820 posi = pos + 1;
1821 NSameU = POW2(M) / 8 /NDiffU; // 8 pts per butterfly
1822 Uinc = NSameU * Ustride;
1823 Uinc2 = Uinc * 2;
1824 Uinc4 = Uinc * 4;
1825 U2toU3 = (POW2(M) / 8)*Ustride;
1826 for (; StageCnt > 0 ; StageCnt--){
1827
1828 u0r = &Utbl[0];
1829 u0i = &Utbl[POW2(M-2)*Ustride];
1830 u1r = u0r;
1831 u1i = u0i;
1832 u2r = u0r;
1833 u2i = u0i;
1834
1835 w0r = *u0r;
1836 w0i = *u0i;
1837 w1r = *u1r;
1838 w1i = *u1i;
1839 w2r = *u2r;
1840 w2i = *u2i;
1841 w3r = *(u2r+U2toU3);
1842 w3i = *(u2i-U2toU3);
1843
1844 pstrt = ioptr;
1845
1846 p0r = pstrt;
1847 p1r = pstrt+pinc;
1848 p2r = p1r+pinc;
1849 p3r = p2r+pinc;
1850
1851 /* Butterflys */
1852 /*
1853 f0 - - t0 - - f0 - - f0
1854 f1 - w0- f1 - - f1 - - f1
1855 f2 - - f2 - w1- f2 - - f4
1856 f3 - w0- t1 - iw1- f3 - - f5
1857
1858 f4 - - t0 - - f4 - w2- t0
1859 f5 - w0- f5 - - f5 - w3- t1
1860 f6 - - f6 - w1- f6 - iw2- f6
1861 f7 - w0- t1 - iw1- f7 - iw3- f7
1862 */
1863
1864 for (DiffUCnt = NDiffU; DiffUCnt > 0 ; DiffUCnt--){
1865 f0r = *p0r;
1866 f0i = *(p0r + 1);
1867 f1r = *p1r;
1868 f1i = *(p1r + 1);
1869 for (SameUCnt = NSameU-1; SameUCnt > 0 ; SameUCnt--){
1870 f2r = *p2r;
1871 f2i = *(p2r + 1);
1872 f3r = *p3r;
1873 f3i = *(p3r + 1);
1874
1875 t0r = f0r + f1r * w0r - f1i * w0i;
1876 t0i = f0i + f1r * w0i + f1i * w0r;
1877 f1r = f0r * Two - t0r;
1878 f1i = f0i * Two - t0i;
1879
1880 f4r = *(p0r + pos);
1881 f4i = *(p0r + posi);
1882 f5r = *(p1r + pos);
1883 f5i = *(p1r + posi);
1884
1885 f6r = *(p2r + pos);
1886 f6i = *(p2r + posi);
1887 f7r = *(p3r + pos);
1888 f7i = *(p3r + posi);
1889
1890 t1r = f2r - f3r * w0r + f3i * w0i;
1891 t1i = f2i - f3r * w0i - f3i * w0r;
1892 f2r = f2r * Two - t1r;
1893 f2i = f2i * Two - t1i;
1894
1895 f0r = t0r + f2r * w1r - f2i * w1i;
1896 f0i = t0i + f2r * w1i + f2i * w1r;
1897 f2r = t0r * Two - f0r;
1898 f2i = t0i * Two - f0i;
1899
1900 f3r = f1r + t1r * w1i + t1i * w1r;
1901 f3i = f1i - t1r * w1r + t1i * w1i;
1902 f1r = f1r * Two - f3r;
1903 f1i = f1i * Two - f3i;
1904
1905
1906 t0r = f4r + f5r * w0r - f5i * w0i;
1907 t0i = f4i + f5r * w0i + f5i * w0r;
1908 f5r = f4r * Two - t0r;
1909 f5i = f4i * Two - t0i;
1910
1911 t1r = f6r - f7r * w0r + f7i * w0i;
1912 t1i = f6i - f7r * w0i - f7i * w0r;
1913 f6r = f6r * Two - t1r;
1914 f6i = f6i * Two - t1i;
1915
1916 f4r = t0r + f6r * w1r - f6i * w1i;
1917 f4i = t0i + f6r * w1i + f6i * w1r;
1918 f6r = t0r * Two - f4r;
1919 f6i = t0i * Two - f4i;
1920
1921 f7r = f5r + t1r * w1i + t1i * w1r;
1922 f7i = f5i - t1r * w1r + t1i * w1i;
1923 f5r = f5r * Two - f7r;
1924 f5i = f5i * Two - f7i;
1925
1926 t0r = f0r - f4r * w2r + f4i * w2i;
1927 t0i = f0i - f4r * w2i - f4i * w2r;
1928 f0r = f0r * Two - t0r;
1929 f0i = f0i * Two - t0i;
1930
1931 t1r = f1r - f5r * w3r + f5i * w3i;
1932 t1i = f1i - f5r * w3i - f5i * w3r;
1933 f1r = f1r * Two - t1r;
1934 f1i = f1i * Two - t1i;
1935
1936 *(p0r + pos) = t0r;
1937 *(p0r + posi) = t0i;
1938 *p0r = f0r;
1939 *(p0r + 1) = f0i;
1940
1941 p0r += pnext;
1942 f0r = *p0r;
1943 f0i = *(p0r + 1);
1944
1945 *(p1r + pos) = t1r;
1946 *(p1r + posi) = t1i;
1947 *p1r = f1r;
1948 *(p1r + 1) = f1i;
1949
1950 p1r += pnext;
1951
1952 f1r = *p1r;
1953 f1i = *(p1r + 1);
1954
1955 f4r = f2r - f6r * w2i - f6i * w2r;
1956 f4i = f2i + f6r * w2r - f6i * w2i;
1957 f6r = f2r * Two - f4r;
1958 f6i = f2i * Two - f4i;
1959
1960 f5r = f3r - f7r * w3i - f7i * w3r;
1961 f5i = f3i + f7r * w3r - f7i * w3i;
1962 f7r = f3r * Two - f5r;
1963 f7i = f3i * Two - f5i;
1964
1965 *p2r = f4r;
1966 *(p2r + 1) = f4i;
1967 *(p2r + pos) = f6r;
1968 *(p2r + posi) = f6i;
1969
1970 p2r += pnext;
1971
1972 *p3r = f5r;
1973 *(p3r + 1) = f5i;
1974 *(p3r + pos) = f7r;
1975 *(p3r + posi) = f7i;
1976
1977 p3r += pnext;
1978
1979 }
1980
1981 f2r = *p2r;
1982 f2i = *(p2r + 1);
1983 f3r = *p3r;
1984 f3i = *(p3r + 1);
1985
1986 t0r = f0r + f1r * w0r - f1i * w0i;
1987 t0i = f0i + f1r * w0i + f1i * w0r;
1988 f1r = f0r * Two - t0r;
1989 f1i = f0i * Two - t0i;
1990
1991 f4r = *(p0r + pos);
1992 f4i = *(p0r + posi);
1993 f5r = *(p1r + pos);
1994 f5i = *(p1r + posi);
1995
1996 f6r = *(p2r + pos);
1997 f6i = *(p2r + posi);
1998 f7r = *(p3r + pos);
1999 f7i = *(p3r + posi);
2000
2001 t1r = f2r - f3r * w0r + f3i * w0i;
2002 t1i = f2i - f3r * w0i - f3i * w0r;
2003 f2r = f2r * Two - t1r;
2004 f2i = f2i * Two - t1i;
2005
2006 f0r = t0r + f2r * w1r - f2i * w1i;
2007 f0i = t0i + f2r * w1i + f2i * w1r;
2008 f2r = t0r * Two - f0r;
2009 f2i = t0i * Two - f0i;
2010
2011 f3r = f1r + t1r * w1i + t1i * w1r;
2012 f3i = f1i - t1r * w1r + t1i * w1i;
2013 f1r = f1r * Two - f3r;
2014 f1i = f1i * Two - f3i;
2015
2016 if (DiffUCnt == NDiffU/2)
2017 Uinc4 = -Uinc4;
2018
2019 u0r += Uinc4;
2020 u0i -= Uinc4;
2021 u1r += Uinc2;
2022 u1i -= Uinc2;
2023 u2r += Uinc;
2024 u2i -= Uinc;
2025
2026 pstrt += 2;
2027
2028 t0r = f4r + f5r * w0r - f5i * w0i;
2029 t0i = f4i + f5r * w0i + f5i * w0r;
2030 f5r = f4r * Two - t0r;
2031 f5i = f4i * Two - t0i;
2032
2033 t1r = f6r - f7r * w0r + f7i * w0i;
2034 t1i = f6i - f7r * w0i - f7i * w0r;
2035 f6r = f6r * Two - t1r;
2036 f6i = f6i * Two - t1i;
2037
2038 f4r = t0r + f6r * w1r - f6i * w1i;
2039 f4i = t0i + f6r * w1i + f6i * w1r;
2040 f6r = t0r * Two - f4r;
2041 f6i = t0i * Two - f4i;
2042
2043 f7r = f5r + t1r * w1i + t1i * w1r;
2044 f7i = f5i - t1r * w1r + t1i * w1i;
2045 f5r = f5r * Two - f7r;
2046 f5i = f5i * Two - f7i;
2047
2048 w0r = *u0r;
2049 w0i = *u0i;
2050 w1r = *u1r;
2051 w1i = *u1i;
2052
2053 if ((long) DiffUCnt <= NDiffU/2)
2054 w0r = -w0r;
2055
2056 t0r = f0r - f4r * w2r + f4i * w2i;
2057 t0i = f0i - f4r * w2i - f4i * w2r;
2058 f0r = f0r * Two - t0r;
2059 f0i = f0i * Two - t0i;
2060
2061 f4r = f2r - f6r * w2i - f6i * w2r;
2062 f4i = f2i + f6r * w2r - f6i * w2i;
2063 f6r = f2r * Two - f4r;
2064 f6i = f2i * Two - f4i;
2065
2066 *(p0r + pos) = t0r;
2067 *p2r = f4r;
2068 *(p0r + posi) = t0i;
2069 *(p2r + 1) = f4i;
2070 w2r = *u2r;
2071 w2i = *u2i;
2072 *p0r = f0r;
2073 *(p2r + pos) = f6r;
2074 *(p0r + 1) = f0i;
2075 *(p2r + posi) = f6i;
2076
2077 p0r = pstrt;
2078 p2r = pstrt + pinc + pinc;
2079
2080 t1r = f1r - f5r * w3r + f5i * w3i;
2081 t1i = f1i - f5r * w3i - f5i * w3r;
2082 f1r = f1r * Two - t1r;
2083 f1i = f1i * Two - t1i;
2084
2085 f5r = f3r - f7r * w3i - f7i * w3r;
2086 f5i = f3i + f7r * w3r - f7i * w3i;
2087 f7r = f3r * Two - f5r;
2088 f7i = f3i * Two - f5i;
2089
2090 *(p1r + pos) = t1r;
2091 *p3r = f5r;
2092 *(p1r + posi) = t1i;
2093 *(p3r + 1) = f5i;
2094 w3r = *(u2r+U2toU3);
2095 w3i = *(u2i-U2toU3);
2096 *p1r = f1r;
2097 *(p3r + pos) = f7r;
2098 *(p1r + 1) = f1i;
2099 *(p3r + posi) = f7i;
2100
2101 p1r = pstrt + pinc;
2102 p3r = p2r + pinc;
2103 }
2104 NSameU /= 8;
2105 Uinc /= 8;
2106 Uinc2 /= 8;
2107 Uinc4 = Uinc * 4;
2108 NDiffU *= 8;
2109 pinc *= 8;
2110 pnext *= 8;
2111 pos *= 8;
2112 posi = pos + 1;
2113 }
2114 }
2115
2116 void ifftrecurs(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt);
ifftrecurs(float * ioptr,long M,float * Utbl,long Ustride,long NDiffU,long StageCnt)2117 void ifftrecurs(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt){
2118 /* recursive bfstages calls to maximize on chip cache efficiency */
2119 long i1;
2120 if (M <= MCACHE)
2121 ibfstages(ioptr, M, Utbl, Ustride, NDiffU, StageCnt); /* RADIX 8 Stages */
2122 else{
2123 for (i1=0; i1<8; i1++){
2124 ifftrecurs(&ioptr[i1*POW2(M-3)*2], M-3, Utbl, 8*Ustride, NDiffU, StageCnt-1); /* RADIX 8 Stages */
2125 }
2126 ibfstages(ioptr, M, Utbl, Ustride, POW2(M-3), 1); /* RADIX 8 Stage */
2127 }
2128 }
2129
iffts1(float * ioptr,long M,long Rows,float * Utbl,short * BRLow)2130 void iffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow){
2131 /* Compute in-place inverse complex fft on the rows of the input array */
2132 /* INPUTS */
2133 /* *ioptr = input data array */
2134 /* M = log2 of fft size */
2135 /* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */
2136 /* *Utbl = cosine table */
2137 /* *BRLow = bit reversed counter table */
2138 /* OUTPUTS */
2139 /* *ioptr = output data array */
2140
2141 long StageCnt;
2142 long NDiffU;
2143 const float scale = 1.0F/POW2(M);
2144
2145 switch (M){
2146 case 0:
2147 break;
2148 case 1:
2149 for (;Rows>0;Rows--){
2150 ifft2pt(ioptr, scale); /* a 2 pt fft */
2151 ioptr += 2*POW2(M);
2152 }
2153 break;
2154 case 2:
2155 for (;Rows>0;Rows--){
2156 ifft4pt(ioptr, scale); /* a 4 pt fft */
2157 ioptr += 2*POW2(M);
2158 }
2159 break;
2160 case 3:
2161 for (;Rows>0;Rows--){
2162 ifft8pt(ioptr, scale); /* an 8 pt fft */
2163 ioptr += 2*POW2(M);
2164 }
2165 break;
2166 default:
2167 for (;Rows>0;Rows--){
2168
2169 scbitrevR2(ioptr, M, BRLow, scale); /* bit reverse and first radix 2 stage */
2170
2171 StageCnt = (M-1) / 3; // number of radix 8 stages
2172 NDiffU = 2; // one radix 2 stage already complete
2173
2174 if ( (M-1-(StageCnt * 3)) == 1 ){
2175 ibfR2(ioptr, M, NDiffU); /* 1 radix 2 stage */
2176 NDiffU *= 2;
2177 }
2178
2179 if ( (M-1-(StageCnt * 3)) == 2 ){
2180 ibfR4(ioptr, M, NDiffU); /* 1 radix 4 stage */
2181 NDiffU *= 4;
2182 }
2183
2184 if (M <= MCACHE)
2185 ibfstages(ioptr, M, Utbl, 1, NDiffU, StageCnt); /* RADIX 8 Stages */
2186
2187 else{
2188 ifftrecurs(ioptr, M, Utbl, 1, NDiffU, StageCnt); /* RADIX 8 Stages */
2189 }
2190
2191 ioptr += 2*POW2(M);
2192 }
2193 }
2194 }
2195
2196 /************************************************
2197 parts of rffts1
2198 *************************************************/
2199
2200 //inline void rfft1pt(float *ioptr);
rfft1pt(float * ioptr)2201 static inline void rfft1pt(float *ioptr){
2202 /*** RADIX 2 rfft ***/
2203 float f0r, f0i;
2204 float t0r, t0i;
2205
2206 /* bit reversed load */
2207 f0r = ioptr[0];
2208 f0i = ioptr[1];
2209
2210 /* finish rfft */
2211 t0r = f0r + f0i;
2212 t0i = f0r - f0i;
2213
2214 /* store result */
2215 ioptr[0] = t0r;
2216 ioptr[1] = t0i;
2217 }
2218
2219 //inline void rfft2pt(float *ioptr);
rfft2pt(float * ioptr)2220 static inline void rfft2pt(float *ioptr){
2221 /*** RADIX 4 rfft ***/
2222 float f0r, f0i, f1r, f1i;
2223 float t0r, t0i;
2224
2225 /* bit reversed load */
2226 f0r = ioptr[0];
2227 f0i = ioptr[1];
2228 f1r = ioptr[2];
2229 f1i = ioptr[3];
2230
2231 /* Butterflys */
2232 /*
2233 f0 - - t0
2234 f1 - 1 - f1
2235 */
2236
2237 t0r = f0r + f1r;
2238 t0i = f0i + f1i;
2239 f1r = f0r - f1r;
2240 f1i = f1i - f0i;
2241 /* finish rfft */
2242 f0r = t0r + t0i;
2243 f0i = t0r - t0i;
2244
2245 /* store result */
2246 ioptr[0] = f0r;
2247 ioptr[1] = f0i;
2248 ioptr[2] = f1r;
2249 ioptr[3] = f1i;
2250 }
2251
2252 //inline void rfft4pt(float *ioptr);
rfft4pt(float * ioptr)2253 static inline void rfft4pt(float *ioptr){
2254 /*** RADIX 8 rfft ***/
2255 float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
2256 float t0r, t0i, t1r, t1i;
2257 float w0r = 1.0F/MYROOT2; /* cos(pi/4) */
2258 const float Two = 2.0;
2259 const float scale = 0.5;
2260
2261 /* bit reversed load */
2262 f0r = ioptr[0];
2263 f0i = ioptr[1];
2264 f1r = ioptr[4];
2265 f1i = ioptr[5];
2266 f2r = ioptr[2];
2267 f2i = ioptr[3];
2268 f3r = ioptr[6];
2269 f3i = ioptr[7];
2270
2271 /* Butterflys */
2272 /*
2273 f0 - - t0 - - f0
2274 f1 - 1 - f1 - - f1
2275 f2 - - f2 - 1 - f2
2276 f3 - 1 - t1 - -i - f3
2277 */
2278
2279 t0r = f0r + f1r;
2280 t0i = f0i + f1i;
2281 f1r = f0r - f1r;
2282 f1i = f0i - f1i;
2283
2284 t1r = f2r - f3r;
2285 t1i = f2i - f3i;
2286 f2r = f2r + f3r;
2287 f2i = f2i + f3i;
2288
2289 f0r = t0r + f2r;
2290 f0i = t0i + f2i;
2291 f2r = t0r - f2r;
2292 f2i = f2i - t0i; // neg for rfft
2293
2294 f3r = f1r - t1i;
2295 f3i = f1i + t1r;
2296 f1r = f1r + t1i;
2297 f1i = f1i - t1r;
2298
2299 /* finish rfft */
2300 t0r = f0r + f0i; /* compute Re(x[0]) */
2301 t0i = f0r - f0i; /* compute Re(x[N/2]) */
2302
2303 t1r = f1r + f3r;
2304 t1i = f1i - f3i;
2305 f0r = f1i + f3i;
2306 f0i = f3r - f1r;
2307
2308 f1r = t1r + w0r * f0r + w0r * f0i;
2309 f1i = t1i - w0r * f0r + w0r * f0i;
2310 f3r = Two * t1r - f1r;
2311 f3i = f1i - Two * t1i;
2312
2313 /* store result */
2314 ioptr[4] = f2r;
2315 ioptr[5] = f2i;
2316 ioptr[0] = t0r;
2317 ioptr[1] = t0i;
2318
2319 ioptr[2] = scale*f1r;
2320 ioptr[3] = scale*f1i;
2321 ioptr[6] = scale*f3r;
2322 ioptr[7] = scale*f3i;
2323 }
2324
2325 //inline void rfft8pt(float *ioptr);
rfft8pt(float * ioptr)2326 static inline void rfft8pt(float *ioptr){
2327 /*** RADIX 16 rfft ***/
2328 float w0r = 1.0F/MYROOT2; /* cos(pi/4) */
2329 float w1r = MYCOSPID8; /* cos(pi/8) */
2330 float w1i = MYSINPID8; /* sin(pi/8) */
2331 float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
2332 float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
2333 float t0r, t0i, t1r, t1i;
2334 const float Two = 2.0;
2335 const float scale = 0.5;
2336
2337 /* bit reversed load */
2338 f0r = ioptr[0];
2339 f0i = ioptr[1];
2340 f1r = ioptr[8];
2341 f1i = ioptr[9];
2342 f2r = ioptr[4];
2343 f2i = ioptr[5];
2344 f3r = ioptr[12];
2345 f3i = ioptr[13];
2346 f4r = ioptr[2];
2347 f4i = ioptr[3];
2348 f5r = ioptr[10];
2349 f5i = ioptr[11];
2350 f6r = ioptr[6];
2351 f6i = ioptr[7];
2352 f7r = ioptr[14];
2353 f7i = ioptr[15];
2354 /* Butterflys */
2355 /*
2356 f0 - - t0 - - f0 - - f0
2357 f1 - 1 - f1 - - f1 - - f1
2358 f2 - - f2 - 1 - f2 - - f2
2359 f3 - 1 - t1 - -i - f3 - - f3
2360 f4 - - t0 - - f4 - 1 - t0
2361 f5 - 1 - f5 - - f5 - w3 - f4
2362 f6 - - f6 - 1 - f6 - -i - t1
2363 f7 - 1 - t1 - -i - f7 - iw3- f6
2364 */
2365
2366 t0r = f0r + f1r;
2367 t0i = f0i + f1i;
2368 f1r = f0r - f1r;
2369 f1i = f0i - f1i;
2370
2371 t1r = f2r - f3r;
2372 t1i = f2i - f3i;
2373 f2r = f2r + f3r;
2374 f2i = f2i + f3i;
2375
2376 f0r = t0r + f2r;
2377 f0i = t0i + f2i;
2378 f2r = t0r - f2r;
2379 f2i = t0i - f2i;
2380
2381 f3r = f1r - t1i;
2382 f3i = f1i + t1r;
2383 f1r = f1r + t1i;
2384 f1i = f1i - t1r;
2385
2386
2387 t0r = f4r + f5r;
2388 t0i = f4i + f5i;
2389 f5r = f4r - f5r;
2390 f5i = f4i - f5i;
2391
2392 t1r = f6r - f7r;
2393 t1i = f6i - f7i;
2394 f6r = f6r + f7r;
2395 f6i = f6i + f7i;
2396
2397 f4r = t0r + f6r;
2398 f4i = t0i + f6i;
2399 f6r = t0r - f6r;
2400 f6i = t0i - f6i;
2401
2402 f7r = f5r - t1i;
2403 f7i = f5i + t1r;
2404 f5r = f5r + t1i;
2405 f5i = f5i - t1r;
2406
2407
2408 t0r = f0r - f4r;
2409 t0i = f4i - f0i; // neg for rfft
2410 f0r = f0r + f4r;
2411 f0i = f0i + f4i;
2412
2413 t1r = f2r - f6i;
2414 t1i = f2i + f6r;
2415 f2r = f2r + f6i;
2416 f2i = f2i - f6r;
2417
2418 f4r = f1r - f5r * w0r - f5i * w0r;
2419 f4i = f1i + f5r * w0r - f5i * w0r;
2420 f1r = f1r * Two - f4r;
2421 f1i = f1i * Two - f4i;
2422
2423 f6r = f3r + f7r * w0r - f7i * w0r;
2424 f6i = f3i + f7r * w0r + f7i * w0r;
2425 f3r = f3r * Two - f6r;
2426 f3i = f3i * Two - f6i;
2427
2428 /* finish rfft */
2429 f5r = f0r + f0i; /* compute Re(x[0]) */
2430 f5i = f0r - f0i; /* compute Re(x[N/2]) */
2431
2432 f0r = f2r + t1r;
2433 f0i = f2i - t1i;
2434 f7r = f2i + t1i;
2435 f7i = t1r - f2r;
2436
2437 f2r = f0r + w0r * f7r + w0r * f7i;
2438 f2i = f0i - w0r * f7r + w0r * f7i;
2439 t1r = Two * f0r - f2r;
2440 t1i = f2i - Two * f0i;
2441
2442
2443 f0r = f1r + f6r;
2444 f0i = f1i - f6i;
2445 f7r = f1i + f6i;
2446 f7i = f6r - f1r;
2447
2448 f1r = f0r + w1r * f7r + w1i * f7i;
2449 f1i = f0i - w1i * f7r + w1r * f7i;
2450 f6r = Two * f0r - f1r;
2451 f6i = f1i - Two * f0i;
2452
2453 f0r = f3r + f4r;
2454 f0i = f3i - f4i;
2455 f7r = f3i + f4i;
2456 f7i = f4r - f3r;
2457
2458 f3r = f0r + w1i * f7r + w1r * f7i;
2459 f3i = f0i - w1r * f7r + w1i * f7i;
2460 f4r = Two * f0r - f3r;
2461 f4i = f3i - Two * f0i;
2462
2463 /* store result */
2464 ioptr[8] = t0r;
2465 ioptr[9] = t0i;
2466 ioptr[0] = f5r;
2467 ioptr[1] = f5i;
2468
2469 ioptr[4] = scale*f2r;
2470 ioptr[5] = scale*f2i;
2471 ioptr[12] = scale*t1r;
2472 ioptr[13] = scale*t1i;
2473
2474 ioptr[2] = scale*f1r;
2475 ioptr[3] = scale*f1i;
2476 ioptr[6] = scale*f3r;
2477 ioptr[7] = scale*f3i;
2478 ioptr[10] = scale*f4r;
2479 ioptr[11] = scale*f4i;
2480 ioptr[14] = scale*f6r;
2481 ioptr[15] = scale*f6i;
2482 }
2483
2484 //inline void frstage(float *ioptr, long M, float *Utbl);
frstage(float * ioptr,long M,float * Utbl)2485 static inline void frstage(float *ioptr, long M, float *Utbl){
2486 /* Finish RFFT */
2487
2488 unsigned long pos;
2489 unsigned long posi;
2490 unsigned long diffUcnt;
2491
2492 float *p0r, *p1r;
2493 float *u0r, *u0i;
2494
2495 float w0r, w0i;
2496 float f0r, f0i, f1r, f1i, f4r, f4i, f5r, f5i;
2497 float t0r, t0i, t1r, t1i;
2498 const float Two = 2.0;
2499
2500 pos = POW2(M-1);
2501 posi = pos + 1;
2502
2503 p0r = ioptr;
2504 p1r = ioptr + pos/2;
2505
2506 u0r = Utbl + (int) POW2(M-3);
2507
2508 w0r = *u0r;
2509
2510 f0r = *(p0r);
2511 f0i = *(p0r + 1);
2512 f4r = *(p0r + pos);
2513 f4i = *(p0r + posi);
2514 f1r = *(p1r);
2515 f1i = *(p1r + 1);
2516 f5r = *(p1r + pos);
2517 f5i = *(p1r + posi);
2518
2519 t0r = Two * f0r + Two * f0i; /* compute Re(x[0]) */
2520 t0i = Two * f0r - Two * f0i; /* compute Re(x[N/2]) */
2521 t1r = f4r + f4r;
2522 t1i = -f4i - f4i;
2523
2524 f0r = f1r + f5r;
2525 f0i = f1i - f5i;
2526 f4r = f1i + f5i;
2527 f4i = f5r - f1r;
2528
2529 f1r = f0r + w0r * f4r + w0r * f4i;
2530 f1i = f0i - w0r * f4r + w0r * f4i;
2531 f5r = Two * f0r - f1r;
2532 f5i = f1i - Two * f0i;
2533
2534 *(p0r) = t0r;
2535 *(p0r + 1) = t0i;
2536 *(p0r + pos) = t1r;
2537 *(p0r + posi) = t1i;
2538 *(p1r) = f1r;
2539 *(p1r + 1) = f1i;
2540 *(p1r + pos) = f5r;
2541 *(p1r + posi) = f5i;
2542
2543 u0r = Utbl + 1;
2544 u0i = Utbl + (POW2(M-2)-1);
2545
2546 w0r = *u0r;
2547 w0i = *u0i;
2548
2549 p0r = (ioptr + 2);
2550 p1r = (ioptr + (POW2(M-2)-1)*2);
2551
2552 /* Butterflys */
2553 /*
2554 f0 - t0 - - f0
2555 f5 - t1 - w0 - f5
2556
2557 f1 - t0 - - f1
2558 f4 - t1 -iw0 - f4
2559 */
2560
2561 for (diffUcnt = POW2(M-3)-1; diffUcnt > 0 ; diffUcnt--){
2562
2563 f0r = *(p0r);
2564 f0i = *(p0r + 1);
2565 f5r = *(p1r + pos);
2566 f5i = *(p1r + posi);
2567 f1r = *(p1r);
2568 f1i = *(p1r + 1);
2569 f4r = *(p0r + pos);
2570 f4i = *(p0r + posi);
2571
2572 t0r = f0r + f5r;
2573 t0i = f0i - f5i;
2574 t1r = f0i + f5i;
2575 t1i = f5r - f0r;
2576
2577 f0r = t0r + w0r * t1r + w0i * t1i;
2578 f0i = t0i - w0i * t1r + w0r * t1i;
2579 f5r = Two * t0r - f0r;
2580 f5i = f0i - Two * t0i;
2581
2582 t0r = f1r + f4r;
2583 t0i = f1i - f4i;
2584 t1r = f1i + f4i;
2585 t1i = f4r - f1r;
2586
2587 f1r = t0r + w0i * t1r + w0r * t1i;
2588 f1i = t0i - w0r * t1r + w0i * t1i;
2589 f4r = Two * t0r - f1r;
2590 f4i = f1i - Two * t0i;
2591
2592 *(p0r) = f0r;
2593 *(p0r + 1) = f0i;
2594 *(p1r + pos) = f5r;
2595 *(p1r + posi) = f5i;
2596
2597 w0r = *++u0r;
2598 w0i = *--u0i;
2599
2600 *(p1r) = f1r;
2601 *(p1r + 1) = f1i;
2602 *(p0r + pos) = f4r;
2603 *(p0r + posi) = f4i;
2604
2605 p0r += 2;
2606 p1r -= 2;
2607 };
2608 }
2609
rffts1(float * ioptr,long M,long Rows,float * Utbl,short * BRLow)2610 void rffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow){
2611 /* Compute in-place real fft on the rows of the input array */
2612 /* The result is the complex spectra of the positive frequencies */
2613 /* except the location for the first complex number contains the real */
2614 /* values for DC and Nyquest */
2615 /* INPUTS */
2616 /* *ioptr = real input data array */
2617 /* M = log2 of fft size */
2618 /* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */
2619 /* *Utbl = cosine table */
2620 /* *BRLow = bit reversed counter table */
2621 /* OUTPUTS */
2622 /* *ioptr = output data array in the following order */
2623 /* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */
2624
2625 float scale;
2626 long StageCnt;
2627 long NDiffU;
2628
2629 M=M-1;
2630 switch (M){
2631 case -1:
2632 break;
2633 case 0:
2634 for (;Rows>0;Rows--){
2635 rfft1pt(ioptr); /* a 2 pt fft */
2636 ioptr += 2*POW2(M);
2637 }
2638 case 1:
2639 for (;Rows>0;Rows--){
2640 rfft2pt(ioptr); /* a 4 pt fft */
2641 ioptr += 2*POW2(M);
2642 }
2643 break;
2644 case 2:
2645 for (;Rows>0;Rows--){
2646 rfft4pt(ioptr); /* an 8 pt fft */
2647 ioptr += 2*POW2(M);
2648 }
2649 break;
2650 case 3:
2651 for (;Rows>0;Rows--){
2652 rfft8pt(ioptr); /* a 16 pt fft */
2653 ioptr += 2*POW2(M);
2654 }
2655 break;
2656 default:
2657 scale = 0.5;
2658 for (;Rows>0;Rows--){
2659
2660 scbitrevR2(ioptr, M, BRLow, scale); /* bit reverse and first radix 2 stage */
2661
2662 StageCnt = (M-1) / 3; // number of radix 8 stages
2663 NDiffU = 2; // one radix 2 stage already complete
2664
2665 if ( (M-1-(StageCnt * 3)) == 1 ){
2666 bfR2(ioptr, M, NDiffU); /* 1 radix 2 stage */
2667 NDiffU *= 2;
2668 }
2669
2670 if ( (M-1-(StageCnt * 3)) == 2 ){
2671 bfR4(ioptr, M, NDiffU); /* 1 radix 4 stage */
2672 NDiffU *= 4;
2673 }
2674
2675 if (M <= MCACHE){
2676 bfstages(ioptr, M, Utbl, 2, NDiffU, StageCnt); /* RADIX 8 Stages */
2677 frstage(ioptr, M+1, Utbl);
2678 }
2679
2680 else{
2681 fftrecurs(ioptr, M, Utbl, 2, NDiffU, StageCnt); /* RADIX 8 Stages */
2682 frstage(ioptr, M+1, Utbl);
2683 }
2684
2685 ioptr += 2*POW2(M);
2686 }
2687 }
2688 }
2689
2690 /************************************************
2691 parts of riffts1
2692 *************************************************/
2693
2694 //inline void rifft1pt(float *ioptr, float scale);
rifft1pt(float * ioptr,float scale)2695 static inline void rifft1pt(float *ioptr, float scale){
2696 /*** RADIX 2 rifft ***/
2697 float f0r, f0i;
2698 float t0r, t0i;
2699
2700 /* bit reversed load */
2701 f0r = ioptr[0];
2702 f0i = ioptr[1];
2703
2704 /* finish rfft */
2705 t0r = f0r + f0i;
2706 t0i = f0r - f0i;
2707
2708 /* store result */
2709 ioptr[0] = scale*t0r;
2710 ioptr[1] = scale*t0i;
2711 }
2712
2713 //inline void rifft2pt(float *ioptr, float scale);
rifft2pt(float * ioptr,float scale)2714 static inline void rifft2pt(float *ioptr, float scale){
2715 /*** RADIX 4 rifft ***/
2716 float f0r, f0i, f1r, f1i;
2717 float t0r, t0i;
2718 const float Two = 2.0;
2719
2720 /* bit reversed load */
2721 t0r = ioptr[0];
2722 t0i = ioptr[1];
2723 f1r = Two * ioptr[2];
2724 f1i = Two * ioptr[3];
2725
2726 /* start rifft */
2727 f0r = t0r + t0i;
2728 f0i = t0r - t0i;
2729 /* Butterflys */
2730 /*
2731 f0 - - t0
2732 f1 - 1 - f1
2733 */
2734
2735 t0r = f0r + f1r;
2736 t0i = f0i - f1i;
2737 f1r = f0r - f1r;
2738 f1i = f0i + f1i;
2739
2740 /* store result */
2741 ioptr[0] = scale*t0r;
2742 ioptr[1] = scale*t0i;
2743 ioptr[2] = scale*f1r;
2744 ioptr[3] = scale*f1i;
2745 }
2746
2747 //inline void rifft4pt(float *ioptr, float scale);
rifft4pt(float * ioptr,float scale)2748 static inline void rifft4pt(float *ioptr, float scale){
2749 /*** RADIX 8 rifft ***/
2750 float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
2751 float t0r, t0i, t1r, t1i;
2752 float w0r = 1.0F/MYROOT2; /* cos(pi/4) */
2753 const float Two = 2.0;
2754
2755 /* bit reversed load */
2756 t0r = ioptr[0];
2757 t0i = ioptr[1];
2758 f2r = ioptr[2];
2759 f2i = ioptr[3];
2760 f1r = Two * ioptr[4];
2761 f1i = Two * ioptr[5];
2762 f3r = ioptr[6];
2763 f3i = ioptr[7];
2764
2765 /* start rfft */
2766 f0r = t0r + t0i; /* compute Re(x[0]) */
2767 f0i = t0r - t0i; /* compute Re(x[N/2]) */
2768
2769 t1r = f2r + f3r;
2770 t1i = f2i - f3i;
2771 t0r = f2r - f3r;
2772 t0i = f2i + f3i;
2773
2774 f2r = t1r - w0r * t0r - w0r * t0i;
2775 f2i = t1i + w0r * t0r - w0r * t0i;
2776 f3r = Two * t1r - f2r;
2777 f3i = f2i - Two * t1i;
2778
2779 /* Butterflys */
2780 /*
2781 f0 - - t0 - - f0
2782 f1 - 1 - f1 - - f1
2783 f2 - - f2 - 1 - f2
2784 f3 - 1 - t1 - i - f3
2785 */
2786
2787 t0r = f0r + f1r;
2788 t0i = f0i - f1i;
2789 f1r = f0r - f1r;
2790 f1i = f0i + f1i;
2791
2792 t1r = f2r - f3r;
2793 t1i = f2i - f3i;
2794 f2r = f2r + f3r;
2795 f2i = f2i + f3i;
2796
2797 f0r = t0r + f2r;
2798 f0i = t0i + f2i;
2799 f2r = t0r - f2r;
2800 f2i = t0i - f2i;
2801
2802 f3r = f1r + t1i;
2803 f3i = f1i - t1r;
2804 f1r = f1r - t1i;
2805 f1i = f1i + t1r;
2806
2807 /* store result */
2808 ioptr[0] = scale*f0r;
2809 ioptr[1] = scale*f0i;
2810 ioptr[2] = scale*f1r;
2811 ioptr[3] = scale*f1i;
2812 ioptr[4] = scale*f2r;
2813 ioptr[5] = scale*f2i;
2814 ioptr[6] = scale*f3r;
2815 ioptr[7] = scale*f3i;
2816 }
2817
2818 //inline void rifft8pt(float *ioptr, float scale);
rifft8pt(float * ioptr,float scale)2819 static inline void rifft8pt(float *ioptr, float scale){
2820 /*** RADIX 16 rifft ***/
2821 float w0r = 1.0F/MYROOT2; /* cos(pi/4) */
2822 float w1r = MYCOSPID8; /* cos(pi/8) */
2823 float w1i = MYSINPID8; /* sin(pi/8) */
2824 float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
2825 float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
2826 float t0r, t0i, t1r, t1i;
2827 const float Two = 2.0;
2828
2829 /* bit reversed load */
2830 t0r = ioptr[0];
2831 t0i = ioptr[1];
2832 f4r = ioptr[2];
2833 f4i = ioptr[3];
2834 f2r = ioptr[4];
2835 f2i = ioptr[5];
2836 f6r = ioptr[6];
2837 f6i = ioptr[7];
2838 f1r = Two * ioptr[8];
2839 f1i = Two * ioptr[9];
2840 f5r = ioptr[10];
2841 f5i = ioptr[11];
2842 f3r = ioptr[12];
2843 f3i = ioptr[13];
2844 f7r = ioptr[14];
2845 f7i = ioptr[15];
2846
2847
2848 /* start rfft */
2849 f0r = t0r + t0i; /* compute Re(x[0]) */
2850 f0i = t0r - t0i; /* compute Re(x[N/2]) */
2851
2852 t0r = f2r + f3r;
2853 t0i = f2i - f3i;
2854 t1r = f2r - f3r;
2855 t1i = f2i + f3i;
2856
2857 f2r = t0r - w0r * t1r - w0r * t1i;
2858 f2i = t0i + w0r * t1r - w0r * t1i;
2859 f3r = Two * t0r - f2r;
2860 f3i = f2i - Two * t0i;
2861
2862 t0r = f4r + f7r;
2863 t0i = f4i - f7i;
2864 t1r = f4r - f7r;
2865 t1i = f4i + f7i;
2866
2867 f4r = t0r - w1i * t1r - w1r * t1i;
2868 f4i = t0i + w1r * t1r - w1i * t1i;
2869 f7r = Two * t0r - f4r;
2870 f7i = f4i - Two * t0i;
2871
2872 t0r = f6r + f5r;
2873 t0i = f6i - f5i;
2874 t1r = f6r - f5r;
2875 t1i = f6i + f5i;
2876
2877 f6r = t0r - w1r * t1r - w1i * t1i;
2878 f6i = t0i + w1i * t1r - w1r * t1i;
2879 f5r = Two * t0r - f6r;
2880 f5i = f6i - Two * t0i;
2881
2882 /* Butterflys */
2883 /*
2884 f0 - - t0 - - f0 - - f0
2885 f1* - 1 - f1 - - f1 - - f1
2886 f2 - - f2 - 1 - f2 - - f2
2887 f3 - 1 - t1 - i - f3 - - f3
2888 f4 - - t0 - - f4 - 1 - t0
2889 f5 - 1 - f5 - - f5 - w3 - f4
2890 f6 - - f6 - 1 - f6 - i - t1
2891 f7 - 1 - t1 - i - f7 - iw3- f6
2892 */
2893
2894 t0r = f0r + f1r;
2895 t0i = f0i - f1i;
2896 f1r = f0r - f1r;
2897 f1i = f0i + f1i;
2898
2899 t1r = f2r - f3r;
2900 t1i = f2i - f3i;
2901 f2r = f2r + f3r;
2902 f2i = f2i + f3i;
2903
2904 f0r = t0r + f2r;
2905 f0i = t0i + f2i;
2906 f2r = t0r - f2r;
2907 f2i = t0i - f2i;
2908
2909 f3r = f1r + t1i;
2910 f3i = f1i - t1r;
2911 f1r = f1r - t1i;
2912 f1i = f1i + t1r;
2913
2914
2915 t0r = f4r + f5r;
2916 t0i = f4i + f5i;
2917 f5r = f4r - f5r;
2918 f5i = f4i - f5i;
2919
2920 t1r = f6r - f7r;
2921 t1i = f6i - f7i;
2922 f6r = f6r + f7r;
2923 f6i = f6i + f7i;
2924
2925 f4r = t0r + f6r;
2926 f4i = t0i + f6i;
2927 f6r = t0r - f6r;
2928 f6i = t0i - f6i;
2929
2930 f7r = f5r + t1i;
2931 f7i = f5i - t1r;
2932 f5r = f5r - t1i;
2933 f5i = f5i + t1r;
2934
2935
2936 t0r = f0r - f4r;
2937 t0i = f0i - f4i;
2938 f0r = f0r + f4r;
2939 f0i = f0i + f4i;
2940
2941 t1r = f2r + f6i;
2942 t1i = f2i - f6r;
2943 f2r = f2r - f6i;
2944 f2i = f2i + f6r;
2945
2946 f4r = f1r - f5r * w0r + f5i * w0r;
2947 f4i = f1i - f5r * w0r - f5i * w0r;
2948 f1r = f1r * Two - f4r;
2949 f1i = f1i * Two - f4i;
2950
2951 f6r = f3r + f7r * w0r + f7i * w0r;
2952 f6i = f3i - f7r * w0r + f7i * w0r;
2953 f3r = f3r * Two - f6r;
2954 f3i = f3i * Two - f6i;
2955
2956 /* store result */
2957 ioptr[0] = scale*f0r;
2958 ioptr[1] = scale*f0i;
2959 ioptr[2] = scale*f1r;
2960 ioptr[3] = scale*f1i;
2961 ioptr[4] = scale*f2r;
2962 ioptr[5] = scale*f2i;
2963 ioptr[6] = scale*f3r;
2964 ioptr[7] = scale*f3i;
2965 ioptr[8] = scale*t0r;
2966 ioptr[9] = scale*t0i;
2967 ioptr[10] = scale*f4r;
2968 ioptr[11] = scale*f4i;
2969 ioptr[12] = scale*t1r;
2970 ioptr[13] = scale*t1i;
2971 ioptr[14] = scale*f6r;
2972 ioptr[15] = scale*f6i;
2973 }
2974
2975 //inline void ifrstage(float *ioptr, long M, float *Utbl);
ifrstage(float * ioptr,long M,float * Utbl)2976 static inline void ifrstage(float *ioptr, long M, float *Utbl){
2977 /* Start RIFFT */
2978
2979 unsigned long pos;
2980 unsigned long posi;
2981 unsigned long diffUcnt;
2982
2983 float *p0r, *p1r;
2984 float *u0r, *u0i;
2985
2986 float w0r, w0i;
2987 float f0r, f0i, f1r, f1i, f4r, f4i, f5r, f5i;
2988 float t0r, t0i, t1r, t1i;
2989 const float Two = 2.0;
2990
2991 pos = POW2(M-1);
2992 posi = pos + 1;
2993
2994 p0r = ioptr;
2995 p1r = ioptr + pos/2;
2996
2997 u0r = Utbl + (int) POW2(M-3);
2998
2999 w0r = *u0r;
3000
3001 f0r = *(p0r);
3002 f0i = *(p0r + 1);
3003 f4r = *(p0r + pos);
3004 f4i = *(p0r + posi);
3005 f1r = *(p1r);
3006 f1i = *(p1r + 1);
3007 f5r = *(p1r + pos);
3008 f5i = *(p1r + posi);
3009
3010 t0r = f0r + f0i;
3011 t0i = f0r - f0i;
3012 t1r = f4r + f4r;
3013 t1i = -f4i - f4i;
3014
3015 f0r = f1r + f5r;
3016 f0i = f1i - f5i;
3017 f4r = f1r - f5r;
3018 f4i = f1i + f5i;
3019
3020 f1r = f0r - w0r * f4r - w0r * f4i;
3021 f1i = f0i + w0r * f4r - w0r * f4i;
3022 f5r = Two * f0r - f1r;
3023 f5i = f1i - Two * f0i;
3024
3025 *(p0r) = t0r;
3026 *(p0r + 1) = t0i;
3027 *(p0r + pos) = t1r;
3028 *(p0r + posi) = t1i;
3029 *(p1r) = f1r;
3030 *(p1r + 1) = f1i;
3031 *(p1r + pos) = f5r;
3032 *(p1r + posi) = f5i;
3033
3034 u0r = Utbl + 1;
3035 u0i = Utbl + (POW2(M-2)-1);
3036
3037 w0r = *u0r;
3038 w0i = *u0i;
3039
3040 p0r = (ioptr + 2);
3041 p1r = (ioptr + (POW2(M-2)-1)*2);
3042
3043 /* Butterflys */
3044 /*
3045 f0 - t0 - f0
3046 f1 - t1 -w0- f1
3047
3048 f2 - t0 - f2
3049 f3 - t1 -iw0- f3
3050 */
3051
3052 for (diffUcnt = POW2(M-3)-1; diffUcnt > 0 ; diffUcnt--){
3053
3054 f0r = *(p0r);
3055 f0i = *(p0r + 1);
3056 f5r = *(p1r + pos);
3057 f5i = *(p1r + posi);
3058 f1r = *(p1r);
3059 f1i = *(p1r + 1);
3060 f4r = *(p0r + pos);
3061 f4i = *(p0r + posi);
3062
3063 t0r = f0r + f5r;
3064 t0i = f0i - f5i;
3065 t1r = f0r - f5r;
3066 t1i = f0i + f5i;
3067
3068 f0r = t0r - w0i * t1r - w0r * t1i;
3069 f0i = t0i + w0r * t1r - w0i * t1i;
3070 f5r = Two * t0r - f0r;
3071 f5i = f0i - Two * t0i;
3072
3073 t0r = f1r + f4r;
3074 t0i = f1i - f4i;
3075 t1r = f1r - f4r;
3076 t1i = f1i + f4i;
3077
3078 f1r = t0r - w0r * t1r - w0i * t1i;
3079 f1i = t0i + w0i * t1r - w0r * t1i;
3080 f4r = Two * t0r - f1r;
3081 f4i = f1i - Two * t0i;
3082
3083 *(p0r) = f0r;
3084 *(p0r + 1) = f0i;
3085 *(p1r + pos) = f5r;
3086 *(p1r + posi) = f5i;
3087
3088 w0r = *++u0r;
3089 w0i = *--u0i;
3090
3091 *(p1r) = f1r;
3092 *(p1r + 1) = f1i;
3093 *(p0r + pos) = f4r;
3094 *(p0r + posi) = f4i;
3095
3096 p0r += 2;
3097 p1r -= 2;
3098 };
3099 }
3100
riffts1(float * ioptr,long M,long Rows,float * Utbl,short * BRLow)3101 void riffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow){
3102 /* Compute in-place real ifft on the rows of the input array */
3103 /* data order as from rffts1 */
3104 /* INPUTS */
3105 /* *ioptr = input data array in the following order */
3106 /* M = log2 of fft size */
3107 /* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */
3108 /* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */
3109 /* *Utbl = cosine table */
3110 /* *BRLow = bit reversed counter table */
3111 /* OUTPUTS */
3112 /* *ioptr = real output data array */
3113
3114 float scale;
3115 long StageCnt;
3116 long NDiffU;
3117
3118 scale = 1.0F/POW2(M);
3119 M=M-1;
3120 switch (M){
3121 case -1:
3122 break;
3123 case 0:
3124 for (;Rows>0;Rows--){
3125 rifft1pt(ioptr, scale); /* a 2 pt fft */
3126 ioptr += 2*POW2(M);
3127 }
3128 case 1:
3129 for (;Rows>0;Rows--){
3130 rifft2pt(ioptr, scale); /* a 4 pt fft */
3131 ioptr += 2*POW2(M);
3132 }
3133 break;
3134 case 2:
3135 for (;Rows>0;Rows--){
3136 rifft4pt(ioptr, scale); /* an 8 pt fft */
3137 ioptr += 2*POW2(M);
3138 }
3139 break;
3140 case 3:
3141 for (;Rows>0;Rows--){
3142 rifft8pt(ioptr, scale); /* a 16 pt fft */
3143 ioptr += 2*POW2(M);
3144 }
3145 break;
3146 default:
3147 for (;Rows>0;Rows--){
3148
3149 ifrstage(ioptr, M+1, Utbl);
3150
3151 scbitrevR2(ioptr, M, BRLow, scale); /* bit reverse and first radix 2 stage */
3152
3153 StageCnt = (M-1) / 3; // number of radix 8 stages
3154 NDiffU = 2; // one radix 2 stage already complete
3155
3156 if ( (M-1-(StageCnt * 3)) == 1 ){
3157 ibfR2(ioptr, M, NDiffU); /* 1 radix 2 stage */
3158 NDiffU *= 2;
3159 }
3160
3161 if ( (M-1-(StageCnt * 3)) == 2 ){
3162 ibfR4(ioptr, M, NDiffU); /* 1 radix 4 stage */
3163 NDiffU *= 4;
3164 }
3165
3166 if (M <= MCACHE){
3167 ibfstages(ioptr, M, Utbl, 2, NDiffU, StageCnt); /* RADIX 8 Stages */
3168 }
3169
3170 else{
3171 ifftrecurs(ioptr, M, Utbl, 2, NDiffU, StageCnt); /* RADIX 8 Stages */
3172 }
3173
3174 ioptr += 2*POW2(M);
3175 }
3176 }
3177 }
3178
3179