1 /*******************************************************************
2 lower level fft stuff including routines called in fftext.c and fft2d.c
3 *******************************************************************/
4 /* inline declarations modified by RBD for C99 compiler */
5 
6 
7 #include <stdint.h>
8 #include "fftlib.h"
9 #include <math.h>
10 #define	MCACHE	(11-(sizeof(float)/8))	// fft's with M bigger than this bust primary cache
11 #ifdef WIN32
12 #define inline __inline
13 #endif
14 
15 // some math constants to 40 decimal places
16 #define MYPI		3.141592653589793238462643383279502884197F	// pi
17 #define MYROOT2 	1.414213562373095048801688724209698078569F	// sqrt(2)
18 #define MYCOSPID8	0.9238795325112867561281831893967882868224F	// cos(pi/8)
19 #define MYSINPID8	0.3826834323650897717284599840303988667613F	// sin(pi/8)
20 
21 
22 /*************************************************
23 routines to initialize tables used by fft routines
24 **************************************************/
25 
fftCosInit(long M,float * Utbl)26 void fftCosInit(long M, float *Utbl){
27 /* Compute Utbl, the cosine table for ffts	*/
28 /* of size (pow(2,M)/4 +1)	*/
29 /* INPUTS */
30 /* M = log2 of fft size	*/
31 /* OUTPUTS */
32 /* *Utbl = cosine table	*/
33 unsigned long fftN = POW2(M);
34 unsigned long i1;
35 	Utbl[0] = 1.0;
36 	for (i1 = 1; i1 < fftN/4; i1++)
37 		Utbl[i1] = (float) cos( (2.0 * MYPI * i1) / fftN );
38 	Utbl[fftN/4] = 0.0;
39 }
40 
fftBRInit(long M,short * BRLow)41 void fftBRInit(long M, short *BRLow){
42 /* Compute BRLow, the bit reversed table for ffts	*/
43 /* of size pow(2,M/2 -1)	*/
44 /* INPUTS */
45 /* M = log2 of fft size	*/
46 /* OUTPUTS */
47 /* *BRLow = bit reversed counter table	*/
48 long	Mroot_1 = M / 2 - 1;
49 long	Nroot_1 = POW2(Mroot_1);
50 long i1;
51 long bitsum;
52 long bitmask;
53 long bit;
54 for (i1 = 0; i1 < Nroot_1; i1++){
55 	bitsum = 0;
56 	bitmask = 1;
57 	for (bit=1; bit <= Mroot_1; bitmask<<=1, bit++)
58 		if (i1 & bitmask)
59 			bitsum = bitsum + (Nroot_1 >> bit);
60 	BRLow[i1] = (short) bitsum;
61 };
62 }
63 
64 /************************************************
65 parts of ffts1
66 *************************************************/
67 
68 //inline void bitrevR2(float *ioptr, long M, short *BRLow);
bitrevR2(float * ioptr,long M,short * BRLow)69 static inline void bitrevR2(float *ioptr, long M, short *BRLow){
70 /*** bit reverse and first radix 2 stage of forward or inverse fft ***/
71 float	f0r;
72 float	f0i;
73 float	f1r;
74 float	f1i;
75 float	f2r;
76 float	f2i;
77 float	f3r;
78 float	f3i;
79 float	f4r;
80 float	f4i;
81 float	f5r;
82 float	f5i;
83 float	f6r;
84 float	f6i;
85 float	f7r;
86 float	f7i;
87 float	t0r;
88 float	t0i;
89 float	t1r;
90 float	t1i;
91 float	*p0r;
92 float	*p1r;
93 float	*IOP;
94 float 	*iolimit;
95 long 	Colstart;
96 long 	iCol;
97 unsigned long 	posA;
98 unsigned long 	posAi;
99 unsigned long 	posB;
100 unsigned long 	posBi;
101 
102 const unsigned long Nrems2 = POW2((M+3)/2);
103 const unsigned long Nroot_1_ColInc = POW2(M)-Nrems2;
104 const unsigned long Nroot_1 = POW2(M/2-1)-1;
105 const unsigned long ColstartShift = (M+1)/2 +1;
106 posA = POW2(M);		// 1/2 of POW2(M) complexes
107 posAi = posA + 1;
108 posB = posA + 2;
109 posBi = posB + 1;
110 
111 iolimit = ioptr + Nrems2;
112 for (; ioptr < iolimit; ioptr += (int64_t) POW2(M/2+1)){
113 	for (Colstart = Nroot_1; Colstart >= 0; Colstart--){
114 		iCol = Nroot_1;
115 		p0r = ioptr+ Nroot_1_ColInc + BRLow[Colstart]*4;
116 		IOP = ioptr + (Colstart << ColstartShift);
117 		p1r = IOP + BRLow[iCol]*4;
118 		f0r = *(p0r);
119 		f0i = *(p0r+1);
120 		f1r = *(p0r+posA);
121 		f1i = *(p0r+posAi);
122 		for (; iCol > Colstart;){
123 			f2r = *(p0r+2);
124 			f2i = *(p0r+(2+1));
125 			f3r = *(p0r+posB);
126 			f3i = *(p0r+posBi);
127 			f4r = *(p1r);
128 			f4i = *(p1r+1);
129 			f5r = *(p1r+posA);
130 			f5i = *(p1r+posAi);
131 			f6r = *(p1r+2);
132 			f6i = *(p1r+(2+1));
133 			f7r = *(p1r+posB);
134 			f7i = *(p1r+posBi);
135 
136 			t0r	= f0r + f1r;
137 			t0i	= f0i + f1i;
138 			f1r = f0r - f1r;
139 			f1i = f0i - f1i;
140 			t1r = f2r + f3r;
141 			t1i = f2i + f3i;
142 			f3r = f2r - f3r;
143 			f3i = f2i - f3i;
144 			f0r = f4r + f5r;
145 			f0i = f4i + f5i;
146 			f5r = f4r - f5r;
147 			f5i = f4i - f5i;
148 			f2r = f6r + f7r;
149 			f2i = f6i + f7i;
150 			f7r = f6r - f7r;
151 			f7i = f6i - f7i;
152 
153 			*(p1r) = t0r;
154 			*(p1r+1) = t0i;
155 			*(p1r+2) = f1r;
156 			*(p1r+(2+1)) = f1i;
157 			*(p1r+posA) = t1r;
158 			*(p1r+posAi) = t1i;
159 			*(p1r+posB) = f3r;
160 			*(p1r+posBi) = f3i;
161 			*(p0r) = f0r;
162 			*(p0r+1) = f0i;
163 			*(p0r+2) = f5r;
164 			*(p0r+(2+1)) = f5i;
165 			*(p0r+posA) = f2r;
166 			*(p0r+posAi) = f2i;
167 			*(p0r+posB) = f7r;
168 			*(p0r+posBi) = f7i;
169 
170 			p0r -= Nrems2;
171 			f0r = *(p0r);
172 			f0i = *(p0r+1);
173 			f1r = *(p0r+posA);
174 			f1i = *(p0r+posAi);
175 			iCol -= 1;
176 			p1r = IOP + BRLow[iCol]*4;
177 		};
178 		f2r = *(p0r+2);
179 		f2i = *(p0r+(2+1));
180 		f3r = *(p0r+posB);
181 		f3i = *(p0r+posBi);
182 
183 		t0r   = f0r + f1r;
184 		t0i   = f0i + f1i;
185 		f1r = f0r - f1r;
186 		f1i = f0i - f1i;
187 		t1r   = f2r + f3r;
188 		t1i   = f2i + f3i;
189 		f3r = f2r - f3r;
190 		f3i = f2i - f3i;
191 
192 		*(p0r) = t0r;
193 		*(p0r+1) = t0i;
194 		*(p0r+2) = f1r;
195 		*(p0r+(2+1)) = f1i;
196 		*(p0r+posA) = t1r;
197 		*(p0r+posAi) = t1i;
198 		*(p0r+posB) = f3r;
199 		*(p0r+posBi) = f3i;
200 
201 	};
202 };
203 }
204 
205 //inline void fft2pt(float *ioptr);
fft2pt(float * ioptr)206 static inline void fft2pt(float *ioptr){
207 /***   RADIX 2 fft	***/
208 float f0r, f0i, f1r, f1i;
209 float t0r, t0i;
210 
211 	/* bit reversed load */
212 f0r = ioptr[0];
213 f0i = ioptr[1];
214 f1r = ioptr[2];
215 f1i = ioptr[3];
216 
217 		/* Butterflys		*/
218 		/*
219 		f0	-	-	t0
220 		f1	-  1 -	f1
221 		*/
222 
223 t0r = f0r + f1r;
224 t0i = f0i + f1i;
225 f1r = f0r - f1r;
226 f1i = f0i - f1i;
227 
228 	/* store result */
229 ioptr[0] = t0r;
230 ioptr[1] = t0i;
231 ioptr[2] = f1r;
232 ioptr[3] = f1i;
233 }
234 
235 
236 //inline void fft4pt(float *ioptr);
fft4pt(float * ioptr)237 static inline void fft4pt(float *ioptr){
238 /***   RADIX 4 fft	***/
239 float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
240 float t0r, t0i, t1r, t1i;
241 
242 	/* bit reversed load */
243 f0r = ioptr[0];
244 f0i = ioptr[1];
245 f1r = ioptr[4];
246 f1i = ioptr[5];
247 f2r = ioptr[2];
248 f2i = ioptr[3];
249 f3r = ioptr[6];
250 f3i = ioptr[7];
251 
252 		/* Butterflys		*/
253 		/*
254 		f0	-	-	t0	-	-	f0
255 		f1	-  1 -	f1	-	-	f1
256 		f2	-	-	f2	-  1 -	f2
257 		f3	-  1 -	t1	- -i -	f3
258 		*/
259 
260 t0r = f0r + f1r;
261 t0i = f0i + f1i;
262 f1r = f0r - f1r;
263 f1i = f0i - f1i;
264 
265 t1r = f2r - f3r;
266 t1i = f2i - f3i;
267 f2r = f2r + f3r;
268 f2i = f2i + f3i;
269 
270 f0r = t0r + f2r;
271 f0i = t0i + f2i;
272 f2r = t0r - f2r;
273 f2i = t0i - f2i;
274 
275 f3r = f1r - t1i;
276 f3i = f1i + t1r;
277 f1r = f1r + t1i;
278 f1i = f1i - t1r;
279 
280 	/* store result */
281 ioptr[0] = f0r;
282 ioptr[1] = f0i;
283 ioptr[2] = f1r;
284 ioptr[3] = f1i;
285 ioptr[4] = f2r;
286 ioptr[5] = f2i;
287 ioptr[6] = f3r;
288 ioptr[7] = f3i;
289 }
290 
291 //inline void fft8pt(float *ioptr);
fft8pt(float * ioptr)292 static inline void fft8pt(float *ioptr){
293 /***   RADIX 8 fft	***/
294 float w0r = 1.0F/MYROOT2; /* cos(pi/4)	*/
295 float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
296 float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
297 float t0r, t0i, t1r, t1i;
298 const float Two = 2.0;
299 
300 	/* bit reversed load */
301 f0r = ioptr[0];
302 f0i = ioptr[1];
303 f1r = ioptr[8];
304 f1i = ioptr[9];
305 f2r = ioptr[4];
306 f2i = ioptr[5];
307 f3r = ioptr[12];
308 f3i = ioptr[13];
309 f4r = ioptr[2];
310 f4i = ioptr[3];
311 f5r = ioptr[10];
312 f5i = ioptr[11];
313 f6r = ioptr[6];
314 f6i = ioptr[7];
315 f7r = ioptr[14];
316 f7i = ioptr[15];
317 			/* Butterflys		*/
318 			/*
319 			f0	-	-	t0	-	-	f0	-	-	f0
320 			f1	-  1 -	f1	-	-	f1	-	-	f1
321 			f2	-	-	f2	-  1 -	f2	-	-	f2
322 			f3	-  1 -	t1	- -i -	f3	-	-	f3
323 			f4	-	-	t0	-	-	f4	-  1 -	t0
324 			f5	-  1 -	f5	-	-	f5	- w3 -	f4
325 			f6	-	-	f6	-  1 -	f6	- -i -	t1
326 			f7	-  1 -	t1	- -i -	f7	- iw3-	f6
327 			*/
328 
329 t0r = f0r + f1r;
330 t0i = f0i + f1i;
331 f1r = f0r - f1r;
332 f1i = f0i - f1i;
333 
334 t1r = f2r - f3r;
335 t1i = f2i - f3i;
336 f2r = f2r + f3r;
337 f2i = f2i + f3i;
338 
339 f0r = t0r + f2r;
340 f0i = t0i + f2i;
341 f2r = t0r - f2r;
342 f2i = t0i - f2i;
343 
344 f3r = f1r - t1i;
345 f3i = f1i + t1r;
346 f1r = f1r + t1i;
347 f1i = f1i - t1r;
348 
349 
350 t0r = f4r + f5r;
351 t0i = f4i + f5i;
352 f5r = f4r - f5r;
353 f5i = f4i - f5i;
354 
355 t1r = f6r - f7r;
356 t1i = f6i - f7i;
357 f6r = f6r + f7r;
358 f6i = f6i + f7i;
359 
360 f4r = t0r + f6r;
361 f4i = t0i + f6i;
362 f6r = t0r - f6r;
363 f6i = t0i - f6i;
364 
365 f7r = f5r - t1i;
366 f7i = f5i + t1r;
367 f5r = f5r + t1i;
368 f5i = f5i - t1r;
369 
370 
371 t0r = f0r - f4r;
372 t0i = f0i - f4i;
373 f0r = f0r + f4r;
374 f0i = f0i + f4i;
375 
376 t1r = f2r - f6i;
377 t1i = f2i + f6r;
378 f2r = f2r + f6i;
379 f2i = f2i - f6r;
380 
381 f4r = f1r - f5r * w0r - f5i * w0r;
382 f4i = f1i + f5r * w0r - f5i * w0r;
383 f1r = f1r * Two - f4r;
384 f1i = f1i * Two - f4i;
385 
386 f6r = f3r + f7r * w0r - f7i * w0r;
387 f6i = f3i + f7r * w0r + f7i * w0r;
388 f3r = f3r * Two - f6r;
389 f3i = f3i * Two - f6i;
390 
391 	/* store result */
392 ioptr[0] = f0r;
393 ioptr[1] = f0i;
394 ioptr[2] = f1r;
395 ioptr[3] = f1i;
396 ioptr[4] = f2r;
397 ioptr[5] = f2i;
398 ioptr[6] = f3r;
399 ioptr[7] = f3i;
400 ioptr[8] = t0r;
401 ioptr[9] = t0i;
402 ioptr[10] = f4r;
403 ioptr[11] = f4i;
404 ioptr[12] = t1r;
405 ioptr[13] = t1i;
406 ioptr[14] = f6r;
407 ioptr[15] = f6i;
408 }
409 
410 //inline void bfR2(float *ioptr, long M, long NDiffU);
bfR2(float * ioptr,long M,long NDiffU)411 static inline void bfR2(float *ioptr, long M, long NDiffU){
412 /*** 2nd radix 2 stage ***/
413 unsigned long	pos;
414 unsigned long	posi;
415 unsigned long	pinc;
416 unsigned long	pnext;
417 unsigned long 	NSameU;
418 unsigned long 	SameUCnt;
419 
420 float	*pstrt;
421 float 	*p0r, *p1r, *p2r, *p3r;
422 
423 float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
424 float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
425 /* UNUSED: const float Two = 2.0; */
426 
427 pinc = NDiffU * 2;		// 2 floats per complex
428 pnext =  pinc * 4;
429 pos = 2;
430 posi = pos+1;
431 NSameU = POW2(M) / 4 /NDiffU;	// 4 Us at a time
432 pstrt = ioptr;
433 p0r = pstrt;
434 p1r = pstrt+pinc;
435 p2r = p1r+pinc;
436 p3r = p2r+pinc;
437 
438 		/* Butterflys		*/
439 		/*
440 		f0	-	-	f4
441 		f1	-  1 -	f5
442 		f2	-	-	f6
443 		f3	-  1 -	f7
444 		*/
445 		/* Butterflys		*/
446 		/*
447 		f0	-	-	f4
448 		f1	-  1 -	f5
449 		f2	-	-	f6
450 		f3	-  1 -	f7
451 		*/
452 
453 for (SameUCnt = NSameU; SameUCnt > 0 ; SameUCnt--){
454 
455 	f0r = *p0r;
456 	f1r = *p1r;
457 	f0i = *(p0r + 1);
458 	f1i = *(p1r + 1);
459 	f2r = *p2r;
460 	f3r = *p3r;
461 	f2i = *(p2r + 1);
462 	f3i = *(p3r + 1);
463 
464 	f4r = f0r + f1r;
465 	f4i = f0i + f1i;
466 	f5r = f0r - f1r;
467 	f5i = f0i - f1i;
468 
469 	f6r = f2r + f3r;
470 	f6i = f2i + f3i;
471 	f7r = f2r - f3r;
472 	f7i = f2i - f3i;
473 
474 	*p0r = f4r;
475 	*(p0r + 1) = f4i;
476 	*p1r = f5r;
477 	*(p1r + 1) = f5i;
478 	*p2r = f6r;
479 	*(p2r + 1) = f6i;
480 	*p3r = f7r;
481 	*(p3r + 1) = f7i;
482 
483 	f0r = *(p0r + pos);
484 	f1i = *(p1r + posi);
485 	f0i = *(p0r + posi);
486 	f1r = *(p1r + pos);
487 	f2r = *(p2r + pos);
488 	f3i = *(p3r + posi);
489 	f2i = *(p2r + posi);
490 	f3r = *(p3r + pos);
491 
492 	f4r = f0r + f1i;
493 	f4i = f0i - f1r;
494 	f5r = f0r - f1i;
495 	f5i = f0i + f1r;
496 
497 	f6r = f2r + f3i;
498 	f6i = f2i - f3r;
499 	f7r = f2r - f3i;
500 	f7i = f2i + f3r;
501 
502 	*(p0r + pos) = f4r;
503 	*(p0r + posi) = f4i;
504 	*(p1r + pos) = f5r;
505 	*(p1r + posi) = f5i;
506 	*(p2r + pos) = f6r;
507 	*(p2r + posi) = f6i;
508 	*(p3r + pos) = f7r;
509 	*(p3r + posi) = f7i;
510 
511 	p0r += pnext;
512 	p1r += pnext;
513 	p2r += pnext;
514 	p3r += pnext;
515 
516 }
517 }
518 
519 //inline void bfR4(float *ioptr, long M, long NDiffU);
bfR4(float * ioptr,long M,long NDiffU)520 static inline void bfR4(float *ioptr, long M, long NDiffU){
521 /*** 1 radix 4 stage ***/
522 unsigned long	pos;
523 unsigned long	posi;
524 unsigned long	pinc;
525 unsigned long	pnext;
526 unsigned long	pnexti;
527 unsigned long 	NSameU;
528 unsigned long 	SameUCnt;
529 
530 float	*pstrt;
531 float 	*p0r, *p1r, *p2r, *p3r;
532 
533 float w1r = 1.0F/MYROOT2; /* cos(pi/4)	*/
534 float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
535 float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
536 float t1r, t1i;
537 const float Two = 2.0;
538 
539 pinc = NDiffU * 2;		// 2 floats per complex
540 pnext =  pinc * 4;
541 pnexti =  pnext + 1;
542 pos = 2;
543 posi = pos+1;
544 NSameU = POW2(M) / 4 /NDiffU;	// 4 pts per butterfly
545 pstrt = ioptr;
546 p0r = pstrt;
547 p1r = pstrt+pinc;
548 p2r = p1r+pinc;
549 p3r = p2r+pinc;
550 
551 		/* Butterflys		*/
552 		/*
553 		f0	-	-	f0	-	-	f4
554 		f1	-  1 -	f5	-	-	f5
555 		f2	-	-	f6	-  1 -	f6
556 		f3	-  1 -	f3	- -i -	f7
557 		*/
558 		/* Butterflys		*/
559 		/*
560 		f0	-	-	f4	-	-	f4
561 		f1	- -i -	t1	-	-	f5
562 		f2	-	-	f2	- w1 -	f6
563 		f3	- -i -	f7	- iw1-	f7
564 		*/
565 
566 f0r = *p0r;
567 f1r = *p1r;
568 f2r = *p2r;
569 f3r = *p3r;
570 f0i = *(p0r + 1);
571 f1i = *(p1r + 1);
572 f2i = *(p2r + 1);
573 f3i = *(p3r + 1);
574 
575 f5r = f0r - f1r;
576 f5i = f0i - f1i;
577 f0r = f0r + f1r;
578 f0i = f0i + f1i;
579 
580 f6r = f2r + f3r;
581 f6i = f2i + f3i;
582 f3r = f2r - f3r;
583 f3i = f2i - f3i;
584 
585 for (SameUCnt = NSameU-1; SameUCnt > 0 ; SameUCnt--){
586 
587 	f7r = f5r - f3i;
588 	f7i = f5i + f3r;
589 	f5r = f5r + f3i;
590 	f5i = f5i - f3r;
591 
592 	f4r = f0r + f6r;
593 	f4i = f0i + f6i;
594 	f6r = f0r - f6r;
595 	f6i = f0i - f6i;
596 
597 	f2r = *(p2r + pos);
598 	f2i = *(p2r + posi);
599 	f1r = *(p1r + pos);
600 	f1i = *(p1r + posi);
601 	f3i = *(p3r + posi);
602 	f0r = *(p0r + pos);
603 	f3r = *(p3r + pos);
604 	f0i = *(p0r + posi);
605 
606 	*p3r = f7r;
607 	*p0r = f4r;
608 	*(p3r + 1) = f7i;
609 	*(p0r + 1) = f4i;
610 	*p1r = f5r;
611 	*p2r = f6r;
612 	*(p1r + 1) = f5i;
613 	*(p2r + 1) = f6i;
614 
615 	f7r = f2r - f3i;
616 	f7i = f2i + f3r;
617 	f2r = f2r + f3i;
618 	f2i = f2i - f3r;
619 
620 	f4r = f0r + f1i;
621 	f4i = f0i - f1r;
622 	t1r = f0r - f1i;
623 	t1i = f0i + f1r;
624 
625 	f5r = t1r - f7r * w1r + f7i * w1r;
626 	f5i = t1i - f7r * w1r - f7i * w1r;
627 	f7r = t1r * Two - f5r;
628 	f7i = t1i * Two - f5i;
629 
630 	f6r = f4r - f2r * w1r - f2i * w1r;
631 	f6i = f4i + f2r * w1r - f2i * w1r;
632 	f4r = f4r * Two - f6r;
633 	f4i = f4i * Two - f6i;
634 
635 	f3r = *(p3r + pnext);
636 	f0r = *(p0r + pnext);
637 	f3i = *(p3r + pnexti);
638 	f0i = *(p0r + pnexti);
639 	f2r = *(p2r + pnext);
640 	f2i = *(p2r + pnexti);
641 	f1r = *(p1r + pnext);
642 	f1i = *(p1r + pnexti);
643 
644 	*(p2r + pos) = f6r;
645 	*(p1r + pos) = f5r;
646 	*(p2r + posi) = f6i;
647 	*(p1r + posi) = f5i;
648 	*(p3r + pos) = f7r;
649 	*(p0r + pos) = f4r;
650 	*(p3r + posi) = f7i;
651 	*(p0r + posi) = f4i;
652 
653 	f6r = f2r + f3r;
654 	f6i = f2i + f3i;
655 	f3r = f2r - f3r;
656 	f3i = f2i - f3i;
657 
658 	f5r = f0r - f1r;
659 	f5i = f0i - f1i;
660 	f0r = f0r + f1r;
661 	f0i = f0i + f1i;
662 
663 	p3r += pnext;
664 	p0r += pnext;
665 	p1r += pnext;
666 	p2r += pnext;
667 
668 }
669 f7r = f5r - f3i;
670 f7i = f5i + f3r;
671 f5r = f5r + f3i;
672 f5i = f5i - f3r;
673 
674 f4r = f0r + f6r;
675 f4i = f0i + f6i;
676 f6r = f0r - f6r;
677 f6i = f0i - f6i;
678 
679 f2r = *(p2r + pos);
680 f2i = *(p2r + posi);
681 f1r = *(p1r + pos);
682 f1i = *(p1r + posi);
683 f3i = *(p3r + posi);
684 f0r = *(p0r + pos);
685 f3r = *(p3r + pos);
686 f0i = *(p0r + posi);
687 
688 *p3r = f7r;
689 *p0r = f4r;
690 *(p3r + 1) = f7i;
691 *(p0r + 1) = f4i;
692 *p1r = f5r;
693 *p2r = f6r;
694 *(p1r + 1) = f5i;
695 *(p2r + 1) = f6i;
696 
697 f7r = f2r - f3i;
698 f7i = f2i + f3r;
699 f2r = f2r + f3i;
700 f2i = f2i - f3r;
701 
702 f4r = f0r + f1i;
703 f4i = f0i - f1r;
704 t1r = f0r - f1i;
705 t1i = f0i + f1r;
706 
707 f5r = t1r - f7r * w1r + f7i * w1r;
708 f5i = t1i - f7r * w1r - f7i * w1r;
709 f7r = t1r * Two - f5r;
710 f7i = t1i * Two - f5i;
711 
712 f6r = f4r - f2r * w1r - f2i * w1r;
713 f6i = f4i + f2r * w1r - f2i * w1r;
714 f4r = f4r * Two - f6r;
715 f4i = f4i * Two - f6i;
716 
717 *(p2r + pos) = f6r;
718 *(p1r + pos) = f5r;
719 *(p2r + posi) = f6i;
720 *(p1r + posi) = f5i;
721 *(p3r + pos) = f7r;
722 *(p0r + pos) = f4r;
723 *(p3r + posi) = f7i;
724 *(p0r + posi) = f4i;
725 
726 }
727 
728 // inline void bfstages(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt);
bfstages(float * ioptr,long M,float * Utbl,long Ustride,long NDiffU,long StageCnt)729 static inline void bfstages(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt){
730 /***   RADIX 8 Stages	***/
731 unsigned long	pos;
732 unsigned long	posi;
733 unsigned long	pinc;
734 unsigned long	pnext;
735 unsigned long 	NSameU;
736 long 	Uinc;
737 long 	Uinc2;
738 long 	Uinc4;
739 unsigned long 	DiffUCnt;
740 unsigned long 	SameUCnt;
741 unsigned long 	U2toU3;
742 
743 float	*pstrt;
744 float 	*p0r, *p1r, *p2r, *p3r;
745 float 	*u0r, *u0i, *u1r, *u1i, *u2r, *u2i;
746 
747 float w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i;
748 float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
749 float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
750 float t0r, t0i, t1r, t1i;
751 const float Two = 2.0;
752 
753 pinc = NDiffU * 2;		// 2 floats per complex
754 pnext =  pinc * 8;
755 pos = pinc * 4;
756 posi =  pos + 1;
757 NSameU = POW2(M) / 8 /NDiffU;	// 8 pts per butterfly
758 Uinc = NSameU * Ustride;
759 Uinc2 = Uinc * 2;
760 Uinc4 = Uinc * 4;
761 U2toU3 = (POW2(M) / 8)*Ustride;
762 for (; StageCnt > 0 ; StageCnt--){
763 
764 	u0r = &Utbl[0];
765 	u0i = &Utbl[POW2(M-2)*Ustride];
766 	u1r = u0r;
767 	u1i = u0i;
768 	u2r = u0r;
769 	u2i = u0i;
770 
771 	w0r =  *u0r;
772 	w0i =  *u0i;
773 	w1r =  *u1r;
774 	w1i =  *u1i;
775 	w2r =  *u2r;
776 	w2i =  *u2i;
777 	w3r =  *(u2r+U2toU3);
778 	w3i =  *(u2i-U2toU3);
779 
780 	pstrt = ioptr;
781 
782 	p0r = pstrt;
783 	p1r = pstrt+pinc;
784 	p2r = p1r+pinc;
785 	p3r = p2r+pinc;
786 
787 			/* Butterflys		*/
788 			/*
789 			f0	-	-	t0	-	-	f0	-	-	f0
790 			f1	- w0-	f1	-	-	f1	-	-	f1
791 			f2	-	-	f2	- w1-	f2	-	-	f4
792 			f3	- w0-	t1	- iw1-	f3	-	-	f5
793 
794 			f4	-	-	t0	-	-	f4	- w2-	t0
795 			f5	- w0-	f5	-	-	f5	- w3-	t1
796 			f6	-	-	f6	- w1-	f6	- iw2-	f6
797 			f7	- w0-	t1	- iw1-	f7	- iw3-	f7
798 			*/
799 
800 	for (DiffUCnt = NDiffU; DiffUCnt > 0 ; DiffUCnt--){
801 		f0r = *p0r;
802 		f0i = *(p0r + 1);
803 		f1r = *p1r;
804 		f1i = *(p1r + 1);
805 		for (SameUCnt = NSameU-1; SameUCnt > 0 ; SameUCnt--){
806 			f2r = *p2r;
807 			f2i = *(p2r + 1);
808 			f3r = *p3r;
809 			f3i = *(p3r + 1);
810 
811 			t0r = f0r + f1r * w0r + f1i * w0i;
812 			t0i = f0i - f1r * w0i + f1i * w0r;
813 			f1r = f0r * Two - t0r;
814 			f1i = f0i * Two - t0i;
815 
816 			f4r = *(p0r + pos);
817 			f4i = *(p0r + posi);
818 			f5r = *(p1r + pos);
819 			f5i = *(p1r + posi);
820 
821 			f6r = *(p2r + pos);
822 			f6i = *(p2r + posi);
823 			f7r = *(p3r + pos);
824 			f7i = *(p3r + posi);
825 
826 			t1r = f2r - f3r * w0r - f3i * w0i;
827 			t1i = f2i + f3r * w0i - f3i * w0r;
828 			f2r = f2r * Two - t1r;
829 			f2i = f2i * Two - t1i;
830 
831 			f0r = t0r + f2r * w1r + f2i * w1i;
832 			f0i = t0i - f2r * w1i + f2i * w1r;
833 			f2r = t0r * Two - f0r;
834 			f2i = t0i * Two - f0i;
835 
836 			f3r = f1r + t1r * w1i - t1i * w1r;
837 			f3i = f1i + t1r * w1r + t1i * w1i;
838 			f1r = f1r * Two - f3r;
839 			f1i = f1i * Two - f3i;
840 
841 
842 			t0r = f4r + f5r * w0r + f5i * w0i;
843 			t0i = f4i - f5r * w0i + f5i * w0r;
844 			f5r = f4r * Two - t0r;
845 			f5i = f4i * Two - t0i;
846 
847 			t1r = f6r - f7r * w0r - f7i * w0i;
848 			t1i = f6i + f7r * w0i - f7i * w0r;
849 			f6r = f6r * Two - t1r;
850 			f6i = f6i * Two - t1i;
851 
852 			f4r = t0r + f6r * w1r + f6i * w1i;
853 			f4i = t0i - f6r * w1i + f6i * w1r;
854 			f6r = t0r * Two - f4r;
855 			f6i = t0i * Two - f4i;
856 
857 			f7r = f5r + t1r * w1i - t1i * w1r;
858 			f7i = f5i + t1r * w1r + t1i * w1i;
859 			f5r = f5r * Two - f7r;
860 			f5i = f5i * Two - f7i;
861 
862 			t0r = f0r - f4r * w2r - f4i * w2i;
863 			t0i = f0i + f4r * w2i - f4i * w2r;
864 			f0r = f0r * Two - t0r;
865 			f0i = f0i * Two - t0i;
866 
867 			t1r = f1r - f5r * w3r - f5i * w3i;
868 			t1i = f1i + f5r * w3i - f5i * w3r;
869 			f1r = f1r * Two - t1r;
870 			f1i = f1i * Two - t1i;
871 
872 			*(p0r + pos) = t0r;
873 			*(p1r + pos) = t1r;
874 			*(p0r + posi) = t0i;
875 			*(p1r + posi) = t1i;
876 			*p0r = f0r;
877 			*p1r = f1r;
878 			*(p0r + 1) = f0i;
879 			*(p1r + 1) = f1i;
880 
881 			p0r += pnext;
882 			f0r = *p0r;
883 			f0i = *(p0r + 1);
884 
885 			p1r += pnext;
886 
887 			f1r = *p1r;
888 			f1i = *(p1r + 1);
889 
890 			f4r = f2r - f6r * w2i + f6i * w2r;
891 			f4i = f2i - f6r * w2r - f6i * w2i;
892 			f6r = f2r * Two - f4r;
893 			f6i = f2i * Two - f4i;
894 
895 			f5r = f3r - f7r * w3i + f7i * w3r;
896 			f5i = f3i - f7r * w3r - f7i * w3i;
897 			f7r = f3r * Two - f5r;
898 			f7i = f3i * Two - f5i;
899 
900 			*p2r = f4r;
901 			*p3r = f5r;
902 			*(p2r + 1) = f4i;
903 			*(p3r + 1) = f5i;
904 			*(p2r + pos) = f6r;
905 			*(p3r + pos) = f7r;
906 			*(p2r + posi) = f6i;
907 			*(p3r + posi) = f7i;
908 
909 			p2r += pnext;
910 			p3r += pnext;
911 
912 		}
913 
914 		f2r = *p2r;
915 		f2i = *(p2r + 1);
916 		f3r = *p3r;
917 		f3i = *(p3r + 1);
918 
919 		t0r = f0r + f1r * w0r + f1i * w0i;
920 		t0i = f0i - f1r * w0i + f1i * w0r;
921 		f1r = f0r * Two - t0r;
922 		f1i = f0i * Two - t0i;
923 
924 		f4r = *(p0r + pos);
925 		f4i = *(p0r + posi);
926 		f5r = *(p1r + pos);
927 		f5i = *(p1r + posi);
928 
929 		f6r = *(p2r + pos);
930 		f6i = *(p2r + posi);
931 		f7r = *(p3r + pos);
932 		f7i = *(p3r + posi);
933 
934 		t1r = f2r - f3r * w0r - f3i * w0i;
935 		t1i = f2i + f3r * w0i - f3i * w0r;
936 		f2r = f2r * Two - t1r;
937 		f2i = f2i * Two - t1i;
938 
939 		f0r = t0r + f2r * w1r + f2i * w1i;
940 		f0i = t0i - f2r * w1i + f2i * w1r;
941 		f2r = t0r * Two - f0r;
942 		f2i = t0i * Two - f0i;
943 
944 		f3r = f1r + t1r * w1i - t1i * w1r;
945 		f3i = f1i + t1r * w1r + t1i * w1i;
946 		f1r = f1r * Two - f3r;
947 		f1i = f1i * Two - f3i;
948 
949 		if (DiffUCnt == NDiffU/2)
950 			Uinc4 = -Uinc4;
951 
952 		u0r += Uinc4;
953 		u0i -= Uinc4;
954 		u1r += Uinc2;
955 		u1i -= Uinc2;
956 		u2r += Uinc;
957 		u2i -= Uinc;
958 
959 		pstrt += 2;
960 
961 		t0r = f4r + f5r * w0r + f5i * w0i;
962 		t0i = f4i - f5r * w0i + f5i * w0r;
963 		f5r = f4r * Two - t0r;
964 		f5i = f4i * Two - t0i;
965 
966 		t1r = f6r - f7r * w0r - f7i * w0i;
967 		t1i = f6i + f7r * w0i - f7i * w0r;
968 		f6r = f6r * Two - t1r;
969 		f6i = f6i * Two - t1i;
970 
971 		f4r = t0r + f6r * w1r + f6i * w1i;
972 		f4i = t0i - f6r * w1i + f6i * w1r;
973 		f6r = t0r * Two - f4r;
974 		f6i = t0i * Two - f4i;
975 
976 		f7r = f5r + t1r * w1i - t1i * w1r;
977 		f7i = f5i + t1r * w1r + t1i * w1i;
978 		f5r = f5r * Two - f7r;
979 		f5i = f5i * Two - f7i;
980 
981 		w0r = *u0r;
982 		w0i =  *u0i;
983 		w1r =  *u1r;
984 		w1i =  *u1i;
985 
986 		if ((long) DiffUCnt <= NDiffU/2)
987 			w0r = -w0r;
988 
989 		t0r = f0r - f4r * w2r - f4i * w2i;
990 		t0i = f0i + f4r * w2i - f4i * w2r;
991 		f0r = f0r * Two - t0r;
992 		f0i = f0i * Two - t0i;
993 
994 		f4r = f2r - f6r * w2i + f6i * w2r;
995 		f4i = f2i - f6r * w2r - f6i * w2i;
996 		f6r = f2r * Two - f4r;
997 		f6i = f2i * Two - f4i;
998 
999 		*(p0r + pos) = t0r;
1000 		*p2r = f4r;
1001 		*(p0r + posi) = t0i;
1002 		*(p2r + 1) = f4i;
1003 		w2r =  *u2r;
1004 		w2i =  *u2i;
1005 		*p0r = f0r;
1006 		*(p2r + pos) = f6r;
1007 		*(p0r + 1) = f0i;
1008 		*(p2r + posi) = f6i;
1009 
1010 		p0r = pstrt;
1011 		p2r = pstrt + pinc + pinc;
1012 
1013 		t1r = f1r - f5r * w3r - f5i * w3i;
1014 		t1i = f1i + f5r * w3i - f5i * w3r;
1015 		f1r = f1r * Two - t1r;
1016 		f1i = f1i * Two - t1i;
1017 
1018 		f5r = f3r - f7r * w3i + f7i * w3r;
1019 		f5i = f3i - f7r * w3r - f7i * w3i;
1020 		f7r = f3r * Two - f5r;
1021 		f7i = f3i * Two - f5i;
1022 
1023 		*(p1r + pos) = t1r;
1024 		*p3r = f5r;
1025 		*(p1r + posi) = t1i;
1026 		*(p3r + 1) = f5i;
1027 		w3r =  *(u2r+U2toU3);
1028 		w3i =  *(u2i-U2toU3);
1029 		*p1r = f1r;
1030 		*(p3r + pos) = f7r;
1031 		*(p1r + 1) = f1i;
1032 		*(p3r + posi) = f7i;
1033 
1034 		p1r = pstrt + pinc;
1035 		p3r = p2r + pinc;
1036 	}
1037 	NSameU /= 8;
1038 	Uinc /= 8;
1039 	Uinc2 /= 8;
1040 	Uinc4 = Uinc * 4;
1041 	NDiffU *= 8;
1042 	pinc *= 8;
1043 	pnext *= 8;
1044 	pos *= 8;
1045 	posi =  pos + 1;
1046 }
1047 }
1048 
1049 void fftrecurs(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt);
fftrecurs(float * ioptr,long M,float * Utbl,long Ustride,long NDiffU,long StageCnt)1050 void fftrecurs(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt){
1051 /* recursive bfstages calls to maximize on chip cache efficiency */
1052 long i1;
1053 if (M <= MCACHE)  // fits on chip ?
1054 	bfstages(ioptr, M, Utbl, Ustride, NDiffU, StageCnt);		/*  RADIX 8 Stages	*/
1055 else{
1056 	for (i1=0; i1<8; i1++){
1057 		fftrecurs(&ioptr[i1*POW2(M-3)*2], M-3, Utbl, 8*Ustride, NDiffU, StageCnt-1);	/*  RADIX 8 Stages	*/
1058 	}
1059 	bfstages(ioptr, M, Utbl, Ustride, POW2(M-3), 1);	/*  RADIX 8 Stage	*/
1060 }
1061 }
1062 
ffts1(float * ioptr,long M,long Rows,float * Utbl,short * BRLow)1063 void ffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow){
1064 /* Compute in-place complex fft on the rows of the input array	*/
1065 /* INPUTS */
1066 /* *ioptr = input data array	*/
1067 /* M = log2 of fft size	(ex M=10 for 1024 point fft) */
1068 /* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array)	*/
1069 /* *Utbl = cosine table	*/
1070 /* *BRLow = bit reversed counter table	*/
1071 /* OUTPUTS */
1072 /* *ioptr = output data array	*/
1073 
1074 long 	StageCnt;
1075 long 	NDiffU;
1076 
1077 switch (M){
1078 case 0:
1079 	break;
1080 case 1:
1081 	for (;Rows>0;Rows--){
1082 		fft2pt(ioptr);				/* a 2 pt fft */
1083 		ioptr += 2*POW2(M);
1084 	}
1085 	break;
1086 case 2:
1087 	for (;Rows>0;Rows--){
1088 		fft4pt(ioptr);				/* a 4 pt fft */
1089 		ioptr += 2*POW2(M);
1090 	}
1091 	break;
1092 case 3:
1093 	for (;Rows>0;Rows--){
1094 		fft8pt(ioptr);				/* an 8 pt fft */
1095 		ioptr += 2*POW2(M);
1096 	}
1097 	break;
1098 default:
1099 	for (;Rows>0;Rows--){
1100 
1101 		bitrevR2(ioptr, M, BRLow);						/* bit reverse and first radix 2 stage */
1102 
1103 		StageCnt = (M-1) / 3;		// number of radix 8 stages
1104 		NDiffU = 2;				// one radix 2 stage already complete
1105 
1106 		if ( (M-1-(StageCnt * 3)) == 1 ){
1107 			bfR2(ioptr, M, NDiffU);				/* 1 radix 2 stage */
1108 			NDiffU *= 2;
1109 		}
1110 
1111 		if ( (M-1-(StageCnt * 3)) == 2 ){
1112 			bfR4(ioptr, M, NDiffU);			/* 1 radix 4 stage */
1113 			NDiffU *= 4;
1114 		}
1115 
1116 		if (M <= MCACHE)
1117 			bfstages(ioptr, M, Utbl, 1, NDiffU, StageCnt);		/*  RADIX 8 Stages	*/
1118 
1119 		else{
1120 			fftrecurs(ioptr, M, Utbl, 1, NDiffU, StageCnt);		/*  RADIX 8 Stages	*/
1121 		}
1122 
1123 		ioptr += 2*POW2(M);
1124 	}
1125 }
1126 }
1127 
1128 /************************************************
1129 parts of iffts1
1130 *************************************************/
1131 
1132 // inline void scbitrevR2(float *ioptr, long M, short *BRLow, float scale);
scbitrevR2(float * ioptr,long M,short * BRLow,float scale)1133 static inline void scbitrevR2(float *ioptr, long M, short *BRLow, float scale){
1134 /*** scaled bit reverse and first radix 2 stage forward or inverse fft ***/
1135 float	f0r;
1136 float	f0i;
1137 float	f1r;
1138 float	f1i;
1139 float	f2r;
1140 float	f2i;
1141 float	f3r;
1142 float	f3i;
1143 float	f4r;
1144 float	f4i;
1145 float	f5r;
1146 float	f5i;
1147 float	f6r;
1148 float	f6i;
1149 float	f7r;
1150 float	f7i;
1151 float	t0r;
1152 float	t0i;
1153 float	t1r;
1154 float	t1i;
1155 float	*p0r;
1156 float	*p1r;
1157 float	*IOP;
1158 float 	*iolimit;
1159 long 	Colstart;
1160 long 	iCol;
1161 unsigned long 	posA;
1162 unsigned long 	posAi;
1163 unsigned long 	posB;
1164 unsigned long 	posBi;
1165 
1166 const unsigned long Nrems2 = POW2((M+3)/2);
1167 const unsigned long Nroot_1_ColInc = POW2(M)-Nrems2;
1168 const unsigned long Nroot_1 = POW2(M/2-1)-1;
1169 const unsigned long ColstartShift = (M+1)/2 +1;
1170 posA = POW2(M);		// 1/2 of POW2(M) complexes
1171 posAi = posA + 1;
1172 posB = posA + 2;
1173 posBi = posB + 1;
1174 
1175 iolimit = ioptr + Nrems2;
1176 for (; ioptr < iolimit; ioptr += (int64_t) POW2(M/2+1)){
1177 	for (Colstart = Nroot_1; Colstart >= 0; Colstart--){
1178 		iCol = Nroot_1;
1179 		p0r = ioptr+ Nroot_1_ColInc + BRLow[Colstart]*4;
1180 		IOP = ioptr + (Colstart << ColstartShift);
1181 		p1r = IOP + BRLow[iCol]*4;
1182 		f0r = *(p0r);
1183 		f0i = *(p0r+1);
1184 		f1r = *(p0r+posA);
1185 		f1i = *(p0r+posAi);
1186 		for (; iCol > Colstart;){
1187 			f2r = *(p0r+2);
1188 			f2i = *(p0r+(2+1));
1189 			f3r = *(p0r+posB);
1190 			f3i = *(p0r+posBi);
1191 			f4r = *(p1r);
1192 			f4i = *(p1r+1);
1193 			f5r = *(p1r+posA);
1194 			f5i = *(p1r+posAi);
1195 			f6r = *(p1r+2);
1196 			f6i = *(p1r+(2+1));
1197 			f7r = *(p1r+posB);
1198 			f7i = *(p1r+posBi);
1199 
1200 			t0r	= f0r + f1r;
1201 			t0i	= f0i + f1i;
1202 			f1r = f0r - f1r;
1203 			f1i = f0i - f1i;
1204 			t1r = f2r + f3r;
1205 			t1i = f2i + f3i;
1206 			f3r = f2r - f3r;
1207 			f3i = f2i - f3i;
1208 			f0r = f4r + f5r;
1209 			f0i = f4i + f5i;
1210 			f5r = f4r - f5r;
1211 			f5i = f4i - f5i;
1212 			f2r = f6r + f7r;
1213 			f2i = f6i + f7i;
1214 			f7r = f6r - f7r;
1215 			f7i = f6i - f7i;
1216 
1217 			*(p1r) = scale*t0r;
1218 			*(p1r+1) = scale*t0i;
1219 			*(p1r+2) = scale*f1r;
1220 			*(p1r+(2+1)) = scale*f1i;
1221 			*(p1r+posA) = scale*t1r;
1222 			*(p1r+posAi) = scale*t1i;
1223 			*(p1r+posB) = scale*f3r;
1224 			*(p1r+posBi) = scale*f3i;
1225 			*(p0r) = scale*f0r;
1226 			*(p0r+1) = scale*f0i;
1227 			*(p0r+2) = scale*f5r;
1228 			*(p0r+(2+1)) = scale*f5i;
1229 			*(p0r+posA) = scale*f2r;
1230 			*(p0r+posAi) = scale*f2i;
1231 			*(p0r+posB) = scale*f7r;
1232 			*(p0r+posBi) = scale*f7i;
1233 
1234 			p0r -= Nrems2;
1235 			f0r = *(p0r);
1236 			f0i = *(p0r+1);
1237 			f1r = *(p0r+posA);
1238 			f1i = *(p0r+posAi);
1239 			iCol -= 1;
1240 			p1r = IOP + BRLow[iCol]*4;
1241 		};
1242 		f2r = *(p0r+2);
1243 		f2i = *(p0r+(2+1));
1244 		f3r = *(p0r+posB);
1245 		f3i = *(p0r+posBi);
1246 
1247 		t0r   = f0r + f1r;
1248 		t0i   = f0i + f1i;
1249 		f1r = f0r - f1r;
1250 		f1i = f0i - f1i;
1251 		t1r   = f2r + f3r;
1252 		t1i   = f2i + f3i;
1253 		f3r = f2r - f3r;
1254 		f3i = f2i - f3i;
1255 
1256 		*(p0r) = scale*t0r;
1257 		*(p0r+1) = scale*t0i;
1258 		*(p0r+2) = scale*f1r;
1259 		*(p0r+(2+1)) = scale*f1i;
1260 		*(p0r+posA) = scale*t1r;
1261 		*(p0r+posAi) = scale*t1i;
1262 		*(p0r+posB) = scale*f3r;
1263 		*(p0r+posBi) = scale*f3i;
1264 
1265 	};
1266 };
1267 }
1268 
1269 //inline void ifft2pt(float *ioptr, float scale);
ifft2pt(float * ioptr,float scale)1270 static inline void ifft2pt(float *ioptr, float scale){
1271 /***   RADIX 2 ifft	***/
1272 float f0r, f0i, f1r, f1i;
1273 float t0r, t0i;
1274 
1275 	/* bit reversed load */
1276 f0r = ioptr[0];
1277 f0i = ioptr[1];
1278 f1r = ioptr[2];
1279 f1i = ioptr[3];
1280 
1281 		/* Butterflys		*/
1282 		/*
1283 		f0	-	-	t0
1284 		f1	-  1 -	f1
1285 		*/
1286 
1287 t0r = f0r + f1r;
1288 t0i = f0i + f1i;
1289 f1r = f0r - f1r;
1290 f1i = f0i - f1i;
1291 
1292 	/* store result */
1293 ioptr[0] = scale*t0r;
1294 ioptr[1] = scale*t0i;
1295 ioptr[2] = scale*f1r;
1296 ioptr[3] = scale*f1i;
1297 }
1298 
1299 // inline void ifft4pt(float *ioptr, float scale);
ifft4pt(float * ioptr,float scale)1300 static inline void ifft4pt(float *ioptr, float scale){
1301 /***   RADIX 4 ifft	***/
1302 float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
1303 float t0r, t0i, t1r, t1i;
1304 
1305 	/* bit reversed load */
1306 f0r = ioptr[0];
1307 f0i = ioptr[1];
1308 f1r = ioptr[4];
1309 f1i = ioptr[5];
1310 f2r = ioptr[2];
1311 f2i = ioptr[3];
1312 f3r = ioptr[6];
1313 f3i = ioptr[7];
1314 
1315 		/* Butterflys		*/
1316 		/*
1317 		f0	-	-	t0	-	-	f0
1318 		f1	-  1 -	f1	-	-	f1
1319 		f2	-	-	f2	-  1 -	f2
1320 		f3	-  1 -	t1	-  i -	f3
1321 		*/
1322 
1323 t0r = f0r + f1r;
1324 t0i = f0i + f1i;
1325 f1r = f0r - f1r;
1326 f1i = f0i - f1i;
1327 
1328 t1r = f2r - f3r;
1329 t1i = f2i - f3i;
1330 f2r = f2r + f3r;
1331 f2i = f2i + f3i;
1332 
1333 f0r = t0r + f2r;
1334 f0i = t0i + f2i;
1335 f2r = t0r - f2r;
1336 f2i = t0i - f2i;
1337 
1338 f3r = f1r + t1i;
1339 f3i = f1i - t1r;
1340 f1r = f1r - t1i;
1341 f1i = f1i + t1r;
1342 
1343 	/* store result */
1344 ioptr[0] = scale*f0r;
1345 ioptr[1] = scale*f0i;
1346 ioptr[2] = scale*f1r;
1347 ioptr[3] = scale*f1i;
1348 ioptr[4] = scale*f2r;
1349 ioptr[5] = scale*f2i;
1350 ioptr[6] = scale*f3r;
1351 ioptr[7] = scale*f3i;
1352 }
1353 
1354 //inline void ifft8pt(float *ioptr, float scale);
ifft8pt(float * ioptr,float scale)1355 static inline void ifft8pt(float *ioptr, float scale){
1356 /***   RADIX 8 ifft	***/
1357 float w0r = 1.0F/MYROOT2; /* cos(pi/4)	*/
1358 float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
1359 float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
1360 float t0r, t0i, t1r, t1i;
1361 const float Two = 2.0;
1362 
1363 	/* bit reversed load */
1364 f0r = ioptr[0];
1365 f0i = ioptr[1];
1366 f1r = ioptr[8];
1367 f1i = ioptr[9];
1368 f2r = ioptr[4];
1369 f2i = ioptr[5];
1370 f3r = ioptr[12];
1371 f3i = ioptr[13];
1372 f4r = ioptr[2];
1373 f4i = ioptr[3];
1374 f5r = ioptr[10];
1375 f5i = ioptr[11];
1376 f6r = ioptr[6];
1377 f6i = ioptr[7];
1378 f7r = ioptr[14];
1379 f7i = ioptr[15];
1380 
1381 			/* Butterflys		*/
1382 			/*
1383 			f0	-	-	t0	-	-	f0	-	-	f0
1384 			f1	-  1 -	f1	-	-	f1	-	-	f1
1385 			f2	-	-	f2	-  1 -	f2	-	-	f2
1386 			f3	-  1 -	t1	-  i -	f3	-	-	f3
1387 			f4	-	-	t0	-	-	f4	-  1 -	t0
1388 			f5	-  1 -	f5	-	-	f5	- w3 -	f4
1389 			f6	-	-	f6	-  1 -	f6	-  i -	t1
1390 			f7	-  1 -	t1	-  i -	f7	- iw3-	f6
1391 			*/
1392 
1393 t0r = f0r + f1r;
1394 t0i = f0i + f1i;
1395 f1r = f0r - f1r;
1396 f1i = f0i - f1i;
1397 
1398 t1r = f2r - f3r;
1399 t1i = f2i - f3i;
1400 f2r = f2r + f3r;
1401 f2i = f2i + f3i;
1402 
1403 f0r = t0r + f2r;
1404 f0i = t0i + f2i;
1405 f2r = t0r - f2r;
1406 f2i = t0i - f2i;
1407 
1408 f3r = f1r + t1i;
1409 f3i = f1i - t1r;
1410 f1r = f1r - t1i;
1411 f1i = f1i + t1r;
1412 
1413 
1414 t0r = f4r + f5r;
1415 t0i = f4i + f5i;
1416 f5r = f4r - f5r;
1417 f5i = f4i - f5i;
1418 
1419 t1r = f6r - f7r;
1420 t1i = f6i - f7i;
1421 f6r = f6r + f7r;
1422 f6i = f6i + f7i;
1423 
1424 f4r = t0r + f6r;
1425 f4i = t0i + f6i;
1426 f6r = t0r - f6r;
1427 f6i = t0i - f6i;
1428 
1429 f7r = f5r + t1i;
1430 f7i = f5i - t1r;
1431 f5r = f5r - t1i;
1432 f5i = f5i + t1r;
1433 
1434 
1435 t0r = f0r - f4r;
1436 t0i = f0i - f4i;
1437 f0r = f0r + f4r;
1438 f0i = f0i + f4i;
1439 
1440 t1r = f2r + f6i;
1441 t1i = f2i - f6r;
1442 f2r = f2r - f6i;
1443 f2i = f2i + f6r;
1444 
1445 f4r = f1r - f5r * w0r + f5i * w0r;
1446 f4i = f1i - f5r * w0r - f5i * w0r;
1447 f1r = f1r * Two - f4r;
1448 f1i = f1i * Two - f4i;
1449 
1450 f6r = f3r + f7r * w0r + f7i * w0r;
1451 f6i = f3i - f7r * w0r + f7i * w0r;
1452 f3r = f3r * Two - f6r;
1453 f3i = f3i * Two - f6i;
1454 
1455 	/* store result */
1456 ioptr[0] = scale*f0r;
1457 ioptr[1] = scale*f0i;
1458 ioptr[2] = scale*f1r;
1459 ioptr[3] = scale*f1i;
1460 ioptr[4] = scale*f2r;
1461 ioptr[5] = scale*f2i;
1462 ioptr[6] = scale*f3r;
1463 ioptr[7] = scale*f3i;
1464 ioptr[8] = scale*t0r;
1465 ioptr[9] = scale*t0i;
1466 ioptr[10] = scale*f4r;
1467 ioptr[11] = scale*f4i;
1468 ioptr[12] = scale*t1r;
1469 ioptr[13] = scale*t1i;
1470 ioptr[14] = scale*f6r;
1471 ioptr[15] = scale*f6i;
1472 }
1473 
1474 //inline void ibfR2(float *ioptr, long M, long NDiffU);
ibfR2(float * ioptr,long M,long NDiffU)1475 static inline void ibfR2(float *ioptr, long M, long NDiffU){
1476 /*** 2nd radix 2 stage ***/
1477 unsigned long	pos;
1478 unsigned long	posi;
1479 unsigned long	pinc;
1480 unsigned long	pnext;
1481 unsigned long 	NSameU;
1482 unsigned long 	SameUCnt;
1483 
1484 float	*pstrt;
1485 float 	*p0r, *p1r, *p2r, *p3r;
1486 
1487 float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
1488 float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
1489 /* UNUSED: const float Two = 2.0; */
1490 
1491 pinc = NDiffU * 2;		// 2 floats per complex
1492 pnext =  pinc * 4;
1493 pos = 2;
1494 posi = pos+1;
1495 NSameU = POW2(M) / 4 /NDiffU;	// 4 Us at a time
1496 pstrt = ioptr;
1497 p0r = pstrt;
1498 p1r = pstrt+pinc;
1499 p2r = p1r+pinc;
1500 p3r = p2r+pinc;
1501 
1502 		/* Butterflys		*/
1503 		/*
1504 		f0	-	-	f4
1505 		f1	-  1 -	f5
1506 		f2	-	-	f6
1507 		f3	-  1 -	f7
1508 		*/
1509 		/* Butterflys		*/
1510 		/*
1511 		f0	-	-	f4
1512 		f1	-  1 -	f5
1513 		f2	-	-	f6
1514 		f3	-  1 -	f7
1515 		*/
1516 
1517 for (SameUCnt = NSameU; SameUCnt > 0 ; SameUCnt--){
1518 
1519 	f0r = *p0r;
1520 	f1r = *p1r;
1521 	f0i = *(p0r + 1);
1522 	f1i = *(p1r + 1);
1523 	f2r = *p2r;
1524 	f3r = *p3r;
1525 	f2i = *(p2r + 1);
1526 	f3i = *(p3r + 1);
1527 
1528 	f4r = f0r + f1r;
1529 	f4i = f0i + f1i;
1530 	f5r = f0r - f1r;
1531 	f5i = f0i - f1i;
1532 
1533 	f6r = f2r + f3r;
1534 	f6i = f2i + f3i;
1535 	f7r = f2r - f3r;
1536 	f7i = f2i - f3i;
1537 
1538 	*p0r = f4r;
1539 	*(p0r + 1) = f4i;
1540 	*p1r = f5r;
1541 	*(p1r + 1) = f5i;
1542 	*p2r = f6r;
1543 	*(p2r + 1) = f6i;
1544 	*p3r = f7r;
1545 	*(p3r + 1) = f7i;
1546 
1547 	f0r = *(p0r + pos);
1548 	f1i = *(p1r + posi);
1549 	f0i = *(p0r + posi);
1550 	f1r = *(p1r + pos);
1551 	f2r = *(p2r + pos);
1552 	f3i = *(p3r + posi);
1553 	f2i = *(p2r + posi);
1554 	f3r = *(p3r + pos);
1555 
1556 	f4r = f0r - f1i;
1557 	f4i = f0i + f1r;
1558 	f5r = f0r + f1i;
1559 	f5i = f0i - f1r;
1560 
1561 	f6r = f2r - f3i;
1562 	f6i = f2i + f3r;
1563 	f7r = f2r + f3i;
1564 	f7i = f2i - f3r;
1565 
1566 	*(p0r + pos) = f4r;
1567 	*(p0r + posi) = f4i;
1568 	*(p1r + pos) = f5r;
1569 	*(p1r + posi) = f5i;
1570 	*(p2r + pos) = f6r;
1571 	*(p2r + posi) = f6i;
1572 	*(p3r + pos) = f7r;
1573 	*(p3r + posi) = f7i;
1574 
1575 	p0r += pnext;
1576 	p1r += pnext;
1577 	p2r += pnext;
1578 	p3r += pnext;
1579 
1580 }
1581 }
1582 
1583 //inline void ibfR4(float *ioptr, long M, long NDiffU);
ibfR4(float * ioptr,long M,long NDiffU)1584 static inline void ibfR4(float *ioptr, long M, long NDiffU){
1585 /*** 1 radix 4 stage ***/
1586 unsigned long	pos;
1587 unsigned long	posi;
1588 unsigned long	pinc;
1589 unsigned long	pnext;
1590 unsigned long	pnexti;
1591 unsigned long 	NSameU;
1592 unsigned long 	SameUCnt;
1593 
1594 float	*pstrt;
1595 float 	*p0r, *p1r, *p2r, *p3r;
1596 
1597 float w1r = 1.0F/MYROOT2; /* cos(pi/4)	*/
1598 float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
1599 float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
1600 float t1r, t1i;
1601 const float Two = 2.0;
1602 
1603 pinc = NDiffU * 2;		// 2 floats per complex
1604 pnext =  pinc * 4;
1605 pnexti =  pnext + 1;
1606 pos = 2;
1607 posi = pos+1;
1608 NSameU = POW2(M) / 4 /NDiffU;	// 4 pts per butterfly
1609 pstrt = ioptr;
1610 p0r = pstrt;
1611 p1r = pstrt+pinc;
1612 p2r = p1r+pinc;
1613 p3r = p2r+pinc;
1614 
1615 		/* Butterflys		*/
1616 		/*
1617 		f0	-	-	f0	-	-	f4
1618 		f1	-  1 -	f5	-	-	f5
1619 		f2	-	-	f6	-  1 -	f6
1620 		f3	-  1 -	f3	- -i -	f7
1621 		*/
1622 		/* Butterflys		*/
1623 		/*
1624 		f0	-	-	f4	-	-	f4
1625 		f1	- -i -	t1	-	-	f5
1626 		f2	-	-	f2	- w1 -	f6
1627 		f3	- -i -	f7	- iw1-	f7
1628 		*/
1629 
1630 f0r = *p0r;
1631 f1r = *p1r;
1632 f2r = *p2r;
1633 f3r = *p3r;
1634 f0i = *(p0r + 1);
1635 f1i = *(p1r + 1);
1636 f2i = *(p2r + 1);
1637 f3i = *(p3r + 1);
1638 
1639 f5r = f0r - f1r;
1640 f5i = f0i - f1i;
1641 f0r = f0r + f1r;
1642 f0i = f0i + f1i;
1643 
1644 f6r = f2r + f3r;
1645 f6i = f2i + f3i;
1646 f3r = f2r - f3r;
1647 f3i = f2i - f3i;
1648 
1649 for (SameUCnt = NSameU-1; SameUCnt > 0 ; SameUCnt--){
1650 
1651 	f7r = f5r + f3i;
1652 	f7i = f5i - f3r;
1653 	f5r = f5r - f3i;
1654 	f5i = f5i + f3r;
1655 
1656 	f4r = f0r + f6r;
1657 	f4i = f0i + f6i;
1658 	f6r = f0r - f6r;
1659 	f6i = f0i - f6i;
1660 
1661 	f2r = *(p2r + pos);
1662 	f2i = *(p2r + posi);
1663 	f1r = *(p1r + pos);
1664 	f1i = *(p1r + posi);
1665 	f3i = *(p3r + posi);
1666 	f0r = *(p0r + pos);
1667 	f3r = *(p3r + pos);
1668 	f0i = *(p0r + posi);
1669 
1670 	*p3r = f7r;
1671 	*p0r = f4r;
1672 	*(p3r + 1) = f7i;
1673 	*(p0r + 1) = f4i;
1674 	*p1r = f5r;
1675 	*p2r = f6r;
1676 	*(p1r + 1) = f5i;
1677 	*(p2r + 1) = f6i;
1678 
1679 	f7r = f2r + f3i;
1680 	f7i = f2i - f3r;
1681 	f2r = f2r - f3i;
1682 	f2i = f2i + f3r;
1683 
1684 	f4r = f0r - f1i;
1685 	f4i = f0i + f1r;
1686 	t1r = f0r + f1i;
1687 	t1i = f0i - f1r;
1688 
1689 	f5r = t1r - f7r * w1r - f7i * w1r;
1690 	f5i = t1i + f7r * w1r - f7i * w1r;
1691 	f7r = t1r * Two - f5r;
1692 	f7i = t1i * Two - f5i;
1693 
1694 	f6r = f4r - f2r * w1r + f2i * w1r;
1695 	f6i = f4i - f2r * w1r - f2i * w1r;
1696 	f4r = f4r * Two - f6r;
1697 	f4i = f4i * Two - f6i;
1698 
1699 	f3r = *(p3r + pnext);
1700 	f0r = *(p0r + pnext);
1701 	f3i = *(p3r + pnexti);
1702 	f0i = *(p0r + pnexti);
1703 	f2r = *(p2r + pnext);
1704 	f2i = *(p2r + pnexti);
1705 	f1r = *(p1r + pnext);
1706 	f1i = *(p1r + pnexti);
1707 
1708 	*(p2r + pos) = f6r;
1709 	*(p1r + pos) = f5r;
1710 	*(p2r + posi) = f6i;
1711 	*(p1r + posi) = f5i;
1712 	*(p3r + pos) = f7r;
1713 	*(p0r + pos) = f4r;
1714 	*(p3r + posi) = f7i;
1715 	*(p0r + posi) = f4i;
1716 
1717 	f6r = f2r + f3r;
1718 	f6i = f2i + f3i;
1719 	f3r = f2r - f3r;
1720 	f3i = f2i - f3i;
1721 
1722 	f5r = f0r - f1r;
1723 	f5i = f0i - f1i;
1724 	f0r = f0r + f1r;
1725 	f0i = f0i + f1i;
1726 
1727 	p3r += pnext;
1728 	p0r += pnext;
1729 	p1r += pnext;
1730 	p2r += pnext;
1731 
1732 }
1733 f7r = f5r + f3i;
1734 f7i = f5i - f3r;
1735 f5r = f5r - f3i;
1736 f5i = f5i + f3r;
1737 
1738 f4r = f0r + f6r;
1739 f4i = f0i + f6i;
1740 f6r = f0r - f6r;
1741 f6i = f0i - f6i;
1742 
1743 f2r = *(p2r + pos);
1744 f2i = *(p2r + posi);
1745 f1r = *(p1r + pos);
1746 f1i = *(p1r + posi);
1747 f3i = *(p3r + posi);
1748 f0r = *(p0r + pos);
1749 f3r = *(p3r + pos);
1750 f0i = *(p0r + posi);
1751 
1752 *p3r = f7r;
1753 *p0r = f4r;
1754 *(p3r + 1) = f7i;
1755 *(p0r + 1) = f4i;
1756 *p1r = f5r;
1757 *p2r = f6r;
1758 *(p1r + 1) = f5i;
1759 *(p2r + 1) = f6i;
1760 
1761 f7r = f2r + f3i;
1762 f7i = f2i - f3r;
1763 f2r = f2r - f3i;
1764 f2i = f2i + f3r;
1765 
1766 f4r = f0r - f1i;
1767 f4i = f0i + f1r;
1768 t1r = f0r + f1i;
1769 t1i = f0i - f1r;
1770 
1771 f5r = t1r - f7r * w1r - f7i * w1r;
1772 f5i = t1i + f7r * w1r - f7i * w1r;
1773 f7r = t1r * Two - f5r;
1774 f7i = t1i * Two - f5i;
1775 
1776 f6r = f4r - f2r * w1r + f2i * w1r;
1777 f6i = f4i - f2r * w1r - f2i * w1r;
1778 f4r = f4r * Two - f6r;
1779 f4i = f4i * Two - f6i;
1780 
1781 *(p2r + pos) = f6r;
1782 *(p1r + pos) = f5r;
1783 *(p2r + posi) = f6i;
1784 *(p1r + posi) = f5i;
1785 *(p3r + pos) = f7r;
1786 *(p0r + pos) = f4r;
1787 *(p3r + posi) = f7i;
1788 *(p0r + posi) = f4i;
1789 
1790 }
1791 
1792 //inline void ibfstages(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt);
ibfstages(float * ioptr,long M,float * Utbl,long Ustride,long NDiffU,long StageCnt)1793 static inline void ibfstages(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt){
1794 /***   RADIX 8 Stages	***/
1795 unsigned long	pos;
1796 unsigned long	posi;
1797 unsigned long	pinc;
1798 unsigned long	pnext;
1799 unsigned long 	NSameU;
1800 long 	Uinc;
1801 long 	Uinc2;
1802 long 	Uinc4;
1803 unsigned long 	DiffUCnt;
1804 unsigned long 	SameUCnt;
1805 unsigned long 	U2toU3;
1806 
1807 float	*pstrt;
1808 float 	*p0r, *p1r, *p2r, *p3r;
1809 float 	*u0r, *u0i, *u1r, *u1i, *u2r, *u2i;
1810 
1811 float w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i;
1812 float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
1813 float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
1814 float t0r, t0i, t1r, t1i;
1815 const float Two = 2.0;
1816 
1817 pinc = NDiffU * 2;		// 2 floats per complex
1818 pnext =  pinc * 8;
1819 pos = pinc * 4;
1820 posi =  pos + 1;
1821 NSameU = POW2(M) / 8 /NDiffU;	// 8 pts per butterfly
1822 Uinc = NSameU * Ustride;
1823 Uinc2 = Uinc * 2;
1824 Uinc4 = Uinc * 4;
1825 U2toU3 = (POW2(M) / 8)*Ustride;
1826 for (; StageCnt > 0 ; StageCnt--){
1827 
1828 	u0r = &Utbl[0];
1829 	u0i = &Utbl[POW2(M-2)*Ustride];
1830 	u1r = u0r;
1831 	u1i = u0i;
1832 	u2r = u0r;
1833 	u2i = u0i;
1834 
1835 	w0r =  *u0r;
1836 	w0i =  *u0i;
1837 	w1r =  *u1r;
1838 	w1i =  *u1i;
1839 	w2r =  *u2r;
1840 	w2i =  *u2i;
1841 	w3r =  *(u2r+U2toU3);
1842 	w3i =  *(u2i-U2toU3);
1843 
1844 	pstrt = ioptr;
1845 
1846 	p0r = pstrt;
1847 	p1r = pstrt+pinc;
1848 	p2r = p1r+pinc;
1849 	p3r = p2r+pinc;
1850 
1851 			/* Butterflys		*/
1852 			/*
1853 			f0	-	-	t0	-	-	f0	-	-	f0
1854 			f1	- w0-	f1	-	-	f1	-	-	f1
1855 			f2	-	-	f2	- w1-	f2	-	-	f4
1856 			f3	- w0-	t1	- iw1-	f3	-	-	f5
1857 
1858 			f4	-	-	t0	-	-	f4	- w2-	t0
1859 			f5	- w0-	f5	-	-	f5	- w3-	t1
1860 			f6	-	-	f6	- w1-	f6	- iw2-	f6
1861 			f7	- w0-	t1	- iw1-	f7	- iw3-	f7
1862 			*/
1863 
1864 	for (DiffUCnt = NDiffU; DiffUCnt > 0 ; DiffUCnt--){
1865 		f0r = *p0r;
1866 		f0i = *(p0r + 1);
1867 		f1r = *p1r;
1868 		f1i = *(p1r + 1);
1869 		for (SameUCnt = NSameU-1; SameUCnt > 0 ; SameUCnt--){
1870 			f2r = *p2r;
1871 			f2i = *(p2r + 1);
1872 			f3r = *p3r;
1873 			f3i = *(p3r + 1);
1874 
1875 			t0r = f0r + f1r * w0r - f1i * w0i;
1876 			t0i = f0i + f1r * w0i + f1i * w0r;
1877 			f1r = f0r * Two - t0r;
1878 			f1i = f0i * Two - t0i;
1879 
1880 			f4r = *(p0r + pos);
1881 			f4i = *(p0r + posi);
1882 			f5r = *(p1r + pos);
1883 			f5i = *(p1r + posi);
1884 
1885 			f6r = *(p2r + pos);
1886 			f6i = *(p2r + posi);
1887 			f7r = *(p3r + pos);
1888 			f7i = *(p3r + posi);
1889 
1890 			t1r = f2r - f3r * w0r + f3i * w0i;
1891 			t1i = f2i - f3r * w0i - f3i * w0r;
1892 			f2r = f2r * Two - t1r;
1893 			f2i = f2i * Two - t1i;
1894 
1895 			f0r = t0r + f2r * w1r - f2i * w1i;
1896 			f0i = t0i + f2r * w1i + f2i * w1r;
1897 			f2r = t0r * Two - f0r;
1898 			f2i = t0i * Two - f0i;
1899 
1900 			f3r = f1r + t1r * w1i + t1i * w1r;
1901 			f3i = f1i - t1r * w1r + t1i * w1i;
1902 			f1r = f1r * Two - f3r;
1903 			f1i = f1i * Two - f3i;
1904 
1905 
1906 			t0r = f4r + f5r * w0r - f5i * w0i;
1907 			t0i = f4i + f5r * w0i + f5i * w0r;
1908 			f5r = f4r * Two - t0r;
1909 			f5i = f4i * Two - t0i;
1910 
1911 			t1r = f6r - f7r * w0r + f7i * w0i;
1912 			t1i = f6i - f7r * w0i - f7i * w0r;
1913 			f6r = f6r * Two - t1r;
1914 			f6i = f6i * Two - t1i;
1915 
1916 			f4r = t0r + f6r * w1r - f6i * w1i;
1917 			f4i = t0i + f6r * w1i + f6i * w1r;
1918 			f6r = t0r * Two - f4r;
1919 			f6i = t0i * Two - f4i;
1920 
1921 			f7r = f5r + t1r * w1i + t1i * w1r;
1922 			f7i = f5i - t1r * w1r + t1i * w1i;
1923 			f5r = f5r * Two - f7r;
1924 			f5i = f5i * Two - f7i;
1925 
1926 			t0r = f0r - f4r * w2r + f4i * w2i;
1927 			t0i = f0i - f4r * w2i - f4i * w2r;
1928 			f0r = f0r * Two - t0r;
1929 			f0i = f0i * Two - t0i;
1930 
1931 			t1r = f1r - f5r * w3r + f5i * w3i;
1932 			t1i = f1i - f5r * w3i - f5i * w3r;
1933 			f1r = f1r * Two - t1r;
1934 			f1i = f1i * Two - t1i;
1935 
1936 			*(p0r + pos) = t0r;
1937 			*(p0r + posi) = t0i;
1938 			*p0r = f0r;
1939 			*(p0r + 1) = f0i;
1940 
1941 			p0r += pnext;
1942 			f0r = *p0r;
1943 			f0i = *(p0r + 1);
1944 
1945 			*(p1r + pos) = t1r;
1946 			*(p1r + posi) = t1i;
1947 			*p1r = f1r;
1948 			*(p1r + 1) = f1i;
1949 
1950 			p1r += pnext;
1951 
1952 			f1r = *p1r;
1953 			f1i = *(p1r + 1);
1954 
1955 			f4r = f2r - f6r * w2i - f6i * w2r;
1956 			f4i = f2i + f6r * w2r - f6i * w2i;
1957 			f6r = f2r * Two - f4r;
1958 			f6i = f2i * Two - f4i;
1959 
1960 			f5r = f3r - f7r * w3i - f7i * w3r;
1961 			f5i = f3i + f7r * w3r - f7i * w3i;
1962 			f7r = f3r * Two - f5r;
1963 			f7i = f3i * Two - f5i;
1964 
1965 			*p2r = f4r;
1966 			*(p2r + 1) = f4i;
1967 			*(p2r + pos) = f6r;
1968 			*(p2r + posi) = f6i;
1969 
1970 			p2r += pnext;
1971 
1972 			*p3r = f5r;
1973 			*(p3r + 1) = f5i;
1974 			*(p3r + pos) = f7r;
1975 			*(p3r + posi) = f7i;
1976 
1977 			p3r += pnext;
1978 
1979 		}
1980 
1981 		f2r = *p2r;
1982 		f2i = *(p2r + 1);
1983 		f3r = *p3r;
1984 		f3i = *(p3r + 1);
1985 
1986 		t0r = f0r + f1r * w0r - f1i * w0i;
1987 		t0i = f0i + f1r * w0i + f1i * w0r;
1988 		f1r = f0r * Two - t0r;
1989 		f1i = f0i * Two - t0i;
1990 
1991 		f4r = *(p0r + pos);
1992 		f4i = *(p0r + posi);
1993 		f5r = *(p1r + pos);
1994 		f5i = *(p1r + posi);
1995 
1996 		f6r = *(p2r + pos);
1997 		f6i = *(p2r + posi);
1998 		f7r = *(p3r + pos);
1999 		f7i = *(p3r + posi);
2000 
2001 		t1r = f2r - f3r * w0r + f3i * w0i;
2002 		t1i = f2i - f3r * w0i - f3i * w0r;
2003 		f2r = f2r * Two - t1r;
2004 		f2i = f2i * Two - t1i;
2005 
2006 		f0r = t0r + f2r * w1r - f2i * w1i;
2007 		f0i = t0i + f2r * w1i + f2i * w1r;
2008 		f2r = t0r * Two - f0r;
2009 		f2i = t0i * Two - f0i;
2010 
2011 		f3r = f1r + t1r * w1i + t1i * w1r;
2012 		f3i = f1i - t1r * w1r + t1i * w1i;
2013 		f1r = f1r * Two - f3r;
2014 		f1i = f1i * Two - f3i;
2015 
2016 		if (DiffUCnt == NDiffU/2)
2017 			Uinc4 = -Uinc4;
2018 
2019 		u0r += Uinc4;
2020 		u0i -= Uinc4;
2021 		u1r += Uinc2;
2022 		u1i -= Uinc2;
2023 		u2r += Uinc;
2024 		u2i -= Uinc;
2025 
2026 		pstrt += 2;
2027 
2028 		t0r = f4r + f5r * w0r - f5i * w0i;
2029 		t0i = f4i + f5r * w0i + f5i * w0r;
2030 		f5r = f4r * Two - t0r;
2031 		f5i = f4i * Two - t0i;
2032 
2033 		t1r = f6r - f7r * w0r + f7i * w0i;
2034 		t1i = f6i - f7r * w0i - f7i * w0r;
2035 		f6r = f6r * Two - t1r;
2036 		f6i = f6i * Two - t1i;
2037 
2038 		f4r = t0r + f6r * w1r - f6i * w1i;
2039 		f4i = t0i + f6r * w1i + f6i * w1r;
2040 		f6r = t0r * Two - f4r;
2041 		f6i = t0i * Two - f4i;
2042 
2043 		f7r = f5r + t1r * w1i + t1i * w1r;
2044 		f7i = f5i - t1r * w1r + t1i * w1i;
2045 		f5r = f5r * Two - f7r;
2046 		f5i = f5i * Two - f7i;
2047 
2048 		w0r = *u0r;
2049 		w0i =  *u0i;
2050 		w1r =  *u1r;
2051 		w1i =  *u1i;
2052 
2053 		if ((long) DiffUCnt <= NDiffU/2)
2054 			w0r = -w0r;
2055 
2056 		t0r = f0r - f4r * w2r + f4i * w2i;
2057 		t0i = f0i - f4r * w2i - f4i * w2r;
2058 		f0r = f0r * Two - t0r;
2059 		f0i = f0i * Two - t0i;
2060 
2061 		f4r = f2r - f6r * w2i - f6i * w2r;
2062 		f4i = f2i + f6r * w2r - f6i * w2i;
2063 		f6r = f2r * Two - f4r;
2064 		f6i = f2i * Two - f4i;
2065 
2066 		*(p0r + pos) = t0r;
2067 		*p2r = f4r;
2068 		*(p0r + posi) = t0i;
2069 		*(p2r + 1) = f4i;
2070 		w2r =  *u2r;
2071 		w2i =  *u2i;
2072 		*p0r = f0r;
2073 		*(p2r + pos) = f6r;
2074 		*(p0r + 1) = f0i;
2075 		*(p2r + posi) = f6i;
2076 
2077 		p0r = pstrt;
2078 		p2r = pstrt + pinc + pinc;
2079 
2080 		t1r = f1r - f5r * w3r + f5i * w3i;
2081 		t1i = f1i - f5r * w3i - f5i * w3r;
2082 		f1r = f1r * Two - t1r;
2083 		f1i = f1i * Two - t1i;
2084 
2085 		f5r = f3r - f7r * w3i - f7i * w3r;
2086 		f5i = f3i + f7r * w3r - f7i * w3i;
2087 		f7r = f3r * Two - f5r;
2088 		f7i = f3i * Two - f5i;
2089 
2090 		*(p1r + pos) = t1r;
2091 		*p3r = f5r;
2092 		*(p1r + posi) = t1i;
2093 		*(p3r + 1) = f5i;
2094 		w3r =  *(u2r+U2toU3);
2095 		w3i =  *(u2i-U2toU3);
2096 		*p1r = f1r;
2097 		*(p3r + pos) = f7r;
2098 		*(p1r + 1) = f1i;
2099 		*(p3r + posi) = f7i;
2100 
2101 		p1r = pstrt + pinc;
2102 		p3r = p2r + pinc;
2103 	}
2104 	NSameU /= 8;
2105 	Uinc /= 8;
2106 	Uinc2 /= 8;
2107 	Uinc4 = Uinc * 4;
2108 	NDiffU *= 8;
2109 	pinc *= 8;
2110 	pnext *= 8;
2111 	pos *= 8;
2112 	posi =  pos + 1;
2113 }
2114 }
2115 
2116 void ifftrecurs(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt);
ifftrecurs(float * ioptr,long M,float * Utbl,long Ustride,long NDiffU,long StageCnt)2117 void ifftrecurs(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt){
2118 /* recursive bfstages calls to maximize on chip cache efficiency */
2119 long i1;
2120 if (M <= MCACHE)
2121 	ibfstages(ioptr, M, Utbl, Ustride, NDiffU, StageCnt);		/*  RADIX 8 Stages	*/
2122 else{
2123 	for (i1=0; i1<8; i1++){
2124 		ifftrecurs(&ioptr[i1*POW2(M-3)*2], M-3, Utbl, 8*Ustride, NDiffU, StageCnt-1);	/*  RADIX 8 Stages	*/
2125 	}
2126 	ibfstages(ioptr, M, Utbl, Ustride, POW2(M-3), 1);	/*  RADIX 8 Stage	*/
2127 }
2128 }
2129 
iffts1(float * ioptr,long M,long Rows,float * Utbl,short * BRLow)2130 void iffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow){
2131 /* Compute in-place inverse complex fft on the rows of the input array	*/
2132 /* INPUTS */
2133 /* *ioptr = input data array	*/
2134 /* M = log2 of fft size	*/
2135 /* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array)	*/
2136 /* *Utbl = cosine table	*/
2137 /* *BRLow = bit reversed counter table	*/
2138 /* OUTPUTS */
2139 /* *ioptr = output data array	*/
2140 
2141 long 	StageCnt;
2142 long 	NDiffU;
2143 const float scale = 1.0F/POW2(M);
2144 
2145 switch (M){
2146 case 0:
2147 	break;
2148 case 1:
2149 	for (;Rows>0;Rows--){
2150 		ifft2pt(ioptr, scale);				/* a 2 pt fft */
2151 		ioptr += 2*POW2(M);
2152 	}
2153 	break;
2154 case 2:
2155 	for (;Rows>0;Rows--){
2156 		ifft4pt(ioptr, scale);				/* a 4 pt fft */
2157 		ioptr += 2*POW2(M);
2158 	}
2159 	break;
2160 case 3:
2161 	for (;Rows>0;Rows--){
2162 		ifft8pt(ioptr, scale);				/* an 8 pt fft */
2163 		ioptr += 2*POW2(M);
2164 	}
2165 	break;
2166 default:
2167 	for (;Rows>0;Rows--){
2168 
2169 		scbitrevR2(ioptr, M, BRLow, scale);	/* bit reverse and first radix 2 stage */
2170 
2171 		StageCnt = (M-1) / 3;		// number of radix 8 stages
2172 		NDiffU = 2;				// one radix 2 stage already complete
2173 
2174 		if ( (M-1-(StageCnt * 3)) == 1 ){
2175 			ibfR2(ioptr, M, NDiffU);				/* 1 radix 2 stage */
2176 			NDiffU *= 2;
2177 		}
2178 
2179 		if ( (M-1-(StageCnt * 3)) == 2 ){
2180 			ibfR4(ioptr, M, NDiffU);			/* 1 radix 4 stage */
2181 			NDiffU *= 4;
2182 		}
2183 
2184 		if (M <= MCACHE)
2185 			ibfstages(ioptr, M, Utbl, 1, NDiffU, StageCnt);		/*  RADIX 8 Stages	*/
2186 
2187 		else{
2188 			ifftrecurs(ioptr, M, Utbl, 1, NDiffU, StageCnt);		/*  RADIX 8 Stages	*/
2189 		}
2190 
2191 		ioptr += 2*POW2(M);
2192 	}
2193 }
2194 }
2195 
2196 /************************************************
2197 parts of rffts1
2198 *************************************************/
2199 
2200 //inline void rfft1pt(float *ioptr);
rfft1pt(float * ioptr)2201 static inline void rfft1pt(float *ioptr){
2202 /***   RADIX 2 rfft	***/
2203 float f0r, f0i;
2204 float t0r, t0i;
2205 
2206 	/* bit reversed load */
2207 f0r = ioptr[0];
2208 f0i = ioptr[1];
2209 
2210 	/* finish rfft */
2211 t0r = 	f0r + f0i;
2212 t0i = 	f0r - f0i;
2213 
2214 	/* store result */
2215 ioptr[0] = t0r;
2216 ioptr[1] = t0i;
2217 }
2218 
2219 //inline void rfft2pt(float *ioptr);
rfft2pt(float * ioptr)2220 static inline void rfft2pt(float *ioptr){
2221 /***   RADIX 4 rfft	***/
2222 float f0r, f0i, f1r, f1i;
2223 float t0r, t0i;
2224 
2225 	/* bit reversed load */
2226 f0r = ioptr[0];
2227 f0i = ioptr[1];
2228 f1r = ioptr[2];
2229 f1i = ioptr[3];
2230 
2231 		/* Butterflys		*/
2232 		/*
2233 		f0	-	-	t0
2234 		f1	-  1 -	f1
2235 		*/
2236 
2237 t0r = f0r + f1r;
2238 t0i = f0i + f1i;
2239 f1r = f0r - f1r;
2240 f1i = f1i - f0i;
2241 	/* finish rfft */
2242 f0r = 	t0r + t0i;
2243 f0i = 	t0r - t0i;
2244 
2245 	/* store result */
2246 ioptr[0] = f0r;
2247 ioptr[1] = f0i;
2248 ioptr[2] = f1r;
2249 ioptr[3] = f1i;
2250 }
2251 
2252 //inline void rfft4pt(float *ioptr);
rfft4pt(float * ioptr)2253 static inline void rfft4pt(float *ioptr){
2254 /***   RADIX 8 rfft	***/
2255 float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
2256 float t0r, t0i, t1r, t1i;
2257 float w0r = 1.0F/MYROOT2; /* cos(pi/4)	*/
2258 const float Two = 2.0;
2259 const float scale = 0.5;
2260 
2261 	/* bit reversed load */
2262 f0r = ioptr[0];
2263 f0i = ioptr[1];
2264 f1r = ioptr[4];
2265 f1i = ioptr[5];
2266 f2r = ioptr[2];
2267 f2i = ioptr[3];
2268 f3r = ioptr[6];
2269 f3i = ioptr[7];
2270 
2271 		/* Butterflys		*/
2272 		/*
2273 		f0	-	-	t0	-	-	f0
2274 		f1	-  1 -	f1	-	-	f1
2275 		f2	-	-	f2	-  1 -	f2
2276 		f3	-  1 -	t1	- -i -	f3
2277 		*/
2278 
2279 t0r = f0r + f1r;
2280 t0i = f0i + f1i;
2281 f1r = f0r - f1r;
2282 f1i = f0i - f1i;
2283 
2284 t1r = f2r - f3r;
2285 t1i = f2i - f3i;
2286 f2r = f2r + f3r;
2287 f2i = f2i + f3i;
2288 
2289 f0r = t0r + f2r;
2290 f0i = t0i + f2i;
2291 f2r = t0r - f2r;
2292 f2i = f2i - t0i;	// neg for rfft
2293 
2294 f3r = f1r - t1i;
2295 f3i = f1i + t1r;
2296 f1r = f1r + t1i;
2297 f1i = f1i - t1r;
2298 
2299 	/* finish rfft */
2300 t0r = 	f0r + f0i;    /* compute Re(x[0]) */
2301 t0i = 	f0r - f0i;    /* compute Re(x[N/2]) */
2302 
2303 t1r = f1r + f3r;
2304 t1i = f1i - f3i;
2305 f0r = f1i + f3i;
2306 f0i = f3r - f1r;
2307 
2308 f1r = t1r + w0r * f0r + w0r * f0i;
2309 f1i = t1i - w0r * f0r + w0r * f0i;
2310 f3r = Two * t1r - f1r;
2311 f3i = f1i - Two * t1i;
2312 
2313 	/* store result */
2314 ioptr[4] = f2r;
2315 ioptr[5] = f2i;
2316 ioptr[0] = t0r;
2317 ioptr[1] = t0i;
2318 
2319 ioptr[2] = scale*f1r;
2320 ioptr[3] = scale*f1i;
2321 ioptr[6] = scale*f3r;
2322 ioptr[7] = scale*f3i;
2323 }
2324 
2325 //inline void rfft8pt(float *ioptr);
rfft8pt(float * ioptr)2326 static inline void rfft8pt(float *ioptr){
2327 /***   RADIX 16 rfft	***/
2328 float w0r = 1.0F/MYROOT2; /* cos(pi/4)	*/
2329 float w1r = MYCOSPID8; /* cos(pi/8)	*/
2330 float w1i = MYSINPID8; /* sin(pi/8)	*/
2331 float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
2332 float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
2333 float t0r, t0i, t1r, t1i;
2334 const float Two = 2.0;
2335 const float scale = 0.5;
2336 
2337 	/* bit reversed load */
2338 f0r = ioptr[0];
2339 f0i = ioptr[1];
2340 f1r = ioptr[8];
2341 f1i = ioptr[9];
2342 f2r = ioptr[4];
2343 f2i = ioptr[5];
2344 f3r = ioptr[12];
2345 f3i = ioptr[13];
2346 f4r = ioptr[2];
2347 f4i = ioptr[3];
2348 f5r = ioptr[10];
2349 f5i = ioptr[11];
2350 f6r = ioptr[6];
2351 f6i = ioptr[7];
2352 f7r = ioptr[14];
2353 f7i = ioptr[15];
2354 			/* Butterflys		*/
2355 			/*
2356 			f0	-	-	t0	-	-	f0	-	-	f0
2357 			f1	-  1 -	f1	-	-	f1	-	-	f1
2358 			f2	-	-	f2	-  1 -	f2	-	-	f2
2359 			f3	-  1 -	t1	- -i -	f3	-	-	f3
2360 			f4	-	-	t0	-	-	f4	-  1 -	t0
2361 			f5	-  1 -	f5	-	-	f5	- w3 -	f4
2362 			f6	-	-	f6	-  1 -	f6	- -i -	t1
2363 			f7	-  1 -	t1	- -i -	f7	- iw3-	f6
2364 			*/
2365 
2366 t0r = f0r + f1r;
2367 t0i = f0i + f1i;
2368 f1r = f0r - f1r;
2369 f1i = f0i - f1i;
2370 
2371 t1r = f2r - f3r;
2372 t1i = f2i - f3i;
2373 f2r = f2r + f3r;
2374 f2i = f2i + f3i;
2375 
2376 f0r = t0r + f2r;
2377 f0i = t0i + f2i;
2378 f2r = t0r - f2r;
2379 f2i = t0i - f2i;
2380 
2381 f3r = f1r - t1i;
2382 f3i = f1i + t1r;
2383 f1r = f1r + t1i;
2384 f1i = f1i - t1r;
2385 
2386 
2387 t0r = f4r + f5r;
2388 t0i = f4i + f5i;
2389 f5r = f4r - f5r;
2390 f5i = f4i - f5i;
2391 
2392 t1r = f6r - f7r;
2393 t1i = f6i - f7i;
2394 f6r = f6r + f7r;
2395 f6i = f6i + f7i;
2396 
2397 f4r = t0r + f6r;
2398 f4i = t0i + f6i;
2399 f6r = t0r - f6r;
2400 f6i = t0i - f6i;
2401 
2402 f7r = f5r - t1i;
2403 f7i = f5i + t1r;
2404 f5r = f5r + t1i;
2405 f5i = f5i - t1r;
2406 
2407 
2408 t0r = f0r - f4r;
2409 t0i = f4i - f0i;	// neg for rfft
2410 f0r = f0r + f4r;
2411 f0i = f0i + f4i;
2412 
2413 t1r = f2r - f6i;
2414 t1i = f2i + f6r;
2415 f2r = f2r + f6i;
2416 f2i = f2i - f6r;
2417 
2418 f4r = f1r - f5r * w0r - f5i * w0r;
2419 f4i = f1i + f5r * w0r - f5i * w0r;
2420 f1r = f1r * Two - f4r;
2421 f1i = f1i * Two - f4i;
2422 
2423 f6r = f3r + f7r * w0r - f7i * w0r;
2424 f6i = f3i + f7r * w0r + f7i * w0r;
2425 f3r = f3r * Two - f6r;
2426 f3i = f3i * Two - f6i;
2427 
2428 	/* finish rfft */
2429 f5r = 	f0r + f0i;    /* compute Re(x[0]) */
2430 f5i = 	f0r - f0i;    /* compute Re(x[N/2]) */
2431 
2432 f0r = f2r + t1r;
2433 f0i = f2i - t1i;
2434 f7r = f2i + t1i;
2435 f7i = t1r - f2r;
2436 
2437 f2r = f0r + w0r * f7r + w0r * f7i;
2438 f2i = f0i - w0r * f7r + w0r * f7i;
2439 t1r = Two * f0r - f2r;
2440 t1i = f2i - Two * f0i;
2441 
2442 
2443 f0r = f1r + f6r;
2444 f0i = f1i - f6i;
2445 f7r = f1i + f6i;
2446 f7i = f6r - f1r;
2447 
2448 f1r = f0r + w1r * f7r + w1i * f7i;
2449 f1i = f0i - w1i * f7r + w1r * f7i;
2450 f6r = Two * f0r - f1r;
2451 f6i = f1i - Two * f0i;
2452 
2453 f0r = f3r + f4r;
2454 f0i = f3i - f4i;
2455 f7r = f3i + f4i;
2456 f7i = f4r - f3r;
2457 
2458 f3r = f0r + w1i * f7r + w1r * f7i;
2459 f3i = f0i - w1r * f7r + w1i * f7i;
2460 f4r = Two * f0r - f3r;
2461 f4i = f3i - Two * f0i;
2462 
2463 	/* store result */
2464 ioptr[8] = t0r;
2465 ioptr[9] = t0i;
2466 ioptr[0] = f5r;
2467 ioptr[1] = f5i;
2468 
2469 ioptr[4] = scale*f2r;
2470 ioptr[5] = scale*f2i;
2471 ioptr[12] = scale*t1r;
2472 ioptr[13] = scale*t1i;
2473 
2474 ioptr[2] = scale*f1r;
2475 ioptr[3] = scale*f1i;
2476 ioptr[6] = scale*f3r;
2477 ioptr[7] = scale*f3i;
2478 ioptr[10] = scale*f4r;
2479 ioptr[11] = scale*f4i;
2480 ioptr[14] = scale*f6r;
2481 ioptr[15] = scale*f6i;
2482 }
2483 
2484 //inline void frstage(float *ioptr, long M, float *Utbl);
frstage(float * ioptr,long M,float * Utbl)2485 static inline void frstage(float *ioptr, long M, float *Utbl){
2486 /*	Finish RFFT		*/
2487 
2488 unsigned long 	pos;
2489 unsigned long 	posi;
2490 unsigned long 	diffUcnt;
2491 
2492 float 	*p0r, *p1r;
2493 float 	*u0r, *u0i;
2494 
2495 float w0r, w0i;
2496 float f0r, f0i, f1r, f1i, f4r, f4i, f5r, f5i;
2497 float t0r, t0i, t1r, t1i;
2498 const float Two = 2.0;
2499 
2500 pos = POW2(M-1);
2501 posi = pos + 1;
2502 
2503 p0r = ioptr;
2504 p1r = ioptr + pos/2;
2505 
2506 u0r = Utbl + (int) POW2(M-3);
2507 
2508 w0r =  *u0r;
2509 
2510 f0r = *(p0r);
2511 f0i = *(p0r + 1);
2512 f4r = *(p0r + pos);
2513 f4i = *(p0r + posi);
2514 f1r = *(p1r);
2515 f1i = *(p1r + 1);
2516 f5r = *(p1r + pos);
2517 f5i = *(p1r + posi);
2518 
2519 	t0r = Two * f0r + Two * f0i;    /* compute Re(x[0]) */
2520 	t0i = Two * f0r - Two * f0i;    /* compute Re(x[N/2]) */
2521 	t1r = f4r + f4r;
2522 	t1i = -f4i - f4i;
2523 
2524 	f0r = f1r + f5r;
2525 	f0i = f1i - f5i;
2526 	f4r = f1i + f5i;
2527 	f4i = f5r - f1r;
2528 
2529 	f1r = f0r + w0r * f4r + w0r * f4i;
2530 	f1i = f0i - w0r * f4r + w0r * f4i;
2531 	f5r = Two * f0r - f1r;
2532 	f5i = f1i - Two * f0i;
2533 
2534 *(p0r) = t0r;
2535 *(p0r + 1) = t0i;
2536 *(p0r + pos) = t1r;
2537 *(p0r + posi) = t1i;
2538 *(p1r) = f1r;
2539 *(p1r + 1) = f1i;
2540 *(p1r + pos) = f5r;
2541 *(p1r + posi) = f5i;
2542 
2543 u0r = Utbl + 1;
2544 u0i = Utbl + (POW2(M-2)-1);
2545 
2546 w0r =  *u0r;
2547 w0i =  *u0i;
2548 
2549 p0r = (ioptr + 2);
2550 p1r = (ioptr + (POW2(M-2)-1)*2);
2551 
2552 				/* Butterflys */
2553 				/*
2554 		f0	-	t0	-	-	f0
2555 		f5	-	t1	- w0	-	f5
2556 
2557 		f1	-	t0	-	-	f1
2558 		f4	-	t1	-iw0 -	f4
2559 				*/
2560 
2561 for (diffUcnt = POW2(M-3)-1; diffUcnt > 0 ; diffUcnt--){
2562 
2563 	f0r = *(p0r);
2564 	f0i = *(p0r + 1);
2565 	f5r = *(p1r + pos);
2566 	f5i = *(p1r + posi);
2567 	f1r = *(p1r);
2568 	f1i = *(p1r + 1);
2569 	f4r = *(p0r + pos);
2570 	f4i = *(p0r + posi);
2571 
2572 	t0r = f0r + f5r;
2573 	t0i = f0i - f5i;
2574 	t1r = f0i + f5i;
2575 	t1i = f5r - f0r;
2576 
2577 	f0r = t0r + w0r * t1r + w0i * t1i;
2578 	f0i = t0i - w0i * t1r + w0r * t1i;
2579 	f5r = Two * t0r - f0r;
2580 	f5i = f0i - Two * t0i;
2581 
2582 	t0r = f1r + f4r;
2583 	t0i = f1i - f4i;
2584 	t1r = f1i + f4i;
2585 	t1i = f4r - f1r;
2586 
2587 	f1r = t0r + w0i * t1r + w0r * t1i;
2588 	f1i = t0i - w0r * t1r + w0i * t1i;
2589 	f4r = Two * t0r - f1r;
2590 	f4i = f1i - Two * t0i;
2591 
2592 	*(p0r) = f0r;
2593 	*(p0r + 1) = f0i;
2594 	*(p1r + pos) = f5r;
2595 	*(p1r + posi) = f5i;
2596 
2597 	w0r = *++u0r;
2598 	w0i = *--u0i;
2599 
2600 	*(p1r) = f1r;
2601 	*(p1r + 1) = f1i;
2602 	*(p0r + pos) = f4r;
2603 	*(p0r + posi) = f4i;
2604 
2605 	p0r += 2;
2606 	p1r -= 2;
2607 };
2608 }
2609 
rffts1(float * ioptr,long M,long Rows,float * Utbl,short * BRLow)2610 void rffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow){
2611 /* Compute in-place real fft on the rows of the input array	*/
2612 /* The result is the complex spectra of the positive frequencies */
2613 /* except the location for the first complex number contains the real */
2614 /* values for DC and Nyquest */
2615 /* INPUTS */
2616 /* *ioptr = real input data array	*/
2617 /* M = log2 of fft size	*/
2618 /* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array)	*/
2619 /* *Utbl = cosine table	*/
2620 /* *BRLow = bit reversed counter table	*/
2621 /* OUTPUTS */
2622 /* *ioptr = output data array	in the following order */
2623 /* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */
2624 
2625 float	scale;
2626 long 	StageCnt;
2627 long 	NDiffU;
2628 
2629 M=M-1;
2630 switch (M){
2631 case -1:
2632 	break;
2633 case 0:
2634 	for (;Rows>0;Rows--){
2635 		rfft1pt(ioptr);				/* a 2 pt fft */
2636 		ioptr += 2*POW2(M);
2637 	}
2638 case 1:
2639 	for (;Rows>0;Rows--){
2640 		rfft2pt(ioptr);				/* a 4 pt fft */
2641 		ioptr += 2*POW2(M);
2642 	}
2643 	break;
2644 case 2:
2645 	for (;Rows>0;Rows--){
2646 		rfft4pt(ioptr);				/* an 8 pt fft */
2647 		ioptr += 2*POW2(M);
2648 	}
2649 	break;
2650 case 3:
2651 	for (;Rows>0;Rows--){
2652 		rfft8pt(ioptr);				/* a 16 pt fft */
2653 		ioptr += 2*POW2(M);
2654 	}
2655 	break;
2656 default:
2657 	scale = 0.5;
2658 	for (;Rows>0;Rows--){
2659 
2660 		scbitrevR2(ioptr, M, BRLow, scale);						/* bit reverse and first radix 2 stage */
2661 
2662 		StageCnt = (M-1) / 3;		// number of radix 8 stages
2663 		NDiffU = 2;				// one radix 2 stage already complete
2664 
2665 		if ( (M-1-(StageCnt * 3)) == 1 ){
2666 			bfR2(ioptr, M, NDiffU);			/* 1 radix 2 stage */
2667 			NDiffU *= 2;
2668 		}
2669 
2670 		if ( (M-1-(StageCnt * 3)) == 2 ){
2671 			bfR4(ioptr, M, NDiffU);			/* 1 radix 4 stage */
2672 			NDiffU *= 4;
2673 		}
2674 
2675 		if (M <= MCACHE){
2676 			bfstages(ioptr, M, Utbl, 2, NDiffU, StageCnt);		/*  RADIX 8 Stages	*/
2677 			frstage(ioptr, M+1, Utbl);
2678 		}
2679 
2680 		else{
2681 			fftrecurs(ioptr, M, Utbl, 2, NDiffU, StageCnt);		/*  RADIX 8 Stages	*/
2682 			frstage(ioptr, M+1, Utbl);
2683 		}
2684 
2685 		ioptr += 2*POW2(M);
2686 	}
2687 }
2688 }
2689 
2690 /************************************************
2691 parts of riffts1
2692 *************************************************/
2693 
2694 //inline void rifft1pt(float *ioptr, float scale);
rifft1pt(float * ioptr,float scale)2695 static inline void rifft1pt(float *ioptr, float scale){
2696 /***   RADIX 2 rifft	***/
2697 float f0r, f0i;
2698 float t0r, t0i;
2699 
2700 	/* bit reversed load */
2701 f0r = ioptr[0];
2702 f0i = ioptr[1];
2703 
2704 	/* finish rfft */
2705 t0r = 	f0r + f0i;
2706 t0i = 	f0r - f0i;
2707 
2708 	/* store result */
2709 ioptr[0] = scale*t0r;
2710 ioptr[1] = scale*t0i;
2711 }
2712 
2713 //inline void rifft2pt(float *ioptr, float scale);
rifft2pt(float * ioptr,float scale)2714 static inline void rifft2pt(float *ioptr, float scale){
2715 /***   RADIX 4 rifft	***/
2716 float f0r, f0i, f1r, f1i;
2717 float t0r, t0i;
2718 const float Two = 2.0;
2719 
2720 	/* bit reversed load */
2721 t0r = ioptr[0];
2722 t0i = ioptr[1];
2723 f1r = Two * ioptr[2];
2724 f1i = Two * ioptr[3];
2725 
2726 	/* start rifft */
2727 f0r = 	t0r + t0i;
2728 f0i = 	t0r - t0i;
2729 		/* Butterflys		*/
2730 		/*
2731 		f0	-	-	t0
2732 		f1	-  1 -	f1
2733 		*/
2734 
2735 t0r = f0r + f1r;
2736 t0i = f0i - f1i;
2737 f1r = f0r - f1r;
2738 f1i = f0i + f1i;
2739 
2740 	/* store result */
2741 ioptr[0] = scale*t0r;
2742 ioptr[1] = scale*t0i;
2743 ioptr[2] = scale*f1r;
2744 ioptr[3] = scale*f1i;
2745 }
2746 
2747 //inline void rifft4pt(float *ioptr, float scale);
rifft4pt(float * ioptr,float scale)2748 static inline void rifft4pt(float *ioptr, float scale){
2749 /***   RADIX 8 rifft	***/
2750 float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
2751 float t0r, t0i, t1r, t1i;
2752 float w0r = 1.0F/MYROOT2; /* cos(pi/4)	*/
2753 const float Two = 2.0;
2754 
2755 	/* bit reversed load */
2756 t0r = ioptr[0];
2757 t0i = ioptr[1];
2758 f2r = ioptr[2];
2759 f2i = ioptr[3];
2760 f1r = Two * ioptr[4];
2761 f1i = Two * ioptr[5];
2762 f3r = ioptr[6];
2763 f3i = ioptr[7];
2764 
2765 	/* start rfft */
2766 f0r = 	t0r + t0i;    /* compute Re(x[0]) */
2767 f0i = 	t0r - t0i;    /* compute Re(x[N/2]) */
2768 
2769 t1r = f2r + f3r;
2770 t1i = f2i - f3i;
2771 t0r = f2r - f3r;
2772 t0i = f2i + f3i;
2773 
2774 f2r = t1r - w0r * t0r - w0r * t0i;
2775 f2i = t1i + w0r * t0r - w0r * t0i;
2776 f3r = Two * t1r - f2r;
2777 f3i = f2i - Two * t1i;
2778 
2779 		/* Butterflys		*/
2780 		/*
2781 		f0	-	-	t0	-	-	f0
2782 		f1	-  1 -	f1	-	-	f1
2783 		f2	-	-	f2	-  1 -	f2
2784 		f3	-  1 -	t1	-  i -	f3
2785 		*/
2786 
2787 t0r = f0r + f1r;
2788 t0i = f0i - f1i;
2789 f1r = f0r - f1r;
2790 f1i = f0i + f1i;
2791 
2792 t1r = f2r - f3r;
2793 t1i = f2i - f3i;
2794 f2r = f2r + f3r;
2795 f2i = f2i + f3i;
2796 
2797 f0r = t0r + f2r;
2798 f0i = t0i + f2i;
2799 f2r = t0r - f2r;
2800 f2i = t0i - f2i;
2801 
2802 f3r = f1r + t1i;
2803 f3i = f1i - t1r;
2804 f1r = f1r - t1i;
2805 f1i = f1i + t1r;
2806 
2807 	/* store result */
2808 ioptr[0] = scale*f0r;
2809 ioptr[1] = scale*f0i;
2810 ioptr[2] = scale*f1r;
2811 ioptr[3] = scale*f1i;
2812 ioptr[4] = scale*f2r;
2813 ioptr[5] = scale*f2i;
2814 ioptr[6] = scale*f3r;
2815 ioptr[7] = scale*f3i;
2816 }
2817 
2818 //inline void rifft8pt(float *ioptr, float scale);
rifft8pt(float * ioptr,float scale)2819 static inline void rifft8pt(float *ioptr, float scale){
2820 /***   RADIX 16 rifft	***/
2821 float w0r = 1.0F/MYROOT2; /* cos(pi/4)	*/
2822 float w1r = MYCOSPID8; /* cos(pi/8)	*/
2823 float w1i = MYSINPID8; /* sin(pi/8)	*/
2824 float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
2825 float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
2826 float t0r, t0i, t1r, t1i;
2827 const float Two = 2.0;
2828 
2829 	/* bit reversed load */
2830 t0r = ioptr[0];
2831 t0i = ioptr[1];
2832 f4r = ioptr[2];
2833 f4i = ioptr[3];
2834 f2r = ioptr[4];
2835 f2i = ioptr[5];
2836 f6r = ioptr[6];
2837 f6i = ioptr[7];
2838 f1r = Two * ioptr[8];
2839 f1i = Two * ioptr[9];
2840 f5r = ioptr[10];
2841 f5i = ioptr[11];
2842 f3r = ioptr[12];
2843 f3i = ioptr[13];
2844 f7r = ioptr[14];
2845 f7i = ioptr[15];
2846 
2847 
2848 	/* start rfft */
2849 f0r = 	t0r + t0i;    /* compute Re(x[0]) */
2850 f0i = 	t0r - t0i;    /* compute Re(x[N/2]) */
2851 
2852 t0r = f2r + f3r;
2853 t0i = f2i - f3i;
2854 t1r = f2r - f3r;
2855 t1i = f2i + f3i;
2856 
2857 f2r = t0r - w0r * t1r - w0r * t1i;
2858 f2i = t0i + w0r * t1r - w0r * t1i;
2859 f3r = Two * t0r - f2r;
2860 f3i = f2i - Two * t0i;
2861 
2862 t0r = f4r + f7r;
2863 t0i = f4i - f7i;
2864 t1r = f4r - f7r;
2865 t1i = f4i + f7i;
2866 
2867 f4r = t0r - w1i * t1r - w1r * t1i;
2868 f4i = t0i + w1r * t1r - w1i * t1i;
2869 f7r = Two * t0r - f4r;
2870 f7i = f4i - Two * t0i;
2871 
2872 t0r = f6r + f5r;
2873 t0i = f6i - f5i;
2874 t1r = f6r - f5r;
2875 t1i = f6i + f5i;
2876 
2877 f6r = t0r - w1r * t1r - w1i * t1i;
2878 f6i = t0i + w1i * t1r - w1r * t1i;
2879 f5r = Two * t0r - f6r;
2880 f5i = f6i - Two * t0i;
2881 
2882 			/* Butterflys		*/
2883 			/*
2884 			f0	-	-	t0	-	-	f0	-	-	f0
2885 			f1*	-  1 -	f1	-	-	f1	-	-	f1
2886 			f2	-	-	f2	-  1 -	f2	-	-	f2
2887 			f3	-  1 -	t1	-  i -	f3	-	-	f3
2888 			f4	-	-	t0	-	-	f4	-  1 -	t0
2889 			f5	-  1 -	f5	-	-	f5	- w3 -	f4
2890 			f6	-	-	f6	-  1 -	f6	-  i -	t1
2891 			f7	-  1 -	t1	-  i -	f7	- iw3-	f6
2892 			*/
2893 
2894 t0r = f0r + f1r;
2895 t0i = f0i - f1i;
2896 f1r = f0r - f1r;
2897 f1i = f0i + f1i;
2898 
2899 t1r = f2r - f3r;
2900 t1i = f2i - f3i;
2901 f2r = f2r + f3r;
2902 f2i = f2i + f3i;
2903 
2904 f0r = t0r + f2r;
2905 f0i = t0i + f2i;
2906 f2r = t0r - f2r;
2907 f2i = t0i - f2i;
2908 
2909 f3r = f1r + t1i;
2910 f3i = f1i - t1r;
2911 f1r = f1r - t1i;
2912 f1i = f1i + t1r;
2913 
2914 
2915 t0r = f4r + f5r;
2916 t0i = f4i + f5i;
2917 f5r = f4r - f5r;
2918 f5i = f4i - f5i;
2919 
2920 t1r = f6r - f7r;
2921 t1i = f6i - f7i;
2922 f6r = f6r + f7r;
2923 f6i = f6i + f7i;
2924 
2925 f4r = t0r + f6r;
2926 f4i = t0i + f6i;
2927 f6r = t0r - f6r;
2928 f6i = t0i - f6i;
2929 
2930 f7r = f5r + t1i;
2931 f7i = f5i - t1r;
2932 f5r = f5r - t1i;
2933 f5i = f5i + t1r;
2934 
2935 
2936 t0r = f0r - f4r;
2937 t0i = f0i - f4i;
2938 f0r = f0r + f4r;
2939 f0i = f0i + f4i;
2940 
2941 t1r = f2r + f6i;
2942 t1i = f2i - f6r;
2943 f2r = f2r - f6i;
2944 f2i = f2i + f6r;
2945 
2946 f4r = f1r - f5r * w0r + f5i * w0r;
2947 f4i = f1i - f5r * w0r - f5i * w0r;
2948 f1r = f1r * Two - f4r;
2949 f1i = f1i * Two - f4i;
2950 
2951 f6r = f3r + f7r * w0r + f7i * w0r;
2952 f6i = f3i - f7r * w0r + f7i * w0r;
2953 f3r = f3r * Two - f6r;
2954 f3i = f3i * Two - f6i;
2955 
2956 	/* store result */
2957 ioptr[0] = scale*f0r;
2958 ioptr[1] = scale*f0i;
2959 ioptr[2] = scale*f1r;
2960 ioptr[3] = scale*f1i;
2961 ioptr[4] = scale*f2r;
2962 ioptr[5] = scale*f2i;
2963 ioptr[6] = scale*f3r;
2964 ioptr[7] = scale*f3i;
2965 ioptr[8] = scale*t0r;
2966 ioptr[9] = scale*t0i;
2967 ioptr[10] = scale*f4r;
2968 ioptr[11] = scale*f4i;
2969 ioptr[12] = scale*t1r;
2970 ioptr[13] = scale*t1i;
2971 ioptr[14] = scale*f6r;
2972 ioptr[15] = scale*f6i;
2973 }
2974 
2975 //inline void ifrstage(float *ioptr, long M, float *Utbl);
ifrstage(float * ioptr,long M,float * Utbl)2976 static inline void ifrstage(float *ioptr, long M, float *Utbl){
2977 /*	Start RIFFT		*/
2978 
2979 unsigned long 	pos;
2980 unsigned long 	posi;
2981 unsigned long 	diffUcnt;
2982 
2983 float 	*p0r, *p1r;
2984 float 	*u0r, *u0i;
2985 
2986 float w0r, w0i;
2987 float f0r, f0i, f1r, f1i, f4r, f4i, f5r, f5i;
2988 float t0r, t0i, t1r, t1i;
2989 const float Two = 2.0;
2990 
2991 pos = POW2(M-1);
2992 posi = pos + 1;
2993 
2994 p0r = ioptr;
2995 p1r = ioptr + pos/2;
2996 
2997 u0r = Utbl + (int) POW2(M-3);
2998 
2999 w0r =  *u0r;
3000 
3001 f0r = *(p0r);
3002 f0i = *(p0r + 1);
3003 f4r = *(p0r + pos);
3004 f4i = *(p0r + posi);
3005 f1r = *(p1r);
3006 f1i = *(p1r + 1);
3007 f5r = *(p1r + pos);
3008 f5i = *(p1r + posi);
3009 
3010 	t0r = f0r + f0i;
3011 	t0i = f0r - f0i;
3012 	t1r = f4r + f4r;
3013 	t1i = -f4i - f4i;
3014 
3015 	f0r = f1r + f5r;
3016 	f0i = f1i - f5i;
3017 	f4r = f1r - f5r;
3018 	f4i = f1i + f5i;
3019 
3020 	f1r = f0r - w0r * f4r - w0r * f4i;
3021 	f1i = f0i + w0r * f4r - w0r * f4i;
3022 	f5r = Two * f0r - f1r;
3023 	f5i = f1i - Two * f0i;
3024 
3025 *(p0r) = t0r;
3026 *(p0r + 1) = t0i;
3027 *(p0r + pos) = t1r;
3028 *(p0r + posi) = t1i;
3029 *(p1r) = f1r;
3030 *(p1r + 1) = f1i;
3031 *(p1r + pos) = f5r;
3032 *(p1r + posi) = f5i;
3033 
3034 u0r = Utbl + 1;
3035 u0i = Utbl + (POW2(M-2)-1);
3036 
3037 w0r =  *u0r;
3038 w0i =  *u0i;
3039 
3040 p0r = (ioptr + 2);
3041 p1r = (ioptr + (POW2(M-2)-1)*2);
3042 
3043 				/* Butterflys */
3044 				/*
3045 		f0	-	 t0		-	f0
3046 		f1	-     t1     -w0-   f1
3047 
3048 		f2	-	 t0		-	f2
3049 		f3	-     t1	   -iw0-  f3
3050 				*/
3051 
3052 for (diffUcnt = POW2(M-3)-1; diffUcnt > 0 ; diffUcnt--){
3053 
3054 	f0r = *(p0r);
3055 	f0i = *(p0r + 1);
3056 	f5r = *(p1r + pos);
3057 	f5i = *(p1r + posi);
3058 	f1r = *(p1r);
3059 	f1i = *(p1r + 1);
3060 	f4r = *(p0r + pos);
3061 	f4i = *(p0r + posi);
3062 
3063 	t0r = f0r + f5r;
3064 	t0i = f0i - f5i;
3065 	t1r = f0r - f5r;
3066 	t1i = f0i + f5i;
3067 
3068 	f0r = t0r - w0i * t1r - w0r * t1i;
3069 	f0i = t0i + w0r * t1r - w0i * t1i;
3070 	f5r = Two * t0r - f0r;
3071 	f5i = f0i - Two * t0i;
3072 
3073 	t0r = f1r + f4r;
3074 	t0i = f1i - f4i;
3075 	t1r = f1r - f4r;
3076 	t1i = f1i + f4i;
3077 
3078 	f1r = t0r - w0r * t1r - w0i * t1i;
3079 	f1i = t0i + w0i * t1r - w0r * t1i;
3080 	f4r = Two * t0r - f1r;
3081 	f4i = f1i - Two * t0i;
3082 
3083 	*(p0r) = f0r;
3084 	*(p0r + 1) = f0i;
3085 	*(p1r + pos) = f5r;
3086 	*(p1r + posi) = f5i;
3087 
3088 	w0r = *++u0r;
3089 	w0i = *--u0i;
3090 
3091 	*(p1r) = f1r;
3092 	*(p1r + 1) = f1i;
3093 	*(p0r + pos) = f4r;
3094 	*(p0r + posi) = f4i;
3095 
3096 	p0r += 2;
3097 	p1r -= 2;
3098 };
3099 }
3100 
riffts1(float * ioptr,long M,long Rows,float * Utbl,short * BRLow)3101 void riffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow){
3102 /* Compute in-place real ifft on the rows of the input array	*/
3103 /* data order as from rffts1 */
3104 /* INPUTS */
3105 /* *ioptr = input data array in the following order	*/
3106 /* M = log2 of fft size	*/
3107 /* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */
3108 /* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array)	*/
3109 /* *Utbl = cosine table	*/
3110 /* *BRLow = bit reversed counter table	*/
3111 /* OUTPUTS */
3112 /* *ioptr = real output data array	*/
3113 
3114 float	scale;
3115 long 	StageCnt;
3116 long 	NDiffU;
3117 
3118 scale = 1.0F/POW2(M);
3119 M=M-1;
3120 switch (M){
3121 case -1:
3122 	break;
3123 case 0:
3124 	for (;Rows>0;Rows--){
3125 		rifft1pt(ioptr, scale);				/* a 2 pt fft */
3126 		ioptr += 2*POW2(M);
3127 	}
3128 case 1:
3129 	for (;Rows>0;Rows--){
3130 		rifft2pt(ioptr, scale);				/* a 4 pt fft */
3131 		ioptr += 2*POW2(M);
3132 	}
3133 	break;
3134 case 2:
3135 	for (;Rows>0;Rows--){
3136 		rifft4pt(ioptr, scale);				/* an 8 pt fft */
3137 		ioptr += 2*POW2(M);
3138 	}
3139 	break;
3140 case 3:
3141 	for (;Rows>0;Rows--){
3142 		rifft8pt(ioptr, scale);				/* a 16 pt fft */
3143 		ioptr += 2*POW2(M);
3144 	}
3145 	break;
3146 default:
3147 	for (;Rows>0;Rows--){
3148 
3149 		ifrstage(ioptr, M+1, Utbl);
3150 
3151 		scbitrevR2(ioptr, M, BRLow, scale);						/* bit reverse and first radix 2 stage */
3152 
3153 		StageCnt = (M-1) / 3;		// number of radix 8 stages
3154 		NDiffU = 2;				// one radix 2 stage already complete
3155 
3156 		if ( (M-1-(StageCnt * 3)) == 1 ){
3157 			ibfR2(ioptr, M, NDiffU);			/* 1 radix 2 stage */
3158 			NDiffU *= 2;
3159 		}
3160 
3161 		if ( (M-1-(StageCnt * 3)) == 2 ){
3162 			ibfR4(ioptr, M, NDiffU);			/* 1 radix 4 stage */
3163 			NDiffU *= 4;
3164 		}
3165 
3166 		if (M <= MCACHE){
3167 			ibfstages(ioptr, M, Utbl, 2, NDiffU, StageCnt);		/*  RADIX 8 Stages	*/
3168 		}
3169 
3170 		else{
3171 			ifftrecurs(ioptr, M, Utbl, 2, NDiffU, StageCnt);		/*  RADIX 8 Stages	*/
3172 		}
3173 
3174 		ioptr += 2*POW2(M);
3175 	}
3176 }
3177 }
3178 
3179