1 /* 2 FFT library 3 based on public domain code by John Green <green_jt@vsdec.npt.nuwc.navy.mil> 4 original version is available at 5 http://hyperarchive.lcs.mit.edu/ 6 /HyperArchive/Archive/dev/src/ffts-for-risc-2-c.hqx 7 ported to Csound by Istvan Varga, 2005 8 */ 9 10 #include <stdint.h> 11 #include <stdlib.h> 12 #include <math.h> 13 #include "base.h" 14 15 #ifndef M_PI 16 #define M_PI 3.14159265358979323846 17 #endif 18 19 #define POW2(m) ((uint32_t) 1 << (m)) /* integer power of 2 for m<32 */ 20 21 /* fft's with M bigger than this bust primary cache */ 22 #define MCACHE (11 - (sizeof(SPFLOAT) / 8)) 23 24 /* some math constants to 40 decimal places */ 25 #define MYPI 3.141592653589793238462643383279502884197 /* pi */ 26 #define MYROOT2 1.414213562373095048801688724209698078569 /* sqrt(2) */ 27 #define MYCOSPID8 0.9238795325112867561281831893967882868224 /* cos(pi/8) */ 28 #define MYSINPID8 0.3826834323650897717284599840303988667613 /* sin(pi/8) */ 29 30 /***************************************************** 31 * routines to initialize tables used by fft routines * 32 *****************************************************/ 33 34 static void fftCosInit(int M, SPFLOAT *Utbl) 35 { 36 /* Compute Utbl, the cosine table for ffts */ 37 /* of size (pow(2,M)/4 +1) */ 38 /* INPUTS */ 39 /* M = log2 of fft size */ 40 /* OUTPUTS */ 41 /* *Utbl = cosine table */ 42 unsigned int fftN = POW2(M); 43 unsigned int i1; 44 45 Utbl[0] = 1.0; 46 for (i1 = 1; i1 < fftN/4; i1++) 47 Utbl[i1] = cos((2.0 * M_PI * (SPFLOAT)i1) / (SPFLOAT)fftN); 48 Utbl[fftN/4] = 0.0; 49 } 50 51 void fftBRInit(int M, int16_t *BRLow) 52 { 53 /* Compute BRLow, the bit reversed table for ffts */ 54 /* of size pow(2,M/2 -1) */ 55 /* INPUTS */ 56 /* M = log2 of fft size */ 57 /* OUTPUTS */ 58 /* *BRLow = bit reversed counter table */ 59 int Mroot_1 = M / 2 - 1; 60 int Nroot_1 = POW2(Mroot_1); 61 int i1; 62 int bitsum; 63 int bitmask; 64 int bit; 65 66 for (i1 = 0; i1 < Nroot_1; i1++) { 67 bitsum = 0; 68 bitmask = 1; 69 for (bit = 1; bit <= Mroot_1; bitmask <<= 1, bit++) 70 if (i1 & bitmask) 71 bitsum = bitsum + (Nroot_1 >> bit); 72 BRLow[i1] = bitsum; 73 } 74 } 75 76 /***************** 77 * parts of ffts1 * 78 *****************/ 79 80 static void bitrevR2(SPFLOAT *ioptr, int M, int16_t *BRLow) 81 { 82 /*** bit reverse and first radix 2 stage of forward or inverse fft ***/ 83 SPFLOAT f0r; 84 SPFLOAT f0i; 85 SPFLOAT f1r; 86 SPFLOAT f1i; 87 SPFLOAT f2r; 88 SPFLOAT f2i; 89 SPFLOAT f3r; 90 SPFLOAT f3i; 91 SPFLOAT f4r; 92 SPFLOAT f4i; 93 SPFLOAT f5r; 94 SPFLOAT f5i; 95 SPFLOAT f6r; 96 SPFLOAT f6i; 97 SPFLOAT f7r; 98 SPFLOAT f7i; 99 SPFLOAT t0r; 100 SPFLOAT t0i; 101 SPFLOAT t1r; 102 SPFLOAT t1i; 103 SPFLOAT *p0r; 104 SPFLOAT *p1r; 105 SPFLOAT *IOP; 106 SPFLOAT *iolimit; 107 int Colstart; 108 int iCol; 109 unsigned int posA; 110 unsigned int posAi; 111 unsigned int posB; 112 unsigned int posBi; 113 114 const unsigned int Nrems2 = POW2((M + 3) / 2); 115 const unsigned int Nroot_1_ColInc = POW2(M) - Nrems2; 116 const unsigned int Nroot_1 = POW2(M / 2 - 1) - 1; 117 const unsigned int ColstartShift = (M + 1) / 2 + 1; 118 119 posA = POW2(M); /* 1/2 of POW2(M) complexes */ 120 posAi = posA + 1; 121 posB = posA + 2; 122 posBi = posB + 1; 123 124 iolimit = ioptr + Nrems2; 125 for (; ioptr < iolimit; ioptr += POW2(M / 2 + 1)) { 126 for (Colstart = Nroot_1; Colstart >= 0; Colstart--) { 127 iCol = Nroot_1; 128 p0r = ioptr + Nroot_1_ColInc + BRLow[Colstart] * 4; 129 IOP = ioptr + (Colstart << ColstartShift); 130 p1r = IOP + BRLow[iCol] * 4; 131 f0r = *(p0r); 132 f0i = *(p0r + 1); 133 f1r = *(p0r + posA); 134 f1i = *(p0r + posAi); 135 for (; iCol > Colstart;) { 136 f2r = *(p0r + 2); 137 f2i = *(p0r + (2 + 1)); 138 f3r = *(p0r + posB); 139 f3i = *(p0r + posBi); 140 f4r = *(p1r); 141 f4i = *(p1r + 1); 142 f5r = *(p1r + posA); 143 f5i = *(p1r + posAi); 144 f6r = *(p1r + 2); 145 f6i = *(p1r + (2 + 1)); 146 f7r = *(p1r + posB); 147 f7i = *(p1r + posBi); 148 149 t0r = f0r + f1r; 150 t0i = f0i + f1i; 151 f1r = f0r - f1r; 152 f1i = f0i - f1i; 153 t1r = f2r + f3r; 154 t1i = f2i + f3i; 155 f3r = f2r - f3r; 156 f3i = f2i - f3i; 157 f0r = f4r + f5r; 158 f0i = f4i + f5i; 159 f5r = f4r - f5r; 160 f5i = f4i - f5i; 161 f2r = f6r + f7r; 162 f2i = f6i + f7i; 163 f7r = f6r - f7r; 164 f7i = f6i - f7i; 165 166 *(p1r) = t0r; 167 *(p1r + 1) = t0i; 168 *(p1r + 2) = f1r; 169 *(p1r + (2 + 1)) = f1i; 170 *(p1r + posA) = t1r; 171 *(p1r + posAi) = t1i; 172 *(p1r + posB) = f3r; 173 *(p1r + posBi) = f3i; 174 *(p0r) = f0r; 175 *(p0r + 1) = f0i; 176 *(p0r + 2) = f5r; 177 *(p0r + (2 + 1)) = f5i; 178 *(p0r + posA) = f2r; 179 *(p0r + posAi) = f2i; 180 *(p0r + posB) = f7r; 181 *(p0r + posBi) = f7i; 182 183 p0r -= Nrems2; 184 f0r = *(p0r); 185 f0i = *(p0r + 1); 186 f1r = *(p0r + posA); 187 f1i = *(p0r + posAi); 188 iCol -= 1; 189 p1r = IOP + BRLow[iCol] * 4; 190 } 191 f2r = *(p0r + 2); 192 f2i = *(p0r + (2 + 1)); 193 f3r = *(p0r + posB); 194 f3i = *(p0r + posBi); 195 196 t0r = f0r + f1r; 197 t0i = f0i + f1i; 198 f1r = f0r - f1r; 199 f1i = f0i - f1i; 200 t1r = f2r + f3r; 201 t1i = f2i + f3i; 202 f3r = f2r - f3r; 203 f3i = f2i - f3i; 204 205 *(p0r) = t0r; 206 *(p0r + 1) = t0i; 207 *(p0r + 2) = f1r; 208 *(p0r + (2 + 1)) = f1i; 209 *(p0r + posA) = t1r; 210 *(p0r + posAi) = t1i; 211 *(p0r + posB) = f3r; 212 *(p0r + posBi) = f3i; 213 } 214 } 215 } 216 217 static void fft2pt(SPFLOAT *ioptr) 218 { 219 /*** RADIX 2 fft ***/ 220 SPFLOAT f0r, f0i, f1r, f1i; 221 SPFLOAT t0r, t0i; 222 223 /* bit reversed load */ 224 f0r = ioptr[0]; 225 f0i = ioptr[1]; 226 f1r = ioptr[2]; 227 f1i = ioptr[3]; 228 229 /* Butterflys */ 230 /* 231 f0 - - t0 232 f1 - 1 - f1 233 */ 234 235 t0r = f0r + f1r; 236 t0i = f0i + f1i; 237 f1r = f0r - f1r; 238 f1i = f0i - f1i; 239 240 /* store result */ 241 ioptr[0] = t0r; 242 ioptr[1] = t0i; 243 ioptr[2] = f1r; 244 ioptr[3] = f1i; 245 } 246 247 static void fft4pt(SPFLOAT *ioptr) 248 { 249 /*** RADIX 4 fft ***/ 250 SPFLOAT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; 251 SPFLOAT t0r, t0i, t1r, t1i; 252 253 /* bit reversed load */ 254 f0r = ioptr[0]; 255 f0i = ioptr[1]; 256 f1r = ioptr[4]; 257 f1i = ioptr[5]; 258 f2r = ioptr[2]; 259 f2i = ioptr[3]; 260 f3r = ioptr[6]; 261 f3i = ioptr[7]; 262 263 /* Butterflys */ 264 /* 265 f0 - - t0 - - f0 266 f1 - 1 - f1 - - f1 267 f2 - - f2 - 1 - f2 268 f3 - 1 - t1 - -i - f3 269 */ 270 271 t0r = f0r + f1r; 272 t0i = f0i + f1i; 273 f1r = f0r - f1r; 274 f1i = f0i - f1i; 275 276 t1r = f2r - f3r; 277 t1i = f2i - f3i; 278 f2r = f2r + f3r; 279 f2i = f2i + f3i; 280 281 f0r = t0r + f2r; 282 f0i = t0i + f2i; 283 f2r = t0r - f2r; 284 f2i = t0i - f2i; 285 286 f3r = f1r - t1i; 287 f3i = f1i + t1r; 288 f1r = f1r + t1i; 289 f1i = f1i - t1r; 290 291 /* store result */ 292 ioptr[0] = f0r; 293 ioptr[1] = f0i; 294 ioptr[2] = f1r; 295 ioptr[3] = f1i; 296 ioptr[4] = f2r; 297 ioptr[5] = f2i; 298 ioptr[6] = f3r; 299 ioptr[7] = f3i; 300 } 301 302 static void fft8pt(SPFLOAT *ioptr) 303 { 304 /*** RADIX 8 fft ***/ 305 SPFLOAT w0r = (SPFLOAT)(1.0 / MYROOT2); /* cos(pi/4) */ 306 SPFLOAT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; 307 SPFLOAT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; 308 SPFLOAT t0r, t0i, t1r, t1i; 309 const SPFLOAT Two = 2.0; 310 311 /* bit reversed load */ 312 f0r = ioptr[0]; 313 f0i = ioptr[1]; 314 f1r = ioptr[8]; 315 f1i = ioptr[9]; 316 f2r = ioptr[4]; 317 f2i = ioptr[5]; 318 f3r = ioptr[12]; 319 f3i = ioptr[13]; 320 f4r = ioptr[2]; 321 f4i = ioptr[3]; 322 f5r = ioptr[10]; 323 f5i = ioptr[11]; 324 f6r = ioptr[6]; 325 f6i = ioptr[7]; 326 f7r = ioptr[14]; 327 f7i = ioptr[15]; 328 /* Butterflys */ 329 /* 330 f0 - - t0 - - f0 - - f0 331 f1 - 1 - f1 - - f1 - - f1 332 f2 - - f2 - 1 - f2 - - f2 333 f3 - 1 - t1 - -i - f3 - - f3 334 f4 - - t0 - - f4 - 1 - t0 335 f5 - 1 - f5 - - f5 - w3 - f4 336 f6 - - f6 - 1 - f6 - -i - t1 337 f7 - 1 - t1 - -i - f7 - iw3- f6 338 */ 339 340 t0r = f0r + f1r; 341 t0i = f0i + f1i; 342 f1r = f0r - f1r; 343 f1i = f0i - f1i; 344 345 t1r = f2r - f3r; 346 t1i = f2i - f3i; 347 f2r = f2r + f3r; 348 f2i = f2i + f3i; 349 350 f0r = t0r + f2r; 351 f0i = t0i + f2i; 352 f2r = t0r - f2r; 353 f2i = t0i - f2i; 354 355 f3r = f1r - t1i; 356 f3i = f1i + t1r; 357 f1r = f1r + t1i; 358 f1i = f1i - t1r; 359 360 t0r = f4r + f5r; 361 t0i = f4i + f5i; 362 f5r = f4r - f5r; 363 f5i = f4i - f5i; 364 365 t1r = f6r - f7r; 366 t1i = f6i - f7i; 367 f6r = f6r + f7r; 368 f6i = f6i + f7i; 369 370 f4r = t0r + f6r; 371 f4i = t0i + f6i; 372 f6r = t0r - f6r; 373 f6i = t0i - f6i; 374 375 f7r = f5r - t1i; 376 f7i = f5i + t1r; 377 f5r = f5r + t1i; 378 f5i = f5i - t1r; 379 380 t0r = f0r - f4r; 381 t0i = f0i - f4i; 382 f0r = f0r + f4r; 383 f0i = f0i + f4i; 384 385 t1r = f2r - f6i; 386 t1i = f2i + f6r; 387 f2r = f2r + f6i; 388 f2i = f2i - f6r; 389 390 f4r = f1r - f5r * w0r - f5i * w0r; 391 f4i = f1i + f5r * w0r - f5i * w0r; 392 f1r = f1r * Two - f4r; 393 f1i = f1i * Two - f4i; 394 395 f6r = f3r + f7r * w0r - f7i * w0r; 396 f6i = f3i + f7r * w0r + f7i * w0r; 397 f3r = f3r * Two - f6r; 398 f3i = f3i * Two - f6i; 399 400 /* store result */ 401 ioptr[0] = f0r; 402 ioptr[1] = f0i; 403 ioptr[2] = f1r; 404 ioptr[3] = f1i; 405 ioptr[4] = f2r; 406 ioptr[5] = f2i; 407 ioptr[6] = f3r; 408 ioptr[7] = f3i; 409 ioptr[8] = t0r; 410 ioptr[9] = t0i; 411 ioptr[10] = f4r; 412 ioptr[11] = f4i; 413 ioptr[12] = t1r; 414 ioptr[13] = t1i; 415 ioptr[14] = f6r; 416 ioptr[15] = f6i; 417 } 418 419 static void bfR2(SPFLOAT *ioptr, int M, int NDiffU) 420 { 421 /*** 2nd radix 2 stage ***/ 422 unsigned int pos; 423 unsigned int posi; 424 unsigned int pinc; 425 unsigned int pnext; 426 unsigned int NSameU; 427 unsigned int SameUCnt; 428 429 SPFLOAT *pstrt; 430 SPFLOAT *p0r, *p1r, *p2r, *p3r; 431 432 SPFLOAT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; 433 SPFLOAT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; 434 435 pinc = NDiffU * 2; /* 2 floats per complex */ 436 pnext = pinc * 4; 437 pos = 2; 438 posi = pos + 1; 439 NSameU = POW2(M) / 4 / NDiffU; /* 4 Us at a time */ 440 pstrt = ioptr; 441 p0r = pstrt; 442 p1r = pstrt + pinc; 443 p2r = p1r + pinc; 444 p3r = p2r + pinc; 445 446 /* Butterflys */ 447 /* 448 f0 - - f4 449 f1 - 1 - f5 450 f2 - - f6 451 f3 - 1 - f7 452 */ 453 /* Butterflys */ 454 /* 455 f0 - - f4 456 f1 - 1 - f5 457 f2 - - f6 458 f3 - 1 - f7 459 */ 460 461 for (SameUCnt = NSameU; SameUCnt > 0; SameUCnt--) { 462 463 f0r = *p0r; 464 f1r = *p1r; 465 f0i = *(p0r + 1); 466 f1i = *(p1r + 1); 467 f2r = *p2r; 468 f3r = *p3r; 469 f2i = *(p2r + 1); 470 f3i = *(p3r + 1); 471 472 f4r = f0r + f1r; 473 f4i = f0i + f1i; 474 f5r = f0r - f1r; 475 f5i = f0i - f1i; 476 477 f6r = f2r + f3r; 478 f6i = f2i + f3i; 479 f7r = f2r - f3r; 480 f7i = f2i - f3i; 481 482 *p0r = f4r; 483 *(p0r + 1) = f4i; 484 *p1r = f5r; 485 *(p1r + 1) = f5i; 486 *p2r = f6r; 487 *(p2r + 1) = f6i; 488 *p3r = f7r; 489 *(p3r + 1) = f7i; 490 491 f0r = *(p0r + pos); 492 f1i = *(p1r + posi); 493 f0i = *(p0r + posi); 494 f1r = *(p1r + pos); 495 f2r = *(p2r + pos); 496 f3i = *(p3r + posi); 497 f2i = *(p2r + posi); 498 f3r = *(p3r + pos); 499 500 f4r = f0r + f1i; 501 f4i = f0i - f1r; 502 f5r = f0r - f1i; 503 f5i = f0i + f1r; 504 505 f6r = f2r + f3i; 506 f6i = f2i - f3r; 507 f7r = f2r - f3i; 508 f7i = f2i + f3r; 509 510 *(p0r + pos) = f4r; 511 *(p0r + posi) = f4i; 512 *(p1r + pos) = f5r; 513 *(p1r + posi) = f5i; 514 *(p2r + pos) = f6r; 515 *(p2r + posi) = f6i; 516 *(p3r + pos) = f7r; 517 *(p3r + posi) = f7i; 518 519 p0r += pnext; 520 p1r += pnext; 521 p2r += pnext; 522 p3r += pnext; 523 } 524 } 525 526 static void bfR4(SPFLOAT *ioptr, int M, int NDiffU) 527 { 528 /*** 1 radix 4 stage ***/ 529 unsigned int pos; 530 unsigned int posi; 531 unsigned int pinc; 532 unsigned int pnext; 533 unsigned int pnexti; 534 unsigned int NSameU; 535 unsigned int SameUCnt; 536 537 SPFLOAT *pstrt; 538 SPFLOAT *p0r, *p1r, *p2r, *p3r; 539 540 SPFLOAT w1r = 1.0 / MYROOT2; /* cos(pi/4) */ 541 SPFLOAT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; 542 SPFLOAT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; 543 SPFLOAT t1r, t1i; 544 const SPFLOAT Two = 2.0; 545 546 pinc = NDiffU * 2; /* 2 floats per complex */ 547 pnext = pinc * 4; 548 pnexti = pnext + 1; 549 pos = 2; 550 posi = pos + 1; 551 NSameU = POW2(M) / 4 / NDiffU; /* 4 pts per butterfly */ 552 pstrt = ioptr; 553 p0r = pstrt; 554 p1r = pstrt + pinc; 555 p2r = p1r + pinc; 556 p3r = p2r + pinc; 557 558 /* Butterflys */ 559 /* 560 f0 - - f0 - - f4 561 f1 - 1 - f5 - - f5 562 f2 - - f6 - 1 - f6 563 f3 - 1 - f3 - -i - f7 564 */ 565 /* Butterflys */ 566 /* 567 f0 - - f4 - - f4 568 f1 - -i - t1 - - f5 569 f2 - - f2 - w1 - f6 570 f3 - -i - f7 - iw1- f7 571 */ 572 573 f0r = *p0r; 574 f1r = *p1r; 575 f2r = *p2r; 576 f3r = *p3r; 577 f0i = *(p0r + 1); 578 f1i = *(p1r + 1); 579 f2i = *(p2r + 1); 580 f3i = *(p3r + 1); 581 582 f5r = f0r - f1r; 583 f5i = f0i - f1i; 584 f0r = f0r + f1r; 585 f0i = f0i + f1i; 586 587 f6r = f2r + f3r; 588 f6i = f2i + f3i; 589 f3r = f2r - f3r; 590 f3i = f2i - f3i; 591 592 for (SameUCnt = NSameU - 1; SameUCnt > 0; SameUCnt--) { 593 594 f7r = f5r - f3i; 595 f7i = f5i + f3r; 596 f5r = f5r + f3i; 597 f5i = f5i - f3r; 598 599 f4r = f0r + f6r; 600 f4i = f0i + f6i; 601 f6r = f0r - f6r; 602 f6i = f0i - f6i; 603 604 f2r = *(p2r + pos); 605 f2i = *(p2r + posi); 606 f1r = *(p1r + pos); 607 f1i = *(p1r + posi); 608 f3i = *(p3r + posi); 609 f0r = *(p0r + pos); 610 f3r = *(p3r + pos); 611 f0i = *(p0r + posi); 612 613 *p3r = f7r; 614 *p0r = f4r; 615 *(p3r + 1) = f7i; 616 *(p0r + 1) = f4i; 617 *p1r = f5r; 618 *p2r = f6r; 619 *(p1r + 1) = f5i; 620 *(p2r + 1) = f6i; 621 622 f7r = f2r - f3i; 623 f7i = f2i + f3r; 624 f2r = f2r + f3i; 625 f2i = f2i - f3r; 626 627 f4r = f0r + f1i; 628 f4i = f0i - f1r; 629 t1r = f0r - f1i; 630 t1i = f0i + f1r; 631 632 f5r = t1r - f7r * w1r + f7i * w1r; 633 f5i = t1i - f7r * w1r - f7i * w1r; 634 f7r = t1r * Two - f5r; 635 f7i = t1i * Two - f5i; 636 637 f6r = f4r - f2r * w1r - f2i * w1r; 638 f6i = f4i + f2r * w1r - f2i * w1r; 639 f4r = f4r * Two - f6r; 640 f4i = f4i * Two - f6i; 641 642 f3r = *(p3r + pnext); 643 f0r = *(p0r + pnext); 644 f3i = *(p3r + pnexti); 645 f0i = *(p0r + pnexti); 646 f2r = *(p2r + pnext); 647 f2i = *(p2r + pnexti); 648 f1r = *(p1r + pnext); 649 f1i = *(p1r + pnexti); 650 651 *(p2r + pos) = f6r; 652 *(p1r + pos) = f5r; 653 *(p2r + posi) = f6i; 654 *(p1r + posi) = f5i; 655 *(p3r + pos) = f7r; 656 *(p0r + pos) = f4r; 657 *(p3r + posi) = f7i; 658 *(p0r + posi) = f4i; 659 660 f6r = f2r + f3r; 661 f6i = f2i + f3i; 662 f3r = f2r - f3r; 663 f3i = f2i - f3i; 664 665 f5r = f0r - f1r; 666 f5i = f0i - f1i; 667 f0r = f0r + f1r; 668 f0i = f0i + f1i; 669 670 p3r += pnext; 671 p0r += pnext; 672 p1r += pnext; 673 p2r += pnext; 674 } 675 f7r = f5r - f3i; 676 f7i = f5i + f3r; 677 f5r = f5r + f3i; 678 f5i = f5i - f3r; 679 680 f4r = f0r + f6r; 681 f4i = f0i + f6i; 682 f6r = f0r - f6r; 683 f6i = f0i - f6i; 684 685 f2r = *(p2r + pos); 686 f2i = *(p2r + posi); 687 f1r = *(p1r + pos); 688 f1i = *(p1r + posi); 689 f3i = *(p3r + posi); 690 f0r = *(p0r + pos); 691 f3r = *(p3r + pos); 692 f0i = *(p0r + posi); 693 694 *p3r = f7r; 695 *p0r = f4r; 696 *(p3r + 1) = f7i; 697 *(p0r + 1) = f4i; 698 *p1r = f5r; 699 *p2r = f6r; 700 *(p1r + 1) = f5i; 701 *(p2r + 1) = f6i; 702 703 f7r = f2r - f3i; 704 f7i = f2i + f3r; 705 f2r = f2r + f3i; 706 f2i = f2i - f3r; 707 708 f4r = f0r + f1i; 709 f4i = f0i - f1r; 710 t1r = f0r - f1i; 711 t1i = f0i + f1r; 712 713 f5r = t1r - f7r * w1r + f7i * w1r; 714 f5i = t1i - f7r * w1r - f7i * w1r; 715 f7r = t1r * Two - f5r; 716 f7i = t1i * Two - f5i; 717 718 f6r = f4r - f2r * w1r - f2i * w1r; 719 f6i = f4i + f2r * w1r - f2i * w1r; 720 f4r = f4r * Two - f6r; 721 f4i = f4i * Two - f6i; 722 723 *(p2r + pos) = f6r; 724 *(p1r + pos) = f5r; 725 *(p2r + posi) = f6i; 726 *(p1r + posi) = f5i; 727 *(p3r + pos) = f7r; 728 *(p0r + pos) = f4r; 729 *(p3r + posi) = f7i; 730 *(p0r + posi) = f4i; 731 } 732 733 static void bfstages(SPFLOAT *ioptr, int M, SPFLOAT *Utbl, int Ustride, 734 int NDiffU, int StageCnt) 735 { 736 /*** RADIX 8 Stages ***/ 737 unsigned int pos; 738 unsigned int posi; 739 unsigned int pinc; 740 unsigned int pnext; 741 unsigned int NSameU; 742 int Uinc; 743 int Uinc2; 744 int Uinc4; 745 unsigned int DiffUCnt; 746 unsigned int SameUCnt; 747 unsigned int U2toU3; 748 749 SPFLOAT *pstrt; 750 SPFLOAT *p0r, *p1r, *p2r, *p3r; 751 SPFLOAT *u0r, *u0i, *u1r, *u1i, *u2r, *u2i; 752 753 SPFLOAT w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i; 754 SPFLOAT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; 755 SPFLOAT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; 756 SPFLOAT t0r, t0i, t1r, t1i; 757 const SPFLOAT Two = 2.0; 758 759 pinc = NDiffU * 2; /* 2 floats per complex */ 760 pnext = pinc * 8; 761 pos = pinc * 4; 762 posi = pos + 1; 763 NSameU = POW2(M) / 8 / NDiffU; /* 8 pts per butterfly */ 764 Uinc = (int) NSameU * Ustride; 765 Uinc2 = Uinc * 2; 766 Uinc4 = Uinc * 4; 767 U2toU3 = (POW2(M) / 8) * Ustride; 768 for (; StageCnt > 0; StageCnt--) { 769 770 u0r = &Utbl[0]; 771 u0i = &Utbl[POW2(M - 2) * Ustride]; 772 u1r = u0r; 773 u1i = u0i; 774 u2r = u0r; 775 u2i = u0i; 776 777 w0r = *u0r; 778 w0i = *u0i; 779 w1r = *u1r; 780 w1i = *u1i; 781 w2r = *u2r; 782 w2i = *u2i; 783 w3r = *(u2r + U2toU3); 784 w3i = *(u2i - U2toU3); 785 786 pstrt = ioptr; 787 788 p0r = pstrt; 789 p1r = pstrt + pinc; 790 p2r = p1r + pinc; 791 p3r = p2r + pinc; 792 793 /* Butterflys */ 794 /* 795 f0 - - t0 - - f0 - - f0 796 f1 - w0- f1 - - f1 - - f1 797 f2 - - f2 - w1- f2 - - f4 798 f3 - w0- t1 - iw1- f3 - - f5 799 800 f4 - - t0 - - f4 - w2- t0 801 f5 - w0- f5 - - f5 - w3- t1 802 f6 - - f6 - w1- f6 - iw2- f6 803 f7 - w0- t1 - iw1- f7 - iw3- f7 804 */ 805 806 for (DiffUCnt = NDiffU; DiffUCnt > 0; DiffUCnt--) { 807 f0r = *p0r; 808 f0i = *(p0r + 1); 809 f1r = *p1r; 810 f1i = *(p1r + 1); 811 for (SameUCnt = NSameU - 1; SameUCnt > 0; SameUCnt--) { 812 f2r = *p2r; 813 f2i = *(p2r + 1); 814 f3r = *p3r; 815 f3i = *(p3r + 1); 816 817 t0r = f0r + f1r * w0r + f1i * w0i; 818 t0i = f0i - f1r * w0i + f1i * w0r; 819 f1r = f0r * Two - t0r; 820 f1i = f0i * Two - t0i; 821 822 f4r = *(p0r + pos); 823 f4i = *(p0r + posi); 824 f5r = *(p1r + pos); 825 f5i = *(p1r + posi); 826 827 f6r = *(p2r + pos); 828 f6i = *(p2r + posi); 829 f7r = *(p3r + pos); 830 f7i = *(p3r + posi); 831 832 t1r = f2r - f3r * w0r - f3i * w0i; 833 t1i = f2i + f3r * w0i - f3i * w0r; 834 f2r = f2r * Two - t1r; 835 f2i = f2i * Two - t1i; 836 837 f0r = t0r + f2r * w1r + f2i * w1i; 838 f0i = t0i - f2r * w1i + f2i * w1r; 839 f2r = t0r * Two - f0r; 840 f2i = t0i * Two - f0i; 841 842 f3r = f1r + t1r * w1i - t1i * w1r; 843 f3i = f1i + t1r * w1r + t1i * w1i; 844 f1r = f1r * Two - f3r; 845 f1i = f1i * Two - f3i; 846 847 t0r = f4r + f5r * w0r + f5i * w0i; 848 t0i = f4i - f5r * w0i + f5i * w0r; 849 f5r = f4r * Two - t0r; 850 f5i = f4i * Two - t0i; 851 852 t1r = f6r - f7r * w0r - f7i * w0i; 853 t1i = f6i + f7r * w0i - f7i * w0r; 854 f6r = f6r * Two - t1r; 855 f6i = f6i * Two - t1i; 856 857 f4r = t0r + f6r * w1r + f6i * w1i; 858 f4i = t0i - f6r * w1i + f6i * w1r; 859 f6r = t0r * Two - f4r; 860 f6i = t0i * Two - f4i; 861 862 f7r = f5r + t1r * w1i - t1i * w1r; 863 f7i = f5i + t1r * w1r + t1i * w1i; 864 f5r = f5r * Two - f7r; 865 f5i = f5i * Two - f7i; 866 867 t0r = f0r - f4r * w2r - f4i * w2i; 868 t0i = f0i + f4r * w2i - f4i * w2r; 869 f0r = f0r * Two - t0r; 870 f0i = f0i * Two - t0i; 871 872 t1r = f1r - f5r * w3r - f5i * w3i; 873 t1i = f1i + f5r * w3i - f5i * w3r; 874 f1r = f1r * Two - t1r; 875 f1i = f1i * Two - t1i; 876 877 *(p0r + pos) = t0r; 878 *(p1r + pos) = t1r; 879 *(p0r + posi) = t0i; 880 *(p1r + posi) = t1i; 881 *p0r = f0r; 882 *p1r = f1r; 883 *(p0r + 1) = f0i; 884 *(p1r + 1) = f1i; 885 886 p0r += pnext; 887 f0r = *p0r; 888 f0i = *(p0r + 1); 889 890 p1r += pnext; 891 892 f1r = *p1r; 893 f1i = *(p1r + 1); 894 895 f4r = f2r - f6r * w2i + f6i * w2r; 896 f4i = f2i - f6r * w2r - f6i * w2i; 897 f6r = f2r * Two - f4r; 898 f6i = f2i * Two - f4i; 899 900 f5r = f3r - f7r * w3i + f7i * w3r; 901 f5i = f3i - f7r * w3r - f7i * w3i; 902 f7r = f3r * Two - f5r; 903 f7i = f3i * Two - f5i; 904 905 *p2r = f4r; 906 *p3r = f5r; 907 *(p2r + 1) = f4i; 908 *(p3r + 1) = f5i; 909 *(p2r + pos) = f6r; 910 *(p3r + pos) = f7r; 911 *(p2r + posi) = f6i; 912 *(p3r + posi) = f7i; 913 914 p2r += pnext; 915 p3r += pnext; 916 } 917 918 f2r = *p2r; 919 f2i = *(p2r + 1); 920 f3r = *p3r; 921 f3i = *(p3r + 1); 922 923 t0r = f0r + f1r * w0r + f1i * w0i; 924 t0i = f0i - f1r * w0i + f1i * w0r; 925 f1r = f0r * Two - t0r; 926 f1i = f0i * Two - t0i; 927 928 f4r = *(p0r + pos); 929 f4i = *(p0r + posi); 930 f5r = *(p1r + pos); 931 f5i = *(p1r + posi); 932 933 f6r = *(p2r + pos); 934 f6i = *(p2r + posi); 935 f7r = *(p3r + pos); 936 f7i = *(p3r + posi); 937 938 t1r = f2r - f3r * w0r - f3i * w0i; 939 t1i = f2i + f3r * w0i - f3i * w0r; 940 f2r = f2r * Two - t1r; 941 f2i = f2i * Two - t1i; 942 943 f0r = t0r + f2r * w1r + f2i * w1i; 944 f0i = t0i - f2r * w1i + f2i * w1r; 945 f2r = t0r * Two - f0r; 946 f2i = t0i * Two - f0i; 947 948 f3r = f1r + t1r * w1i - t1i * w1r; 949 f3i = f1i + t1r * w1r + t1i * w1i; 950 f1r = f1r * Two - f3r; 951 f1i = f1i * Two - f3i; 952 953 if ((int) DiffUCnt == NDiffU / 2) 954 Uinc4 = -Uinc4; 955 956 u0r += Uinc4; 957 u0i -= Uinc4; 958 u1r += Uinc2; 959 u1i -= Uinc2; 960 u2r += Uinc; 961 u2i -= Uinc; 962 963 pstrt += 2; 964 965 t0r = f4r + f5r * w0r + f5i * w0i; 966 t0i = f4i - f5r * w0i + f5i * w0r; 967 f5r = f4r * Two - t0r; 968 f5i = f4i * Two - t0i; 969 970 t1r = f6r - f7r * w0r - f7i * w0i; 971 t1i = f6i + f7r * w0i - f7i * w0r; 972 f6r = f6r * Two - t1r; 973 f6i = f6i * Two - t1i; 974 975 f4r = t0r + f6r * w1r + f6i * w1i; 976 f4i = t0i - f6r * w1i + f6i * w1r; 977 f6r = t0r * Two - f4r; 978 f6i = t0i * Two - f4i; 979 980 f7r = f5r + t1r * w1i - t1i * w1r; 981 f7i = f5i + t1r * w1r + t1i * w1i; 982 f5r = f5r * Two - f7r; 983 f5i = f5i * Two - f7i; 984 985 w0r = *u0r; 986 w0i = *u0i; 987 w1r = *u1r; 988 w1i = *u1i; 989 990 if ((int) DiffUCnt <= NDiffU / 2) 991 w0r = -w0r; 992 993 t0r = f0r - f4r * w2r - f4i * w2i; 994 t0i = f0i + f4r * w2i - f4i * w2r; 995 f0r = f0r * Two - t0r; 996 f0i = f0i * Two - t0i; 997 998 f4r = f2r - f6r * w2i + f6i * w2r; 999 f4i = f2i - f6r * w2r - f6i * w2i; 1000 f6r = f2r * Two - f4r; 1001 f6i = f2i * Two - f4i; 1002 1003 *(p0r + pos) = t0r; 1004 *p2r = f4r; 1005 *(p0r + posi) = t0i; 1006 *(p2r + 1) = f4i; 1007 w2r = *u2r; 1008 w2i = *u2i; 1009 *p0r = f0r; 1010 *(p2r + pos) = f6r; 1011 *(p0r + 1) = f0i; 1012 *(p2r + posi) = f6i; 1013 1014 p0r = pstrt; 1015 p2r = pstrt + pinc + pinc; 1016 1017 t1r = f1r - f5r * w3r - f5i * w3i; 1018 t1i = f1i + f5r * w3i - f5i * w3r; 1019 f1r = f1r * Two - t1r; 1020 f1i = f1i * Two - t1i; 1021 1022 f5r = f3r - f7r * w3i + f7i * w3r; 1023 f5i = f3i - f7r * w3r - f7i * w3i; 1024 f7r = f3r * Two - f5r; 1025 f7i = f3i * Two - f5i; 1026 1027 *(p1r + pos) = t1r; 1028 *p3r = f5r; 1029 *(p1r + posi) = t1i; 1030 *(p3r + 1) = f5i; 1031 w3r = *(u2r + U2toU3); 1032 w3i = *(u2i - U2toU3); 1033 *p1r = f1r; 1034 *(p3r + pos) = f7r; 1035 *(p1r + 1) = f1i; 1036 *(p3r + posi) = f7i; 1037 1038 p1r = pstrt + pinc; 1039 p3r = p2r + pinc; 1040 } 1041 NSameU /= 8; 1042 Uinc /= 8; 1043 Uinc2 /= 8; 1044 Uinc4 = Uinc * 4; 1045 NDiffU *= 8; 1046 pinc *= 8; 1047 pnext *= 8; 1048 pos *= 8; 1049 posi = pos + 1; 1050 } 1051 } 1052 1053 static void fftrecurs(SPFLOAT *ioptr, int M, SPFLOAT *Utbl, int Ustride, int NDiffU, 1054 int StageCnt) 1055 { 1056 /* recursive bfstages calls to maximize on chip cache efficiency */ 1057 int i1; 1058 1059 if (M <= (int) MCACHE) /* fits on chip ? */ 1060 bfstages(ioptr, M, Utbl, Ustride, NDiffU, StageCnt); /* RADIX 8 Stages */ 1061 else { 1062 for (i1 = 0; i1 < 8; i1++) { 1063 fftrecurs(&ioptr[i1 * POW2(M - 3) * 2], M - 3, Utbl, 8 * Ustride, 1064 NDiffU, StageCnt - 1); /* RADIX 8 Stages */ 1065 } 1066 bfstages(ioptr, M, Utbl, Ustride, POW2(M - 3), 1); /* RADIX 8 Stage */ 1067 } 1068 } 1069 1070 static void ffts1(SPFLOAT *ioptr, int M, SPFLOAT *Utbl, int16_t *BRLow) 1071 { 1072 /* Compute in-place complex fft on the rows of the input array */ 1073 /* INPUTS */ 1074 /* *ioptr = input data array */ 1075 /* M = log2 of fft size (ex M=10 for 1024 point fft) */ 1076 /* *Utbl = cosine table */ 1077 /* *BRLow = bit reversed counter table */ 1078 /* OUTPUTS */ 1079 /* *ioptr = output data array */ 1080 1081 int StageCnt; 1082 int NDiffU; 1083 1084 switch (M) { 1085 case 0: 1086 break; 1087 case 1: 1088 fft2pt(ioptr); /* a 2 pt fft */ 1089 break; 1090 case 2: 1091 fft4pt(ioptr); /* a 4 pt fft */ 1092 break; 1093 case 3: 1094 fft8pt(ioptr); /* an 8 pt fft */ 1095 break; 1096 default: 1097 bitrevR2(ioptr, M, BRLow); /* bit reverse and first radix 2 stage */ 1098 StageCnt = (M - 1) / 3; /* number of radix 8 stages */ 1099 NDiffU = 2; /* one radix 2 stage already complete */ 1100 if ((M - 1 - (StageCnt * 3)) == 1) { 1101 bfR2(ioptr, M, NDiffU); /* 1 radix 2 stage */ 1102 NDiffU *= 2; 1103 } 1104 if ((M - 1 - (StageCnt * 3)) == 2) { 1105 bfR4(ioptr, M, NDiffU); /* 1 radix 4 stage */ 1106 NDiffU *= 4; 1107 } 1108 if (M <= (int) MCACHE) 1109 bfstages(ioptr, M, Utbl, 1, NDiffU, StageCnt); /* RADIX 8 Stages */ 1110 else 1111 fftrecurs(ioptr, M, Utbl, 1, NDiffU, StageCnt); /* RADIX 8 Stages */ 1112 } 1113 } 1114 1115 /****************** 1116 * parts of iffts1 * 1117 ******************/ 1118 1119 static void scbitrevR2(SPFLOAT *ioptr, int M, int16_t *BRLow, SPFLOAT scale) 1120 { 1121 /*** scaled bit reverse and first radix 2 stage forward or inverse fft ***/ 1122 SPFLOAT f0r; 1123 SPFLOAT f0i; 1124 SPFLOAT f1r; 1125 SPFLOAT f1i; 1126 SPFLOAT f2r; 1127 SPFLOAT f2i; 1128 SPFLOAT f3r; 1129 SPFLOAT f3i; 1130 SPFLOAT f4r; 1131 SPFLOAT f4i; 1132 SPFLOAT f5r; 1133 SPFLOAT f5i; 1134 SPFLOAT f6r; 1135 SPFLOAT f6i; 1136 SPFLOAT f7r; 1137 SPFLOAT f7i; 1138 SPFLOAT t0r; 1139 SPFLOAT t0i; 1140 SPFLOAT t1r; 1141 SPFLOAT t1i; 1142 SPFLOAT *p0r; 1143 SPFLOAT *p1r; 1144 SPFLOAT *IOP; 1145 SPFLOAT *iolimit; 1146 int Colstart; 1147 int iCol; 1148 unsigned int posA; 1149 unsigned int posAi; 1150 unsigned int posB; 1151 unsigned int posBi; 1152 1153 const unsigned int Nrems2 = POW2((M + 3) / 2); 1154 const unsigned int Nroot_1_ColInc = POW2(M) - Nrems2; 1155 const unsigned int Nroot_1 = POW2(M / 2 - 1) - 1; 1156 const unsigned int ColstartShift = (M + 1) / 2 + 1; 1157 1158 posA = POW2(M); /* 1/2 of POW2(M) complexes */ 1159 posAi = posA + 1; 1160 posB = posA + 2; 1161 posBi = posB + 1; 1162 1163 iolimit = ioptr + Nrems2; 1164 for (; ioptr < iolimit; ioptr += POW2(M / 2 + 1)) { 1165 for (Colstart = Nroot_1; Colstart >= 0; Colstart--) { 1166 iCol = Nroot_1; 1167 p0r = ioptr + Nroot_1_ColInc + BRLow[Colstart] * 4; 1168 IOP = ioptr + (Colstart << ColstartShift); 1169 p1r = IOP + BRLow[iCol] * 4; 1170 f0r = *(p0r); 1171 f0i = *(p0r + 1); 1172 f1r = *(p0r + posA); 1173 f1i = *(p0r + posAi); 1174 for (; iCol > Colstart;) { 1175 f2r = *(p0r + 2); 1176 f2i = *(p0r + (2 + 1)); 1177 f3r = *(p0r + posB); 1178 f3i = *(p0r + posBi); 1179 f4r = *(p1r); 1180 f4i = *(p1r + 1); 1181 f5r = *(p1r + posA); 1182 f5i = *(p1r + posAi); 1183 f6r = *(p1r + 2); 1184 f6i = *(p1r + (2 + 1)); 1185 f7r = *(p1r + posB); 1186 f7i = *(p1r + posBi); 1187 1188 t0r = f0r + f1r; 1189 t0i = f0i + f1i; 1190 f1r = f0r - f1r; 1191 f1i = f0i - f1i; 1192 t1r = f2r + f3r; 1193 t1i = f2i + f3i; 1194 f3r = f2r - f3r; 1195 f3i = f2i - f3i; 1196 f0r = f4r + f5r; 1197 f0i = f4i + f5i; 1198 f5r = f4r - f5r; 1199 f5i = f4i - f5i; 1200 f2r = f6r + f7r; 1201 f2i = f6i + f7i; 1202 f7r = f6r - f7r; 1203 f7i = f6i - f7i; 1204 1205 *(p1r) = scale * t0r; 1206 *(p1r + 1) = scale * t0i; 1207 *(p1r + 2) = scale * f1r; 1208 *(p1r + (2 + 1)) = scale * f1i; 1209 *(p1r + posA) = scale * t1r; 1210 *(p1r + posAi) = scale * t1i; 1211 *(p1r + posB) = scale * f3r; 1212 *(p1r + posBi) = scale * f3i; 1213 *(p0r) = scale * f0r; 1214 *(p0r + 1) = scale * f0i; 1215 *(p0r + 2) = scale * f5r; 1216 *(p0r + (2 + 1)) = scale * f5i; 1217 *(p0r + posA) = scale * f2r; 1218 *(p0r + posAi) = scale * f2i; 1219 *(p0r + posB) = scale * f7r; 1220 *(p0r + posBi) = scale * f7i; 1221 1222 p0r -= Nrems2; 1223 f0r = *(p0r); 1224 f0i = *(p0r + 1); 1225 f1r = *(p0r + posA); 1226 f1i = *(p0r + posAi); 1227 iCol -= 1; 1228 p1r = IOP + BRLow[iCol] * 4; 1229 } 1230 f2r = *(p0r + 2); 1231 f2i = *(p0r + (2 + 1)); 1232 f3r = *(p0r + posB); 1233 f3i = *(p0r + posBi); 1234 1235 t0r = f0r + f1r; 1236 t0i = f0i + f1i; 1237 f1r = f0r - f1r; 1238 f1i = f0i - f1i; 1239 t1r = f2r + f3r; 1240 t1i = f2i + f3i; 1241 f3r = f2r - f3r; 1242 f3i = f2i - f3i; 1243 1244 *(p0r) = scale * t0r; 1245 *(p0r + 1) = scale * t0i; 1246 *(p0r + 2) = scale * f1r; 1247 *(p0r + (2 + 1)) = scale * f1i; 1248 *(p0r + posA) = scale * t1r; 1249 *(p0r + posAi) = scale * t1i; 1250 *(p0r + posB) = scale * f3r; 1251 *(p0r + posBi) = scale * f3i; 1252 } 1253 } 1254 } 1255 1256 static void ifft2pt(SPFLOAT *ioptr, SPFLOAT scale) 1257 { 1258 /*** RADIX 2 ifft ***/ 1259 SPFLOAT f0r, f0i, f1r, f1i; 1260 SPFLOAT t0r, t0i; 1261 1262 /* bit reversed load */ 1263 f0r = ioptr[0]; 1264 f0i = ioptr[1]; 1265 f1r = ioptr[2]; 1266 f1i = ioptr[3]; 1267 1268 /* Butterflys */ 1269 /* 1270 f0 - - t0 1271 f1 - 1 - f1 1272 */ 1273 1274 t0r = f0r + f1r; 1275 t0i = f0i + f1i; 1276 f1r = f0r - f1r; 1277 f1i = f0i - f1i; 1278 1279 /* store result */ 1280 ioptr[0] = scale * t0r; 1281 ioptr[1] = scale * t0i; 1282 ioptr[2] = scale * f1r; 1283 ioptr[3] = scale * f1i; 1284 } 1285 1286 static void ifft4pt(SPFLOAT *ioptr, SPFLOAT scale) 1287 { 1288 /*** RADIX 4 ifft ***/ 1289 SPFLOAT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; 1290 SPFLOAT t0r, t0i, t1r, t1i; 1291 1292 /* bit reversed load */ 1293 f0r = ioptr[0]; 1294 f0i = ioptr[1]; 1295 f1r = ioptr[4]; 1296 f1i = ioptr[5]; 1297 f2r = ioptr[2]; 1298 f2i = ioptr[3]; 1299 f3r = ioptr[6]; 1300 f3i = ioptr[7]; 1301 1302 /* Butterflys */ 1303 /* 1304 f0 - - t0 - - f0 1305 f1 - 1 - f1 - - f1 1306 f2 - - f2 - 1 - f2 1307 f3 - 1 - t1 - i - f3 1308 */ 1309 1310 t0r = f0r + f1r; 1311 t0i = f0i + f1i; 1312 f1r = f0r - f1r; 1313 f1i = f0i - f1i; 1314 1315 t1r = f2r - f3r; 1316 t1i = f2i - f3i; 1317 f2r = f2r + f3r; 1318 f2i = f2i + f3i; 1319 1320 f0r = t0r + f2r; 1321 f0i = t0i + f2i; 1322 f2r = t0r - f2r; 1323 f2i = t0i - f2i; 1324 1325 f3r = f1r + t1i; 1326 f3i = f1i - t1r; 1327 f1r = f1r - t1i; 1328 f1i = f1i + t1r; 1329 1330 /* store result */ 1331 ioptr[0] = scale * f0r; 1332 ioptr[1] = scale * f0i; 1333 ioptr[2] = scale * f1r; 1334 ioptr[3] = scale * f1i; 1335 ioptr[4] = scale * f2r; 1336 ioptr[5] = scale * f2i; 1337 ioptr[6] = scale * f3r; 1338 ioptr[7] = scale * f3i; 1339 } 1340 1341 static void ifft8pt(SPFLOAT *ioptr, SPFLOAT scale) 1342 { 1343 /*** RADIX 8 ifft ***/ 1344 SPFLOAT w0r = 1.0 / MYROOT2; /* cos(pi/4) */ 1345 SPFLOAT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; 1346 SPFLOAT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; 1347 SPFLOAT t0r, t0i, t1r, t1i; 1348 const SPFLOAT Two = 2.0; 1349 1350 /* bit reversed load */ 1351 f0r = ioptr[0]; 1352 f0i = ioptr[1]; 1353 f1r = ioptr[8]; 1354 f1i = ioptr[9]; 1355 f2r = ioptr[4]; 1356 f2i = ioptr[5]; 1357 f3r = ioptr[12]; 1358 f3i = ioptr[13]; 1359 f4r = ioptr[2]; 1360 f4i = ioptr[3]; 1361 f5r = ioptr[10]; 1362 f5i = ioptr[11]; 1363 f6r = ioptr[6]; 1364 f6i = ioptr[7]; 1365 f7r = ioptr[14]; 1366 f7i = ioptr[15]; 1367 1368 /* Butterflys */ 1369 /* 1370 f0 - - t0 - - f0 - - f0 1371 f1 - 1 - f1 - - f1 - - f1 1372 f2 - - f2 - 1 - f2 - - f2 1373 f3 - 1 - t1 - i - f3 - - f3 1374 f4 - - t0 - - f4 - 1 - t0 1375 f5 - 1 - f5 - - f5 - w3 - f4 1376 f6 - - f6 - 1 - f6 - i - t1 1377 f7 - 1 - t1 - i - f7 - iw3- f6 1378 */ 1379 1380 t0r = f0r + f1r; 1381 t0i = f0i + f1i; 1382 f1r = f0r - f1r; 1383 f1i = f0i - f1i; 1384 1385 t1r = f2r - f3r; 1386 t1i = f2i - f3i; 1387 f2r = f2r + f3r; 1388 f2i = f2i + f3i; 1389 1390 f0r = t0r + f2r; 1391 f0i = t0i + f2i; 1392 f2r = t0r - f2r; 1393 f2i = t0i - f2i; 1394 1395 f3r = f1r + t1i; 1396 f3i = f1i - t1r; 1397 f1r = f1r - t1i; 1398 f1i = f1i + t1r; 1399 1400 t0r = f4r + f5r; 1401 t0i = f4i + f5i; 1402 f5r = f4r - f5r; 1403 f5i = f4i - f5i; 1404 1405 t1r = f6r - f7r; 1406 t1i = f6i - f7i; 1407 f6r = f6r + f7r; 1408 f6i = f6i + f7i; 1409 1410 f4r = t0r + f6r; 1411 f4i = t0i + f6i; 1412 f6r = t0r - f6r; 1413 f6i = t0i - f6i; 1414 1415 f7r = f5r + t1i; 1416 f7i = f5i - t1r; 1417 f5r = f5r - t1i; 1418 f5i = f5i + t1r; 1419 1420 t0r = f0r - f4r; 1421 t0i = f0i - f4i; 1422 f0r = f0r + f4r; 1423 f0i = f0i + f4i; 1424 1425 t1r = f2r + f6i; 1426 t1i = f2i - f6r; 1427 f2r = f2r - f6i; 1428 f2i = f2i + f6r; 1429 1430 f4r = f1r - f5r * w0r + f5i * w0r; 1431 f4i = f1i - f5r * w0r - f5i * w0r; 1432 f1r = f1r * Two - f4r; 1433 f1i = f1i * Two - f4i; 1434 1435 f6r = f3r + f7r * w0r + f7i * w0r; 1436 f6i = f3i - f7r * w0r + f7i * w0r; 1437 f3r = f3r * Two - f6r; 1438 f3i = f3i * Two - f6i; 1439 1440 /* store result */ 1441 ioptr[0] = scale * f0r; 1442 ioptr[1] = scale * f0i; 1443 ioptr[2] = scale * f1r; 1444 ioptr[3] = scale * f1i; 1445 ioptr[4] = scale * f2r; 1446 ioptr[5] = scale * f2i; 1447 ioptr[6] = scale * f3r; 1448 ioptr[7] = scale * f3i; 1449 ioptr[8] = scale * t0r; 1450 ioptr[9] = scale * t0i; 1451 ioptr[10] = scale * f4r; 1452 ioptr[11] = scale * f4i; 1453 ioptr[12] = scale * t1r; 1454 ioptr[13] = scale * t1i; 1455 ioptr[14] = scale * f6r; 1456 ioptr[15] = scale * f6i; 1457 } 1458 1459 static void ibfR2(SPFLOAT *ioptr, int M, int NDiffU) 1460 { 1461 /*** 2nd radix 2 stage ***/ 1462 unsigned int pos; 1463 unsigned int posi; 1464 unsigned int pinc; 1465 unsigned int pnext; 1466 unsigned int NSameU; 1467 unsigned int SameUCnt; 1468 1469 SPFLOAT *pstrt; 1470 SPFLOAT *p0r, *p1r, *p2r, *p3r; 1471 1472 SPFLOAT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; 1473 SPFLOAT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; 1474 1475 pinc = NDiffU * 2; /* 2 floats per complex */ 1476 pnext = pinc * 4; 1477 pos = 2; 1478 posi = pos + 1; 1479 NSameU = POW2(M) / 4 / NDiffU; /* 4 Us at a time */ 1480 pstrt = ioptr; 1481 p0r = pstrt; 1482 p1r = pstrt + pinc; 1483 p2r = p1r + pinc; 1484 p3r = p2r + pinc; 1485 1486 /* Butterflys */ 1487 /* 1488 f0 - - f4 1489 f1 - 1 - f5 1490 f2 - - f6 1491 f3 - 1 - f7 1492 */ 1493 /* Butterflys */ 1494 /* 1495 f0 - - f4 1496 f1 - 1 - f5 1497 f2 - - f6 1498 f3 - 1 - f7 1499 */ 1500 1501 for (SameUCnt = NSameU; SameUCnt > 0; SameUCnt--) { 1502 1503 f0r = *p0r; 1504 f1r = *p1r; 1505 f0i = *(p0r + 1); 1506 f1i = *(p1r + 1); 1507 f2r = *p2r; 1508 f3r = *p3r; 1509 f2i = *(p2r + 1); 1510 f3i = *(p3r + 1); 1511 1512 f4r = f0r + f1r; 1513 f4i = f0i + f1i; 1514 f5r = f0r - f1r; 1515 f5i = f0i - f1i; 1516 1517 f6r = f2r + f3r; 1518 f6i = f2i + f3i; 1519 f7r = f2r - f3r; 1520 f7i = f2i - f3i; 1521 1522 *p0r = f4r; 1523 *(p0r + 1) = f4i; 1524 *p1r = f5r; 1525 *(p1r + 1) = f5i; 1526 *p2r = f6r; 1527 *(p2r + 1) = f6i; 1528 *p3r = f7r; 1529 *(p3r + 1) = f7i; 1530 1531 f0r = *(p0r + pos); 1532 f1i = *(p1r + posi); 1533 f0i = *(p0r + posi); 1534 f1r = *(p1r + pos); 1535 f2r = *(p2r + pos); 1536 f3i = *(p3r + posi); 1537 f2i = *(p2r + posi); 1538 f3r = *(p3r + pos); 1539 1540 f4r = f0r - f1i; 1541 f4i = f0i + f1r; 1542 f5r = f0r + f1i; 1543 f5i = f0i - f1r; 1544 1545 f6r = f2r - f3i; 1546 f6i = f2i + f3r; 1547 f7r = f2r + f3i; 1548 f7i = f2i - f3r; 1549 1550 *(p0r + pos) = f4r; 1551 *(p0r + posi) = f4i; 1552 *(p1r + pos) = f5r; 1553 *(p1r + posi) = f5i; 1554 *(p2r + pos) = f6r; 1555 *(p2r + posi) = f6i; 1556 *(p3r + pos) = f7r; 1557 *(p3r + posi) = f7i; 1558 1559 p0r += pnext; 1560 p1r += pnext; 1561 p2r += pnext; 1562 p3r += pnext; 1563 } 1564 } 1565 1566 static void ibfR4(SPFLOAT *ioptr, int M, int NDiffU) 1567 { 1568 /*** 1 radix 4 stage ***/ 1569 unsigned int pos; 1570 unsigned int posi; 1571 unsigned int pinc; 1572 unsigned int pnext; 1573 unsigned int pnexti; 1574 unsigned int NSameU; 1575 unsigned int SameUCnt; 1576 1577 SPFLOAT *pstrt; 1578 SPFLOAT *p0r, *p1r, *p2r, *p3r; 1579 1580 SPFLOAT w1r = 1.0 / MYROOT2; /* cos(pi/4) */ 1581 SPFLOAT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; 1582 SPFLOAT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; 1583 SPFLOAT t1r, t1i; 1584 const SPFLOAT Two = 2.0; 1585 1586 pinc = NDiffU * 2; /* 2 floats per complex */ 1587 pnext = pinc * 4; 1588 pnexti = pnext + 1; 1589 pos = 2; 1590 posi = pos + 1; 1591 NSameU = POW2(M) / 4 / NDiffU; /* 4 pts per butterfly */ 1592 pstrt = ioptr; 1593 p0r = pstrt; 1594 p1r = pstrt + pinc; 1595 p2r = p1r + pinc; 1596 p3r = p2r + pinc; 1597 1598 /* Butterflys */ 1599 /* 1600 f0 - - f0 - - f4 1601 f1 - 1 - f5 - - f5 1602 f2 - - f6 - 1 - f6 1603 f3 - 1 - f3 - -i - f7 1604 */ 1605 /* Butterflys */ 1606 /* 1607 f0 - - f4 - - f4 1608 f1 - -i - t1 - - f5 1609 f2 - - f2 - w1 - f6 1610 f3 - -i - f7 - iw1- f7 1611 */ 1612 1613 f0r = *p0r; 1614 f1r = *p1r; 1615 f2r = *p2r; 1616 f3r = *p3r; 1617 f0i = *(p0r + 1); 1618 f1i = *(p1r + 1); 1619 f2i = *(p2r + 1); 1620 f3i = *(p3r + 1); 1621 1622 f5r = f0r - f1r; 1623 f5i = f0i - f1i; 1624 f0r = f0r + f1r; 1625 f0i = f0i + f1i; 1626 1627 f6r = f2r + f3r; 1628 f6i = f2i + f3i; 1629 f3r = f2r - f3r; 1630 f3i = f2i - f3i; 1631 1632 for (SameUCnt = NSameU - 1; SameUCnt > 0; SameUCnt--) { 1633 1634 f7r = f5r + f3i; 1635 f7i = f5i - f3r; 1636 f5r = f5r - f3i; 1637 f5i = f5i + f3r; 1638 1639 f4r = f0r + f6r; 1640 f4i = f0i + f6i; 1641 f6r = f0r - f6r; 1642 f6i = f0i - f6i; 1643 1644 f2r = *(p2r + pos); 1645 f2i = *(p2r + posi); 1646 f1r = *(p1r + pos); 1647 f1i = *(p1r + posi); 1648 f3i = *(p3r + posi); 1649 f0r = *(p0r + pos); 1650 f3r = *(p3r + pos); 1651 f0i = *(p0r + posi); 1652 1653 *p3r = f7r; 1654 *p0r = f4r; 1655 *(p3r + 1) = f7i; 1656 *(p0r + 1) = f4i; 1657 *p1r = f5r; 1658 *p2r = f6r; 1659 *(p1r + 1) = f5i; 1660 *(p2r + 1) = f6i; 1661 1662 f7r = f2r + f3i; 1663 f7i = f2i - f3r; 1664 f2r = f2r - f3i; 1665 f2i = f2i + f3r; 1666 1667 f4r = f0r - f1i; 1668 f4i = f0i + f1r; 1669 t1r = f0r + f1i; 1670 t1i = f0i - f1r; 1671 1672 f5r = t1r - f7r * w1r - f7i * w1r; 1673 f5i = t1i + f7r * w1r - f7i * w1r; 1674 f7r = t1r * Two - f5r; 1675 f7i = t1i * Two - f5i; 1676 1677 f6r = f4r - f2r * w1r + f2i * w1r; 1678 f6i = f4i - f2r * w1r - f2i * w1r; 1679 f4r = f4r * Two - f6r; 1680 f4i = f4i * Two - f6i; 1681 1682 f3r = *(p3r + pnext); 1683 f0r = *(p0r + pnext); 1684 f3i = *(p3r + pnexti); 1685 f0i = *(p0r + pnexti); 1686 f2r = *(p2r + pnext); 1687 f2i = *(p2r + pnexti); 1688 f1r = *(p1r + pnext); 1689 f1i = *(p1r + pnexti); 1690 1691 *(p2r + pos) = f6r; 1692 *(p1r + pos) = f5r; 1693 *(p2r + posi) = f6i; 1694 *(p1r + posi) = f5i; 1695 *(p3r + pos) = f7r; 1696 *(p0r + pos) = f4r; 1697 *(p3r + posi) = f7i; 1698 *(p0r + posi) = f4i; 1699 1700 f6r = f2r + f3r; 1701 f6i = f2i + f3i; 1702 f3r = f2r - f3r; 1703 f3i = f2i - f3i; 1704 1705 f5r = f0r - f1r; 1706 f5i = f0i - f1i; 1707 f0r = f0r + f1r; 1708 f0i = f0i + f1i; 1709 1710 p3r += pnext; 1711 p0r += pnext; 1712 p1r += pnext; 1713 p2r += pnext; 1714 } 1715 f7r = f5r + f3i; 1716 f7i = f5i - f3r; 1717 f5r = f5r - f3i; 1718 f5i = f5i + f3r; 1719 1720 f4r = f0r + f6r; 1721 f4i = f0i + f6i; 1722 f6r = f0r - f6r; 1723 f6i = f0i - f6i; 1724 1725 f2r = *(p2r + pos); 1726 f2i = *(p2r + posi); 1727 f1r = *(p1r + pos); 1728 f1i = *(p1r + posi); 1729 f3i = *(p3r + posi); 1730 f0r = *(p0r + pos); 1731 f3r = *(p3r + pos); 1732 f0i = *(p0r + posi); 1733 1734 *p3r = f7r; 1735 *p0r = f4r; 1736 *(p3r + 1) = f7i; 1737 *(p0r + 1) = f4i; 1738 *p1r = f5r; 1739 *p2r = f6r; 1740 *(p1r + 1) = f5i; 1741 *(p2r + 1) = f6i; 1742 1743 f7r = f2r + f3i; 1744 f7i = f2i - f3r; 1745 f2r = f2r - f3i; 1746 f2i = f2i + f3r; 1747 1748 f4r = f0r - f1i; 1749 f4i = f0i + f1r; 1750 t1r = f0r + f1i; 1751 t1i = f0i - f1r; 1752 1753 f5r = t1r - f7r * w1r - f7i * w1r; 1754 f5i = t1i + f7r * w1r - f7i * w1r; 1755 f7r = t1r * Two - f5r; 1756 f7i = t1i * Two - f5i; 1757 1758 f6r = f4r - f2r * w1r + f2i * w1r; 1759 f6i = f4i - f2r * w1r - f2i * w1r; 1760 f4r = f4r * Two - f6r; 1761 f4i = f4i * Two - f6i; 1762 1763 *(p2r + pos) = f6r; 1764 *(p1r + pos) = f5r; 1765 *(p2r + posi) = f6i; 1766 *(p1r + posi) = f5i; 1767 *(p3r + pos) = f7r; 1768 *(p0r + pos) = f4r; 1769 *(p3r + posi) = f7i; 1770 *(p0r + posi) = f4i; 1771 } 1772 1773 static void ibfstages(SPFLOAT *ioptr, int M, SPFLOAT *Utbl, int Ustride, 1774 int NDiffU, int StageCnt) 1775 { 1776 /*** RADIX 8 Stages ***/ 1777 unsigned int pos; 1778 unsigned int posi; 1779 unsigned int pinc; 1780 unsigned int pnext; 1781 unsigned int NSameU; 1782 int Uinc; 1783 int Uinc2; 1784 int Uinc4; 1785 unsigned int DiffUCnt; 1786 unsigned int SameUCnt; 1787 unsigned int U2toU3; 1788 1789 SPFLOAT *pstrt; 1790 SPFLOAT *p0r, *p1r, *p2r, *p3r; 1791 SPFLOAT *u0r, *u0i, *u1r, *u1i, *u2r, *u2i; 1792 1793 SPFLOAT w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i; 1794 SPFLOAT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; 1795 SPFLOAT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; 1796 SPFLOAT t0r, t0i, t1r, t1i; 1797 const SPFLOAT Two = 2.0; 1798 1799 pinc = NDiffU * 2; /* 2 floats per complex */ 1800 pnext = pinc * 8; 1801 pos = pinc * 4; 1802 posi = pos + 1; 1803 NSameU = POW2(M) / 8 / NDiffU; /* 8 pts per butterfly */ 1804 Uinc = (int) NSameU * Ustride; 1805 Uinc2 = Uinc * 2; 1806 Uinc4 = Uinc * 4; 1807 U2toU3 = (POW2(M) / 8) * Ustride; 1808 for (; StageCnt > 0; StageCnt--) { 1809 1810 u0r = &Utbl[0]; 1811 u0i = &Utbl[POW2(M - 2) * Ustride]; 1812 u1r = u0r; 1813 u1i = u0i; 1814 u2r = u0r; 1815 u2i = u0i; 1816 1817 w0r = *u0r; 1818 w0i = *u0i; 1819 w1r = *u1r; 1820 w1i = *u1i; 1821 w2r = *u2r; 1822 w2i = *u2i; 1823 w3r = *(u2r + U2toU3); 1824 w3i = *(u2i - U2toU3); 1825 1826 pstrt = ioptr; 1827 1828 p0r = pstrt; 1829 p1r = pstrt + pinc; 1830 p2r = p1r + pinc; 1831 p3r = p2r + pinc; 1832 1833 /* Butterflys */ 1834 /* 1835 f0 - - t0 - - f0 - - f0 1836 f1 - w0- f1 - - f1 - - f1 1837 f2 - - f2 - w1- f2 - - f4 1838 f3 - w0- t1 - iw1- f3 - - f5 1839 1840 f4 - - t0 - - f4 - w2- t0 1841 f5 - w0- f5 - - f5 - w3- t1 1842 f6 - - f6 - w1- f6 - iw2- f6 1843 f7 - w0- t1 - iw1- f7 - iw3- f7 1844 */ 1845 1846 for (DiffUCnt = NDiffU; DiffUCnt > 0; DiffUCnt--) { 1847 f0r = *p0r; 1848 f0i = *(p0r + 1); 1849 f1r = *p1r; 1850 f1i = *(p1r + 1); 1851 for (SameUCnt = NSameU - 1; SameUCnt > 0; SameUCnt--) { 1852 f2r = *p2r; 1853 f2i = *(p2r + 1); 1854 f3r = *p3r; 1855 f3i = *(p3r + 1); 1856 1857 t0r = f0r + f1r * w0r - f1i * w0i; 1858 t0i = f0i + f1r * w0i + f1i * w0r; 1859 f1r = f0r * Two - t0r; 1860 f1i = f0i * Two - t0i; 1861 1862 f4r = *(p0r + pos); 1863 f4i = *(p0r + posi); 1864 f5r = *(p1r + pos); 1865 f5i = *(p1r + posi); 1866 1867 f6r = *(p2r + pos); 1868 f6i = *(p2r + posi); 1869 f7r = *(p3r + pos); 1870 f7i = *(p3r + posi); 1871 1872 t1r = f2r - f3r * w0r + f3i * w0i; 1873 t1i = f2i - f3r * w0i - f3i * w0r; 1874 f2r = f2r * Two - t1r; 1875 f2i = f2i * Two - t1i; 1876 1877 f0r = t0r + f2r * w1r - f2i * w1i; 1878 f0i = t0i + f2r * w1i + f2i * w1r; 1879 f2r = t0r * Two - f0r; 1880 f2i = t0i * Two - f0i; 1881 1882 f3r = f1r + t1r * w1i + t1i * w1r; 1883 f3i = f1i - t1r * w1r + t1i * w1i; 1884 f1r = f1r * Two - f3r; 1885 f1i = f1i * Two - f3i; 1886 1887 t0r = f4r + f5r * w0r - f5i * w0i; 1888 t0i = f4i + f5r * w0i + f5i * w0r; 1889 f5r = f4r * Two - t0r; 1890 f5i = f4i * Two - t0i; 1891 1892 t1r = f6r - f7r * w0r + f7i * w0i; 1893 t1i = f6i - f7r * w0i - f7i * w0r; 1894 f6r = f6r * Two - t1r; 1895 f6i = f6i * Two - t1i; 1896 1897 f4r = t0r + f6r * w1r - f6i * w1i; 1898 f4i = t0i + f6r * w1i + f6i * w1r; 1899 f6r = t0r * Two - f4r; 1900 f6i = t0i * Two - f4i; 1901 1902 f7r = f5r + t1r * w1i + t1i * w1r; 1903 f7i = f5i - t1r * w1r + t1i * w1i; 1904 f5r = f5r * Two - f7r; 1905 f5i = f5i * Two - f7i; 1906 1907 t0r = f0r - f4r * w2r + f4i * w2i; 1908 t0i = f0i - f4r * w2i - f4i * w2r; 1909 f0r = f0r * Two - t0r; 1910 f0i = f0i * Two - t0i; 1911 1912 t1r = f1r - f5r * w3r + f5i * w3i; 1913 t1i = f1i - f5r * w3i - f5i * w3r; 1914 f1r = f1r * Two - t1r; 1915 f1i = f1i * Two - t1i; 1916 1917 *(p0r + pos) = t0r; 1918 *(p0r + posi) = t0i; 1919 *p0r = f0r; 1920 *(p0r + 1) = f0i; 1921 1922 p0r += pnext; 1923 f0r = *p0r; 1924 f0i = *(p0r + 1); 1925 1926 *(p1r + pos) = t1r; 1927 *(p1r + posi) = t1i; 1928 *p1r = f1r; 1929 *(p1r + 1) = f1i; 1930 1931 p1r += pnext; 1932 1933 f1r = *p1r; 1934 f1i = *(p1r + 1); 1935 1936 f4r = f2r - f6r * w2i - f6i * w2r; 1937 f4i = f2i + f6r * w2r - f6i * w2i; 1938 f6r = f2r * Two - f4r; 1939 f6i = f2i * Two - f4i; 1940 1941 f5r = f3r - f7r * w3i - f7i * w3r; 1942 f5i = f3i + f7r * w3r - f7i * w3i; 1943 f7r = f3r * Two - f5r; 1944 f7i = f3i * Two - f5i; 1945 1946 *p2r = f4r; 1947 *(p2r + 1) = f4i; 1948 *(p2r + pos) = f6r; 1949 *(p2r + posi) = f6i; 1950 1951 p2r += pnext; 1952 1953 *p3r = f5r; 1954 *(p3r + 1) = f5i; 1955 *(p3r + pos) = f7r; 1956 *(p3r + posi) = f7i; 1957 1958 p3r += pnext; 1959 } 1960 1961 f2r = *p2r; 1962 f2i = *(p2r + 1); 1963 f3r = *p3r; 1964 f3i = *(p3r + 1); 1965 1966 t0r = f0r + f1r * w0r - f1i * w0i; 1967 t0i = f0i + f1r * w0i + f1i * w0r; 1968 f1r = f0r * Two - t0r; 1969 f1i = f0i * Two - t0i; 1970 1971 f4r = *(p0r + pos); 1972 f4i = *(p0r + posi); 1973 f5r = *(p1r + pos); 1974 f5i = *(p1r + posi); 1975 1976 f6r = *(p2r + pos); 1977 f6i = *(p2r + posi); 1978 f7r = *(p3r + pos); 1979 f7i = *(p3r + posi); 1980 1981 t1r = f2r - f3r * w0r + f3i * w0i; 1982 t1i = f2i - f3r * w0i - f3i * w0r; 1983 f2r = f2r * Two - t1r; 1984 f2i = f2i * Two - t1i; 1985 1986 f0r = t0r + f2r * w1r - f2i * w1i; 1987 f0i = t0i + f2r * w1i + f2i * w1r; 1988 f2r = t0r * Two - f0r; 1989 f2i = t0i * Two - f0i; 1990 1991 f3r = f1r + t1r * w1i + t1i * w1r; 1992 f3i = f1i - t1r * w1r + t1i * w1i; 1993 f1r = f1r * Two - f3r; 1994 f1i = f1i * Two - f3i; 1995 1996 if ((int) DiffUCnt == NDiffU / 2) 1997 Uinc4 = -Uinc4; 1998 1999 u0r += Uinc4; 2000 u0i -= Uinc4; 2001 u1r += Uinc2; 2002 u1i -= Uinc2; 2003 u2r += Uinc; 2004 u2i -= Uinc; 2005 2006 pstrt += 2; 2007 2008 t0r = f4r + f5r * w0r - f5i * w0i; 2009 t0i = f4i + f5r * w0i + f5i * w0r; 2010 f5r = f4r * Two - t0r; 2011 f5i = f4i * Two - t0i; 2012 2013 t1r = f6r - f7r * w0r + f7i * w0i; 2014 t1i = f6i - f7r * w0i - f7i * w0r; 2015 f6r = f6r * Two - t1r; 2016 f6i = f6i * Two - t1i; 2017 2018 f4r = t0r + f6r * w1r - f6i * w1i; 2019 f4i = t0i + f6r * w1i + f6i * w1r; 2020 f6r = t0r * Two - f4r; 2021 f6i = t0i * Two - f4i; 2022 2023 f7r = f5r + t1r * w1i + t1i * w1r; 2024 f7i = f5i - t1r * w1r + t1i * w1i; 2025 f5r = f5r * Two - f7r; 2026 f5i = f5i * Two - f7i; 2027 2028 w0r = *u0r; 2029 w0i = *u0i; 2030 w1r = *u1r; 2031 w1i = *u1i; 2032 2033 if ((int) DiffUCnt <= NDiffU / 2) 2034 w0r = -w0r; 2035 2036 t0r = f0r - f4r * w2r + f4i * w2i; 2037 t0i = f0i - f4r * w2i - f4i * w2r; 2038 f0r = f0r * Two - t0r; 2039 f0i = f0i * Two - t0i; 2040 2041 f4r = f2r - f6r * w2i - f6i * w2r; 2042 f4i = f2i + f6r * w2r - f6i * w2i; 2043 f6r = f2r * Two - f4r; 2044 f6i = f2i * Two - f4i; 2045 2046 *(p0r + pos) = t0r; 2047 *p2r = f4r; 2048 *(p0r + posi) = t0i; 2049 *(p2r + 1) = f4i; 2050 w2r = *u2r; 2051 w2i = *u2i; 2052 *p0r = f0r; 2053 *(p2r + pos) = f6r; 2054 *(p0r + 1) = f0i; 2055 *(p2r + posi) = f6i; 2056 2057 p0r = pstrt; 2058 p2r = pstrt + pinc + pinc; 2059 2060 t1r = f1r - f5r * w3r + f5i * w3i; 2061 t1i = f1i - f5r * w3i - f5i * w3r; 2062 f1r = f1r * Two - t1r; 2063 f1i = f1i * Two - t1i; 2064 2065 f5r = f3r - f7r * w3i - f7i * w3r; 2066 f5i = f3i + f7r * w3r - f7i * w3i; 2067 f7r = f3r * Two - f5r; 2068 f7i = f3i * Two - f5i; 2069 2070 *(p1r + pos) = t1r; 2071 *p3r = f5r; 2072 *(p1r + posi) = t1i; 2073 *(p3r + 1) = f5i; 2074 w3r = *(u2r + U2toU3); 2075 w3i = *(u2i - U2toU3); 2076 *p1r = f1r; 2077 *(p3r + pos) = f7r; 2078 *(p1r + 1) = f1i; 2079 *(p3r + posi) = f7i; 2080 2081 p1r = pstrt + pinc; 2082 p3r = p2r + pinc; 2083 } 2084 NSameU /= 8; 2085 Uinc /= 8; 2086 Uinc2 /= 8; 2087 Uinc4 = Uinc * 4; 2088 NDiffU *= 8; 2089 pinc *= 8; 2090 pnext *= 8; 2091 pos *= 8; 2092 posi = pos + 1; 2093 } 2094 } 2095 2096 static void ifftrecurs(SPFLOAT *ioptr, int M, SPFLOAT *Utbl, int Ustride, 2097 int NDiffU, int StageCnt) 2098 { 2099 /* recursive bfstages calls to maximize on chip cache efficiency */ 2100 int i1; 2101 2102 if (M <= (int) MCACHE) 2103 ibfstages(ioptr, M, Utbl, Ustride, NDiffU, StageCnt); /* RADIX 8 Stages */ 2104 else { 2105 for (i1 = 0; i1 < 8; i1++) { 2106 ifftrecurs(&ioptr[i1 * POW2(M - 3) * 2], M - 3, Utbl, 8 * Ustride, 2107 NDiffU, StageCnt - 1); /* RADIX 8 Stages */ 2108 } 2109 ibfstages(ioptr, M, Utbl, Ustride, POW2(M - 3), 1); /* RADIX 8 Stage */ 2110 } 2111 } 2112 2113 static void iffts1(SPFLOAT *ioptr, int M, SPFLOAT *Utbl, int16_t *BRLow) 2114 { 2115 /* Compute in-place inverse complex fft on the rows of the input array */ 2116 /* INPUTS */ 2117 /* *ioptr = input data array */ 2118 /* M = log2 of fft size */ 2119 /* *Utbl = cosine table */ 2120 /* *BRLow = bit reversed counter table */ 2121 /* OUTPUTS */ 2122 /* *ioptr = output data array */ 2123 2124 int StageCnt; 2125 int NDiffU; 2126 const SPFLOAT scale = 1.0 / POW2(M); 2127 2128 switch (M) { 2129 case 0: 2130 break; 2131 case 1: 2132 ifft2pt(ioptr, scale); /* a 2 pt fft */ 2133 break; 2134 case 2: 2135 ifft4pt(ioptr, scale); /* a 4 pt fft */ 2136 break; 2137 case 3: 2138 ifft8pt(ioptr, scale); /* an 8 pt fft */ 2139 break; 2140 default: 2141 /* bit reverse and first radix 2 stage */ 2142 scbitrevR2(ioptr, M, BRLow, scale); 2143 StageCnt = (M - 1) / 3; /* number of radix 8 stages */ 2144 NDiffU = 2; /* one radix 2 stage already complete */ 2145 if ((M - 1 - (StageCnt * 3)) == 1) { 2146 ibfR2(ioptr, M, NDiffU); /* 1 radix 2 stage */ 2147 NDiffU *= 2; 2148 } 2149 if ((M - 1 - (StageCnt * 3)) == 2) { 2150 ibfR4(ioptr, M, NDiffU); /* 1 radix 4 stage */ 2151 NDiffU *= 4; 2152 } 2153 if (M <= (int) MCACHE) 2154 ibfstages(ioptr, M, Utbl, 1, NDiffU, StageCnt); /* RADIX 8 Stages */ 2155 else 2156 ifftrecurs(ioptr, M, Utbl, 1, NDiffU, StageCnt); /* RADIX 8 Stages */ 2157 } 2158 } 2159 2160 /****************** 2161 * parts of rffts1 * 2162 ******************/ 2163 2164 static void rfft1pt(SPFLOAT *ioptr) 2165 { 2166 /*** RADIX 2 rfft ***/ 2167 SPFLOAT f0r, f0i; 2168 SPFLOAT t0r, t0i; 2169 2170 /* bit reversed load */ 2171 f0r = ioptr[0]; 2172 f0i = ioptr[1]; 2173 2174 /* finish rfft */ 2175 t0r = f0r + f0i; 2176 t0i = f0r - f0i; 2177 2178 /* store result */ 2179 ioptr[0] = t0r; 2180 ioptr[1] = t0i; 2181 } 2182 2183 static void rfft2pt(SPFLOAT *ioptr) 2184 { 2185 /*** RADIX 4 rfft ***/ 2186 SPFLOAT f0r, f0i, f1r, f1i; 2187 SPFLOAT t0r, t0i; 2188 2189 /* bit reversed load */ 2190 f0r = ioptr[0]; 2191 f0i = ioptr[1]; 2192 f1r = ioptr[2]; 2193 f1i = ioptr[3]; 2194 2195 /* Butterflys */ 2196 /* 2197 f0 - - t0 2198 f1 - 1 - f1 2199 */ 2200 2201 t0r = f0r + f1r; 2202 t0i = f0i + f1i; 2203 f1r = f0r - f1r; 2204 f1i = f1i - f0i; 2205 /* finish rfft */ 2206 f0r = t0r + t0i; 2207 f0i = t0r - t0i; 2208 2209 /* store result */ 2210 ioptr[0] = f0r; 2211 ioptr[1] = f0i; 2212 ioptr[2] = f1r; 2213 ioptr[3] = f1i; 2214 } 2215 2216 static void rfft4pt(SPFLOAT *ioptr) 2217 { 2218 /*** RADIX 8 rfft ***/ 2219 SPFLOAT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; 2220 SPFLOAT t0r, t0i, t1r, t1i; 2221 SPFLOAT w0r = 1.0 / MYROOT2; /* cos(pi/4) */ 2222 const SPFLOAT Two = 2.0; 2223 const SPFLOAT scale = 0.5; 2224 2225 /* bit reversed load */ 2226 f0r = ioptr[0]; 2227 f0i = ioptr[1]; 2228 f1r = ioptr[4]; 2229 f1i = ioptr[5]; 2230 f2r = ioptr[2]; 2231 f2i = ioptr[3]; 2232 f3r = ioptr[6]; 2233 f3i = ioptr[7]; 2234 2235 /* Butterflys */ 2236 /* 2237 f0 - - t0 - - f0 2238 f1 - 1 - f1 - - f1 2239 f2 - - f2 - 1 - f2 2240 f3 - 1 - t1 - -i - f3 2241 */ 2242 2243 t0r = f0r + f1r; 2244 t0i = f0i + f1i; 2245 f1r = f0r - f1r; 2246 f1i = f0i - f1i; 2247 2248 t1r = f2r - f3r; 2249 t1i = f2i - f3i; 2250 f2r = f2r + f3r; 2251 f2i = f2i + f3i; 2252 2253 f0r = t0r + f2r; 2254 f0i = t0i + f2i; 2255 f2r = t0r - f2r; 2256 f2i = f2i - t0i; /* neg for rfft */ 2257 2258 f3r = f1r - t1i; 2259 f3i = f1i + t1r; 2260 f1r = f1r + t1i; 2261 f1i = f1i - t1r; 2262 2263 /* finish rfft */ 2264 t0r = f0r + f0i; /* compute Re(x[0]) */ 2265 t0i = f0r - f0i; /* compute Re(x[N/2]) */ 2266 2267 t1r = f1r + f3r; 2268 t1i = f1i - f3i; 2269 f0r = f1i + f3i; 2270 f0i = f3r - f1r; 2271 2272 f1r = t1r + w0r * f0r + w0r * f0i; 2273 f1i = t1i - w0r * f0r + w0r * f0i; 2274 f3r = Two * t1r - f1r; 2275 f3i = f1i - Two * t1i; 2276 2277 /* store result */ 2278 ioptr[4] = f2r; 2279 ioptr[5] = f2i; 2280 ioptr[0] = t0r; 2281 ioptr[1] = t0i; 2282 2283 ioptr[2] = scale * f1r; 2284 ioptr[3] = scale * f1i; 2285 ioptr[6] = scale * f3r; 2286 ioptr[7] = scale * f3i; 2287 } 2288 2289 static void rfft8pt(SPFLOAT *ioptr) 2290 { 2291 /*** RADIX 16 rfft ***/ 2292 SPFLOAT w0r = 1.0 / MYROOT2; /* cos(pi/4) */ 2293 SPFLOAT w1r = MYCOSPID8; /* cos(pi/8) */ 2294 SPFLOAT w1i = MYSINPID8; /* sin(pi/8) */ 2295 SPFLOAT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; 2296 SPFLOAT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; 2297 SPFLOAT t0r, t0i, t1r, t1i; 2298 const SPFLOAT Two = 2.0; 2299 const SPFLOAT scale = 0.5; 2300 2301 /* bit reversed load */ 2302 f0r = ioptr[0]; 2303 f0i = ioptr[1]; 2304 f1r = ioptr[8]; 2305 f1i = ioptr[9]; 2306 f2r = ioptr[4]; 2307 f2i = ioptr[5]; 2308 f3r = ioptr[12]; 2309 f3i = ioptr[13]; 2310 f4r = ioptr[2]; 2311 f4i = ioptr[3]; 2312 f5r = ioptr[10]; 2313 f5i = ioptr[11]; 2314 f6r = ioptr[6]; 2315 f6i = ioptr[7]; 2316 f7r = ioptr[14]; 2317 f7i = ioptr[15]; 2318 /* Butterflys */ 2319 /* 2320 f0 - - t0 - - f0 - - f0 2321 f1 - 1 - f1 - - f1 - - f1 2322 f2 - - f2 - 1 - f2 - - f2 2323 f3 - 1 - t1 - -i - f3 - - f3 2324 f4 - - t0 - - f4 - 1 - t0 2325 f5 - 1 - f5 - - f5 - w3 - f4 2326 f6 - - f6 - 1 - f6 - -i - t1 2327 f7 - 1 - t1 - -i - f7 - iw3- f6 2328 */ 2329 2330 t0r = f0r + f1r; 2331 t0i = f0i + f1i; 2332 f1r = f0r - f1r; 2333 f1i = f0i - f1i; 2334 2335 t1r = f2r - f3r; 2336 t1i = f2i - f3i; 2337 f2r = f2r + f3r; 2338 f2i = f2i + f3i; 2339 2340 f0r = t0r + f2r; 2341 f0i = t0i + f2i; 2342 f2r = t0r - f2r; 2343 f2i = t0i - f2i; 2344 2345 f3r = f1r - t1i; 2346 f3i = f1i + t1r; 2347 f1r = f1r + t1i; 2348 f1i = f1i - t1r; 2349 2350 t0r = f4r + f5r; 2351 t0i = f4i + f5i; 2352 f5r = f4r - f5r; 2353 f5i = f4i - f5i; 2354 2355 t1r = f6r - f7r; 2356 t1i = f6i - f7i; 2357 f6r = f6r + f7r; 2358 f6i = f6i + f7i; 2359 2360 f4r = t0r + f6r; 2361 f4i = t0i + f6i; 2362 f6r = t0r - f6r; 2363 f6i = t0i - f6i; 2364 2365 f7r = f5r - t1i; 2366 f7i = f5i + t1r; 2367 f5r = f5r + t1i; 2368 f5i = f5i - t1r; 2369 2370 t0r = f0r - f4r; 2371 t0i = f4i - f0i; /* neg for rfft */ 2372 f0r = f0r + f4r; 2373 f0i = f0i + f4i; 2374 2375 t1r = f2r - f6i; 2376 t1i = f2i + f6r; 2377 f2r = f2r + f6i; 2378 f2i = f2i - f6r; 2379 2380 f4r = f1r - f5r * w0r - f5i * w0r; 2381 f4i = f1i + f5r * w0r - f5i * w0r; 2382 f1r = f1r * Two - f4r; 2383 f1i = f1i * Two - f4i; 2384 2385 f6r = f3r + f7r * w0r - f7i * w0r; 2386 f6i = f3i + f7r * w0r + f7i * w0r; 2387 f3r = f3r * Two - f6r; 2388 f3i = f3i * Two - f6i; 2389 2390 /* finish rfft */ 2391 f5r = f0r + f0i; /* compute Re(x[0]) */ 2392 f5i = f0r - f0i; /* compute Re(x[N/2]) */ 2393 2394 f0r = f2r + t1r; 2395 f0i = f2i - t1i; 2396 f7r = f2i + t1i; 2397 f7i = t1r - f2r; 2398 2399 f2r = f0r + w0r * f7r + w0r * f7i; 2400 f2i = f0i - w0r * f7r + w0r * f7i; 2401 t1r = Two * f0r - f2r; 2402 t1i = f2i - Two * f0i; 2403 2404 f0r = f1r + f6r; 2405 f0i = f1i - f6i; 2406 f7r = f1i + f6i; 2407 f7i = f6r - f1r; 2408 2409 f1r = f0r + w1r * f7r + w1i * f7i; 2410 f1i = f0i - w1i * f7r + w1r * f7i; 2411 f6r = Two * f0r - f1r; 2412 f6i = f1i - Two * f0i; 2413 2414 f0r = f3r + f4r; 2415 f0i = f3i - f4i; 2416 f7r = f3i + f4i; 2417 f7i = f4r - f3r; 2418 2419 f3r = f0r + w1i * f7r + w1r * f7i; 2420 f3i = f0i - w1r * f7r + w1i * f7i; 2421 f4r = Two * f0r - f3r; 2422 f4i = f3i - Two * f0i; 2423 2424 /* store result */ 2425 ioptr[8] = t0r; 2426 ioptr[9] = t0i; 2427 ioptr[0] = f5r; 2428 ioptr[1] = f5i; 2429 2430 ioptr[4] = scale * f2r; 2431 ioptr[5] = scale * f2i; 2432 ioptr[12] = scale * t1r; 2433 ioptr[13] = scale * t1i; 2434 2435 ioptr[2] = scale * f1r; 2436 ioptr[3] = scale * f1i; 2437 ioptr[6] = scale * f3r; 2438 ioptr[7] = scale * f3i; 2439 ioptr[10] = scale * f4r; 2440 ioptr[11] = scale * f4i; 2441 ioptr[14] = scale * f6r; 2442 ioptr[15] = scale * f6i; 2443 } 2444 2445 static void frstage(SPFLOAT *ioptr, int M, SPFLOAT *Utbl) 2446 { 2447 /* Finish RFFT */ 2448 2449 unsigned int pos; 2450 unsigned int posi; 2451 unsigned int diffUcnt; 2452 2453 SPFLOAT *p0r, *p1r; 2454 SPFLOAT *u0r, *u0i; 2455 2456 SPFLOAT w0r, w0i; 2457 SPFLOAT f0r, f0i, f1r, f1i, f4r, f4i, f5r, f5i; 2458 SPFLOAT t0r, t0i, t1r, t1i; 2459 const SPFLOAT Two = 2.0; 2460 2461 pos = POW2(M - 1); 2462 posi = pos + 1; 2463 2464 p0r = ioptr; 2465 p1r = ioptr + pos / 2; 2466 2467 u0r = Utbl + POW2(M - 3); 2468 2469 w0r = *u0r; f0r = *(p0r); 2470 f0i = *(p0r + 1); 2471 f4r = *(p0r + pos); 2472 f4i = *(p0r + posi); 2473 f1r = *(p1r); 2474 f1i = *(p1r + 1); 2475 f5r = *(p1r + pos); 2476 f5i = *(p1r + posi); 2477 2478 t0r = Two * f0r + Two * f0i; /* compute Re(x[0]) */ 2479 t0i = Two * f0r - Two * f0i; /* compute Re(x[N/2]) */ 2480 t1r = f4r + f4r; 2481 t1i = -f4i - f4i; 2482 2483 f0r = f1r + f5r; 2484 f0i = f1i - f5i; 2485 f4r = f1i + f5i; 2486 f4i = f5r - f1r; 2487 2488 f1r = f0r + w0r * f4r + w0r * f4i; 2489 f1i = f0i - w0r * f4r + w0r * f4i; 2490 f5r = Two * f0r - f1r; 2491 f5i = f1i - Two * f0i; 2492 2493 *(p0r) = t0r; 2494 *(p0r + 1) = t0i; 2495 *(p0r + pos) = t1r; 2496 *(p0r + posi) = t1i; 2497 *(p1r) = f1r; 2498 *(p1r + 1) = f1i; 2499 *(p1r + pos) = f5r; 2500 *(p1r + posi) = f5i; 2501 2502 u0r = Utbl + 1; 2503 u0i = Utbl + (POW2(M - 2) - 1); 2504 2505 w0r = *u0r; w0i = *u0i; 2506 2507 p0r = (ioptr + 2); 2508 p1r = (ioptr + (POW2(M - 2) - 1) * 2); 2509 2510 /* Butterflys */ 2511 /* 2512 f0 - t0 - - f0 2513 f5 - t1 - w0 - f5 2514 2515 f1 - t0 - - f1 2516 f4 - t1 -iw0 - f4 2517 */ 2518 2519 for (diffUcnt = POW2(M - 3) - 1; diffUcnt > 0; diffUcnt--) { 2520 2521 f0r = *(p0r); 2522 f0i = *(p0r + 1); 2523 f5r = *(p1r + pos); 2524 f5i = *(p1r + posi); 2525 f1r = *(p1r); 2526 f1i = *(p1r + 1); 2527 f4r = *(p0r + pos); 2528 f4i = *(p0r + posi); 2529 2530 t0r = f0r + f5r; 2531 t0i = f0i - f5i; 2532 t1r = f0i + f5i; 2533 t1i = f5r - f0r; 2534 2535 f0r = t0r + w0r * t1r + w0i * t1i; 2536 f0i = t0i - w0i * t1r + w0r * t1i; 2537 f5r = Two * t0r - f0r; 2538 f5i = f0i - Two * t0i; 2539 2540 t0r = f1r + f4r; 2541 t0i = f1i - f4i; 2542 t1r = f1i + f4i; 2543 t1i = f4r - f1r; 2544 2545 f1r = t0r + w0i * t1r + w0r * t1i; 2546 f1i = t0i - w0r * t1r + w0i * t1i; 2547 f4r = Two * t0r - f1r; 2548 f4i = f1i - Two * t0i; 2549 2550 *(p0r) = f0r; 2551 *(p0r + 1) = f0i; 2552 *(p1r + pos) = f5r; 2553 *(p1r + posi) = f5i; 2554 2555 w0r = *++u0r; 2556 w0i = *--u0i; 2557 2558 *(p1r) = f1r; 2559 *(p1r + 1) = f1i; 2560 *(p0r + pos) = f4r; 2561 *(p0r + posi) = f4i; 2562 2563 p0r += 2; 2564 p1r -= 2; 2565 } 2566 } 2567 2568 static void rffts1(SPFLOAT *ioptr, int M, SPFLOAT *Utbl, int16_t *BRLow) 2569 { 2570 /* Compute in-place real fft on the rows of the input array */ 2571 /* The result is the complex spectra of the positive frequencies */ 2572 /* except the location for the first complex number contains the real */ 2573 /* values for DC and Nyquest */ 2574 /* INPUTS */ 2575 /* *ioptr = real input data array */ 2576 /* M = log2 of fft size */ 2577 /* *Utbl = cosine table */ 2578 /* *BRLow = bit reversed counter table */ 2579 /* OUTPUTS */ 2580 /* *ioptr = output data array in the following order */ 2581 /* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), */ 2582 /* ... Re(x[N/2-1]), Im(x[N/2-1]). */ 2583 2584 SPFLOAT scale; 2585 int StageCnt; 2586 int NDiffU; 2587 2588 M = M - 1; 2589 switch (M) { 2590 case -1: 2591 break; 2592 case 0: 2593 rfft1pt(ioptr); /* a 2 pt fft */ 2594 break; 2595 case 1: 2596 rfft2pt(ioptr); /* a 4 pt fft */ 2597 break; 2598 case 2: 2599 rfft4pt(ioptr); /* an 8 pt fft */ 2600 break; 2601 case 3: 2602 rfft8pt(ioptr); /* a 16 pt fft */ 2603 break; 2604 default: 2605 scale = 0.5; 2606 /* bit reverse and first radix 2 stage */ 2607 scbitrevR2(ioptr, M, BRLow, scale); 2608 StageCnt = (M - 1) / 3; /* number of radix 8 stages */ 2609 NDiffU = 2; /* one radix 2 stage already complete */ 2610 if ((M - 1 - (StageCnt * 3)) == 1) { 2611 bfR2(ioptr, M, NDiffU); /* 1 radix 2 stage */ 2612 NDiffU *= 2; 2613 } 2614 if ((M - 1 - (StageCnt * 3)) == 2) { 2615 bfR4(ioptr, M, NDiffU); /* 1 radix 4 stage */ 2616 NDiffU *= 4; 2617 } 2618 if (M <= (int) MCACHE) 2619 bfstages(ioptr, M, Utbl, 2, NDiffU, StageCnt); /* RADIX 8 Stages */ 2620 else 2621 fftrecurs(ioptr, M, Utbl, 2, NDiffU, StageCnt); /* RADIX 8 Stages */ 2622 frstage(ioptr, M + 1, Utbl); 2623 } 2624 } 2625 2626 /******************* 2627 * parts of riffts1 * 2628 *******************/ 2629 2630 static void rifft1pt(SPFLOAT *ioptr, SPFLOAT scale) 2631 { 2632 /*** RADIX 2 rifft ***/ 2633 SPFLOAT f0r, f0i; 2634 SPFLOAT t0r, t0i; 2635 2636 /* bit reversed load */ 2637 f0r = ioptr[0]; 2638 f0i = ioptr[1]; 2639 2640 /* finish rfft */ 2641 t0r = f0r + f0i; 2642 t0i = f0r - f0i; 2643 2644 /* store result */ 2645 ioptr[0] = scale * t0r; 2646 ioptr[1] = scale * t0i; 2647 } 2648 2649 static void rifft2pt(SPFLOAT *ioptr, SPFLOAT scale) 2650 { 2651 /*** RADIX 4 rifft ***/ 2652 SPFLOAT f0r, f0i, f1r, f1i; 2653 SPFLOAT t0r, t0i; 2654 const SPFLOAT Two = 2.0; 2655 2656 /* bit reversed load */ 2657 t0r = ioptr[0]; 2658 t0i = ioptr[1]; 2659 f1r = Two * ioptr[2]; 2660 f1i = Two * ioptr[3]; 2661 2662 /* start rifft */ 2663 f0r = t0r + t0i; 2664 f0i = t0r - t0i; 2665 /* Butterflys */ 2666 /* 2667 f0 - - t0 2668 f1 - 1 - f1 2669 */ 2670 2671 t0r = f0r + f1r; 2672 t0i = f0i - f1i; 2673 f1r = f0r - f1r; 2674 f1i = f0i + f1i; 2675 2676 /* store result */ 2677 ioptr[0] = scale * t0r; 2678 ioptr[1] = scale * t0i; 2679 ioptr[2] = scale * f1r; 2680 ioptr[3] = scale * f1i; 2681 } 2682 2683 static void rifft4pt(SPFLOAT *ioptr, SPFLOAT scale) 2684 { 2685 /*** RADIX 8 rifft ***/ 2686 SPFLOAT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; 2687 SPFLOAT t0r, t0i, t1r, t1i; 2688 SPFLOAT w0r = 1.0 / MYROOT2; /* cos(pi/4) */ 2689 const SPFLOAT Two = 2.0; 2690 2691 /* bit reversed load */ 2692 t0r = ioptr[0]; 2693 t0i = ioptr[1]; 2694 f2r = ioptr[2]; 2695 f2i = ioptr[3]; 2696 f1r = Two * ioptr[4]; 2697 f1i = Two * ioptr[5]; 2698 f3r = ioptr[6]; 2699 f3i = ioptr[7]; 2700 2701 /* start rfft */ 2702 f0r = t0r + t0i; /* compute Re(x[0]) */ 2703 f0i = t0r - t0i; /* compute Re(x[N/2]) */ 2704 2705 t1r = f2r + f3r; 2706 t1i = f2i - f3i; 2707 t0r = f2r - f3r; 2708 t0i = f2i + f3i; 2709 2710 f2r = t1r - w0r * t0r - w0r * t0i; 2711 f2i = t1i + w0r * t0r - w0r * t0i; 2712 f3r = Two * t1r - f2r; 2713 f3i = f2i - Two * t1i; 2714 2715 /* Butterflys */ 2716 /* 2717 f0 - - t0 - - f0 2718 f1 - 1 - f1 - - f1 2719 f2 - - f2 - 1 - f2 2720 f3 - 1 - t1 - i - f3 2721 */ 2722 2723 t0r = f0r + f1r; 2724 t0i = f0i - f1i; 2725 f1r = f0r - f1r; 2726 f1i = f0i + f1i; 2727 2728 t1r = f2r - f3r; 2729 t1i = f2i - f3i; 2730 f2r = f2r + f3r; 2731 f2i = f2i + f3i; 2732 2733 f0r = t0r + f2r; 2734 f0i = t0i + f2i; 2735 f2r = t0r - f2r; 2736 f2i = t0i - f2i; 2737 2738 f3r = f1r + t1i; 2739 f3i = f1i - t1r; 2740 f1r = f1r - t1i; 2741 f1i = f1i + t1r; 2742 2743 /* store result */ 2744 ioptr[0] = scale * f0r; 2745 ioptr[1] = scale * f0i; 2746 ioptr[2] = scale * f1r; 2747 ioptr[3] = scale * f1i; 2748 ioptr[4] = scale * f2r; 2749 ioptr[5] = scale * f2i; 2750 ioptr[6] = scale * f3r; 2751 ioptr[7] = scale * f3i; 2752 } 2753 2754 static void rifft8pt(SPFLOAT *ioptr, SPFLOAT scale) 2755 { 2756 /*** RADIX 16 rifft ***/ 2757 SPFLOAT w0r = (SPFLOAT) (1.0 / MYROOT2); /* cos(pi/4) */ 2758 SPFLOAT w1r = MYCOSPID8; /* cos(pi/8) */ 2759 SPFLOAT w1i = MYSINPID8; /* sin(pi/8) */ 2760 SPFLOAT f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; 2761 SPFLOAT f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; 2762 SPFLOAT t0r, t0i, t1r, t1i; 2763 const SPFLOAT Two = 2.0; 2764 2765 /* bit reversed load */ 2766 t0r = ioptr[0]; 2767 t0i = ioptr[1]; 2768 f4r = ioptr[2]; 2769 f4i = ioptr[3]; 2770 f2r = ioptr[4]; 2771 f2i = ioptr[5]; 2772 f6r = ioptr[6]; 2773 f6i = ioptr[7]; 2774 f1r = Two * ioptr[8]; 2775 f1i = Two * ioptr[9]; 2776 f5r = ioptr[10]; 2777 f5i = ioptr[11]; 2778 f3r = ioptr[12]; 2779 f3i = ioptr[13]; 2780 f7r = ioptr[14]; 2781 f7i = ioptr[15]; 2782 2783 /* start rfft */ 2784 f0r = t0r + t0i; /* compute Re(x[0]) */ 2785 f0i = t0r - t0i; /* compute Re(x[N/2]) */ 2786 2787 t0r = f2r + f3r; 2788 t0i = f2i - f3i; 2789 t1r = f2r - f3r; 2790 t1i = f2i + f3i; 2791 2792 f2r = t0r - w0r * t1r - w0r * t1i; 2793 f2i = t0i + w0r * t1r - w0r * t1i; 2794 f3r = Two * t0r - f2r; 2795 f3i = f2i - Two * t0i; 2796 2797 t0r = f4r + f7r; 2798 t0i = f4i - f7i; 2799 t1r = f4r - f7r; 2800 t1i = f4i + f7i; 2801 2802 f4r = t0r - w1i * t1r - w1r * t1i; 2803 f4i = t0i + w1r * t1r - w1i * t1i; 2804 f7r = Two * t0r - f4r; 2805 f7i = f4i - Two * t0i; 2806 2807 t0r = f6r + f5r; 2808 t0i = f6i - f5i; 2809 t1r = f6r - f5r; 2810 t1i = f6i + f5i; 2811 2812 f6r = t0r - w1r * t1r - w1i * t1i; 2813 f6i = t0i + w1i * t1r - w1r * t1i; 2814 f5r = Two * t0r - f6r; 2815 f5i = f6i - Two * t0i; 2816 2817 /* Butterflys */ 2818 /* 2819 f0 - - t0 - - f0 - - f0 2820 f1* - 1 - f1 - - f1 - - f1 2821 f2 - - f2 - 1 - f2 - - f2 2822 f3 - 1 - t1 - i - f3 - - f3 2823 f4 - - t0 - - f4 - 1 - t0 2824 f5 - 1 - f5 - - f5 - w3 - f4 2825 f6 - - f6 - 1 - f6 - i - t1 2826 f7 - 1 - t1 - i - f7 - iw3- f6 2827 */ 2828 2829 t0r = f0r + f1r; 2830 t0i = f0i - f1i; 2831 f1r = f0r - f1r; 2832 f1i = f0i + f1i; 2833 2834 t1r = f2r - f3r; 2835 t1i = f2i - f3i; 2836 f2r = f2r + f3r; 2837 f2i = f2i + f3i; 2838 2839 f0r = t0r + f2r; 2840 f0i = t0i + f2i; 2841 f2r = t0r - f2r; 2842 f2i = t0i - f2i; 2843 2844 f3r = f1r + t1i; 2845 f3i = f1i - t1r; 2846 f1r = f1r - t1i; 2847 f1i = f1i + t1r; 2848 2849 t0r = f4r + f5r; 2850 t0i = f4i + f5i; 2851 f5r = f4r - f5r; 2852 f5i = f4i - f5i; 2853 2854 t1r = f6r - f7r; 2855 t1i = f6i - f7i; 2856 f6r = f6r + f7r; 2857 f6i = f6i + f7i; 2858 2859 f4r = t0r + f6r; 2860 f4i = t0i + f6i; 2861 f6r = t0r - f6r; 2862 f6i = t0i - f6i; 2863 2864 f7r = f5r + t1i; 2865 f7i = f5i - t1r; 2866 f5r = f5r - t1i; 2867 f5i = f5i + t1r; 2868 2869 t0r = f0r - f4r; 2870 t0i = f0i - f4i; 2871 f0r = f0r + f4r; 2872 f0i = f0i + f4i; 2873 2874 t1r = f2r + f6i; 2875 t1i = f2i - f6r; 2876 f2r = f2r - f6i; 2877 f2i = f2i + f6r; 2878 2879 f4r = f1r - f5r * w0r + f5i * w0r; 2880 f4i = f1i - f5r * w0r - f5i * w0r; 2881 f1r = f1r * Two - f4r; 2882 f1i = f1i * Two - f4i; 2883 2884 f6r = f3r + f7r * w0r + f7i * w0r; 2885 f6i = f3i - f7r * w0r + f7i * w0r; 2886 f3r = f3r * Two - f6r; 2887 f3i = f3i * Two - f6i; 2888 2889 /* store result */ 2890 ioptr[0] = scale * f0r; 2891 ioptr[1] = scale * f0i; 2892 ioptr[2] = scale * f1r; 2893 ioptr[3] = scale * f1i; 2894 ioptr[4] = scale * f2r; 2895 ioptr[5] = scale * f2i; 2896 ioptr[6] = scale * f3r; 2897 ioptr[7] = scale * f3i; 2898 ioptr[8] = scale * t0r; 2899 ioptr[9] = scale * t0i; 2900 ioptr[10] = scale * f4r; 2901 ioptr[11] = scale * f4i; 2902 ioptr[12] = scale * t1r; 2903 ioptr[13] = scale * t1i; 2904 ioptr[14] = scale * f6r; 2905 ioptr[15] = scale * f6i; 2906 } 2907 2908 static void ifrstage(SPFLOAT *ioptr, int M, SPFLOAT *Utbl) 2909 { 2910 /* Start RIFFT */ 2911 2912 unsigned int pos; 2913 unsigned int posi; 2914 unsigned int diffUcnt; 2915 2916 SPFLOAT *p0r, *p1r; 2917 SPFLOAT *u0r, *u0i; 2918 2919 SPFLOAT w0r, w0i; 2920 SPFLOAT f0r, f0i, f1r, f1i, f4r, f4i, f5r, f5i; 2921 SPFLOAT t0r, t0i, t1r, t1i; 2922 const SPFLOAT Two = 2.0; 2923 2924 pos = POW2(M - 1); 2925 posi = pos + 1; 2926 2927 p0r = ioptr; 2928 p1r = ioptr + pos / 2; 2929 2930 u0r = Utbl + POW2(M - 3); 2931 2932 w0r = *u0r; f0r = *(p0r); 2933 f0i = *(p0r + 1); 2934 f4r = *(p0r + pos); 2935 f4i = *(p0r + posi); 2936 f1r = *(p1r); 2937 f1i = *(p1r + 1); 2938 f5r = *(p1r + pos); 2939 f5i = *(p1r + posi); 2940 2941 t0r = f0r + f0i; 2942 t0i = f0r - f0i; 2943 t1r = f4r + f4r; 2944 t1i = -f4i - f4i; 2945 2946 f0r = f1r + f5r; 2947 f0i = f1i - f5i; 2948 f4r = f1r - f5r; 2949 f4i = f1i + f5i; 2950 2951 f1r = f0r - w0r * f4r - w0r * f4i; 2952 f1i = f0i + w0r * f4r - w0r * f4i; 2953 f5r = Two * f0r - f1r; 2954 f5i = f1i - Two * f0i; 2955 2956 *(p0r) = t0r; 2957 *(p0r + 1) = t0i; 2958 *(p0r + pos) = t1r; 2959 *(p0r + posi) = t1i; 2960 *(p1r) = f1r; 2961 *(p1r + 1) = f1i; 2962 *(p1r + pos) = f5r; 2963 *(p1r + posi) = f5i; 2964 2965 u0r = Utbl + 1; 2966 u0i = Utbl + (POW2(M - 2) - 1); 2967 2968 w0r = *u0r; w0i = *u0i; 2969 2970 p0r = (ioptr + 2); 2971 p1r = (ioptr + (POW2(M - 2) - 1) * 2); 2972 2973 /* Butterflys */ 2974 /* 2975 f0 - t0 - f0 2976 f1 - t1 -w0- f1 2977 2978 f2 - t0 - f2 2979 f3 - t1 -iw0- f3 2980 */ 2981 2982 for (diffUcnt = POW2(M - 3) - 1; diffUcnt > 0; diffUcnt--) { 2983 2984 f0r = *(p0r); 2985 f0i = *(p0r + 1); 2986 f5r = *(p1r + pos); 2987 f5i = *(p1r + posi); 2988 f1r = *(p1r); 2989 f1i = *(p1r + 1); 2990 f4r = *(p0r + pos); 2991 f4i = *(p0r + posi); 2992 2993 t0r = f0r + f5r; 2994 t0i = f0i - f5i; 2995 t1r = f0r - f5r; 2996 t1i = f0i + f5i; 2997 2998 f0r = t0r - w0i * t1r - w0r * t1i; 2999 f0i = t0i + w0r * t1r - w0i * t1i; 3000 f5r = Two * t0r - f0r; 3001 f5i = f0i - Two * t0i; 3002 3003 t0r = f1r + f4r; 3004 t0i = f1i - f4i; 3005 t1r = f1r - f4r; 3006 t1i = f1i + f4i; 3007 3008 f1r = t0r - w0r * t1r - w0i * t1i; 3009 f1i = t0i + w0i * t1r - w0r * t1i; 3010 f4r = Two * t0r - f1r; 3011 f4i = f1i - Two * t0i; 3012 3013 *(p0r) = f0r; 3014 *(p0r + 1) = f0i; 3015 *(p1r + pos) = f5r; 3016 *(p1r + posi) = f5i; 3017 3018 w0r = *++u0r; 3019 w0i = *--u0i; 3020 3021 *(p1r) = f1r; 3022 *(p1r + 1) = f1i; 3023 *(p0r + pos) = f4r; 3024 *(p0r + posi) = f4i; 3025 3026 p0r += 2; 3027 p1r -= 2; 3028 } 3029 } 3030 3031 static void riffts1(SPFLOAT *ioptr, int M, SPFLOAT *Utbl, int16_t *BRLow) 3032 { 3033 /* Compute in-place real ifft on the rows of the input array */ 3034 /* data order as from rffts1 */ 3035 /* INPUTS */ 3036 /* *ioptr = input data array in the following order */ 3037 /* M = log2 of fft size */ 3038 /* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), */ 3039 /* Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ 3040 /* *Utbl = cosine table */ 3041 /* *BRLow = bit reversed counter table */ 3042 /* OUTPUTS */ 3043 /* *ioptr = real output data array */ 3044 3045 SPFLOAT scale; 3046 int StageCnt; 3047 int NDiffU; 3048 3049 scale = (SPFLOAT)(1.0 / (double)((int)POW2(M))); 3050 M = M - 1; 3051 switch (M) { 3052 case -1: 3053 break; 3054 case 0: 3055 rifft1pt(ioptr, scale); /* a 2 pt fft */ 3056 break; 3057 case 1: 3058 rifft2pt(ioptr, scale); /* a 4 pt fft */ 3059 break; 3060 case 2: 3061 rifft4pt(ioptr, scale); /* an 8 pt fft */ 3062 break; 3063 case 3: 3064 rifft8pt(ioptr, scale); /* a 16 pt fft */ 3065 break; 3066 default: 3067 ifrstage(ioptr, M + 1, Utbl); 3068 /* bit reverse and first radix 2 stage */ 3069 scbitrevR2(ioptr, M, BRLow, scale); 3070 StageCnt = (M - 1) / 3; /* number of radix 8 stages */ 3071 NDiffU = 2; /* one radix 2 stage already complete */ 3072 if ((M - 1 - (StageCnt * 3)) == 1) { 3073 ibfR2(ioptr, M, NDiffU); /* 1 radix 2 stage */ 3074 NDiffU *= 2; 3075 } 3076 if ((M - 1 - (StageCnt * 3)) == 2) { 3077 ibfR4(ioptr, M, NDiffU); /* 1 radix 4 stage */ 3078 NDiffU *= 4; 3079 } 3080 if (M <= (int) MCACHE) 3081 ibfstages(ioptr, M, Utbl, 2, NDiffU, StageCnt); /* RADIX 8 Stages */ 3082 else 3083 ifftrecurs(ioptr, M, Utbl, 2, NDiffU, StageCnt); /* RADIX 8 Stages */ 3084 } 3085 } 3086