1 /*
2 Copyright (c) 2003-2004, Mark Borgerding
3 
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
7 
8     * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
9     * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
10     * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
11 
12 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
13 */
14 #ifdef HAVE_CONFIG_H
15 #include "config.h"
16 #endif
17 
18 #include "_kiss_fft_guts_f32.h"
19 /* The guts header contains all the multiplication and addition macros that are defined for
20  fixed or floating point complex numbers.  It also delares the kf_ internal functions.
21  */
22 
23 static kiss_fft_f32_cpx *scratchbuf = NULL;
24 static size_t nscratchbuf = 0;
25 static kiss_fft_f32_cpx *tmpbuf = NULL;
26 static size_t ntmpbuf = 0;
27 
28 #define CHECKBUF(buf,nbuf,n) \
29     do { \
30         if ( nbuf < (size_t)(n) ) {\
31             free(buf); \
32             buf = (kiss_fft_f32_cpx*)KISS_FFT_F32_MALLOC(sizeof(kiss_fft_f32_cpx)*(n)); \
33             nbuf = (size_t)(n); \
34         } \
35    }while(0)
36 
37 
38 static void
kf_bfly2(kiss_fft_f32_cpx * Fout,const size_t fstride,const kiss_fft_f32_cfg st,int m)39 kf_bfly2 (kiss_fft_f32_cpx * Fout,
40     const size_t fstride, const kiss_fft_f32_cfg st, int m)
41 {
42   kiss_fft_f32_cpx *Fout2;
43   kiss_fft_f32_cpx *tw1 = st->twiddles;
44   kiss_fft_f32_cpx t;
45 
46   Fout2 = Fout + m;
47   do {
48     C_FIXDIV (*Fout, 2);
49     C_FIXDIV (*Fout2, 2);
50 
51     C_MUL (t, *Fout2, *tw1);
52     tw1 += fstride;
53     C_SUB (*Fout2, *Fout, t);
54     C_ADDTO (*Fout, t);
55     ++Fout2;
56     ++Fout;
57   } while (--m);
58 }
59 
60 static void
kf_bfly4(kiss_fft_f32_cpx * Fout,const size_t fstride,const kiss_fft_f32_cfg st,const size_t m)61 kf_bfly4 (kiss_fft_f32_cpx * Fout,
62     const size_t fstride, const kiss_fft_f32_cfg st, const size_t m)
63 {
64   kiss_fft_f32_cpx *tw1, *tw2, *tw3;
65   kiss_fft_f32_cpx scratch[6];
66   size_t k = m;
67   const size_t m2 = 2 * m;
68   const size_t m3 = 3 * m;
69 
70   tw3 = tw2 = tw1 = st->twiddles;
71 
72   do {
73     C_FIXDIV (*Fout, 4);
74     C_FIXDIV (Fout[m], 4);
75     C_FIXDIV (Fout[m2], 4);
76     C_FIXDIV (Fout[m3], 4);
77 
78     C_MUL (scratch[0], Fout[m], *tw1);
79     C_MUL (scratch[1], Fout[m2], *tw2);
80     C_MUL (scratch[2], Fout[m3], *tw3);
81 
82     C_SUB (scratch[5], *Fout, scratch[1]);
83     C_ADDTO (*Fout, scratch[1]);
84     C_ADD (scratch[3], scratch[0], scratch[2]);
85     C_SUB (scratch[4], scratch[0], scratch[2]);
86     C_SUB (Fout[m2], *Fout, scratch[3]);
87     tw1 += fstride;
88     tw2 += fstride * 2;
89     tw3 += fstride * 3;
90     C_ADDTO (*Fout, scratch[3]);
91 
92     if (st->inverse) {
93       Fout[m].r = scratch[5].r - scratch[4].i;
94       Fout[m].i = scratch[5].i + scratch[4].r;
95       Fout[m3].r = scratch[5].r + scratch[4].i;
96       Fout[m3].i = scratch[5].i - scratch[4].r;
97     } else {
98       Fout[m].r = scratch[5].r + scratch[4].i;
99       Fout[m].i = scratch[5].i - scratch[4].r;
100       Fout[m3].r = scratch[5].r - scratch[4].i;
101       Fout[m3].i = scratch[5].i + scratch[4].r;
102     }
103     ++Fout;
104   } while (--k);
105 }
106 
107 static void
kf_bfly3(kiss_fft_f32_cpx * Fout,const size_t fstride,const kiss_fft_f32_cfg st,size_t m)108 kf_bfly3 (kiss_fft_f32_cpx * Fout,
109     const size_t fstride, const kiss_fft_f32_cfg st, size_t m)
110 {
111   size_t k = m;
112   const size_t m2 = 2 * m;
113   kiss_fft_f32_cpx *tw1, *tw2;
114   kiss_fft_f32_cpx scratch[5];
115   kiss_fft_f32_cpx epi3;
116 
117   epi3 = st->twiddles[fstride * m];
118 
119   tw1 = tw2 = st->twiddles;
120 
121   do {
122     C_FIXDIV (*Fout, 3);
123     C_FIXDIV (Fout[m], 3);
124     C_FIXDIV (Fout[m2], 3);
125 
126     C_MUL (scratch[1], Fout[m], *tw1);
127     C_MUL (scratch[2], Fout[m2], *tw2);
128 
129     C_ADD (scratch[3], scratch[1], scratch[2]);
130     C_SUB (scratch[0], scratch[1], scratch[2]);
131     tw1 += fstride;
132     tw2 += fstride * 2;
133 
134     Fout[m].r = Fout->r - HALF_OF (scratch[3].r);
135     Fout[m].i = Fout->i - HALF_OF (scratch[3].i);
136 
137     C_MULBYSCALAR (scratch[0], epi3.i);
138 
139     C_ADDTO (*Fout, scratch[3]);
140 
141     Fout[m2].r = Fout[m].r + scratch[0].i;
142     Fout[m2].i = Fout[m].i - scratch[0].r;
143 
144     Fout[m].r -= scratch[0].i;
145     Fout[m].i += scratch[0].r;
146 
147     ++Fout;
148   } while (--k);
149 }
150 
151 static void
kf_bfly5(kiss_fft_f32_cpx * Fout,const size_t fstride,const kiss_fft_f32_cfg st,int m)152 kf_bfly5 (kiss_fft_f32_cpx * Fout,
153     const size_t fstride, const kiss_fft_f32_cfg st, int m)
154 {
155   kiss_fft_f32_cpx *Fout0, *Fout1, *Fout2, *Fout3, *Fout4;
156   int u;
157   kiss_fft_f32_cpx scratch[13];
158   kiss_fft_f32_cpx *twiddles = st->twiddles;
159   kiss_fft_f32_cpx *tw;
160   kiss_fft_f32_cpx ya, yb;
161 
162   ya = twiddles[fstride * m];
163   yb = twiddles[fstride * 2 * m];
164 
165   Fout0 = Fout;
166   Fout1 = Fout0 + m;
167   Fout2 = Fout0 + 2 * m;
168   Fout3 = Fout0 + 3 * m;
169   Fout4 = Fout0 + 4 * m;
170 
171   tw = st->twiddles;
172   for (u = 0; u < m; ++u) {
173     C_FIXDIV (*Fout0, 5);
174     C_FIXDIV (*Fout1, 5);
175     C_FIXDIV (*Fout2, 5);
176     C_FIXDIV (*Fout3, 5);
177     C_FIXDIV (*Fout4, 5);
178     scratch[0] = *Fout0;
179 
180     C_MUL (scratch[1], *Fout1, tw[u * fstride]);
181     C_MUL (scratch[2], *Fout2, tw[2 * u * fstride]);
182     C_MUL (scratch[3], *Fout3, tw[3 * u * fstride]);
183     C_MUL (scratch[4], *Fout4, tw[4 * u * fstride]);
184 
185     C_ADD (scratch[7], scratch[1], scratch[4]);
186     C_SUB (scratch[10], scratch[1], scratch[4]);
187     C_ADD (scratch[8], scratch[2], scratch[3]);
188     C_SUB (scratch[9], scratch[2], scratch[3]);
189 
190     Fout0->r += scratch[7].r + scratch[8].r;
191     Fout0->i += scratch[7].i + scratch[8].i;
192 
193     scratch[5].r =
194         scratch[0].r + S_MUL (scratch[7].r, ya.r) + S_MUL (scratch[8].r, yb.r);
195     scratch[5].i =
196         scratch[0].i + S_MUL (scratch[7].i, ya.r) + S_MUL (scratch[8].i, yb.r);
197 
198     scratch[6].r = S_MUL (scratch[10].i, ya.i) + S_MUL (scratch[9].i, yb.i);
199     scratch[6].i = -S_MUL (scratch[10].r, ya.i) - S_MUL (scratch[9].r, yb.i);
200 
201     C_SUB (*Fout1, scratch[5], scratch[6]);
202     C_ADD (*Fout4, scratch[5], scratch[6]);
203 
204     scratch[11].r =
205         scratch[0].r + S_MUL (scratch[7].r, yb.r) + S_MUL (scratch[8].r, ya.r);
206     scratch[11].i =
207         scratch[0].i + S_MUL (scratch[7].i, yb.r) + S_MUL (scratch[8].i, ya.r);
208     scratch[12].r = -S_MUL (scratch[10].i, yb.i) + S_MUL (scratch[9].i, ya.i);
209     scratch[12].i = S_MUL (scratch[10].r, yb.i) - S_MUL (scratch[9].r, ya.i);
210 
211     C_ADD (*Fout2, scratch[11], scratch[12]);
212     C_SUB (*Fout3, scratch[11], scratch[12]);
213 
214     ++Fout0;
215     ++Fout1;
216     ++Fout2;
217     ++Fout3;
218     ++Fout4;
219   }
220 }
221 
222 /* perform the butterfly for one stage of a mixed radix FFT */
223 static void
kf_bfly_generic(kiss_fft_f32_cpx * Fout,const size_t fstride,const kiss_fft_f32_cfg st,int m,int p)224 kf_bfly_generic (kiss_fft_f32_cpx * Fout,
225     const size_t fstride, const kiss_fft_f32_cfg st, int m, int p)
226 {
227   int u, k, q1, q;
228   kiss_fft_f32_cpx *twiddles = st->twiddles;
229   kiss_fft_f32_cpx t;
230   int Norig = st->nfft;
231 
232   CHECKBUF (scratchbuf, nscratchbuf, p);
233 
234   for (u = 0; u < m; ++u) {
235     k = u;
236     for (q1 = 0; q1 < p; ++q1) {
237       scratchbuf[q1] = Fout[k];
238       C_FIXDIV (scratchbuf[q1], p);
239       k += m;
240     }
241 
242     k = u;
243     for (q1 = 0; q1 < p; ++q1) {
244       int twidx = 0;
245 
246       Fout[k] = scratchbuf[0];
247       for (q = 1; q < p; ++q) {
248         twidx += fstride * k;
249         if (twidx >= Norig)
250           twidx -= Norig;
251         C_MUL (t, scratchbuf[q], twiddles[twidx]);
252         C_ADDTO (Fout[k], t);
253       }
254       k += m;
255     }
256   }
257 }
258 
259 static void
kf_work(kiss_fft_f32_cpx * Fout,const kiss_fft_f32_cpx * f,const size_t fstride,int in_stride,int * factors,const kiss_fft_f32_cfg st)260 kf_work (kiss_fft_f32_cpx * Fout,
261     const kiss_fft_f32_cpx * f,
262     const size_t fstride,
263     int in_stride, int *factors, const kiss_fft_f32_cfg st)
264 {
265   kiss_fft_f32_cpx *Fout_beg = Fout;
266   const int p = *factors++;     /* the radix  */
267   const int m = *factors++;     /* stage's fft length/p */
268   const kiss_fft_f32_cpx *Fout_end = Fout + p * m;
269 
270 #ifdef _OPENMP
271   // use openmp extensions at the
272   // top-level (not recursive)
273   if (fstride == 1) {
274     int k;
275 
276     // execute the p different work units in different threads
277 #       pragma omp parallel for
278     for (k = 0; k < p; ++k)
279       kf_work (Fout + k * m, f + fstride * in_stride * k, fstride * p,
280           in_stride, factors, st);
281     // all threads have joined by this point
282 
283     switch (p) {
284       case 2:
285         kf_bfly2 (Fout, fstride, st, m);
286         break;
287       case 3:
288         kf_bfly3 (Fout, fstride, st, m);
289         break;
290       case 4:
291         kf_bfly4 (Fout, fstride, st, m);
292         break;
293       case 5:
294         kf_bfly5 (Fout, fstride, st, m);
295         break;
296       default:
297         kf_bfly_generic (Fout, fstride, st, m, p);
298         break;
299     }
300     return;
301   }
302 #endif
303 
304   if (m == 1) {
305     do {
306       *Fout = *f;
307       f += fstride * in_stride;
308     } while (++Fout != Fout_end);
309   } else {
310     do {
311       // recursive call:
312       // DFT of size m*p performed by doing
313       // p instances of smaller DFTs of size m,
314       // each one takes a decimated version of the input
315       kf_work (Fout, f, fstride * p, in_stride, factors, st);
316       f += fstride * in_stride;
317     } while ((Fout += m) != Fout_end);
318   }
319 
320   Fout = Fout_beg;
321 
322   // recombine the p smaller DFTs
323   switch (p) {
324     case 2:
325       kf_bfly2 (Fout, fstride, st, m);
326       break;
327     case 3:
328       kf_bfly3 (Fout, fstride, st, m);
329       break;
330     case 4:
331       kf_bfly4 (Fout, fstride, st, m);
332       break;
333     case 5:
334       kf_bfly5 (Fout, fstride, st, m);
335       break;
336     default:
337       kf_bfly_generic (Fout, fstride, st, m, p);
338       break;
339   }
340 }
341 
342 /*  facbuf is populated by p1,m1,p2,m2, ...
343     where
344     p[i] * m[i] = m[i-1]
345     m0 = n                  */
346 static void
kf_factor(int n,int * facbuf)347 kf_factor (int n, int *facbuf)
348 {
349   int p = 4;
350   double floor_sqrt;
351 
352   floor_sqrt = floor (sqrt ((double) n));
353 
354   /*factor out powers of 4, powers of 2, then any remaining primes */
355   do {
356     while (n % p) {
357       switch (p) {
358         case 4:
359           p = 2;
360           break;
361         case 2:
362           p = 3;
363           break;
364         default:
365           p += 2;
366           break;
367       }
368       if (p > floor_sqrt)
369         p = n;                  /* no more factors, skip to end */
370     }
371     n /= p;
372     *facbuf++ = p;
373     *facbuf++ = n;
374   } while (n > 1);
375 }
376 
377 /*
378  *
379  * User-callable function to allocate all necessary storage space for the fft.
380  *
381  * The return value is a contiguous block of memory, allocated with malloc.  As such,
382  * It can be freed with free(), rather than a kiss_fft-specific function.
383  * */
384 kiss_fft_f32_cfg
kiss_fft_f32_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem)385 kiss_fft_f32_alloc (int nfft, int inverse_fft, void *mem, size_t * lenmem)
386 {
387   kiss_fft_f32_cfg st = NULL;
388   size_t memneeded = sizeof (struct kiss_fft_f32_state)
389       + sizeof (kiss_fft_f32_cpx) * (nfft - 1); /* twiddle factors */
390 
391   if (lenmem == NULL) {
392     st = (kiss_fft_f32_cfg) KISS_FFT_F32_MALLOC (memneeded);
393   } else {
394     if (mem != NULL && *lenmem >= memneeded)
395       st = (kiss_fft_f32_cfg) mem;
396     *lenmem = memneeded;
397   }
398   if (st) {
399     int i;
400 
401     st->nfft = nfft;
402     st->inverse = inverse_fft;
403 
404     for (i = 0; i < nfft; ++i) {
405       const double pi =
406           3.141592653589793238462643383279502884197169399375105820974944;
407       double phase = -2 * pi * i / nfft;
408 
409       if (st->inverse)
410         phase *= -1;
411       kf_cexp (st->twiddles + i, phase);
412     }
413 
414     kf_factor (nfft, st->factors);
415   }
416   return st;
417 }
418 
419 
420 
421 
422 void
kiss_fft_f32_stride(kiss_fft_f32_cfg st,const kiss_fft_f32_cpx * fin,kiss_fft_f32_cpx * fout,int in_stride)423 kiss_fft_f32_stride (kiss_fft_f32_cfg st, const kiss_fft_f32_cpx * fin,
424     kiss_fft_f32_cpx * fout, int in_stride)
425 {
426   if (fin == fout) {
427     CHECKBUF (tmpbuf, ntmpbuf, st->nfft);
428     kf_work (tmpbuf, fin, 1, in_stride, st->factors, st);
429     memcpy (fout, tmpbuf, sizeof (kiss_fft_f32_cpx) * st->nfft);
430   } else {
431     kf_work (fout, fin, 1, in_stride, st->factors, st);
432   }
433 }
434 
435 void
kiss_fft_f32(kiss_fft_f32_cfg cfg,const kiss_fft_f32_cpx * fin,kiss_fft_f32_cpx * fout)436 kiss_fft_f32 (kiss_fft_f32_cfg cfg, const kiss_fft_f32_cpx * fin,
437     kiss_fft_f32_cpx * fout)
438 {
439   kiss_fft_f32_stride (cfg, fin, fout, 1);
440 }
441 
442 
443 /* not really necessary to call, but if someone is doing in-place ffts, they may want to free the
444    buffers from CHECKBUF
445  */
446 void
kiss_fft_f32_cleanup(void)447 kiss_fft_f32_cleanup (void)
448 {
449   free (scratchbuf);
450   scratchbuf = NULL;
451   nscratchbuf = 0;
452   free (tmpbuf);
453   tmpbuf = NULL;
454   ntmpbuf = 0;
455 }
456 
457 int
kiss_fft_f32_next_fast_size(int n)458 kiss_fft_f32_next_fast_size (int n)
459 {
460   while (1) {
461     int m = n;
462 
463     while ((m % 2) == 0)
464       m /= 2;
465     while ((m % 3) == 0)
466       m /= 3;
467     while ((m % 5) == 0)
468       m /= 5;
469     if (m <= 1)
470       break;                    /* n is completely factorable by twos, threes, and fives */
471     n++;
472   }
473   return n;
474 }
475