1 /*
2 Copyright (c) 2003-2004, Mark Borgerding
3
4 All rights reserved.
5
6 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
7
8 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
9 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
10 * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
11
12 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
13 */
14 #ifdef HAVE_CONFIG_H
15 #include "config.h"
16 #endif
17
18 #include "_kiss_fft_guts_s32.h"
19 /* The guts header contains all the multiplication and addition macros that are defined for
20 fixed or floating point complex numbers. It also delares the kf_ internal functions.
21 */
22
23 static kiss_fft_s32_cpx *scratchbuf = NULL;
24 static size_t nscratchbuf = 0;
25 static kiss_fft_s32_cpx *tmpbuf = NULL;
26 static size_t ntmpbuf = 0;
27
28 #define CHECKBUF(buf,nbuf,n) \
29 do { \
30 if ( nbuf < (size_t)(n) ) {\
31 free(buf); \
32 buf = (kiss_fft_s32_cpx*)KISS_FFT_S32_MALLOC(sizeof(kiss_fft_s32_cpx)*(n)); \
33 nbuf = (size_t)(n); \
34 } \
35 }while(0)
36
37
38 static void
kf_bfly2(kiss_fft_s32_cpx * Fout,const size_t fstride,const kiss_fft_s32_cfg st,int m)39 kf_bfly2 (kiss_fft_s32_cpx * Fout,
40 const size_t fstride, const kiss_fft_s32_cfg st, int m)
41 {
42 kiss_fft_s32_cpx *Fout2;
43 kiss_fft_s32_cpx *tw1 = st->twiddles;
44 kiss_fft_s32_cpx t;
45
46 Fout2 = Fout + m;
47 do {
48 C_FIXDIV (*Fout, 2);
49 C_FIXDIV (*Fout2, 2);
50
51 C_MUL (t, *Fout2, *tw1);
52 tw1 += fstride;
53 C_SUB (*Fout2, *Fout, t);
54 C_ADDTO (*Fout, t);
55 ++Fout2;
56 ++Fout;
57 } while (--m);
58 }
59
60 static void
kf_bfly4(kiss_fft_s32_cpx * Fout,const size_t fstride,const kiss_fft_s32_cfg st,const size_t m)61 kf_bfly4 (kiss_fft_s32_cpx * Fout,
62 const size_t fstride, const kiss_fft_s32_cfg st, const size_t m)
63 {
64 kiss_fft_s32_cpx *tw1, *tw2, *tw3;
65 kiss_fft_s32_cpx scratch[6];
66 size_t k = m;
67 const size_t m2 = 2 * m;
68 const size_t m3 = 3 * m;
69
70 tw3 = tw2 = tw1 = st->twiddles;
71
72 do {
73 C_FIXDIV (*Fout, 4);
74 C_FIXDIV (Fout[m], 4);
75 C_FIXDIV (Fout[m2], 4);
76 C_FIXDIV (Fout[m3], 4);
77
78 C_MUL (scratch[0], Fout[m], *tw1);
79 C_MUL (scratch[1], Fout[m2], *tw2);
80 C_MUL (scratch[2], Fout[m3], *tw3);
81
82 C_SUB (scratch[5], *Fout, scratch[1]);
83 C_ADDTO (*Fout, scratch[1]);
84 C_ADD (scratch[3], scratch[0], scratch[2]);
85 C_SUB (scratch[4], scratch[0], scratch[2]);
86 C_SUB (Fout[m2], *Fout, scratch[3]);
87 tw1 += fstride;
88 tw2 += fstride * 2;
89 tw3 += fstride * 3;
90 C_ADDTO (*Fout, scratch[3]);
91
92 if (st->inverse) {
93 Fout[m].r = scratch[5].r - scratch[4].i;
94 Fout[m].i = scratch[5].i + scratch[4].r;
95 Fout[m3].r = scratch[5].r + scratch[4].i;
96 Fout[m3].i = scratch[5].i - scratch[4].r;
97 } else {
98 Fout[m].r = scratch[5].r + scratch[4].i;
99 Fout[m].i = scratch[5].i - scratch[4].r;
100 Fout[m3].r = scratch[5].r - scratch[4].i;
101 Fout[m3].i = scratch[5].i + scratch[4].r;
102 }
103 ++Fout;
104 } while (--k);
105 }
106
107 static void
kf_bfly3(kiss_fft_s32_cpx * Fout,const size_t fstride,const kiss_fft_s32_cfg st,size_t m)108 kf_bfly3 (kiss_fft_s32_cpx * Fout,
109 const size_t fstride, const kiss_fft_s32_cfg st, size_t m)
110 {
111 size_t k = m;
112 const size_t m2 = 2 * m;
113 kiss_fft_s32_cpx *tw1, *tw2;
114 kiss_fft_s32_cpx scratch[5];
115 kiss_fft_s32_cpx epi3;
116
117 epi3 = st->twiddles[fstride * m];
118
119 tw1 = tw2 = st->twiddles;
120
121 do {
122 C_FIXDIV (*Fout, 3);
123 C_FIXDIV (Fout[m], 3);
124 C_FIXDIV (Fout[m2], 3);
125
126 C_MUL (scratch[1], Fout[m], *tw1);
127 C_MUL (scratch[2], Fout[m2], *tw2);
128
129 C_ADD (scratch[3], scratch[1], scratch[2]);
130 C_SUB (scratch[0], scratch[1], scratch[2]);
131 tw1 += fstride;
132 tw2 += fstride * 2;
133
134 Fout[m].r = Fout->r - HALF_OF (scratch[3].r);
135 Fout[m].i = Fout->i - HALF_OF (scratch[3].i);
136
137 C_MULBYSCALAR (scratch[0], epi3.i);
138
139 C_ADDTO (*Fout, scratch[3]);
140
141 Fout[m2].r = Fout[m].r + scratch[0].i;
142 Fout[m2].i = Fout[m].i - scratch[0].r;
143
144 Fout[m].r -= scratch[0].i;
145 Fout[m].i += scratch[0].r;
146
147 ++Fout;
148 } while (--k);
149 }
150
151 static void
kf_bfly5(kiss_fft_s32_cpx * Fout,const size_t fstride,const kiss_fft_s32_cfg st,int m)152 kf_bfly5 (kiss_fft_s32_cpx * Fout,
153 const size_t fstride, const kiss_fft_s32_cfg st, int m)
154 {
155 kiss_fft_s32_cpx *Fout0, *Fout1, *Fout2, *Fout3, *Fout4;
156 int u;
157 kiss_fft_s32_cpx scratch[13];
158 kiss_fft_s32_cpx *twiddles = st->twiddles;
159 kiss_fft_s32_cpx *tw;
160 kiss_fft_s32_cpx ya, yb;
161
162 ya = twiddles[fstride * m];
163 yb = twiddles[fstride * 2 * m];
164
165 Fout0 = Fout;
166 Fout1 = Fout0 + m;
167 Fout2 = Fout0 + 2 * m;
168 Fout3 = Fout0 + 3 * m;
169 Fout4 = Fout0 + 4 * m;
170
171 tw = st->twiddles;
172 for (u = 0; u < m; ++u) {
173 C_FIXDIV (*Fout0, 5);
174 C_FIXDIV (*Fout1, 5);
175 C_FIXDIV (*Fout2, 5);
176 C_FIXDIV (*Fout3, 5);
177 C_FIXDIV (*Fout4, 5);
178 scratch[0] = *Fout0;
179
180 C_MUL (scratch[1], *Fout1, tw[u * fstride]);
181 C_MUL (scratch[2], *Fout2, tw[2 * u * fstride]);
182 C_MUL (scratch[3], *Fout3, tw[3 * u * fstride]);
183 C_MUL (scratch[4], *Fout4, tw[4 * u * fstride]);
184
185 C_ADD (scratch[7], scratch[1], scratch[4]);
186 C_SUB (scratch[10], scratch[1], scratch[4]);
187 C_ADD (scratch[8], scratch[2], scratch[3]);
188 C_SUB (scratch[9], scratch[2], scratch[3]);
189
190 Fout0->r += scratch[7].r + scratch[8].r;
191 Fout0->i += scratch[7].i + scratch[8].i;
192
193 scratch[5].r =
194 scratch[0].r + S_MUL (scratch[7].r, ya.r) + S_MUL (scratch[8].r, yb.r);
195 scratch[5].i =
196 scratch[0].i + S_MUL (scratch[7].i, ya.r) + S_MUL (scratch[8].i, yb.r);
197
198 scratch[6].r = S_MUL (scratch[10].i, ya.i) + S_MUL (scratch[9].i, yb.i);
199 scratch[6].i = -S_MUL (scratch[10].r, ya.i) - S_MUL (scratch[9].r, yb.i);
200
201 C_SUB (*Fout1, scratch[5], scratch[6]);
202 C_ADD (*Fout4, scratch[5], scratch[6]);
203
204 scratch[11].r =
205 scratch[0].r + S_MUL (scratch[7].r, yb.r) + S_MUL (scratch[8].r, ya.r);
206 scratch[11].i =
207 scratch[0].i + S_MUL (scratch[7].i, yb.r) + S_MUL (scratch[8].i, ya.r);
208 scratch[12].r = -S_MUL (scratch[10].i, yb.i) + S_MUL (scratch[9].i, ya.i);
209 scratch[12].i = S_MUL (scratch[10].r, yb.i) - S_MUL (scratch[9].r, ya.i);
210
211 C_ADD (*Fout2, scratch[11], scratch[12]);
212 C_SUB (*Fout3, scratch[11], scratch[12]);
213
214 ++Fout0;
215 ++Fout1;
216 ++Fout2;
217 ++Fout3;
218 ++Fout4;
219 }
220 }
221
222 /* perform the butterfly for one stage of a mixed radix FFT */
223 static void
kf_bfly_generic(kiss_fft_s32_cpx * Fout,const size_t fstride,const kiss_fft_s32_cfg st,int m,int p)224 kf_bfly_generic (kiss_fft_s32_cpx * Fout,
225 const size_t fstride, const kiss_fft_s32_cfg st, int m, int p)
226 {
227 int u, k, q1, q;
228 kiss_fft_s32_cpx *twiddles = st->twiddles;
229 kiss_fft_s32_cpx t;
230 int Norig = st->nfft;
231
232 CHECKBUF (scratchbuf, nscratchbuf, p);
233
234 for (u = 0; u < m; ++u) {
235 k = u;
236 for (q1 = 0; q1 < p; ++q1) {
237 scratchbuf[q1] = Fout[k];
238 C_FIXDIV (scratchbuf[q1], p);
239 k += m;
240 }
241
242 k = u;
243 for (q1 = 0; q1 < p; ++q1) {
244 int twidx = 0;
245
246 Fout[k] = scratchbuf[0];
247 for (q = 1; q < p; ++q) {
248 twidx += fstride * k;
249 if (twidx >= Norig)
250 twidx -= Norig;
251 C_MUL (t, scratchbuf[q], twiddles[twidx]);
252 C_ADDTO (Fout[k], t);
253 }
254 k += m;
255 }
256 }
257 }
258
259 static void
kf_work(kiss_fft_s32_cpx * Fout,const kiss_fft_s32_cpx * f,const size_t fstride,int in_stride,int * factors,const kiss_fft_s32_cfg st)260 kf_work (kiss_fft_s32_cpx * Fout,
261 const kiss_fft_s32_cpx * f,
262 const size_t fstride,
263 int in_stride, int *factors, const kiss_fft_s32_cfg st)
264 {
265 kiss_fft_s32_cpx *Fout_beg = Fout;
266 const int p = *factors++; /* the radix */
267 const int m = *factors++; /* stage's fft length/p */
268 const kiss_fft_s32_cpx *Fout_end = Fout + p * m;
269
270 #ifdef _OPENMP
271 // use openmp extensions at the
272 // top-level (not recursive)
273 if (fstride == 1) {
274 int k;
275
276 // execute the p different work units in different threads
277 # pragma omp parallel for
278 for (k = 0; k < p; ++k)
279 kf_work (Fout + k * m, f + fstride * in_stride * k, fstride * p,
280 in_stride, factors, st);
281 // all threads have joined by this point
282
283 switch (p) {
284 case 2:
285 kf_bfly2 (Fout, fstride, st, m);
286 break;
287 case 3:
288 kf_bfly3 (Fout, fstride, st, m);
289 break;
290 case 4:
291 kf_bfly4 (Fout, fstride, st, m);
292 break;
293 case 5:
294 kf_bfly5 (Fout, fstride, st, m);
295 break;
296 default:
297 kf_bfly_generic (Fout, fstride, st, m, p);
298 break;
299 }
300 return;
301 }
302 #endif
303
304 if (m == 1) {
305 do {
306 *Fout = *f;
307 f += fstride * in_stride;
308 } while (++Fout != Fout_end);
309 } else {
310 do {
311 // recursive call:
312 // DFT of size m*p performed by doing
313 // p instances of smaller DFTs of size m,
314 // each one takes a decimated version of the input
315 kf_work (Fout, f, fstride * p, in_stride, factors, st);
316 f += fstride * in_stride;
317 } while ((Fout += m) != Fout_end);
318 }
319
320 Fout = Fout_beg;
321
322 // recombine the p smaller DFTs
323 switch (p) {
324 case 2:
325 kf_bfly2 (Fout, fstride, st, m);
326 break;
327 case 3:
328 kf_bfly3 (Fout, fstride, st, m);
329 break;
330 case 4:
331 kf_bfly4 (Fout, fstride, st, m);
332 break;
333 case 5:
334 kf_bfly5 (Fout, fstride, st, m);
335 break;
336 default:
337 kf_bfly_generic (Fout, fstride, st, m, p);
338 break;
339 }
340 }
341
342 /* facbuf is populated by p1,m1,p2,m2, ...
343 where
344 p[i] * m[i] = m[i-1]
345 m0 = n */
346 static void
kf_factor(int n,int * facbuf)347 kf_factor (int n, int *facbuf)
348 {
349 int p = 4;
350 double floor_sqrt;
351
352 floor_sqrt = floor (sqrt ((double) n));
353
354 /*factor out powers of 4, powers of 2, then any remaining primes */
355 do {
356 while (n % p) {
357 switch (p) {
358 case 4:
359 p = 2;
360 break;
361 case 2:
362 p = 3;
363 break;
364 default:
365 p += 2;
366 break;
367 }
368 if (p > floor_sqrt)
369 p = n; /* no more factors, skip to end */
370 }
371 n /= p;
372 *facbuf++ = p;
373 *facbuf++ = n;
374 } while (n > 1);
375 }
376
377 /*
378 *
379 * User-callable function to allocate all necessary storage space for the fft.
380 *
381 * The return value is a contiguous block of memory, allocated with malloc. As such,
382 * It can be freed with free(), rather than a kiss_fft-specific function.
383 * */
384 kiss_fft_s32_cfg
kiss_fft_s32_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem)385 kiss_fft_s32_alloc (int nfft, int inverse_fft, void *mem, size_t * lenmem)
386 {
387 kiss_fft_s32_cfg st = NULL;
388 size_t memneeded = sizeof (struct kiss_fft_s32_state)
389 + sizeof (kiss_fft_s32_cpx) * (nfft - 1); /* twiddle factors */
390
391 if (lenmem == NULL) {
392 st = (kiss_fft_s32_cfg) KISS_FFT_S32_MALLOC (memneeded);
393 } else {
394 if (mem != NULL && *lenmem >= memneeded)
395 st = (kiss_fft_s32_cfg) mem;
396 *lenmem = memneeded;
397 }
398 if (st) {
399 int i;
400
401 st->nfft = nfft;
402 st->inverse = inverse_fft;
403
404 for (i = 0; i < nfft; ++i) {
405 const double pi =
406 3.141592653589793238462643383279502884197169399375105820974944;
407 double phase = -2 * pi * i / nfft;
408
409 if (st->inverse)
410 phase *= -1;
411 kf_cexp (st->twiddles + i, phase);
412 }
413
414 kf_factor (nfft, st->factors);
415 }
416 return st;
417 }
418
419
420
421
422 void
kiss_fft_s32_stride(kiss_fft_s32_cfg st,const kiss_fft_s32_cpx * fin,kiss_fft_s32_cpx * fout,int in_stride)423 kiss_fft_s32_stride (kiss_fft_s32_cfg st, const kiss_fft_s32_cpx * fin,
424 kiss_fft_s32_cpx * fout, int in_stride)
425 {
426 if (fin == fout) {
427 CHECKBUF (tmpbuf, ntmpbuf, st->nfft);
428 kf_work (tmpbuf, fin, 1, in_stride, st->factors, st);
429 memcpy (fout, tmpbuf, sizeof (kiss_fft_s32_cpx) * st->nfft);
430 } else {
431 kf_work (fout, fin, 1, in_stride, st->factors, st);
432 }
433 }
434
435 void
kiss_fft_s32(kiss_fft_s32_cfg cfg,const kiss_fft_s32_cpx * fin,kiss_fft_s32_cpx * fout)436 kiss_fft_s32 (kiss_fft_s32_cfg cfg, const kiss_fft_s32_cpx * fin,
437 kiss_fft_s32_cpx * fout)
438 {
439 kiss_fft_s32_stride (cfg, fin, fout, 1);
440 }
441
442
443 /* not really necessary to call, but if someone is doing in-place ffts, they may want to free the
444 buffers from CHECKBUF
445 */
446 void
kiss_fft_s32_cleanup(void)447 kiss_fft_s32_cleanup (void)
448 {
449 free (scratchbuf);
450 scratchbuf = NULL;
451 nscratchbuf = 0;
452 free (tmpbuf);
453 tmpbuf = NULL;
454 ntmpbuf = 0;
455 }
456
457 int
kiss_fft_s32_next_fast_size(int n)458 kiss_fft_s32_next_fast_size (int n)
459 {
460 while (1) {
461 int m = n;
462
463 while ((m % 2) == 0)
464 m /= 2;
465 while ((m % 3) == 0)
466 m /= 3;
467 while ((m % 5) == 0)
468 m /= 5;
469 if (m <= 1)
470 break; /* n is completely factorable by twos, threes, and fives */
471 n++;
472 }
473 return n;
474 }
475