1 /*
2 * This file is part of pocketfft.
3 * Licensed under a 3-clause BSD style license - see LICENSE.md
4 */
5
6 /*
7 * Main implementation file.
8 *
9 * Copyright (C) 2004-2018 Max-Planck-Society
10 * \author Martin Reinecke
11 */
12
13 #include <math.h>
14 #include <string.h>
15
16 #include "pocketfft.h"
17
18 #define RALLOC(type,num) \
19 ((type *)malloc((num)*sizeof(type)))
20 #define DEALLOC(ptr) \
21 do { free(ptr); (ptr)=NULL; } while(0)
22
23 #define SWAP(a,b,type) \
24 do { type tmp_=(a); (a)=(b); (b)=tmp_; } while(0)
25
26 #ifdef __GNUC__
27 #define NOINLINE __attribute__((noinline))
28 #define WARN_UNUSED_RESULT __attribute__ ((warn_unused_result))
29 #else
30 #define NOINLINE
31 #define WARN_UNUSED_RESULT
32 #endif
33
34 // adapted from https://stackoverflow.com/questions/42792939/
35 // CAUTION: this function only works for arguments in the range [-0.25; 0.25]!
my_sincosm1pi(double a,double * restrict res)36 static void my_sincosm1pi (double a, double *restrict res)
37 {
38 double s = a * a;
39 /* Approximate cos(pi*x)-1 for x in [-0.25,0.25] */
40 double r = -1.0369917389758117e-4;
41 r = fma (r, s, 1.9294935641298806e-3);
42 r = fma (r, s, -2.5806887942825395e-2);
43 r = fma (r, s, 2.3533063028328211e-1);
44 r = fma (r, s, -1.3352627688538006e+0);
45 r = fma (r, s, 4.0587121264167623e+0);
46 r = fma (r, s, -4.9348022005446790e+0);
47 double c = r*s;
48 /* Approximate sin(pi*x) for x in [-0.25,0.25] */
49 r = 4.6151442520157035e-4;
50 r = fma (r, s, -7.3700183130883555e-3);
51 r = fma (r, s, 8.2145868949323936e-2);
52 r = fma (r, s, -5.9926452893214921e-1);
53 r = fma (r, s, 2.5501640398732688e+0);
54 r = fma (r, s, -5.1677127800499516e+0);
55 s = s * a;
56 r = r * s;
57 s = fma (a, 3.1415926535897931e+0, r);
58 res[0] = c;
59 res[1] = s;
60 }
61
calc_first_octant(size_t den,double * restrict res)62 NOINLINE static void calc_first_octant(size_t den, double * restrict res)
63 {
64 size_t n = (den+4)>>3;
65 if (n==0) return;
66 res[0]=1.; res[1]=0.;
67 if (n==1) return;
68 size_t l1=(size_t)sqrt(n);
69 for (size_t i=1; i<l1; ++i)
70 my_sincosm1pi((2.*i)/den,&res[2*i]);
71 size_t start=l1;
72 while(start<n)
73 {
74 double cs[2];
75 my_sincosm1pi((2.*start)/den,cs);
76 res[2*start] = cs[0]+1.;
77 res[2*start+1] = cs[1];
78 size_t end = l1;
79 if (start+end>n) end = n-start;
80 for (size_t i=1; i<end; ++i)
81 {
82 double csx[2]={res[2*i], res[2*i+1]};
83 res[2*(start+i)] = ((cs[0]*csx[0] - cs[1]*csx[1] + cs[0]) + csx[0]) + 1.;
84 res[2*(start+i)+1] = (cs[0]*csx[1] + cs[1]*csx[0]) + cs[1] + csx[1];
85 }
86 start += l1;
87 }
88 for (size_t i=1; i<l1; ++i)
89 res[2*i] += 1.;
90 }
91
calc_first_quadrant(size_t n,double * restrict res)92 NOINLINE static void calc_first_quadrant(size_t n, double * restrict res)
93 {
94 double * restrict p = res+n;
95 calc_first_octant(n<<1, p);
96 size_t ndone=(n+2)>>2;
97 size_t i=0, idx1=0, idx2=2*ndone-2;
98 for (; i+1<ndone; i+=2, idx1+=2, idx2-=2)
99 {
100 res[idx1] = p[2*i];
101 res[idx1+1] = p[2*i+1];
102 res[idx2] = p[2*i+3];
103 res[idx2+1] = p[2*i+2];
104 }
105 if (i!=ndone)
106 {
107 res[idx1 ] = p[2*i];
108 res[idx1+1] = p[2*i+1];
109 }
110 }
111
calc_first_half(size_t n,double * restrict res)112 NOINLINE static void calc_first_half(size_t n, double * restrict res)
113 {
114 int ndone=(n+1)>>1;
115 double * p = res+n-1;
116 calc_first_octant(n<<2, p);
117 int i4=0, in=n, i=0;
118 for (; i4<=in-i4; ++i, i4+=4) // octant 0
119 {
120 res[2*i] = p[2*i4]; res[2*i+1] = p[2*i4+1];
121 }
122 for (; i4-in <= 0; ++i, i4+=4) // octant 1
123 {
124 int xm = in-i4;
125 res[2*i] = p[2*xm+1]; res[2*i+1] = p[2*xm];
126 }
127 for (; i4<=3*in-i4; ++i, i4+=4) // octant 2
128 {
129 int xm = i4-in;
130 res[2*i] = -p[2*xm+1]; res[2*i+1] = p[2*xm];
131 }
132 for (; i<ndone; ++i, i4+=4) // octant 3
133 {
134 int xm = 2*in-i4;
135 res[2*i] = -p[2*xm]; res[2*i+1] = p[2*xm+1];
136 }
137 }
138
fill_first_quadrant(size_t n,double * restrict res)139 NOINLINE static void fill_first_quadrant(size_t n, double * restrict res)
140 {
141 const double hsqt2 = 0.707106781186547524400844362104849;
142 size_t quart = n>>2;
143 if ((n&7)==0)
144 res[quart] = res[quart+1] = hsqt2;
145 for (size_t i=2, j=2*quart-2; i<quart; i+=2, j-=2)
146 {
147 res[j ] = res[i+1];
148 res[j+1] = res[i ];
149 }
150 }
151
fill_first_half(size_t n,double * restrict res)152 NOINLINE static void fill_first_half(size_t n, double * restrict res)
153 {
154 size_t half = n>>1;
155 if ((n&3)==0)
156 for (size_t i=0; i<half; i+=2)
157 {
158 res[i+half] = -res[i+1];
159 res[i+half+1] = res[i ];
160 }
161 else
162 for (size_t i=2, j=2*half-2; i<half; i+=2, j-=2)
163 {
164 res[j ] = -res[i ];
165 res[j+1] = res[i+1];
166 }
167 }
168
fill_second_half(size_t n,double * restrict res)169 NOINLINE static void fill_second_half(size_t n, double * restrict res)
170 {
171 if ((n&1)==0)
172 for (size_t i=0; i<n; ++i)
173 res[i+n] = -res[i];
174 else
175 for (size_t i=2, j=2*n-2; i<n; i+=2, j-=2)
176 {
177 res[j ] = res[i ];
178 res[j+1] = -res[i+1];
179 }
180 }
181
sincos_2pibyn_half(size_t n,double * restrict res)182 NOINLINE static void sincos_2pibyn_half(size_t n, double * restrict res)
183 {
184 if ((n&3)==0)
185 {
186 calc_first_octant(n, res);
187 fill_first_quadrant(n, res);
188 fill_first_half(n, res);
189 }
190 else if ((n&1)==0)
191 {
192 calc_first_quadrant(n, res);
193 fill_first_half(n, res);
194 }
195 else
196 calc_first_half(n, res);
197 }
198
sincos_2pibyn(size_t n,double * restrict res)199 NOINLINE static void sincos_2pibyn(size_t n, double * restrict res)
200 {
201 sincos_2pibyn_half(n, res);
202 fill_second_half(n, res);
203 }
204
largest_prime_factor(size_t n)205 NOINLINE static size_t largest_prime_factor (size_t n)
206 {
207 size_t res=1;
208 size_t tmp;
209 while (((tmp=(n>>1))<<1)==n)
210 { res=2; n=tmp; }
211
212 size_t limit=(size_t)sqrt(n+0.01);
213 for (size_t x=3; x<=limit; x+=2)
214 while (((tmp=(n/x))*x)==n)
215 {
216 res=x;
217 n=tmp;
218 limit=(size_t)sqrt(n+0.01);
219 }
220 if (n>1) res=n;
221
222 return res;
223 }
224
cost_guess(size_t n)225 NOINLINE static double cost_guess (size_t n)
226 {
227 const double lfp=1.1; // penalty for non-hardcoded larger factors
228 size_t ni=n;
229 double result=0.;
230 size_t tmp;
231 while (((tmp=(n>>1))<<1)==n)
232 { result+=2; n=tmp; }
233
234 size_t limit=(size_t)sqrt(n+0.01);
235 for (size_t x=3; x<=limit; x+=2)
236 while ((tmp=(n/x))*x==n)
237 {
238 result+= (x<=5) ? x : lfp*x; // penalize larger prime factors
239 n=tmp;
240 limit=(size_t)sqrt(n+0.01);
241 }
242 if (n>1) result+=(n<=5) ? n : lfp*n;
243
244 return result*ni;
245 }
246
247 /* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */
good_size(size_t n)248 NOINLINE static size_t good_size(size_t n)
249 {
250 if (n<=6) return n;
251
252 size_t bestfac=2*n;
253 for (size_t f2=1; f2<bestfac; f2*=2)
254 for (size_t f23=f2; f23<bestfac; f23*=3)
255 for (size_t f235=f23; f235<bestfac; f235*=5)
256 for (size_t f2357=f235; f2357<bestfac; f2357*=7)
257 for (size_t f235711=f2357; f235711<bestfac; f235711*=11)
258 if (f235711>=n) bestfac=f235711;
259 return bestfac;
260 }
261
262 typedef struct cmplx {
263 double r,i;
264 } cmplx;
265
266 #define NFCT 25
267 typedef struct cfftp_fctdata
268 {
269 size_t fct;
270 cmplx *tw, *tws;
271 } cfftp_fctdata;
272
273 typedef struct cfftp_plan_i
274 {
275 size_t length, nfct;
276 cmplx *mem;
277 cfftp_fctdata fct[NFCT];
278 } cfftp_plan_i;
279 typedef struct cfftp_plan_i * cfftp_plan;
280
281 #define PMC(a,b,c,d) { a.r=c.r+d.r; a.i=c.i+d.i; b.r=c.r-d.r; b.i=c.i-d.i; }
282 #define ADDC(a,b,c) { a.r=b.r+c.r; a.i=b.i+c.i; }
283 #define SCALEC(a,b) { a.r*=b; a.i*=b; }
284 #define ROT90(a) { double tmp_=a.r; a.r=-a.i; a.i=tmp_; }
285 #define ROTM90(a) { double tmp_=-a.r; a.r=a.i; a.i=tmp_; }
286 #define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
287 #define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
288 #define WA(x,i) wa[(i)-1+(x)*(ido-1)]
289 /* a = b*c */
290 #define A_EQ_B_MUL_C(a,b,c) { a.r=b.r*c.r-b.i*c.i; a.i=b.r*c.i+b.i*c.r; }
291 /* a = conj(b)*c*/
292 #define A_EQ_CB_MUL_C(a,b,c) { a.r=b.r*c.r+b.i*c.i; a.i=b.r*c.i-b.i*c.r; }
293
294 #define PMSIGNC(a,b,c,d) { a.r=c.r+sign*d.r; a.i=c.i+sign*d.i; b.r=c.r-sign*d.r; b.i=c.i-sign*d.i; }
295 /* a = b*c */
296 #define MULPMSIGNC(a,b,c) { a.r=b.r*c.r-sign*b.i*c.i; a.i=b.r*c.i+sign*b.i*c.r; }
297 /* a *= b */
298 #define MULPMSIGNCEQ(a,b) { double xtmp=a.r; a.r=b.r*a.r-sign*b.i*a.i; a.i=b.r*a.i+sign*b.i*xtmp; }
299
pass2b(size_t ido,size_t l1,const cmplx * restrict cc,cmplx * restrict ch,const cmplx * restrict wa)300 NOINLINE static void pass2b (size_t ido, size_t l1, const cmplx * restrict cc,
301 cmplx * restrict ch, const cmplx * restrict wa)
302 {
303 const size_t cdim=2;
304
305 if (ido==1)
306 for (size_t k=0; k<l1; ++k)
307 PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
308 else
309 for (size_t k=0; k<l1; ++k)
310 {
311 PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
312 for (size_t i=1; i<ido; ++i)
313 {
314 cmplx t;
315 PMC (CH(i,k,0),t,CC(i,0,k),CC(i,1,k))
316 A_EQ_B_MUL_C (CH(i,k,1),WA(0,i),t)
317 }
318 }
319 }
320
pass2f(size_t ido,size_t l1,const cmplx * restrict cc,cmplx * restrict ch,const cmplx * restrict wa)321 NOINLINE static void pass2f (size_t ido, size_t l1, const cmplx * restrict cc,
322 cmplx * restrict ch, const cmplx * restrict wa)
323 {
324 const size_t cdim=2;
325
326 if (ido==1)
327 for (size_t k=0; k<l1; ++k)
328 PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
329 else
330 for (size_t k=0; k<l1; ++k)
331 {
332 PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
333 for (size_t i=1; i<ido; ++i)
334 {
335 cmplx t;
336 PMC (CH(i,k,0),t,CC(i,0,k),CC(i,1,k))
337 A_EQ_CB_MUL_C (CH(i,k,1),WA(0,i),t)
338 }
339 }
340 }
341
342 #define PREP3(idx) \
343 cmplx t0 = CC(idx,0,k), t1, t2; \
344 PMC (t1,t2,CC(idx,1,k),CC(idx,2,k)) \
345 CH(idx,k,0).r=t0.r+t1.r; \
346 CH(idx,k,0).i=t0.i+t1.i;
347 #define PARTSTEP3a(u1,u2,twr,twi) \
348 { \
349 cmplx ca,cb; \
350 ca.r=t0.r+twr*t1.r; \
351 ca.i=t0.i+twr*t1.i; \
352 cb.i=twi*t2.r; \
353 cb.r=-(twi*t2.i); \
354 PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) \
355 }
356
357 #define PARTSTEP3b(u1,u2,twr,twi) \
358 { \
359 cmplx ca,cb,da,db; \
360 ca.r=t0.r+twr*t1.r; \
361 ca.i=t0.i+twr*t1.i; \
362 cb.i=twi*t2.r; \
363 cb.r=-(twi*t2.i); \
364 PMC(da,db,ca,cb) \
365 A_EQ_B_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
366 A_EQ_B_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
367 }
pass3b(size_t ido,size_t l1,const cmplx * restrict cc,cmplx * restrict ch,const cmplx * restrict wa)368 NOINLINE static void pass3b (size_t ido, size_t l1, const cmplx * restrict cc,
369 cmplx * restrict ch, const cmplx * restrict wa)
370 {
371 const size_t cdim=3;
372 const double tw1r=-0.5, tw1i= 0.86602540378443864676;
373
374 if (ido==1)
375 for (size_t k=0; k<l1; ++k)
376 {
377 PREP3(0)
378 PARTSTEP3a(1,2,tw1r,tw1i)
379 }
380 else
381 for (size_t k=0; k<l1; ++k)
382 {
383 {
384 PREP3(0)
385 PARTSTEP3a(1,2,tw1r,tw1i)
386 }
387 for (size_t i=1; i<ido; ++i)
388 {
389 PREP3(i)
390 PARTSTEP3b(1,2,tw1r,tw1i)
391 }
392 }
393 }
394 #define PARTSTEP3f(u1,u2,twr,twi) \
395 { \
396 cmplx ca,cb,da,db; \
397 ca.r=t0.r+twr*t1.r; \
398 ca.i=t0.i+twr*t1.i; \
399 cb.i=twi*t2.r; \
400 cb.r=-(twi*t2.i); \
401 PMC(da,db,ca,cb) \
402 A_EQ_CB_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
403 A_EQ_CB_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
404 }
pass3f(size_t ido,size_t l1,const cmplx * restrict cc,cmplx * restrict ch,const cmplx * restrict wa)405 NOINLINE static void pass3f (size_t ido, size_t l1, const cmplx * restrict cc,
406 cmplx * restrict ch, const cmplx * restrict wa)
407 {
408 const size_t cdim=3;
409 const double tw1r=-0.5, tw1i= -0.86602540378443864676;
410
411 if (ido==1)
412 for (size_t k=0; k<l1; ++k)
413 {
414 PREP3(0)
415 PARTSTEP3a(1,2,tw1r,tw1i)
416 }
417 else
418 for (size_t k=0; k<l1; ++k)
419 {
420 {
421 PREP3(0)
422 PARTSTEP3a(1,2,tw1r,tw1i)
423 }
424 for (size_t i=1; i<ido; ++i)
425 {
426 PREP3(i)
427 PARTSTEP3f(1,2,tw1r,tw1i)
428 }
429 }
430 }
431
pass4b(size_t ido,size_t l1,const cmplx * restrict cc,cmplx * restrict ch,const cmplx * restrict wa)432 NOINLINE static void pass4b (size_t ido, size_t l1, const cmplx * restrict cc,
433 cmplx * restrict ch, const cmplx * restrict wa)
434 {
435 const size_t cdim=4;
436
437 if (ido==1)
438 for (size_t k=0; k<l1; ++k)
439 {
440 cmplx t1, t2, t3, t4;
441 PMC(t2,t1,CC(0,0,k),CC(0,2,k))
442 PMC(t3,t4,CC(0,1,k),CC(0,3,k))
443 ROT90(t4)
444 PMC(CH(0,k,0),CH(0,k,2),t2,t3)
445 PMC(CH(0,k,1),CH(0,k,3),t1,t4)
446 }
447 else
448 for (size_t k=0; k<l1; ++k)
449 {
450 {
451 cmplx t1, t2, t3, t4;
452 PMC(t2,t1,CC(0,0,k),CC(0,2,k))
453 PMC(t3,t4,CC(0,1,k),CC(0,3,k))
454 ROT90(t4)
455 PMC(CH(0,k,0),CH(0,k,2),t2,t3)
456 PMC(CH(0,k,1),CH(0,k,3),t1,t4)
457 }
458 for (size_t i=1; i<ido; ++i)
459 {
460 cmplx c2, c3, c4, t1, t2, t3, t4;
461 cmplx cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
462 PMC(t2,t1,cc0,cc2)
463 PMC(t3,t4,cc1,cc3)
464 ROT90(t4)
465 cmplx wa0=WA(0,i), wa1=WA(1,i),wa2=WA(2,i);
466 PMC(CH(i,k,0),c3,t2,t3)
467 PMC(c2,c4,t1,t4)
468 A_EQ_B_MUL_C (CH(i,k,1),wa0,c2)
469 A_EQ_B_MUL_C (CH(i,k,2),wa1,c3)
470 A_EQ_B_MUL_C (CH(i,k,3),wa2,c4)
471 }
472 }
473 }
pass4f(size_t ido,size_t l1,const cmplx * restrict cc,cmplx * restrict ch,const cmplx * restrict wa)474 NOINLINE static void pass4f (size_t ido, size_t l1, const cmplx * restrict cc,
475 cmplx * restrict ch, const cmplx * restrict wa)
476 {
477 const size_t cdim=4;
478
479 if (ido==1)
480 for (size_t k=0; k<l1; ++k)
481 {
482 cmplx t1, t2, t3, t4;
483 PMC(t2,t1,CC(0,0,k),CC(0,2,k))
484 PMC(t3,t4,CC(0,1,k),CC(0,3,k))
485 ROTM90(t4)
486 PMC(CH(0,k,0),CH(0,k,2),t2,t3)
487 PMC(CH(0,k,1),CH(0,k,3),t1,t4)
488 }
489 else
490 for (size_t k=0; k<l1; ++k)
491 {
492 {
493 cmplx t1, t2, t3, t4;
494 PMC(t2,t1,CC(0,0,k),CC(0,2,k))
495 PMC(t3,t4,CC(0,1,k),CC(0,3,k))
496 ROTM90(t4)
497 PMC(CH(0,k,0),CH(0,k,2),t2,t3)
498 PMC (CH(0,k,1),CH(0,k,3),t1,t4)
499 }
500 for (size_t i=1; i<ido; ++i)
501 {
502 cmplx c2, c3, c4, t1, t2, t3, t4;
503 cmplx cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
504 PMC(t2,t1,cc0,cc2)
505 PMC(t3,t4,cc1,cc3)
506 ROTM90(t4)
507 cmplx wa0=WA(0,i), wa1=WA(1,i),wa2=WA(2,i);
508 PMC(CH(i,k,0),c3,t2,t3)
509 PMC(c2,c4,t1,t4)
510 A_EQ_CB_MUL_C (CH(i,k,1),wa0,c2)
511 A_EQ_CB_MUL_C (CH(i,k,2),wa1,c3)
512 A_EQ_CB_MUL_C (CH(i,k,3),wa2,c4)
513 }
514 }
515 }
516
517 #define PREP5(idx) \
518 cmplx t0 = CC(idx,0,k), t1, t2, t3, t4; \
519 PMC (t1,t4,CC(idx,1,k),CC(idx,4,k)) \
520 PMC (t2,t3,CC(idx,2,k),CC(idx,3,k)) \
521 CH(idx,k,0).r=t0.r+t1.r+t2.r; \
522 CH(idx,k,0).i=t0.i+t1.i+t2.i;
523
524 #define PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \
525 { \
526 cmplx ca,cb; \
527 ca.r=t0.r+twar*t1.r+twbr*t2.r; \
528 ca.i=t0.i+twar*t1.i+twbr*t2.i; \
529 cb.i=twai*t4.r twbi*t3.r; \
530 cb.r=-(twai*t4.i twbi*t3.i); \
531 PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) \
532 }
533
534 #define PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \
535 { \
536 cmplx ca,cb,da,db; \
537 ca.r=t0.r+twar*t1.r+twbr*t2.r; \
538 ca.i=t0.i+twar*t1.i+twbr*t2.i; \
539 cb.i=twai*t4.r twbi*t3.r; \
540 cb.r=-(twai*t4.i twbi*t3.i); \
541 PMC(da,db,ca,cb) \
542 A_EQ_B_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
543 A_EQ_B_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
544 }
pass5b(size_t ido,size_t l1,const cmplx * restrict cc,cmplx * restrict ch,const cmplx * restrict wa)545 NOINLINE static void pass5b (size_t ido, size_t l1, const cmplx * restrict cc,
546 cmplx * restrict ch, const cmplx * restrict wa)
547 {
548 const size_t cdim=5;
549 const double tw1r= 0.3090169943749474241,
550 tw1i= 0.95105651629515357212,
551 tw2r= -0.8090169943749474241,
552 tw2i= 0.58778525229247312917;
553
554 if (ido==1)
555 for (size_t k=0; k<l1; ++k)
556 {
557 PREP5(0)
558 PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
559 PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
560 }
561 else
562 for (size_t k=0; k<l1; ++k)
563 {
564 {
565 PREP5(0)
566 PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
567 PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
568 }
569 for (size_t i=1; i<ido; ++i)
570 {
571 PREP5(i)
572 PARTSTEP5b(1,4,tw1r,tw2r,+tw1i,+tw2i)
573 PARTSTEP5b(2,3,tw2r,tw1r,+tw2i,-tw1i)
574 }
575 }
576 }
577 #define PARTSTEP5f(u1,u2,twar,twbr,twai,twbi) \
578 { \
579 cmplx ca,cb,da,db; \
580 ca.r=t0.r+twar*t1.r+twbr*t2.r; \
581 ca.i=t0.i+twar*t1.i+twbr*t2.i; \
582 cb.i=twai*t4.r twbi*t3.r; \
583 cb.r=-(twai*t4.i twbi*t3.i); \
584 PMC(da,db,ca,cb) \
585 A_EQ_CB_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
586 A_EQ_CB_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
587 }
pass5f(size_t ido,size_t l1,const cmplx * restrict cc,cmplx * restrict ch,const cmplx * restrict wa)588 NOINLINE static void pass5f (size_t ido, size_t l1, const cmplx * restrict cc,
589 cmplx * restrict ch, const cmplx * restrict wa)
590 {
591 const size_t cdim=5;
592 const double tw1r= 0.3090169943749474241,
593 tw1i= -0.95105651629515357212,
594 tw2r= -0.8090169943749474241,
595 tw2i= -0.58778525229247312917;
596
597 if (ido==1)
598 for (size_t k=0; k<l1; ++k)
599 {
600 PREP5(0)
601 PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
602 PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
603 }
604 else
605 for (size_t k=0; k<l1; ++k)
606 {
607 {
608 PREP5(0)
609 PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
610 PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
611 }
612 for (size_t i=1; i<ido; ++i)
613 {
614 PREP5(i)
615 PARTSTEP5f(1,4,tw1r,tw2r,+tw1i,+tw2i)
616 PARTSTEP5f(2,3,tw2r,tw1r,+tw2i,-tw1i)
617 }
618 }
619 }
620
621 #define PREP7(idx) \
622 cmplx t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \
623 PMC (t2,t7,CC(idx,1,k),CC(idx,6,k)) \
624 PMC (t3,t6,CC(idx,2,k),CC(idx,5,k)) \
625 PMC (t4,t5,CC(idx,3,k),CC(idx,4,k)) \
626 CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \
627 CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i;
628
629 #define PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,out1,out2) \
630 { \
631 cmplx ca,cb; \
632 ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; \
633 ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \
634 cb.i=y1*t7.r y2*t6.r y3*t5.r; \
635 cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \
636 PMC(out1,out2,ca,cb) \
637 }
638 #define PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \
639 PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2))
640 #define PARTSTEP7(u1,u2,x1,x2,x3,y1,y2,y3) \
641 { \
642 cmplx da,db; \
643 PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \
644 MULPMSIGNC (CH(i,k,u1),WA(u1-1,i),da) \
645 MULPMSIGNC (CH(i,k,u2),WA(u2-1,i),db) \
646 }
647
pass7(size_t ido,size_t l1,const cmplx * restrict cc,cmplx * restrict ch,const cmplx * restrict wa,const int sign)648 NOINLINE static void pass7(size_t ido, size_t l1, const cmplx * restrict cc,
649 cmplx * restrict ch, const cmplx * restrict wa, const int sign)
650 {
651 const size_t cdim=7;
652 const double tw1r= 0.623489801858733530525,
653 tw1i= sign * 0.7818314824680298087084,
654 tw2r= -0.222520933956314404289,
655 tw2i= sign * 0.9749279121818236070181,
656 tw3r= -0.9009688679024191262361,
657 tw3i= sign * 0.4338837391175581204758;
658
659 if (ido==1)
660 for (size_t k=0; k<l1; ++k)
661 {
662 PREP7(0)
663 PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
664 PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
665 PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
666 }
667 else
668 for (size_t k=0; k<l1; ++k)
669 {
670 {
671 PREP7(0)
672 PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
673 PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
674 PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
675 }
676 for (size_t i=1; i<ido; ++i)
677 {
678 PREP7(i)
679 PARTSTEP7(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
680 PARTSTEP7(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
681 PARTSTEP7(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
682 }
683 }
684 }
685
686 #define PREP11(idx) \
687 cmplx t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \
688 PMC (t2,t11,CC(idx,1,k),CC(idx,10,k)) \
689 PMC (t3,t10,CC(idx,2,k),CC(idx, 9,k)) \
690 PMC (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)) \
691 PMC (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)) \
692 PMC (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)) \
693 CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; \
694 CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i;
695
696 #define PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,out1,out2) \
697 { \
698 cmplx ca,cb; \
699 ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r+x4*t5.r+x5*t6.r; \
700 ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i+x4*t5.i+x5*t6.i; \
701 cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; \
702 cb.r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); \
703 PMC(out1,out2,ca,cb) \
704 }
705 #define PARTSTEP11a(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
706 PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,CH(0,k,u1),CH(0,k,u2))
707 #define PARTSTEP11(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
708 { \
709 cmplx da,db; \
710 PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,da,db) \
711 MULPMSIGNC (CH(i,k,u1),WA(u1-1,i),da) \
712 MULPMSIGNC (CH(i,k,u2),WA(u2-1,i),db) \
713 }
714
pass11(size_t ido,size_t l1,const cmplx * restrict cc,cmplx * restrict ch,const cmplx * restrict wa,const int sign)715 NOINLINE static void pass11 (size_t ido, size_t l1, const cmplx * restrict cc,
716 cmplx * restrict ch, const cmplx * restrict wa, const int sign)
717 {
718 const size_t cdim=11;
719 const double tw1r = 0.8412535328311811688618,
720 tw1i = sign * 0.5406408174555975821076,
721 tw2r = 0.4154150130018864255293,
722 tw2i = sign * 0.9096319953545183714117,
723 tw3r = -0.1423148382732851404438,
724 tw3i = sign * 0.9898214418809327323761,
725 tw4r = -0.6548607339452850640569,
726 tw4i = sign * 0.755749574354258283774,
727 tw5r = -0.9594929736144973898904,
728 tw5i = sign * 0.2817325568414296977114;
729
730 if (ido==1)
731 for (size_t k=0; k<l1; ++k)
732 {
733 PREP11(0)
734 PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
735 PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
736 PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
737 PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
738 PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
739 }
740 else
741 for (size_t k=0; k<l1; ++k)
742 {
743 {
744 PREP11(0)
745 PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
746 PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
747 PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
748 PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
749 PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
750 }
751 for (size_t i=1; i<ido; ++i)
752 {
753 PREP11(i)
754 PARTSTEP11(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
755 PARTSTEP11(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
756 PARTSTEP11(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
757 PARTSTEP11(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
758 PARTSTEP11(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
759 }
760 }
761 }
762
763 #define CX(a,b,c) cc[(a)+ido*((b)+l1*(c))]
764 #define CX2(a,b) cc[(a)+idl1*(b)]
765 #define CH2(a,b) ch[(a)+idl1*(b)]
766
passg(size_t ido,size_t ip,size_t l1,cmplx * restrict cc,cmplx * restrict ch,const cmplx * restrict wa,const cmplx * restrict csarr,const int sign)767 NOINLINE static int passg (size_t ido, size_t ip, size_t l1,
768 cmplx * restrict cc, cmplx * restrict ch, const cmplx * restrict wa,
769 const cmplx * restrict csarr, const int sign)
770 {
771 const size_t cdim=ip;
772 size_t ipph = (ip+1)/2;
773 size_t idl1 = ido*l1;
774
775 cmplx * restrict wal=RALLOC(cmplx,ip);
776 if (!wal) return -1;
777 wal[0]=(cmplx){1.,0.};
778 for (size_t i=1; i<ip; ++i)
779 wal[i]=(cmplx){csarr[i].r,sign*csarr[i].i};
780
781 for (size_t k=0; k<l1; ++k)
782 for (size_t i=0; i<ido; ++i)
783 CH(i,k,0) = CC(i,0,k);
784 for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
785 for (size_t k=0; k<l1; ++k)
786 for (size_t i=0; i<ido; ++i)
787 PMC(CH(i,k,j),CH(i,k,jc),CC(i,j,k),CC(i,jc,k))
788 for (size_t k=0; k<l1; ++k)
789 for (size_t i=0; i<ido; ++i)
790 {
791 cmplx tmp = CH(i,k,0);
792 for (size_t j=1; j<ipph; ++j)
793 ADDC(tmp,tmp,CH(i,k,j))
794 CX(i,k,0) = tmp;
795 }
796 for (size_t l=1, lc=ip-1; l<ipph; ++l, --lc)
797 {
798 // j=0
799 for (size_t ik=0; ik<idl1; ++ik)
800 {
801 CX2(ik,l).r = CH2(ik,0).r+wal[l].r*CH2(ik,1).r+wal[2*l].r*CH2(ik,2).r;
802 CX2(ik,l).i = CH2(ik,0).i+wal[l].r*CH2(ik,1).i+wal[2*l].r*CH2(ik,2).i;
803 CX2(ik,lc).r=-wal[l].i*CH2(ik,ip-1).i-wal[2*l].i*CH2(ik,ip-2).i;
804 CX2(ik,lc).i=wal[l].i*CH2(ik,ip-1).r+wal[2*l].i*CH2(ik,ip-2).r;
805 }
806
807 size_t iwal=2*l;
808 size_t j=3, jc=ip-3;
809 for (; j<ipph-1; j+=2, jc-=2)
810 {
811 iwal+=l; if (iwal>ip) iwal-=ip;
812 cmplx xwal=wal[iwal];
813 iwal+=l; if (iwal>ip) iwal-=ip;
814 cmplx xwal2=wal[iwal];
815 for (size_t ik=0; ik<idl1; ++ik)
816 {
817 CX2(ik,l).r += CH2(ik,j).r*xwal.r+CH2(ik,j+1).r*xwal2.r;
818 CX2(ik,l).i += CH2(ik,j).i*xwal.r+CH2(ik,j+1).i*xwal2.r;
819 CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i+CH2(ik,jc-1).i*xwal2.i;
820 CX2(ik,lc).i += CH2(ik,jc).r*xwal.i+CH2(ik,jc-1).r*xwal2.i;
821 }
822 }
823 for (; j<ipph; ++j, --jc)
824 {
825 iwal+=l; if (iwal>ip) iwal-=ip;
826 cmplx xwal=wal[iwal];
827 for (size_t ik=0; ik<idl1; ++ik)
828 {
829 CX2(ik,l).r += CH2(ik,j).r*xwal.r;
830 CX2(ik,l).i += CH2(ik,j).i*xwal.r;
831 CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i;
832 CX2(ik,lc).i += CH2(ik,jc).r*xwal.i;
833 }
834 }
835 }
836 DEALLOC(wal);
837
838 // shuffling and twiddling
839 if (ido==1)
840 for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
841 for (size_t ik=0; ik<idl1; ++ik)
842 {
843 cmplx t1=CX2(ik,j), t2=CX2(ik,jc);
844 PMC(CX2(ik,j),CX2(ik,jc),t1,t2)
845 }
846 else
847 {
848 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)
849 for (size_t k=0; k<l1; ++k)
850 {
851 cmplx t1=CX(0,k,j), t2=CX(0,k,jc);
852 PMC(CX(0,k,j),CX(0,k,jc),t1,t2)
853 for (size_t i=1; i<ido; ++i)
854 {
855 cmplx x1, x2;
856 PMC(x1,x2,CX(i,k,j),CX(i,k,jc))
857 size_t idij=(j-1)*(ido-1)+i-1;
858 MULPMSIGNC (CX(i,k,j),wa[idij],x1)
859 idij=(jc-1)*(ido-1)+i-1;
860 MULPMSIGNC (CX(i,k,jc),wa[idij],x2)
861 }
862 }
863 }
864 return 0;
865 }
866
867 #undef CH2
868 #undef CX2
869 #undef CX
870
pass_all(cfftp_plan plan,cmplx c[],double fct,const int sign)871 NOINLINE WARN_UNUSED_RESULT static int pass_all(cfftp_plan plan, cmplx c[], double fct,
872 const int sign)
873 {
874 if (plan->length==1) return 0;
875 size_t len=plan->length;
876 size_t l1=1, nf=plan->nfct;
877 cmplx *ch = RALLOC(cmplx, len);
878 if (!ch) return -1;
879 cmplx *p1=c, *p2=ch;
880
881 for(size_t k1=0; k1<nf; k1++)
882 {
883 size_t ip=plan->fct[k1].fct;
884 size_t l2=ip*l1;
885 size_t ido = len/l2;
886 if (ip==4)
887 sign>0 ? pass4b (ido, l1, p1, p2, plan->fct[k1].tw)
888 : pass4f (ido, l1, p1, p2, plan->fct[k1].tw);
889 else if(ip==2)
890 sign>0 ? pass2b (ido, l1, p1, p2, plan->fct[k1].tw)
891 : pass2f (ido, l1, p1, p2, plan->fct[k1].tw);
892 else if(ip==3)
893 sign>0 ? pass3b (ido, l1, p1, p2, plan->fct[k1].tw)
894 : pass3f (ido, l1, p1, p2, plan->fct[k1].tw);
895 else if(ip==5)
896 sign>0 ? pass5b (ido, l1, p1, p2, plan->fct[k1].tw)
897 : pass5f (ido, l1, p1, p2, plan->fct[k1].tw);
898 else if(ip==7) pass7 (ido, l1, p1, p2, plan->fct[k1].tw, sign);
899 else if(ip==11) pass11(ido, l1, p1, p2, plan->fct[k1].tw, sign);
900 else
901 {
902 if (passg(ido, ip, l1, p1, p2, plan->fct[k1].tw, plan->fct[k1].tws, sign))
903 { DEALLOC(ch); return -1; }
904 SWAP(p1,p2,cmplx *);
905 }
906 SWAP(p1,p2,cmplx *);
907 l1=l2;
908 }
909 if (p1!=c)
910 {
911 if (fct!=1.)
912 for (size_t i=0; i<len; ++i)
913 {
914 c[i].r = ch[i].r*fct;
915 c[i].i = ch[i].i*fct;
916 }
917 else
918 memcpy (c,p1,len*sizeof(cmplx));
919 }
920 else
921 if (fct!=1.)
922 for (size_t i=0; i<len; ++i)
923 {
924 c[i].r *= fct;
925 c[i].i *= fct;
926 }
927 DEALLOC(ch);
928 return 0;
929 }
930
931 #undef PMSIGNC
932 #undef A_EQ_B_MUL_C
933 #undef A_EQ_CB_MUL_C
934 #undef MULPMSIGNC
935 #undef MULPMSIGNCEQ
936
937 #undef WA
938 #undef CC
939 #undef CH
940 #undef ROT90
941 #undef SCALEC
942 #undef ADDC
943 #undef PMC
944
945 NOINLINE WARN_UNUSED_RESULT
cfftp_forward(cfftp_plan plan,double c[],double fct)946 static int cfftp_forward(cfftp_plan plan, double c[], double fct)
947 { return pass_all(plan,(cmplx *)c, fct, -1); }
948
949 NOINLINE WARN_UNUSED_RESULT
cfftp_backward(cfftp_plan plan,double c[],double fct)950 static int cfftp_backward(cfftp_plan plan, double c[], double fct)
951 { return pass_all(plan,(cmplx *)c, fct, 1); }
952
953 NOINLINE WARN_UNUSED_RESULT
cfftp_factorize(cfftp_plan plan)954 static int cfftp_factorize (cfftp_plan plan)
955 {
956 size_t length=plan->length;
957 size_t nfct=0;
958 while ((length%4)==0)
959 { if (nfct>=NFCT) return -1; plan->fct[nfct++].fct=4; length>>=2; }
960 if ((length%2)==0)
961 {
962 length>>=1;
963 // factor 2 should be at the front of the factor list
964 if (nfct>=NFCT) return -1;
965 plan->fct[nfct++].fct=2;
966 SWAP(plan->fct[0].fct, plan->fct[nfct-1].fct,size_t);
967 }
968 size_t maxl=(size_t)(sqrt((double)length))+1;
969 for (size_t divisor=3; (length>1)&&(divisor<maxl); divisor+=2)
970 if ((length%divisor)==0)
971 {
972 while ((length%divisor)==0)
973 {
974 if (nfct>=NFCT) return -1;
975 plan->fct[nfct++].fct=divisor;
976 length/=divisor;
977 }
978 maxl=(size_t)(sqrt((double)length))+1;
979 }
980 if (length>1) plan->fct[nfct++].fct=length;
981 plan->nfct=nfct;
982 return 0;
983 }
984
cfftp_twsize(cfftp_plan plan)985 NOINLINE static size_t cfftp_twsize (cfftp_plan plan)
986 {
987 size_t twsize=0, l1=1;
988 for (size_t k=0; k<plan->nfct; ++k)
989 {
990 size_t ip=plan->fct[k].fct, ido= plan->length/(l1*ip);
991 twsize+=(ip-1)*(ido-1);
992 if (ip>11)
993 twsize+=ip;
994 l1*=ip;
995 }
996 return twsize;
997 }
998
cfftp_comp_twiddle(cfftp_plan plan)999 NOINLINE WARN_UNUSED_RESULT static int cfftp_comp_twiddle (cfftp_plan plan)
1000 {
1001 size_t length=plan->length;
1002 double *twid = RALLOC(double, 2*length);
1003 if (!twid) return -1;
1004 sincos_2pibyn(length, twid);
1005 size_t l1=1;
1006 size_t memofs=0;
1007 for (size_t k=0; k<plan->nfct; ++k)
1008 {
1009 size_t ip=plan->fct[k].fct, ido= length/(l1*ip);
1010 plan->fct[k].tw=plan->mem+memofs;
1011 memofs+=(ip-1)*(ido-1);
1012 for (size_t j=1; j<ip; ++j)
1013 for (size_t i=1; i<ido; ++i)
1014 {
1015 plan->fct[k].tw[(j-1)*(ido-1)+i-1].r = twid[2*j*l1*i];
1016 plan->fct[k].tw[(j-1)*(ido-1)+i-1].i = twid[2*j*l1*i+1];
1017 }
1018 if (ip>11)
1019 {
1020 plan->fct[k].tws=plan->mem+memofs;
1021 memofs+=ip;
1022 for (size_t j=0; j<ip; ++j)
1023 {
1024 plan->fct[k].tws[j].r = twid[2*j*l1*ido];
1025 plan->fct[k].tws[j].i = twid[2*j*l1*ido+1];
1026 }
1027 }
1028 l1*=ip;
1029 }
1030 DEALLOC(twid);
1031 return 0;
1032 }
1033
make_cfftp_plan(size_t length)1034 static cfftp_plan make_cfftp_plan (size_t length)
1035 {
1036 if (length==0) return NULL;
1037 cfftp_plan plan = RALLOC(cfftp_plan_i,1);
1038 if (!plan) return NULL;
1039 plan->length=length;
1040 plan->nfct=0;
1041 for (size_t i=0; i<NFCT; ++i)
1042 plan->fct[i]=(cfftp_fctdata){0,0,0};
1043 plan->mem=0;
1044 if (length==1) return plan;
1045 if (cfftp_factorize(plan)!=0) { DEALLOC(plan); return NULL; }
1046 size_t tws=cfftp_twsize(plan);
1047 plan->mem=RALLOC(cmplx,tws);
1048 if (!plan->mem) { DEALLOC(plan); return NULL; }
1049 if (cfftp_comp_twiddle(plan)!=0)
1050 { DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
1051 return plan;
1052 }
1053
destroy_cfftp_plan(cfftp_plan plan)1054 static void destroy_cfftp_plan (cfftp_plan plan)
1055 {
1056 DEALLOC(plan->mem);
1057 DEALLOC(plan);
1058 }
1059
1060 typedef struct rfftp_fctdata
1061 {
1062 size_t fct;
1063 double *tw, *tws;
1064 } rfftp_fctdata;
1065
1066 typedef struct rfftp_plan_i
1067 {
1068 size_t length, nfct;
1069 double *mem;
1070 rfftp_fctdata fct[NFCT];
1071 } rfftp_plan_i;
1072 typedef struct rfftp_plan_i * rfftp_plan;
1073
1074 #define WA(x,i) wa[(i)+(x)*(ido-1)]
1075 #define PM(a,b,c,d) { a=c+d; b=c-d; }
1076 /* (a+ib) = conj(c+id) * (e+if) */
1077 #define MULPM(a,b,c,d,e,f) { a=c*e+d*f; b=c*f-d*e; }
1078
1079 #define CC(a,b,c) cc[(a)+ido*((b)+l1*(c))]
1080 #define CH(a,b,c) ch[(a)+ido*((b)+cdim*(c))]
1081
radf2(size_t ido,size_t l1,const double * restrict cc,double * restrict ch,const double * restrict wa)1082 NOINLINE static void radf2 (size_t ido, size_t l1, const double * restrict cc,
1083 double * restrict ch, const double * restrict wa)
1084 {
1085 const size_t cdim=2;
1086
1087 for (size_t k=0; k<l1; k++)
1088 PM (CH(0,0,k),CH(ido-1,1,k),CC(0,k,0),CC(0,k,1))
1089 if ((ido&1)==0)
1090 for (size_t k=0; k<l1; k++)
1091 {
1092 CH( 0,1,k) = -CC(ido-1,k,1);
1093 CH(ido-1,0,k) = CC(ido-1,k,0);
1094 }
1095 if (ido<=2) return;
1096 for (size_t k=0; k<l1; k++)
1097 for (size_t i=2; i<ido; i+=2)
1098 {
1099 size_t ic=ido-i;
1100 double tr2, ti2;
1101 MULPM (tr2,ti2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
1102 PM (CH(i-1,0,k),CH(ic-1,1,k),CC(i-1,k,0),tr2)
1103 PM (CH(i ,0,k),CH(ic ,1,k),ti2,CC(i ,k,0))
1104 }
1105 }
1106
radf3(size_t ido,size_t l1,const double * restrict cc,double * restrict ch,const double * restrict wa)1107 NOINLINE static void radf3(size_t ido, size_t l1, const double * restrict cc,
1108 double * restrict ch, const double * restrict wa)
1109 {
1110 const size_t cdim=3;
1111 static const double taur=-0.5, taui=0.86602540378443864676;
1112
1113 for (size_t k=0; k<l1; k++)
1114 {
1115 double cr2=CC(0,k,1)+CC(0,k,2);
1116 CH(0,0,k) = CC(0,k,0)+cr2;
1117 CH(0,2,k) = taui*(CC(0,k,2)-CC(0,k,1));
1118 CH(ido-1,1,k) = CC(0,k,0)+taur*cr2;
1119 }
1120 if (ido==1) return;
1121 for (size_t k=0; k<l1; k++)
1122 for (size_t i=2; i<ido; i+=2)
1123 {
1124 size_t ic=ido-i;
1125 double di2, di3, dr2, dr3;
1126 MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1)) // d2=conj(WA0)*CC1
1127 MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2)) // d3=conj(WA1)*CC2
1128 double cr2=dr2+dr3; // c add
1129 double ci2=di2+di3;
1130 CH(i-1,0,k) = CC(i-1,k,0)+cr2; // c add
1131 CH(i ,0,k) = CC(i ,k,0)+ci2;
1132 double tr2 = CC(i-1,k,0)+taur*cr2; // c add
1133 double ti2 = CC(i ,k,0)+taur*ci2;
1134 double tr3 = taui*(di2-di3); // t3 = taui*i*(d3-d2)?
1135 double ti3 = taui*(dr3-dr2);
1136 PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr3) // PM(i) = t2+t3
1137 PM(CH(i ,2,k),CH(ic ,1,k),ti3,ti2) // PM(ic) = conj(t2-t3)
1138 }
1139 }
1140
radf4(size_t ido,size_t l1,const double * restrict cc,double * restrict ch,const double * restrict wa)1141 NOINLINE static void radf4(size_t ido, size_t l1, const double * restrict cc,
1142 double * restrict ch, const double * restrict wa)
1143 {
1144 const size_t cdim=4;
1145 static const double hsqt2=0.70710678118654752440;
1146
1147 for (size_t k=0; k<l1; k++)
1148 {
1149 double tr1,tr2;
1150 PM (tr1,CH(0,2,k),CC(0,k,3),CC(0,k,1))
1151 PM (tr2,CH(ido-1,1,k),CC(0,k,0),CC(0,k,2))
1152 PM (CH(0,0,k),CH(ido-1,3,k),tr2,tr1)
1153 }
1154 if ((ido&1)==0)
1155 for (size_t k=0; k<l1; k++)
1156 {
1157 double ti1=-hsqt2*(CC(ido-1,k,1)+CC(ido-1,k,3));
1158 double tr1= hsqt2*(CC(ido-1,k,1)-CC(ido-1,k,3));
1159 PM (CH(ido-1,0,k),CH(ido-1,2,k),CC(ido-1,k,0),tr1)
1160 PM (CH( 0,3,k),CH( 0,1,k),ti1,CC(ido-1,k,2))
1161 }
1162 if (ido<=2) return;
1163 for (size_t k=0; k<l1; k++)
1164 for (size_t i=2; i<ido; i+=2)
1165 {
1166 size_t ic=ido-i;
1167 double ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
1168 MULPM(cr2,ci2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
1169 MULPM(cr3,ci3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2))
1170 MULPM(cr4,ci4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3))
1171 PM(tr1,tr4,cr4,cr2)
1172 PM(ti1,ti4,ci2,ci4)
1173 PM(tr2,tr3,CC(i-1,k,0),cr3)
1174 PM(ti2,ti3,CC(i ,k,0),ci3)
1175 PM(CH(i-1,0,k),CH(ic-1,3,k),tr2,tr1)
1176 PM(CH(i ,0,k),CH(ic ,3,k),ti1,ti2)
1177 PM(CH(i-1,2,k),CH(ic-1,1,k),tr3,ti4)
1178 PM(CH(i ,2,k),CH(ic ,1,k),tr4,ti3)
1179 }
1180 }
1181
radf5(size_t ido,size_t l1,const double * restrict cc,double * restrict ch,const double * restrict wa)1182 NOINLINE static void radf5(size_t ido, size_t l1, const double * restrict cc,
1183 double * restrict ch, const double * restrict wa)
1184 {
1185 const size_t cdim=5;
1186 static const double tr11= 0.3090169943749474241, ti11=0.95105651629515357212,
1187 tr12=-0.8090169943749474241, ti12=0.58778525229247312917;
1188
1189 for (size_t k=0; k<l1; k++)
1190 {
1191 double cr2, cr3, ci4, ci5;
1192 PM (cr2,ci5,CC(0,k,4),CC(0,k,1))
1193 PM (cr3,ci4,CC(0,k,3),CC(0,k,2))
1194 CH(0,0,k)=CC(0,k,0)+cr2+cr3;
1195 CH(ido-1,1,k)=CC(0,k,0)+tr11*cr2+tr12*cr3;
1196 CH(0,2,k)=ti11*ci5+ti12*ci4;
1197 CH(ido-1,3,k)=CC(0,k,0)+tr12*cr2+tr11*cr3;
1198 CH(0,4,k)=ti12*ci5-ti11*ci4;
1199 }
1200 if (ido==1) return;
1201 for (size_t k=0; k<l1;++k)
1202 for (size_t i=2; i<ido; i+=2)
1203 {
1204 double ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3,
1205 dr4, dr5, cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
1206 size_t ic=ido-i;
1207 MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
1208 MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2))
1209 MULPM (dr4,di4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3))
1210 MULPM (dr5,di5,WA(3,i-2),WA(3,i-1),CC(i-1,k,4),CC(i,k,4))
1211 PM(cr2,ci5,dr5,dr2)
1212 PM(ci2,cr5,di2,di5)
1213 PM(cr3,ci4,dr4,dr3)
1214 PM(ci3,cr4,di3,di4)
1215 CH(i-1,0,k)=CC(i-1,k,0)+cr2+cr3;
1216 CH(i ,0,k)=CC(i ,k,0)+ci2+ci3;
1217 tr2=CC(i-1,k,0)+tr11*cr2+tr12*cr3;
1218 ti2=CC(i ,k,0)+tr11*ci2+tr12*ci3;
1219 tr3=CC(i-1,k,0)+tr12*cr2+tr11*cr3;
1220 ti3=CC(i ,k,0)+tr12*ci2+tr11*ci3;
1221 MULPM(tr5,tr4,cr5,cr4,ti11,ti12)
1222 MULPM(ti5,ti4,ci5,ci4,ti11,ti12)
1223 PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr5)
1224 PM(CH(i ,2,k),CH(ic ,1,k),ti5,ti2)
1225 PM(CH(i-1,4,k),CH(ic-1,3,k),tr3,tr4)
1226 PM(CH(i ,4,k),CH(ic ,3,k),ti4,ti3)
1227 }
1228 }
1229
1230 #undef CC
1231 #undef CH
1232 #define C1(a,b,c) cc[(a)+ido*((b)+l1*(c))]
1233 #define C2(a,b) cc[(a)+idl1*(b)]
1234 #define CH2(a,b) ch[(a)+idl1*(b)]
1235 #define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
1236 #define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
radfg(size_t ido,size_t ip,size_t l1,double * restrict cc,double * restrict ch,const double * restrict wa,const double * restrict csarr)1237 NOINLINE static void radfg(size_t ido, size_t ip, size_t l1,
1238 double * restrict cc, double * restrict ch, const double * restrict wa,
1239 const double * restrict csarr)
1240 {
1241 const size_t cdim=ip;
1242 size_t ipph=(ip+1)/2;
1243 size_t idl1 = ido*l1;
1244
1245 if (ido>1)
1246 {
1247 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 114
1248 {
1249 size_t is=(j-1)*(ido-1),
1250 is2=(jc-1)*(ido-1);
1251 for (size_t k=0; k<l1; ++k) // 113
1252 {
1253 size_t idij=is;
1254 size_t idij2=is2;
1255 for (size_t i=1; i<=ido-2; i+=2) // 112
1256 {
1257 double t1=C1(i,k,j ), t2=C1(i+1,k,j ),
1258 t3=C1(i,k,jc), t4=C1(i+1,k,jc);
1259 double x1=wa[idij]*t1 + wa[idij+1]*t2,
1260 x2=wa[idij]*t2 - wa[idij+1]*t1,
1261 x3=wa[idij2]*t3 + wa[idij2+1]*t4,
1262 x4=wa[idij2]*t4 - wa[idij2+1]*t3;
1263 C1(i ,k,j ) = x1+x3;
1264 C1(i ,k,jc) = x2-x4;
1265 C1(i+1,k,j ) = x2+x4;
1266 C1(i+1,k,jc) = x3-x1;
1267 idij+=2;
1268 idij2+=2;
1269 }
1270 }
1271 }
1272 }
1273
1274 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 123
1275 for (size_t k=0; k<l1; ++k) // 122
1276 {
1277 double t1=C1(0,k,j), t2=C1(0,k,jc);
1278 C1(0,k,j ) = t1+t2;
1279 C1(0,k,jc) = t2-t1;
1280 }
1281
1282 //everything in C
1283 //memset(ch,0,ip*l1*ido*sizeof(double));
1284
1285 for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc) // 127
1286 {
1287 for (size_t ik=0; ik<idl1; ++ik) // 124
1288 {
1289 CH2(ik,l ) = C2(ik,0)+csarr[2*l]*C2(ik,1)+csarr[4*l]*C2(ik,2);
1290 CH2(ik,lc) = csarr[2*l+1]*C2(ik,ip-1)+csarr[4*l+1]*C2(ik,ip-2);
1291 }
1292 size_t iang = 2*l;
1293 size_t j=3, jc=ip-3;
1294 for (; j<ipph-3; j+=4,jc-=4) // 126
1295 {
1296 iang+=l; if (iang>=ip) iang-=ip;
1297 double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
1298 iang+=l; if (iang>=ip) iang-=ip;
1299 double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
1300 iang+=l; if (iang>=ip) iang-=ip;
1301 double ar3=csarr[2*iang], ai3=csarr[2*iang+1];
1302 iang+=l; if (iang>=ip) iang-=ip;
1303 double ar4=csarr[2*iang], ai4=csarr[2*iang+1];
1304 for (size_t ik=0; ik<idl1; ++ik) // 125
1305 {
1306 CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1)
1307 +ar3*C2(ik,j +2)+ar4*C2(ik,j +3);
1308 CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1)
1309 +ai3*C2(ik,jc-2)+ai4*C2(ik,jc-3);
1310 }
1311 }
1312 for (; j<ipph-1; j+=2,jc-=2) // 126
1313 {
1314 iang+=l; if (iang>=ip) iang-=ip;
1315 double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
1316 iang+=l; if (iang>=ip) iang-=ip;
1317 double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
1318 for (size_t ik=0; ik<idl1; ++ik) // 125
1319 {
1320 CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1);
1321 CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1);
1322 }
1323 }
1324 for (; j<ipph; ++j,--jc) // 126
1325 {
1326 iang+=l; if (iang>=ip) iang-=ip;
1327 double ar=csarr[2*iang], ai=csarr[2*iang+1];
1328 for (size_t ik=0; ik<idl1; ++ik) // 125
1329 {
1330 CH2(ik,l ) += ar*C2(ik,j );
1331 CH2(ik,lc) += ai*C2(ik,jc);
1332 }
1333 }
1334 }
1335 for (size_t ik=0; ik<idl1; ++ik) // 101
1336 CH2(ik,0) = C2(ik,0);
1337 for (size_t j=1; j<ipph; ++j) // 129
1338 for (size_t ik=0; ik<idl1; ++ik) // 128
1339 CH2(ik,0) += C2(ik,j);
1340
1341 // everything in CH at this point!
1342 //memset(cc,0,ip*l1*ido*sizeof(double));
1343
1344 for (size_t k=0; k<l1; ++k) // 131
1345 for (size_t i=0; i<ido; ++i) // 130
1346 CC(i,0,k) = CH(i,k,0);
1347
1348 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 137
1349 {
1350 size_t j2=2*j-1;
1351 for (size_t k=0; k<l1; ++k) // 136
1352 {
1353 CC(ido-1,j2,k) = CH(0,k,j);
1354 CC(0,j2+1,k) = CH(0,k,jc);
1355 }
1356 }
1357
1358 if (ido==1) return;
1359
1360 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 140
1361 {
1362 size_t j2=2*j-1;
1363 for(size_t k=0; k<l1; ++k) // 139
1364 for(size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2) // 138
1365 {
1366 CC(i ,j2+1,k) = CH(i ,k,j )+CH(i ,k,jc);
1367 CC(ic ,j2 ,k) = CH(i ,k,j )-CH(i ,k,jc);
1368 CC(i+1 ,j2+1,k) = CH(i+1,k,j )+CH(i+1,k,jc);
1369 CC(ic+1,j2 ,k) = CH(i+1,k,jc)-CH(i+1,k,j );
1370 }
1371 }
1372 }
1373 #undef C1
1374 #undef C2
1375 #undef CH2
1376
1377 #undef CH
1378 #undef CC
1379 #define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
1380 #define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
1381
radb2(size_t ido,size_t l1,const double * restrict cc,double * restrict ch,const double * restrict wa)1382 NOINLINE static void radb2(size_t ido, size_t l1, const double * restrict cc,
1383 double * restrict ch, const double * restrict wa)
1384 {
1385 const size_t cdim=2;
1386
1387 for (size_t k=0; k<l1; k++)
1388 PM (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(ido-1,1,k))
1389 if ((ido&1)==0)
1390 for (size_t k=0; k<l1; k++)
1391 {
1392 CH(ido-1,k,0) = 2.*CC(ido-1,0,k);
1393 CH(ido-1,k,1) =-2.*CC(0 ,1,k);
1394 }
1395 if (ido<=2) return;
1396 for (size_t k=0; k<l1;++k)
1397 for (size_t i=2; i<ido; i+=2)
1398 {
1399 size_t ic=ido-i;
1400 double ti2, tr2;
1401 PM (CH(i-1,k,0),tr2,CC(i-1,0,k),CC(ic-1,1,k))
1402 PM (ti2,CH(i ,k,0),CC(i ,0,k),CC(ic ,1,k))
1403 MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ti2,tr2)
1404 }
1405 }
1406
radb3(size_t ido,size_t l1,const double * restrict cc,double * restrict ch,const double * restrict wa)1407 NOINLINE static void radb3(size_t ido, size_t l1, const double * restrict cc,
1408 double * restrict ch, const double * restrict wa)
1409 {
1410 const size_t cdim=3;
1411 static const double taur=-0.5, taui=0.86602540378443864676;
1412
1413 for (size_t k=0; k<l1; k++)
1414 {
1415 double tr2=2.*CC(ido-1,1,k);
1416 double cr2=CC(0,0,k)+taur*tr2;
1417 CH(0,k,0)=CC(0,0,k)+tr2;
1418 double ci3=2.*taui*CC(0,2,k);
1419 PM (CH(0,k,2),CH(0,k,1),cr2,ci3);
1420 }
1421 if (ido==1) return;
1422 for (size_t k=0; k<l1; k++)
1423 for (size_t i=2; i<ido; i+=2)
1424 {
1425 size_t ic=ido-i;
1426 double tr2=CC(i-1,2,k)+CC(ic-1,1,k); // t2=CC(I) + conj(CC(ic))
1427 double ti2=CC(i ,2,k)-CC(ic ,1,k);
1428 double cr2=CC(i-1,0,k)+taur*tr2; // c2=CC +taur*t2
1429 double ci2=CC(i ,0,k)+taur*ti2;
1430 CH(i-1,k,0)=CC(i-1,0,k)+tr2; // CH=CC+t2
1431 CH(i ,k,0)=CC(i ,0,k)+ti2;
1432 double cr3=taui*(CC(i-1,2,k)-CC(ic-1,1,k));// c3=taui*(CC(i)-conj(CC(ic)))
1433 double ci3=taui*(CC(i ,2,k)+CC(ic ,1,k));
1434 double di2, di3, dr2, dr3;
1435 PM(dr3,dr2,cr2,ci3) // d2= (cr2-ci3, ci2+cr3) = c2+i*c3
1436 PM(di2,di3,ci2,cr3) // d3= (cr2+ci3, ci2-cr3) = c2-i*c3
1437 MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2) // ch = WA*d2
1438 MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3)
1439 }
1440 }
1441
radb4(size_t ido,size_t l1,const double * restrict cc,double * restrict ch,const double * restrict wa)1442 NOINLINE static void radb4(size_t ido, size_t l1, const double * restrict cc,
1443 double * restrict ch, const double * restrict wa)
1444 {
1445 const size_t cdim=4;
1446 static const double sqrt2=1.41421356237309504880;
1447
1448 for (size_t k=0; k<l1; k++)
1449 {
1450 double tr1, tr2;
1451 PM (tr2,tr1,CC(0,0,k),CC(ido-1,3,k))
1452 double tr3=2.*CC(ido-1,1,k);
1453 double tr4=2.*CC(0,2,k);
1454 PM (CH(0,k,0),CH(0,k,2),tr2,tr3)
1455 PM (CH(0,k,3),CH(0,k,1),tr1,tr4)
1456 }
1457 if ((ido&1)==0)
1458 for (size_t k=0; k<l1; k++)
1459 {
1460 double tr1,tr2,ti1,ti2;
1461 PM (ti1,ti2,CC(0 ,3,k),CC(0 ,1,k))
1462 PM (tr2,tr1,CC(ido-1,0,k),CC(ido-1,2,k))
1463 CH(ido-1,k,0)=tr2+tr2;
1464 CH(ido-1,k,1)=sqrt2*(tr1-ti1);
1465 CH(ido-1,k,2)=ti2+ti2;
1466 CH(ido-1,k,3)=-sqrt2*(tr1+ti1);
1467 }
1468 if (ido<=2) return;
1469 for (size_t k=0; k<l1;++k)
1470 for (size_t i=2; i<ido; i+=2)
1471 {
1472 double ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
1473 size_t ic=ido-i;
1474 PM (tr2,tr1,CC(i-1,0,k),CC(ic-1,3,k))
1475 PM (ti1,ti2,CC(i ,0,k),CC(ic ,3,k))
1476 PM (tr4,ti3,CC(i ,2,k),CC(ic ,1,k))
1477 PM (tr3,ti4,CC(i-1,2,k),CC(ic-1,1,k))
1478 PM (CH(i-1,k,0),cr3,tr2,tr3)
1479 PM (CH(i ,k,0),ci3,ti2,ti3)
1480 PM (cr4,cr2,tr1,tr4)
1481 PM (ci2,ci4,ti1,ti4)
1482 MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ci2,cr2)
1483 MULPM (CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),ci3,cr3)
1484 MULPM (CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),ci4,cr4)
1485 }
1486 }
1487
radb5(size_t ido,size_t l1,const double * restrict cc,double * restrict ch,const double * restrict wa)1488 NOINLINE static void radb5(size_t ido, size_t l1, const double * restrict cc,
1489 double * restrict ch, const double * restrict wa)
1490 {
1491 const size_t cdim=5;
1492 static const double tr11= 0.3090169943749474241, ti11=0.95105651629515357212,
1493 tr12=-0.8090169943749474241, ti12=0.58778525229247312917;
1494
1495 for (size_t k=0; k<l1; k++)
1496 {
1497 double ti5=CC(0,2,k)+CC(0,2,k);
1498 double ti4=CC(0,4,k)+CC(0,4,k);
1499 double tr2=CC(ido-1,1,k)+CC(ido-1,1,k);
1500 double tr3=CC(ido-1,3,k)+CC(ido-1,3,k);
1501 CH(0,k,0)=CC(0,0,k)+tr2+tr3;
1502 double cr2=CC(0,0,k)+tr11*tr2+tr12*tr3;
1503 double cr3=CC(0,0,k)+tr12*tr2+tr11*tr3;
1504 double ci4, ci5;
1505 MULPM(ci5,ci4,ti5,ti4,ti11,ti12)
1506 PM(CH(0,k,4),CH(0,k,1),cr2,ci5)
1507 PM(CH(0,k,3),CH(0,k,2),cr3,ci4)
1508 }
1509 if (ido==1) return;
1510 for (size_t k=0; k<l1;++k)
1511 for (size_t i=2; i<ido; i+=2)
1512 {
1513 size_t ic=ido-i;
1514 double tr2, tr3, tr4, tr5, ti2, ti3, ti4, ti5;
1515 PM(tr2,tr5,CC(i-1,2,k),CC(ic-1,1,k))
1516 PM(ti5,ti2,CC(i ,2,k),CC(ic ,1,k))
1517 PM(tr3,tr4,CC(i-1,4,k),CC(ic-1,3,k))
1518 PM(ti4,ti3,CC(i ,4,k),CC(ic ,3,k))
1519 CH(i-1,k,0)=CC(i-1,0,k)+tr2+tr3;
1520 CH(i ,k,0)=CC(i ,0,k)+ti2+ti3;
1521 double cr2=CC(i-1,0,k)+tr11*tr2+tr12*tr3;
1522 double ci2=CC(i ,0,k)+tr11*ti2+tr12*ti3;
1523 double cr3=CC(i-1,0,k)+tr12*tr2+tr11*tr3;
1524 double ci3=CC(i ,0,k)+tr12*ti2+tr11*ti3;
1525 double ci4, ci5, cr5, cr4;
1526 MULPM(cr5,cr4,tr5,tr4,ti11,ti12)
1527 MULPM(ci5,ci4,ti5,ti4,ti11,ti12)
1528 double dr2, dr3, dr4, dr5, di2, di3, di4, di5;
1529 PM(dr4,dr3,cr3,ci4)
1530 PM(di3,di4,ci3,cr4)
1531 PM(dr5,dr2,cr2,ci5)
1532 PM(di2,di5,ci2,cr5)
1533 MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2)
1534 MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3)
1535 MULPM(CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),di4,dr4)
1536 MULPM(CH(i,k,4),CH(i-1,k,4),WA(3,i-2),WA(3,i-1),di5,dr5)
1537 }
1538 }
1539
1540 #undef CC
1541 #undef CH
1542 #define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
1543 #define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
1544 #define C1(a,b,c) cc[(a)+ido*((b)+l1*(c))]
1545 #define C2(a,b) cc[(a)+idl1*(b)]
1546 #define CH2(a,b) ch[(a)+idl1*(b)]
1547
radbg(size_t ido,size_t ip,size_t l1,double * restrict cc,double * restrict ch,const double * restrict wa,const double * restrict csarr)1548 NOINLINE static void radbg(size_t ido, size_t ip, size_t l1,
1549 double * restrict cc, double * restrict ch, const double * restrict wa,
1550 const double * restrict csarr)
1551 {
1552 const size_t cdim=ip;
1553 size_t ipph=(ip+1)/ 2;
1554 size_t idl1 = ido*l1;
1555
1556 for (size_t k=0; k<l1; ++k) // 102
1557 for (size_t i=0; i<ido; ++i) // 101
1558 CH(i,k,0) = CC(i,0,k);
1559 for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc) // 108
1560 {
1561 size_t j2=2*j-1;
1562 for (size_t k=0; k<l1; ++k)
1563 {
1564 CH(0,k,j ) = 2*CC(ido-1,j2,k);
1565 CH(0,k,jc) = 2*CC(0,j2+1,k);
1566 }
1567 }
1568
1569 if (ido!=1)
1570 {
1571 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 111
1572 {
1573 size_t j2=2*j-1;
1574 for (size_t k=0; k<l1; ++k)
1575 for (size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2) // 109
1576 {
1577 CH(i ,k,j ) = CC(i ,j2+1,k)+CC(ic ,j2,k);
1578 CH(i ,k,jc) = CC(i ,j2+1,k)-CC(ic ,j2,k);
1579 CH(i+1,k,j ) = CC(i+1,j2+1,k)-CC(ic+1,j2,k);
1580 CH(i+1,k,jc) = CC(i+1,j2+1,k)+CC(ic+1,j2,k);
1581 }
1582 }
1583 }
1584 for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc)
1585 {
1586 for (size_t ik=0; ik<idl1; ++ik)
1587 {
1588 C2(ik,l ) = CH2(ik,0)+csarr[2*l]*CH2(ik,1)+csarr[4*l]*CH2(ik,2);
1589 C2(ik,lc) = csarr[2*l+1]*CH2(ik,ip-1)+csarr[4*l+1]*CH2(ik,ip-2);
1590 }
1591 size_t iang=2*l;
1592 size_t j=3,jc=ip-3;
1593 for(; j<ipph-3; j+=4,jc-=4)
1594 {
1595 iang+=l; if(iang>ip) iang-=ip;
1596 double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
1597 iang+=l; if(iang>ip) iang-=ip;
1598 double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
1599 iang+=l; if(iang>ip) iang-=ip;
1600 double ar3=csarr[2*iang], ai3=csarr[2*iang+1];
1601 iang+=l; if(iang>ip) iang-=ip;
1602 double ar4=csarr[2*iang], ai4=csarr[2*iang+1];
1603 for (size_t ik=0; ik<idl1; ++ik)
1604 {
1605 C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1)
1606 +ar3*CH2(ik,j +2)+ar4*CH2(ik,j +3);
1607 C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1)
1608 +ai3*CH2(ik,jc-2)+ai4*CH2(ik,jc-3);
1609 }
1610 }
1611 for(; j<ipph-1; j+=2,jc-=2)
1612 {
1613 iang+=l; if(iang>ip) iang-=ip;
1614 double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
1615 iang+=l; if(iang>ip) iang-=ip;
1616 double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
1617 for (size_t ik=0; ik<idl1; ++ik)
1618 {
1619 C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1);
1620 C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1);
1621 }
1622 }
1623 for(; j<ipph; ++j,--jc)
1624 {
1625 iang+=l; if(iang>ip) iang-=ip;
1626 double war=csarr[2*iang], wai=csarr[2*iang+1];
1627 for (size_t ik=0; ik<idl1; ++ik)
1628 {
1629 C2(ik,l ) += war*CH2(ik,j );
1630 C2(ik,lc) += wai*CH2(ik,jc);
1631 }
1632 }
1633 }
1634 for (size_t j=1; j<ipph; ++j)
1635 for (size_t ik=0; ik<idl1; ++ik)
1636 CH2(ik,0) += CH2(ik,j);
1637 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 124
1638 for (size_t k=0; k<l1; ++k)
1639 {
1640 CH(0,k,j ) = C1(0,k,j)-C1(0,k,jc);
1641 CH(0,k,jc) = C1(0,k,j)+C1(0,k,jc);
1642 }
1643
1644 if (ido==1) return;
1645
1646 for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc) // 127
1647 for (size_t k=0; k<l1; ++k)
1648 for (size_t i=1; i<=ido-2; i+=2)
1649 {
1650 CH(i ,k,j ) = C1(i ,k,j)-C1(i+1,k,jc);
1651 CH(i ,k,jc) = C1(i ,k,j)+C1(i+1,k,jc);
1652 CH(i+1,k,j ) = C1(i+1,k,j)+C1(i ,k,jc);
1653 CH(i+1,k,jc) = C1(i+1,k,j)-C1(i ,k,jc);
1654 }
1655
1656 // All in CH
1657
1658 for (size_t j=1; j<ip; ++j)
1659 {
1660 size_t is = (j-1)*(ido-1);
1661 for (size_t k=0; k<l1; ++k)
1662 {
1663 size_t idij = is;
1664 for (size_t i=1; i<=ido-2; i+=2)
1665 {
1666 double t1=CH(i,k,j), t2=CH(i+1,k,j);
1667 CH(i ,k,j) = wa[idij]*t1-wa[idij+1]*t2;
1668 CH(i+1,k,j) = wa[idij]*t2+wa[idij+1]*t1;
1669 idij+=2;
1670 }
1671 }
1672 }
1673 }
1674 #undef C1
1675 #undef C2
1676 #undef CH2
1677
1678 #undef CC
1679 #undef CH
1680 #undef PM
1681 #undef MULPM
1682 #undef WA
1683
copy_and_norm(double * c,double * p1,size_t n,double fct)1684 static void copy_and_norm(double *c, double *p1, size_t n, double fct)
1685 {
1686 if (p1!=c)
1687 {
1688 if (fct!=1.)
1689 for (size_t i=0; i<n; ++i)
1690 c[i] = fct*p1[i];
1691 else
1692 memcpy (c,p1,n*sizeof(double));
1693 }
1694 else
1695 if (fct!=1.)
1696 for (size_t i=0; i<n; ++i)
1697 c[i] *= fct;
1698 }
1699
1700 WARN_UNUSED_RESULT
rfftp_forward(rfftp_plan plan,double c[],double fct)1701 static int rfftp_forward(rfftp_plan plan, double c[], double fct)
1702 {
1703 if (plan->length==1) return 0;
1704 size_t n=plan->length;
1705 size_t l1=n, nf=plan->nfct;
1706 double *ch = RALLOC(double, n);
1707 if (!ch) return -1;
1708 double *p1=c, *p2=ch;
1709
1710 for(size_t k1=0; k1<nf;++k1)
1711 {
1712 size_t k=nf-k1-1;
1713 size_t ip=plan->fct[k].fct;
1714 size_t ido=n / l1;
1715 l1 /= ip;
1716 if(ip==4)
1717 radf4(ido, l1, p1, p2, plan->fct[k].tw);
1718 else if(ip==2)
1719 radf2(ido, l1, p1, p2, plan->fct[k].tw);
1720 else if(ip==3)
1721 radf3(ido, l1, p1, p2, plan->fct[k].tw);
1722 else if(ip==5)
1723 radf5(ido, l1, p1, p2, plan->fct[k].tw);
1724 else
1725 {
1726 radfg(ido, ip, l1, p1, p2, plan->fct[k].tw, plan->fct[k].tws);
1727 SWAP (p1,p2,double *);
1728 }
1729 SWAP (p1,p2,double *);
1730 }
1731 copy_and_norm(c,p1,n,fct);
1732 DEALLOC(ch);
1733 return 0;
1734 }
1735
1736 WARN_UNUSED_RESULT
rfftp_backward(rfftp_plan plan,double c[],double fct)1737 static int rfftp_backward(rfftp_plan plan, double c[], double fct)
1738 {
1739 if (plan->length==1) return 0;
1740 size_t n=plan->length;
1741 size_t l1=1, nf=plan->nfct;
1742 double *ch = RALLOC(double, n);
1743 if (!ch) return -1;
1744 double *p1=c, *p2=ch;
1745
1746 for(size_t k=0; k<nf; k++)
1747 {
1748 size_t ip = plan->fct[k].fct,
1749 ido= n/(ip*l1);
1750 if(ip==4)
1751 radb4(ido, l1, p1, p2, plan->fct[k].tw);
1752 else if(ip==2)
1753 radb2(ido, l1, p1, p2, plan->fct[k].tw);
1754 else if(ip==3)
1755 radb3(ido, l1, p1, p2, plan->fct[k].tw);
1756 else if(ip==5)
1757 radb5(ido, l1, p1, p2, plan->fct[k].tw);
1758 else
1759 radbg(ido, ip, l1, p1, p2, plan->fct[k].tw, plan->fct[k].tws);
1760 SWAP (p1,p2,double *);
1761 l1*=ip;
1762 }
1763 copy_and_norm(c,p1,n,fct);
1764 DEALLOC(ch);
1765 return 0;
1766 }
1767
1768 WARN_UNUSED_RESULT
rfftp_factorize(rfftp_plan plan)1769 static int rfftp_factorize (rfftp_plan plan)
1770 {
1771 size_t length=plan->length;
1772 size_t nfct=0;
1773 while ((length%4)==0)
1774 { if (nfct>=NFCT) return -1; plan->fct[nfct++].fct=4; length>>=2; }
1775 if ((length%2)==0)
1776 {
1777 length>>=1;
1778 // factor 2 should be at the front of the factor list
1779 if (nfct>=NFCT) return -1;
1780 plan->fct[nfct++].fct=2;
1781 SWAP(plan->fct[0].fct, plan->fct[nfct-1].fct,size_t);
1782 }
1783 size_t maxl=(size_t)(sqrt((double)length))+1;
1784 for (size_t divisor=3; (length>1)&&(divisor<maxl); divisor+=2)
1785 if ((length%divisor)==0)
1786 {
1787 while ((length%divisor)==0)
1788 {
1789 if (nfct>=NFCT) return -1;
1790 plan->fct[nfct++].fct=divisor;
1791 length/=divisor;
1792 }
1793 maxl=(size_t)(sqrt((double)length))+1;
1794 }
1795 if (length>1) plan->fct[nfct++].fct=length;
1796 plan->nfct=nfct;
1797 return 0;
1798 }
1799
rfftp_twsize(rfftp_plan plan)1800 static size_t rfftp_twsize(rfftp_plan plan)
1801 {
1802 size_t twsize=0, l1=1;
1803 for (size_t k=0; k<plan->nfct; ++k)
1804 {
1805 size_t ip=plan->fct[k].fct, ido= plan->length/(l1*ip);
1806 twsize+=(ip-1)*(ido-1);
1807 if (ip>5) twsize+=2*ip;
1808 l1*=ip;
1809 }
1810 return twsize;
1811 return 0;
1812 }
1813
rfftp_comp_twiddle(rfftp_plan plan)1814 WARN_UNUSED_RESULT NOINLINE static int rfftp_comp_twiddle (rfftp_plan plan)
1815 {
1816 size_t length=plan->length;
1817 double *twid = RALLOC(double, 2*length);
1818 if (!twid) return -1;
1819 sincos_2pibyn_half(length, twid);
1820 size_t l1=1;
1821 double *ptr=plan->mem;
1822 for (size_t k=0; k<plan->nfct; ++k)
1823 {
1824 size_t ip=plan->fct[k].fct, ido=length/(l1*ip);
1825 if (k<plan->nfct-1) // last factor doesn't need twiddles
1826 {
1827 plan->fct[k].tw=ptr; ptr+=(ip-1)*(ido-1);
1828 for (size_t j=1; j<ip; ++j)
1829 for (size_t i=1; i<=(ido-1)/2; ++i)
1830 {
1831 plan->fct[k].tw[(j-1)*(ido-1)+2*i-2] = twid[2*j*l1*i];
1832 plan->fct[k].tw[(j-1)*(ido-1)+2*i-1] = twid[2*j*l1*i+1];
1833 }
1834 }
1835 if (ip>5) // special factors required by *g functions
1836 {
1837 plan->fct[k].tws=ptr; ptr+=2*ip;
1838 plan->fct[k].tws[0] = 1.;
1839 plan->fct[k].tws[1] = 0.;
1840 for (size_t i=1; i<=(ip>>1); ++i)
1841 {
1842 plan->fct[k].tws[2*i ] = twid[2*i*(length/ip)];
1843 plan->fct[k].tws[2*i+1] = twid[2*i*(length/ip)+1];
1844 plan->fct[k].tws[2*(ip-i) ] = twid[2*i*(length/ip)];
1845 plan->fct[k].tws[2*(ip-i)+1] = -twid[2*i*(length/ip)+1];
1846 }
1847 }
1848 l1*=ip;
1849 }
1850 DEALLOC(twid);
1851 return 0;
1852 }
1853
make_rfftp_plan(size_t length)1854 NOINLINE static rfftp_plan make_rfftp_plan (size_t length)
1855 {
1856 if (length==0) return NULL;
1857 rfftp_plan plan = RALLOC(rfftp_plan_i,1);
1858 if (!plan) return NULL;
1859 plan->length=length;
1860 plan->nfct=0;
1861 plan->mem=NULL;
1862 for (size_t i=0; i<NFCT; ++i)
1863 plan->fct[i]=(rfftp_fctdata){0,0,0};
1864 if (length==1) return plan;
1865 if (rfftp_factorize(plan)!=0) { DEALLOC(plan); return NULL; }
1866 size_t tws=rfftp_twsize(plan);
1867 plan->mem=RALLOC(double,tws);
1868 if (!plan->mem) { DEALLOC(plan); return NULL; }
1869 if (rfftp_comp_twiddle(plan)!=0)
1870 { DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
1871 return plan;
1872 }
1873
destroy_rfftp_plan(rfftp_plan plan)1874 NOINLINE static void destroy_rfftp_plan (rfftp_plan plan)
1875 {
1876 DEALLOC(plan->mem);
1877 DEALLOC(plan);
1878 }
1879
1880 typedef struct fftblue_plan_i
1881 {
1882 size_t n, n2;
1883 cfftp_plan plan;
1884 double *mem;
1885 double *bk, *bkf;
1886 } fftblue_plan_i;
1887 typedef struct fftblue_plan_i * fftblue_plan;
1888
make_fftblue_plan(size_t length)1889 NOINLINE static fftblue_plan make_fftblue_plan (size_t length)
1890 {
1891 fftblue_plan plan = RALLOC(fftblue_plan_i,1);
1892 if (!plan) return NULL;
1893 plan->n = length;
1894 plan->n2 = good_size(plan->n*2-1);
1895 plan->mem = RALLOC(double, 2*plan->n+2*plan->n2);
1896 if (!plan->mem) { DEALLOC(plan); return NULL; }
1897 plan->bk = plan->mem;
1898 plan->bkf = plan->bk+2*plan->n;
1899
1900 /* initialize b_k */
1901 double *tmp = RALLOC(double,4*plan->n);
1902 if (!tmp) { DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
1903 sincos_2pibyn(2*plan->n,tmp);
1904 plan->bk[0] = 1;
1905 plan->bk[1] = 0;
1906
1907 size_t coeff=0;
1908 for (size_t m=1; m<plan->n; ++m)
1909 {
1910 coeff+=2*m-1;
1911 if (coeff>=2*plan->n) coeff-=2*plan->n;
1912 plan->bk[2*m ] = tmp[2*coeff ];
1913 plan->bk[2*m+1] = tmp[2*coeff+1];
1914 }
1915
1916 /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */
1917 double xn2 = 1./plan->n2;
1918 plan->bkf[0] = plan->bk[0]*xn2;
1919 plan->bkf[1] = plan->bk[1]*xn2;
1920 for (size_t m=2; m<2*plan->n; m+=2)
1921 {
1922 plan->bkf[m] = plan->bkf[2*plan->n2-m] = plan->bk[m] *xn2;
1923 plan->bkf[m+1] = plan->bkf[2*plan->n2-m+1] = plan->bk[m+1] *xn2;
1924 }
1925 for (size_t m=2*plan->n;m<=(2*plan->n2-2*plan->n+1);++m)
1926 plan->bkf[m]=0.;
1927 plan->plan=make_cfftp_plan(plan->n2);
1928 if (!plan->plan)
1929 { DEALLOC(tmp); DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
1930 if (cfftp_forward(plan->plan,plan->bkf,1.)!=0)
1931 { DEALLOC(tmp); DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
1932 DEALLOC(tmp);
1933
1934 return plan;
1935 }
1936
destroy_fftblue_plan(fftblue_plan plan)1937 NOINLINE static void destroy_fftblue_plan (fftblue_plan plan)
1938 {
1939 DEALLOC(plan->mem);
1940 destroy_cfftp_plan(plan->plan);
1941 DEALLOC(plan);
1942 }
1943
1944 NOINLINE WARN_UNUSED_RESULT
fftblue_fft(fftblue_plan plan,double c[],int isign,double fct)1945 static int fftblue_fft(fftblue_plan plan, double c[], int isign, double fct)
1946 {
1947 size_t n=plan->n;
1948 size_t n2=plan->n2;
1949 double *bk = plan->bk;
1950 double *bkf = plan->bkf;
1951 double *akf = RALLOC(double, 2*n2);
1952 if (!akf) return -1;
1953
1954 /* initialize a_k and FFT it */
1955 if (isign>0)
1956 for (size_t m=0; m<2*n; m+=2)
1957 {
1958 akf[m] = c[m]*bk[m] - c[m+1]*bk[m+1];
1959 akf[m+1] = c[m]*bk[m+1] + c[m+1]*bk[m];
1960 }
1961 else
1962 for (size_t m=0; m<2*n; m+=2)
1963 {
1964 akf[m] = c[m]*bk[m] + c[m+1]*bk[m+1];
1965 akf[m+1] =-c[m]*bk[m+1] + c[m+1]*bk[m];
1966 }
1967 for (size_t m=2*n; m<2*n2; ++m)
1968 akf[m]=0;
1969
1970 if (cfftp_forward (plan->plan,akf,fct)!=0)
1971 { DEALLOC(akf); return -1; }
1972
1973 /* do the convolution */
1974 if (isign>0)
1975 for (size_t m=0; m<2*n2; m+=2)
1976 {
1977 double im = -akf[m]*bkf[m+1] + akf[m+1]*bkf[m];
1978 akf[m ] = akf[m]*bkf[m] + akf[m+1]*bkf[m+1];
1979 akf[m+1] = im;
1980 }
1981 else
1982 for (size_t m=0; m<2*n2; m+=2)
1983 {
1984 double im = akf[m]*bkf[m+1] + akf[m+1]*bkf[m];
1985 akf[m ] = akf[m]*bkf[m] - akf[m+1]*bkf[m+1];
1986 akf[m+1] = im;
1987 }
1988
1989 /* inverse FFT */
1990 if (cfftp_backward (plan->plan,akf,1.)!=0)
1991 { DEALLOC(akf); return -1; }
1992
1993 /* multiply by b_k */
1994 if (isign>0)
1995 for (size_t m=0; m<2*n; m+=2)
1996 {
1997 c[m] = bk[m] *akf[m] - bk[m+1]*akf[m+1];
1998 c[m+1] = bk[m+1]*akf[m] + bk[m] *akf[m+1];
1999 }
2000 else
2001 for (size_t m=0; m<2*n; m+=2)
2002 {
2003 c[m] = bk[m] *akf[m] + bk[m+1]*akf[m+1];
2004 c[m+1] =-bk[m+1]*akf[m] + bk[m] *akf[m+1];
2005 }
2006 DEALLOC(akf);
2007 return 0;
2008 }
2009
2010 WARN_UNUSED_RESULT
cfftblue_backward(fftblue_plan plan,double c[],double fct)2011 static int cfftblue_backward(fftblue_plan plan, double c[], double fct)
2012 { return fftblue_fft(plan,c,1,fct); }
2013
2014 WARN_UNUSED_RESULT
cfftblue_forward(fftblue_plan plan,double c[],double fct)2015 static int cfftblue_forward(fftblue_plan plan, double c[], double fct)
2016 { return fftblue_fft(plan,c,-1,fct); }
2017
2018 WARN_UNUSED_RESULT
rfftblue_backward(fftblue_plan plan,double c[],double fct)2019 static int rfftblue_backward(fftblue_plan plan, double c[], double fct)
2020 {
2021 size_t n=plan->n;
2022 double *tmp = RALLOC(double,2*n);
2023 if (!tmp) return -1;
2024 tmp[0]=c[0];
2025 tmp[1]=0.;
2026 memcpy (tmp+2,c+1, (n-1)*sizeof(double));
2027 if ((n&1)==0) tmp[n+1]=0.;
2028 for (size_t m=2; m<n; m+=2)
2029 {
2030 tmp[2*n-m]=tmp[m];
2031 tmp[2*n-m+1]=-tmp[m+1];
2032 }
2033 if (fftblue_fft(plan,tmp,1,fct)!=0)
2034 { DEALLOC(tmp); return -1; }
2035 for (size_t m=0; m<n; ++m)
2036 c[m] = tmp[2*m];
2037 DEALLOC(tmp);
2038 return 0;
2039 }
2040
2041 WARN_UNUSED_RESULT
rfftblue_forward(fftblue_plan plan,double c[],double fct)2042 static int rfftblue_forward(fftblue_plan plan, double c[], double fct)
2043 {
2044 size_t n=plan->n;
2045 double *tmp = RALLOC(double,2*n);
2046 if (!tmp) return -1;
2047 for (size_t m=0; m<n; ++m)
2048 {
2049 tmp[2*m] = c[m];
2050 tmp[2*m+1] = 0.;
2051 }
2052 if (fftblue_fft(plan,tmp,-1,fct)!=0)
2053 { DEALLOC(tmp); return -1; }
2054 c[0] = tmp[0];
2055 memcpy (c+1, tmp+2, (n-1)*sizeof(double));
2056 DEALLOC(tmp);
2057 return 0;
2058 }
2059
2060 typedef struct cfft_plan_i
2061 {
2062 cfftp_plan packplan;
2063 fftblue_plan blueplan;
2064 } cfft_plan_i;
2065
make_cfft_plan(size_t length)2066 cfft_plan make_cfft_plan (size_t length)
2067 {
2068 if (length==0) return NULL;
2069 cfft_plan plan = RALLOC(cfft_plan_i,1);
2070 if (!plan) return NULL;
2071 plan->blueplan=0;
2072 plan->packplan=0;
2073 if ((length<50) || (largest_prime_factor(length)<=sqrt(length)))
2074 {
2075 plan->packplan=make_cfftp_plan(length);
2076 if (!plan->packplan) { DEALLOC(plan); return NULL; }
2077 return plan;
2078 }
2079 double comp1 = cost_guess(length);
2080 double comp2 = 2*cost_guess(good_size(2*length-1));
2081 comp2*=1.5; /* fudge factor that appears to give good overall performance */
2082 if (comp2<comp1) // use Bluestein
2083 {
2084 plan->blueplan=make_fftblue_plan(length);
2085 if (!plan->blueplan) { DEALLOC(plan); return NULL; }
2086 }
2087 else
2088 {
2089 plan->packplan=make_cfftp_plan(length);
2090 if (!plan->packplan) { DEALLOC(plan); return NULL; }
2091 }
2092 return plan;
2093 }
2094
destroy_cfft_plan(cfft_plan plan)2095 void destroy_cfft_plan (cfft_plan plan)
2096 {
2097 if (plan->blueplan)
2098 destroy_fftblue_plan(plan->blueplan);
2099 if (plan->packplan)
2100 destroy_cfftp_plan(plan->packplan);
2101 DEALLOC(plan);
2102 }
2103
cfft_backward(cfft_plan plan,double c[],double fct)2104 WARN_UNUSED_RESULT int cfft_backward(cfft_plan plan, double c[], double fct)
2105 {
2106 if (plan->packplan)
2107 return cfftp_backward(plan->packplan,c,fct);
2108 // if (plan->blueplan)
2109 return cfftblue_backward(plan->blueplan,c,fct);
2110 }
2111
cfft_forward(cfft_plan plan,double c[],double fct)2112 WARN_UNUSED_RESULT int cfft_forward(cfft_plan plan, double c[], double fct)
2113 {
2114 if (plan->packplan)
2115 return cfftp_forward(plan->packplan,c,fct);
2116 // if (plan->blueplan)
2117 return cfftblue_forward(plan->blueplan,c,fct);
2118 }
2119
2120 typedef struct rfft_plan_i
2121 {
2122 rfftp_plan packplan;
2123 fftblue_plan blueplan;
2124 } rfft_plan_i;
2125
make_rfft_plan(size_t length)2126 rfft_plan make_rfft_plan (size_t length)
2127 {
2128 if (length==0) return NULL;
2129 rfft_plan plan = RALLOC(rfft_plan_i,1);
2130 if (!plan) return NULL;
2131 plan->blueplan=0;
2132 plan->packplan=0;
2133 if ((length<50) || (largest_prime_factor(length)<=sqrt(length)))
2134 {
2135 plan->packplan=make_rfftp_plan(length);
2136 if (!plan->packplan) { DEALLOC(plan); return NULL; }
2137 return plan;
2138 }
2139 double comp1 = 0.5*cost_guess(length);
2140 double comp2 = 2*cost_guess(good_size(2*length-1));
2141 comp2*=1.5; /* fudge factor that appears to give good overall performance */
2142 if (comp2<comp1) // use Bluestein
2143 {
2144 plan->blueplan=make_fftblue_plan(length);
2145 if (!plan->blueplan) { DEALLOC(plan); return NULL; }
2146 }
2147 else
2148 {
2149 plan->packplan=make_rfftp_plan(length);
2150 if (!plan->packplan) { DEALLOC(plan); return NULL; }
2151 }
2152 return plan;
2153 }
2154
destroy_rfft_plan(rfft_plan plan)2155 void destroy_rfft_plan (rfft_plan plan)
2156 {
2157 if (plan->blueplan)
2158 destroy_fftblue_plan(plan->blueplan);
2159 if (plan->packplan)
2160 destroy_rfftp_plan(plan->packplan);
2161 DEALLOC(plan);
2162 }
2163
rfft_length(rfft_plan plan)2164 size_t rfft_length(rfft_plan plan)
2165 {
2166 if (plan->packplan) return plan->packplan->length;
2167 return plan->blueplan->n;
2168 }
2169
cfft_length(cfft_plan plan)2170 size_t cfft_length(cfft_plan plan)
2171 {
2172 if (plan->packplan) return plan->packplan->length;
2173 return plan->blueplan->n;
2174 }
2175
rfft_backward(rfft_plan plan,double c[],double fct)2176 WARN_UNUSED_RESULT int rfft_backward(rfft_plan plan, double c[], double fct)
2177 {
2178 if (plan->packplan)
2179 return rfftp_backward(plan->packplan,c,fct);
2180 else // if (plan->blueplan)
2181 return rfftblue_backward(plan->blueplan,c,fct);
2182 }
2183
rfft_forward(rfft_plan plan,double c[],double fct)2184 WARN_UNUSED_RESULT int rfft_forward(rfft_plan plan, double c[], double fct)
2185 {
2186 if (plan->packplan)
2187 return rfftp_forward(plan->packplan,c,fct);
2188 else // if (plan->blueplan)
2189 return rfftblue_forward(plan->blueplan,c,fct);
2190 }
2191