1 /*
2  * This file is part of pocketfft.
3  * Licensed under a 3-clause BSD style license - see LICENSE.md
4  */
5 
6 /*
7  *  Main implementation file.
8  *
9  *  Copyright (C) 2004-2018 Max-Planck-Society
10  *  \author Martin Reinecke
11  */
12 
13 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
14 
15 #include "Python.h"
16 #include "numpy/arrayobject.h"
17 
18 #include <math.h>
19 #include <string.h>
20 #include <stdlib.h>
21 
22 #include "npy_config.h"
23 #define restrict NPY_RESTRICT
24 
25 #define RALLOC(type,num) \
26   ((type *)malloc((num)*sizeof(type)))
27 #define DEALLOC(ptr) \
28   do { free(ptr); (ptr)=NULL; } while(0)
29 
30 #define SWAP(a,b,type) \
31   do { type tmp_=(a); (a)=(b); (b)=tmp_; } while(0)
32 
33 #ifdef __GNUC__
34 #define NOINLINE __attribute__((noinline))
35 #define WARN_UNUSED_RESULT __attribute__ ((warn_unused_result))
36 #else
37 #define NOINLINE
38 #define WARN_UNUSED_RESULT
39 #endif
40 
41 struct cfft_plan_i;
42 typedef struct cfft_plan_i * cfft_plan;
43 struct rfft_plan_i;
44 typedef struct rfft_plan_i * rfft_plan;
45 
46 // adapted from https://stackoverflow.com/questions/42792939/
47 // CAUTION: this function only works for arguments in the range [-0.25; 0.25]!
my_sincosm1pi(double a,double * restrict res)48 static void my_sincosm1pi (double a, double *restrict res)
49   {
50   double s = a * a;
51   /* Approximate cos(pi*x)-1 for x in [-0.25,0.25] */
52   double r =     -1.0369917389758117e-4;
53   r = fma (r, s,  1.9294935641298806e-3);
54   r = fma (r, s, -2.5806887942825395e-2);
55   r = fma (r, s,  2.3533063028328211e-1);
56   r = fma (r, s, -1.3352627688538006e+0);
57   r = fma (r, s,  4.0587121264167623e+0);
58   r = fma (r, s, -4.9348022005446790e+0);
59   double c = r*s;
60   /* Approximate sin(pi*x) for x in [-0.25,0.25] */
61   r =             4.6151442520157035e-4;
62   r = fma (r, s, -7.3700183130883555e-3);
63   r = fma (r, s,  8.2145868949323936e-2);
64   r = fma (r, s, -5.9926452893214921e-1);
65   r = fma (r, s,  2.5501640398732688e+0);
66   r = fma (r, s, -5.1677127800499516e+0);
67   s = s * a;
68   r = r * s;
69   s = fma (a, 3.1415926535897931e+0, r);
70   res[0] = c;
71   res[1] = s;
72   }
73 
calc_first_octant(size_t den,double * restrict res)74 NOINLINE static void calc_first_octant(size_t den, double * restrict res)
75   {
76   size_t n = (den+4)>>3;
77   if (n==0) return;
78   res[0]=1.; res[1]=0.;
79   if (n==1) return;
80   size_t l1=(size_t)sqrt(n);
81   for (size_t i=1; i<l1; ++i)
82     my_sincosm1pi((2.*i)/den,&res[2*i]);
83   size_t start=l1;
84   while(start<n)
85     {
86     double cs[2];
87     my_sincosm1pi((2.*start)/den,cs);
88     res[2*start] = cs[0]+1.;
89     res[2*start+1] = cs[1];
90     size_t end = l1;
91     if (start+end>n) end = n-start;
92     for (size_t i=1; i<end; ++i)
93       {
94       double csx[2]={res[2*i], res[2*i+1]};
95       res[2*(start+i)] = ((cs[0]*csx[0] - cs[1]*csx[1] + cs[0]) + csx[0]) + 1.;
96       res[2*(start+i)+1] = (cs[0]*csx[1] + cs[1]*csx[0]) + cs[1] + csx[1];
97       }
98     start += l1;
99     }
100   for (size_t i=1; i<l1; ++i)
101     res[2*i] += 1.;
102   }
103 
calc_first_quadrant(size_t n,double * restrict res)104 NOINLINE static void calc_first_quadrant(size_t n, double * restrict res)
105   {
106   double * restrict p = res+n;
107   calc_first_octant(n<<1, p);
108   size_t ndone=(n+2)>>2;
109   size_t i=0, idx1=0, idx2=2*ndone-2;
110   for (; i+1<ndone; i+=2, idx1+=2, idx2-=2)
111     {
112     res[idx1]   = p[2*i];
113     res[idx1+1] = p[2*i+1];
114     res[idx2]   = p[2*i+3];
115     res[idx2+1] = p[2*i+2];
116     }
117   if (i!=ndone)
118     {
119     res[idx1  ] = p[2*i];
120     res[idx1+1] = p[2*i+1];
121     }
122   }
123 
calc_first_half(size_t n,double * restrict res)124 NOINLINE static void calc_first_half(size_t n, double * restrict res)
125   {
126   int ndone=(n+1)>>1;
127   double * p = res+n-1;
128   calc_first_octant(n<<2, p);
129   int i4=0, in=n, i=0;
130   for (; i4<=in-i4; ++i, i4+=4) // octant 0
131     {
132     res[2*i] = p[2*i4]; res[2*i+1] = p[2*i4+1];
133     }
134   for (; i4-in <= 0; ++i, i4+=4) // octant 1
135     {
136     int xm = in-i4;
137     res[2*i] = p[2*xm+1]; res[2*i+1] = p[2*xm];
138     }
139   for (; i4<=3*in-i4; ++i, i4+=4) // octant 2
140     {
141     int xm = i4-in;
142     res[2*i] = -p[2*xm+1]; res[2*i+1] = p[2*xm];
143     }
144   for (; i<ndone; ++i, i4+=4) // octant 3
145     {
146     int xm = 2*in-i4;
147     res[2*i] = -p[2*xm]; res[2*i+1] = p[2*xm+1];
148     }
149   }
150 
fill_first_quadrant(size_t n,double * restrict res)151 NOINLINE static void fill_first_quadrant(size_t n, double * restrict res)
152   {
153   const double hsqt2 = 0.707106781186547524400844362104849;
154   size_t quart = n>>2;
155   if ((n&7)==0)
156     res[quart] = res[quart+1] = hsqt2;
157   for (size_t i=2, j=2*quart-2; i<quart; i+=2, j-=2)
158     {
159     res[j  ] = res[i+1];
160     res[j+1] = res[i  ];
161     }
162   }
163 
fill_first_half(size_t n,double * restrict res)164 NOINLINE static void fill_first_half(size_t n, double * restrict res)
165   {
166   size_t half = n>>1;
167   if ((n&3)==0)
168     for (size_t i=0; i<half; i+=2)
169       {
170       res[i+half]   = -res[i+1];
171       res[i+half+1] =  res[i  ];
172       }
173   else
174     for (size_t i=2, j=2*half-2; i<half; i+=2, j-=2)
175       {
176       res[j  ] = -res[i  ];
177       res[j+1] =  res[i+1];
178       }
179   }
180 
fill_second_half(size_t n,double * restrict res)181 NOINLINE static void fill_second_half(size_t n, double * restrict res)
182   {
183   if ((n&1)==0)
184     for (size_t i=0; i<n; ++i)
185       res[i+n] = -res[i];
186   else
187     for (size_t i=2, j=2*n-2; i<n; i+=2, j-=2)
188       {
189       res[j  ] =  res[i  ];
190       res[j+1] = -res[i+1];
191       }
192   }
193 
sincos_2pibyn_half(size_t n,double * restrict res)194 NOINLINE static void sincos_2pibyn_half(size_t n, double * restrict res)
195   {
196   if ((n&3)==0)
197     {
198     calc_first_octant(n, res);
199     fill_first_quadrant(n, res);
200     fill_first_half(n, res);
201     }
202   else if ((n&1)==0)
203     {
204     calc_first_quadrant(n, res);
205     fill_first_half(n, res);
206     }
207   else
208     calc_first_half(n, res);
209   }
210 
sincos_2pibyn(size_t n,double * restrict res)211 NOINLINE static void sincos_2pibyn(size_t n, double * restrict res)
212   {
213   sincos_2pibyn_half(n, res);
214   fill_second_half(n, res);
215   }
216 
largest_prime_factor(size_t n)217 NOINLINE static size_t largest_prime_factor (size_t n)
218   {
219   size_t res=1;
220   size_t tmp;
221   while (((tmp=(n>>1))<<1)==n)
222     { res=2; n=tmp; }
223 
224   size_t limit=(size_t)sqrt(n+0.01);
225   for (size_t x=3; x<=limit; x+=2)
226   while (((tmp=(n/x))*x)==n)
227     {
228     res=x;
229     n=tmp;
230     limit=(size_t)sqrt(n+0.01);
231     }
232   if (n>1) res=n;
233 
234   return res;
235   }
236 
cost_guess(size_t n)237 NOINLINE static double cost_guess (size_t n)
238   {
239   const double lfp=1.1; // penalty for non-hardcoded larger factors
240   size_t ni=n;
241   double result=0.;
242   size_t tmp;
243   while (((tmp=(n>>1))<<1)==n)
244     { result+=2; n=tmp; }
245 
246   size_t limit=(size_t)sqrt(n+0.01);
247   for (size_t x=3; x<=limit; x+=2)
248   while ((tmp=(n/x))*x==n)
249     {
250     result+= (x<=5) ? x : lfp*x; // penalize larger prime factors
251     n=tmp;
252     limit=(size_t)sqrt(n+0.01);
253     }
254   if (n>1) result+=(n<=5) ? n : lfp*n;
255 
256   return result*ni;
257   }
258 
259 /* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */
good_size(size_t n)260 NOINLINE static size_t good_size(size_t n)
261   {
262   if (n<=6) return n;
263 
264   size_t bestfac=2*n;
265   for (size_t f2=1; f2<bestfac; f2*=2)
266     for (size_t f23=f2; f23<bestfac; f23*=3)
267       for (size_t f235=f23; f235<bestfac; f235*=5)
268         for (size_t f2357=f235; f2357<bestfac; f2357*=7)
269           for (size_t f235711=f2357; f235711<bestfac; f235711*=11)
270             if (f235711>=n) bestfac=f235711;
271   return bestfac;
272   }
273 
274 typedef struct cmplx {
275   double r,i;
276 } cmplx;
277 
278 #define NFCT 25
279 typedef struct cfftp_fctdata
280   {
281   size_t fct;
282   cmplx *tw, *tws;
283   } cfftp_fctdata;
284 
285 typedef struct cfftp_plan_i
286   {
287   size_t length, nfct;
288   cmplx *mem;
289   cfftp_fctdata fct[NFCT];
290   } cfftp_plan_i;
291 typedef struct cfftp_plan_i * cfftp_plan;
292 
293 #define PMC(a,b,c,d) { a.r=c.r+d.r; a.i=c.i+d.i; b.r=c.r-d.r; b.i=c.i-d.i; }
294 #define ADDC(a,b,c) { a.r=b.r+c.r; a.i=b.i+c.i; }
295 #define SCALEC(a,b) { a.r*=b; a.i*=b; }
296 #define ROT90(a) { double tmp_=a.r; a.r=-a.i; a.i=tmp_; }
297 #define ROTM90(a) { double tmp_=-a.r; a.r=a.i; a.i=tmp_; }
298 #define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
299 #define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
300 #define WA(x,i) wa[(i)-1+(x)*(ido-1)]
301 /* a = b*c */
302 #define A_EQ_B_MUL_C(a,b,c) { a.r=b.r*c.r-b.i*c.i; a.i=b.r*c.i+b.i*c.r; }
303 /* a = conj(b)*c*/
304 #define A_EQ_CB_MUL_C(a,b,c) { a.r=b.r*c.r+b.i*c.i; a.i=b.r*c.i-b.i*c.r; }
305 
306 #define PMSIGNC(a,b,c,d) { a.r=c.r+sign*d.r; a.i=c.i+sign*d.i; b.r=c.r-sign*d.r; b.i=c.i-sign*d.i; }
307 /* a = b*c */
308 #define MULPMSIGNC(a,b,c) { a.r=b.r*c.r-sign*b.i*c.i; a.i=b.r*c.i+sign*b.i*c.r; }
309 /* a *= b */
310 #define MULPMSIGNCEQ(a,b) { double xtmp=a.r; a.r=b.r*a.r-sign*b.i*a.i; a.i=b.r*a.i+sign*b.i*xtmp; }
311 
pass2b(size_t ido,size_t l1,const cmplx * restrict cc,cmplx * restrict ch,const cmplx * restrict wa)312 NOINLINE static void pass2b (size_t ido, size_t l1, const cmplx * restrict cc,
313   cmplx * restrict ch, const cmplx * restrict wa)
314   {
315   const size_t cdim=2;
316 
317   if (ido==1)
318     for (size_t k=0; k<l1; ++k)
319       PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
320   else
321     for (size_t k=0; k<l1; ++k)
322       {
323       PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
324       for (size_t i=1; i<ido; ++i)
325         {
326         cmplx t;
327         PMC (CH(i,k,0),t,CC(i,0,k),CC(i,1,k))
328         A_EQ_B_MUL_C (CH(i,k,1),WA(0,i),t)
329         }
330       }
331   }
332 
pass2f(size_t ido,size_t l1,const cmplx * restrict cc,cmplx * restrict ch,const cmplx * restrict wa)333 NOINLINE static void pass2f (size_t ido, size_t l1, const cmplx * restrict cc,
334   cmplx * restrict ch, const cmplx * restrict wa)
335   {
336   const size_t cdim=2;
337 
338   if (ido==1)
339     for (size_t k=0; k<l1; ++k)
340       PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
341   else
342     for (size_t k=0; k<l1; ++k)
343       {
344       PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
345       for (size_t i=1; i<ido; ++i)
346         {
347         cmplx t;
348         PMC (CH(i,k,0),t,CC(i,0,k),CC(i,1,k))
349         A_EQ_CB_MUL_C (CH(i,k,1),WA(0,i),t)
350         }
351       }
352   }
353 
354 #define PREP3(idx) \
355         cmplx t0 = CC(idx,0,k), t1, t2; \
356         PMC (t1,t2,CC(idx,1,k),CC(idx,2,k)) \
357         CH(idx,k,0).r=t0.r+t1.r; \
358         CH(idx,k,0).i=t0.i+t1.i;
359 #define PARTSTEP3a(u1,u2,twr,twi) \
360         { \
361         cmplx ca,cb; \
362         ca.r=t0.r+twr*t1.r; \
363         ca.i=t0.i+twr*t1.i; \
364         cb.i=twi*t2.r; \
365         cb.r=-(twi*t2.i); \
366         PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) \
367         }
368 
369 #define PARTSTEP3b(u1,u2,twr,twi) \
370         { \
371         cmplx ca,cb,da,db; \
372         ca.r=t0.r+twr*t1.r; \
373         ca.i=t0.i+twr*t1.i; \
374         cb.i=twi*t2.r; \
375         cb.r=-(twi*t2.i); \
376         PMC(da,db,ca,cb) \
377         A_EQ_B_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
378         A_EQ_B_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
379         }
pass3b(size_t ido,size_t l1,const cmplx * restrict cc,cmplx * restrict ch,const cmplx * restrict wa)380 NOINLINE static void pass3b (size_t ido, size_t l1, const cmplx * restrict cc,
381   cmplx * restrict ch, const cmplx * restrict wa)
382   {
383   const size_t cdim=3;
384   const double tw1r=-0.5, tw1i= 0.86602540378443864676;
385 
386   if (ido==1)
387     for (size_t k=0; k<l1; ++k)
388       {
389       PREP3(0)
390       PARTSTEP3a(1,2,tw1r,tw1i)
391       }
392   else
393     for (size_t k=0; k<l1; ++k)
394       {
395       {
396       PREP3(0)
397       PARTSTEP3a(1,2,tw1r,tw1i)
398       }
399       for (size_t i=1; i<ido; ++i)
400         {
401         PREP3(i)
402         PARTSTEP3b(1,2,tw1r,tw1i)
403         }
404       }
405   }
406 #define PARTSTEP3f(u1,u2,twr,twi) \
407         { \
408         cmplx ca,cb,da,db; \
409         ca.r=t0.r+twr*t1.r; \
410         ca.i=t0.i+twr*t1.i; \
411         cb.i=twi*t2.r; \
412         cb.r=-(twi*t2.i); \
413         PMC(da,db,ca,cb) \
414         A_EQ_CB_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
415         A_EQ_CB_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
416         }
pass3f(size_t ido,size_t l1,const cmplx * restrict cc,cmplx * restrict ch,const cmplx * restrict wa)417 NOINLINE static void pass3f (size_t ido, size_t l1, const cmplx * restrict cc,
418   cmplx * restrict ch, const cmplx * restrict wa)
419   {
420   const size_t cdim=3;
421   const double tw1r=-0.5, tw1i= -0.86602540378443864676;
422 
423   if (ido==1)
424     for (size_t k=0; k<l1; ++k)
425       {
426       PREP3(0)
427       PARTSTEP3a(1,2,tw1r,tw1i)
428       }
429   else
430     for (size_t k=0; k<l1; ++k)
431       {
432       {
433       PREP3(0)
434       PARTSTEP3a(1,2,tw1r,tw1i)
435       }
436       for (size_t i=1; i<ido; ++i)
437         {
438         PREP3(i)
439         PARTSTEP3f(1,2,tw1r,tw1i)
440         }
441       }
442   }
443 
pass4b(size_t ido,size_t l1,const cmplx * restrict cc,cmplx * restrict ch,const cmplx * restrict wa)444 NOINLINE static void pass4b (size_t ido, size_t l1, const cmplx * restrict cc,
445   cmplx * restrict ch, const cmplx * restrict wa)
446   {
447   const size_t cdim=4;
448 
449   if (ido==1)
450     for (size_t k=0; k<l1; ++k)
451       {
452       cmplx t1, t2, t3, t4;
453       PMC(t2,t1,CC(0,0,k),CC(0,2,k))
454       PMC(t3,t4,CC(0,1,k),CC(0,3,k))
455       ROT90(t4)
456       PMC(CH(0,k,0),CH(0,k,2),t2,t3)
457       PMC(CH(0,k,1),CH(0,k,3),t1,t4)
458       }
459   else
460     for (size_t k=0; k<l1; ++k)
461       {
462       {
463       cmplx t1, t2, t3, t4;
464       PMC(t2,t1,CC(0,0,k),CC(0,2,k))
465       PMC(t3,t4,CC(0,1,k),CC(0,3,k))
466       ROT90(t4)
467       PMC(CH(0,k,0),CH(0,k,2),t2,t3)
468       PMC(CH(0,k,1),CH(0,k,3),t1,t4)
469       }
470       for (size_t i=1; i<ido; ++i)
471         {
472         cmplx c2, c3, c4, t1, t2, t3, t4;
473         cmplx cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
474         PMC(t2,t1,cc0,cc2)
475         PMC(t3,t4,cc1,cc3)
476         ROT90(t4)
477         cmplx wa0=WA(0,i), wa1=WA(1,i),wa2=WA(2,i);
478         PMC(CH(i,k,0),c3,t2,t3)
479         PMC(c2,c4,t1,t4)
480         A_EQ_B_MUL_C (CH(i,k,1),wa0,c2)
481         A_EQ_B_MUL_C (CH(i,k,2),wa1,c3)
482         A_EQ_B_MUL_C (CH(i,k,3),wa2,c4)
483         }
484       }
485   }
pass4f(size_t ido,size_t l1,const cmplx * restrict cc,cmplx * restrict ch,const cmplx * restrict wa)486 NOINLINE static void pass4f (size_t ido, size_t l1, const cmplx * restrict cc,
487   cmplx * restrict ch, const cmplx * restrict wa)
488   {
489   const size_t cdim=4;
490 
491   if (ido==1)
492     for (size_t k=0; k<l1; ++k)
493       {
494       cmplx t1, t2, t3, t4;
495       PMC(t2,t1,CC(0,0,k),CC(0,2,k))
496       PMC(t3,t4,CC(0,1,k),CC(0,3,k))
497       ROTM90(t4)
498       PMC(CH(0,k,0),CH(0,k,2),t2,t3)
499       PMC(CH(0,k,1),CH(0,k,3),t1,t4)
500       }
501   else
502     for (size_t k=0; k<l1; ++k)
503       {
504       {
505       cmplx t1, t2, t3, t4;
506       PMC(t2,t1,CC(0,0,k),CC(0,2,k))
507       PMC(t3,t4,CC(0,1,k),CC(0,3,k))
508       ROTM90(t4)
509       PMC(CH(0,k,0),CH(0,k,2),t2,t3)
510       PMC (CH(0,k,1),CH(0,k,3),t1,t4)
511       }
512       for (size_t i=1; i<ido; ++i)
513         {
514         cmplx c2, c3, c4, t1, t2, t3, t4;
515         cmplx cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
516         PMC(t2,t1,cc0,cc2)
517         PMC(t3,t4,cc1,cc3)
518         ROTM90(t4)
519         cmplx wa0=WA(0,i), wa1=WA(1,i),wa2=WA(2,i);
520         PMC(CH(i,k,0),c3,t2,t3)
521         PMC(c2,c4,t1,t4)
522         A_EQ_CB_MUL_C (CH(i,k,1),wa0,c2)
523         A_EQ_CB_MUL_C (CH(i,k,2),wa1,c3)
524         A_EQ_CB_MUL_C (CH(i,k,3),wa2,c4)
525         }
526       }
527   }
528 
529 #define PREP5(idx) \
530         cmplx t0 = CC(idx,0,k), t1, t2, t3, t4; \
531         PMC (t1,t4,CC(idx,1,k),CC(idx,4,k)) \
532         PMC (t2,t3,CC(idx,2,k),CC(idx,3,k)) \
533         CH(idx,k,0).r=t0.r+t1.r+t2.r; \
534         CH(idx,k,0).i=t0.i+t1.i+t2.i;
535 
536 #define PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \
537         { \
538         cmplx ca,cb; \
539         ca.r=t0.r+twar*t1.r+twbr*t2.r; \
540         ca.i=t0.i+twar*t1.i+twbr*t2.i; \
541         cb.i=twai*t4.r twbi*t3.r; \
542         cb.r=-(twai*t4.i twbi*t3.i); \
543         PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) \
544         }
545 
546 #define PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \
547         { \
548         cmplx ca,cb,da,db; \
549         ca.r=t0.r+twar*t1.r+twbr*t2.r; \
550         ca.i=t0.i+twar*t1.i+twbr*t2.i; \
551         cb.i=twai*t4.r twbi*t3.r; \
552         cb.r=-(twai*t4.i twbi*t3.i); \
553         PMC(da,db,ca,cb) \
554         A_EQ_B_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
555         A_EQ_B_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
556         }
pass5b(size_t ido,size_t l1,const cmplx * restrict cc,cmplx * restrict ch,const cmplx * restrict wa)557 NOINLINE static void pass5b (size_t ido, size_t l1, const cmplx * restrict cc,
558   cmplx * restrict ch, const cmplx * restrict wa)
559   {
560   const size_t cdim=5;
561   const double tw1r= 0.3090169943749474241,
562                tw1i= 0.95105651629515357212,
563                tw2r= -0.8090169943749474241,
564                tw2i= 0.58778525229247312917;
565 
566   if (ido==1)
567     for (size_t k=0; k<l1; ++k)
568       {
569       PREP5(0)
570       PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
571       PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
572       }
573   else
574     for (size_t k=0; k<l1; ++k)
575       {
576       {
577       PREP5(0)
578       PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
579       PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
580       }
581       for (size_t i=1; i<ido; ++i)
582         {
583         PREP5(i)
584         PARTSTEP5b(1,4,tw1r,tw2r,+tw1i,+tw2i)
585         PARTSTEP5b(2,3,tw2r,tw1r,+tw2i,-tw1i)
586         }
587       }
588   }
589 #define PARTSTEP5f(u1,u2,twar,twbr,twai,twbi) \
590         { \
591         cmplx ca,cb,da,db; \
592         ca.r=t0.r+twar*t1.r+twbr*t2.r; \
593         ca.i=t0.i+twar*t1.i+twbr*t2.i; \
594         cb.i=twai*t4.r twbi*t3.r; \
595         cb.r=-(twai*t4.i twbi*t3.i); \
596         PMC(da,db,ca,cb) \
597         A_EQ_CB_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
598         A_EQ_CB_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
599         }
pass5f(size_t ido,size_t l1,const cmplx * restrict cc,cmplx * restrict ch,const cmplx * restrict wa)600 NOINLINE static void pass5f (size_t ido, size_t l1, const cmplx * restrict cc,
601   cmplx * restrict ch, const cmplx * restrict wa)
602   {
603   const size_t cdim=5;
604   const double tw1r= 0.3090169943749474241,
605                tw1i= -0.95105651629515357212,
606                tw2r= -0.8090169943749474241,
607                tw2i= -0.58778525229247312917;
608 
609   if (ido==1)
610     for (size_t k=0; k<l1; ++k)
611       {
612       PREP5(0)
613       PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
614       PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
615       }
616   else
617     for (size_t k=0; k<l1; ++k)
618       {
619       {
620       PREP5(0)
621       PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
622       PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
623       }
624       for (size_t i=1; i<ido; ++i)
625         {
626         PREP5(i)
627         PARTSTEP5f(1,4,tw1r,tw2r,+tw1i,+tw2i)
628         PARTSTEP5f(2,3,tw2r,tw1r,+tw2i,-tw1i)
629         }
630       }
631   }
632 
633 #define PREP7(idx) \
634         cmplx t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \
635         PMC (t2,t7,CC(idx,1,k),CC(idx,6,k)) \
636         PMC (t3,t6,CC(idx,2,k),CC(idx,5,k)) \
637         PMC (t4,t5,CC(idx,3,k),CC(idx,4,k)) \
638         CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \
639         CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i;
640 
641 #define PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,out1,out2) \
642         { \
643         cmplx ca,cb; \
644         ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; \
645         ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \
646         cb.i=y1*t7.r y2*t6.r y3*t5.r; \
647         cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \
648         PMC(out1,out2,ca,cb) \
649         }
650 #define PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \
651         PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2))
652 #define PARTSTEP7(u1,u2,x1,x2,x3,y1,y2,y3) \
653         { \
654         cmplx da,db; \
655         PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \
656         MULPMSIGNC (CH(i,k,u1),WA(u1-1,i),da) \
657         MULPMSIGNC (CH(i,k,u2),WA(u2-1,i),db) \
658         }
659 
pass7(size_t ido,size_t l1,const cmplx * restrict cc,cmplx * restrict ch,const cmplx * restrict wa,const int sign)660 NOINLINE static void pass7(size_t ido, size_t l1, const cmplx * restrict cc,
661   cmplx * restrict ch, const cmplx * restrict wa, const int sign)
662   {
663   const size_t cdim=7;
664   const double tw1r= 0.623489801858733530525,
665                tw1i= sign * 0.7818314824680298087084,
666                tw2r= -0.222520933956314404289,
667                tw2i= sign * 0.9749279121818236070181,
668                tw3r= -0.9009688679024191262361,
669                tw3i= sign * 0.4338837391175581204758;
670 
671   if (ido==1)
672     for (size_t k=0; k<l1; ++k)
673       {
674       PREP7(0)
675       PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
676       PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
677       PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
678       }
679   else
680     for (size_t k=0; k<l1; ++k)
681       {
682       {
683       PREP7(0)
684       PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
685       PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
686       PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
687       }
688       for (size_t i=1; i<ido; ++i)
689         {
690         PREP7(i)
691         PARTSTEP7(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
692         PARTSTEP7(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
693         PARTSTEP7(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
694         }
695       }
696   }
697 
698 #define PREP11(idx) \
699         cmplx t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \
700         PMC (t2,t11,CC(idx,1,k),CC(idx,10,k)) \
701         PMC (t3,t10,CC(idx,2,k),CC(idx, 9,k)) \
702         PMC (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)) \
703         PMC (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)) \
704         PMC (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)) \
705         CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; \
706         CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i;
707 
708 #define PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,out1,out2) \
709         { \
710         cmplx ca,cb; \
711         ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r+x4*t5.r+x5*t6.r; \
712         ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i+x4*t5.i+x5*t6.i; \
713         cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; \
714         cb.r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); \
715         PMC(out1,out2,ca,cb) \
716         }
717 #define PARTSTEP11a(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
718         PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,CH(0,k,u1),CH(0,k,u2))
719 #define PARTSTEP11(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
720         { \
721         cmplx da,db; \
722         PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,da,db) \
723         MULPMSIGNC (CH(i,k,u1),WA(u1-1,i),da) \
724         MULPMSIGNC (CH(i,k,u2),WA(u2-1,i),db) \
725         }
726 
pass11(size_t ido,size_t l1,const cmplx * restrict cc,cmplx * restrict ch,const cmplx * restrict wa,const int sign)727 NOINLINE static void pass11 (size_t ido, size_t l1, const cmplx * restrict cc,
728   cmplx * restrict ch, const cmplx * restrict wa, const int sign)
729   {
730   const size_t cdim=11;
731   const double tw1r =        0.8412535328311811688618,
732                tw1i = sign * 0.5406408174555975821076,
733                tw2r =        0.4154150130018864255293,
734                tw2i = sign * 0.9096319953545183714117,
735                tw3r =       -0.1423148382732851404438,
736                tw3i = sign * 0.9898214418809327323761,
737                tw4r =       -0.6548607339452850640569,
738                tw4i = sign * 0.755749574354258283774,
739                tw5r =       -0.9594929736144973898904,
740                tw5i = sign * 0.2817325568414296977114;
741 
742   if (ido==1)
743     for (size_t k=0; k<l1; ++k)
744       {
745       PREP11(0)
746       PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
747       PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
748       PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
749       PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
750       PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
751       }
752   else
753     for (size_t k=0; k<l1; ++k)
754       {
755       {
756       PREP11(0)
757       PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
758       PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
759       PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
760       PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
761       PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
762       }
763       for (size_t i=1; i<ido; ++i)
764         {
765         PREP11(i)
766         PARTSTEP11(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
767         PARTSTEP11(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
768         PARTSTEP11(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
769         PARTSTEP11(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
770         PARTSTEP11(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
771         }
772       }
773   }
774 
775 #define CX(a,b,c) cc[(a)+ido*((b)+l1*(c))]
776 #define CX2(a,b) cc[(a)+idl1*(b)]
777 #define CH2(a,b) ch[(a)+idl1*(b)]
778 
passg(size_t ido,size_t ip,size_t l1,cmplx * restrict cc,cmplx * restrict ch,const cmplx * restrict wa,const cmplx * restrict csarr,const int sign)779 NOINLINE static int passg (size_t ido, size_t ip, size_t l1,
780   cmplx * restrict cc, cmplx * restrict ch, const cmplx * restrict wa,
781   const cmplx * restrict csarr, const int sign)
782   {
783   const size_t cdim=ip;
784   size_t ipph = (ip+1)/2;
785   size_t idl1 = ido*l1;
786 
787   cmplx * restrict wal=RALLOC(cmplx,ip);
788   if (!wal) return -1;
789   wal[0]=(cmplx){1.,0.};
790   for (size_t i=1; i<ip; ++i)
791     wal[i]=(cmplx){csarr[i].r,sign*csarr[i].i};
792 
793   for (size_t k=0; k<l1; ++k)
794     for (size_t i=0; i<ido; ++i)
795       CH(i,k,0) = CC(i,0,k);
796   for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
797     for (size_t k=0; k<l1; ++k)
798       for (size_t i=0; i<ido; ++i)
799         PMC(CH(i,k,j),CH(i,k,jc),CC(i,j,k),CC(i,jc,k))
800   for (size_t k=0; k<l1; ++k)
801     for (size_t i=0; i<ido; ++i)
802       {
803       cmplx tmp = CH(i,k,0);
804       for (size_t j=1; j<ipph; ++j)
805         ADDC(tmp,tmp,CH(i,k,j))
806       CX(i,k,0) = tmp;
807       }
808   for (size_t l=1, lc=ip-1; l<ipph; ++l, --lc)
809     {
810     // j=0
811     for (size_t ik=0; ik<idl1; ++ik)
812       {
813       CX2(ik,l).r = CH2(ik,0).r+wal[l].r*CH2(ik,1).r+wal[2*l].r*CH2(ik,2).r;
814       CX2(ik,l).i = CH2(ik,0).i+wal[l].r*CH2(ik,1).i+wal[2*l].r*CH2(ik,2).i;
815       CX2(ik,lc).r=-wal[l].i*CH2(ik,ip-1).i-wal[2*l].i*CH2(ik,ip-2).i;
816       CX2(ik,lc).i=wal[l].i*CH2(ik,ip-1).r+wal[2*l].i*CH2(ik,ip-2).r;
817       }
818 
819     size_t iwal=2*l;
820     size_t j=3, jc=ip-3;
821     for (; j<ipph-1; j+=2, jc-=2)
822       {
823       iwal+=l; if (iwal>ip) iwal-=ip;
824       cmplx xwal=wal[iwal];
825       iwal+=l; if (iwal>ip) iwal-=ip;
826       cmplx xwal2=wal[iwal];
827       for (size_t ik=0; ik<idl1; ++ik)
828         {
829         CX2(ik,l).r += CH2(ik,j).r*xwal.r+CH2(ik,j+1).r*xwal2.r;
830         CX2(ik,l).i += CH2(ik,j).i*xwal.r+CH2(ik,j+1).i*xwal2.r;
831         CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i+CH2(ik,jc-1).i*xwal2.i;
832         CX2(ik,lc).i += CH2(ik,jc).r*xwal.i+CH2(ik,jc-1).r*xwal2.i;
833         }
834       }
835     for (; j<ipph; ++j, --jc)
836       {
837       iwal+=l; if (iwal>ip) iwal-=ip;
838       cmplx xwal=wal[iwal];
839       for (size_t ik=0; ik<idl1; ++ik)
840         {
841         CX2(ik,l).r += CH2(ik,j).r*xwal.r;
842         CX2(ik,l).i += CH2(ik,j).i*xwal.r;
843         CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i;
844         CX2(ik,lc).i += CH2(ik,jc).r*xwal.i;
845         }
846       }
847     }
848   DEALLOC(wal);
849 
850   // shuffling and twiddling
851   if (ido==1)
852     for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
853       for (size_t ik=0; ik<idl1; ++ik)
854         {
855         cmplx t1=CX2(ik,j), t2=CX2(ik,jc);
856         PMC(CX2(ik,j),CX2(ik,jc),t1,t2)
857         }
858   else
859     {
860     for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)
861       for (size_t k=0; k<l1; ++k)
862         {
863         cmplx t1=CX(0,k,j), t2=CX(0,k,jc);
864         PMC(CX(0,k,j),CX(0,k,jc),t1,t2)
865         for (size_t i=1; i<ido; ++i)
866           {
867           cmplx x1, x2;
868           PMC(x1,x2,CX(i,k,j),CX(i,k,jc))
869           size_t idij=(j-1)*(ido-1)+i-1;
870           MULPMSIGNC (CX(i,k,j),wa[idij],x1)
871           idij=(jc-1)*(ido-1)+i-1;
872           MULPMSIGNC (CX(i,k,jc),wa[idij],x2)
873           }
874         }
875     }
876   return 0;
877   }
878 
879 #undef CH2
880 #undef CX2
881 #undef CX
882 
pass_all(cfftp_plan plan,cmplx c[],double fct,const int sign)883 NOINLINE WARN_UNUSED_RESULT static int pass_all(cfftp_plan plan, cmplx c[], double fct,
884   const int sign)
885   {
886   if (plan->length==1) return 0;
887   size_t len=plan->length;
888   size_t l1=1, nf=plan->nfct;
889   cmplx *ch = RALLOC(cmplx, len);
890   if (!ch) return -1;
891   cmplx *p1=c, *p2=ch;
892 
893   for(size_t k1=0; k1<nf; k1++)
894     {
895     size_t ip=plan->fct[k1].fct;
896     size_t l2=ip*l1;
897     size_t ido = len/l2;
898     if     (ip==4)
899       sign>0 ? pass4b (ido, l1, p1, p2, plan->fct[k1].tw)
900              : pass4f (ido, l1, p1, p2, plan->fct[k1].tw);
901     else if(ip==2)
902       sign>0 ? pass2b (ido, l1, p1, p2, plan->fct[k1].tw)
903              : pass2f (ido, l1, p1, p2, plan->fct[k1].tw);
904     else if(ip==3)
905       sign>0 ? pass3b (ido, l1, p1, p2, plan->fct[k1].tw)
906              : pass3f (ido, l1, p1, p2, plan->fct[k1].tw);
907     else if(ip==5)
908       sign>0 ? pass5b (ido, l1, p1, p2, plan->fct[k1].tw)
909              : pass5f (ido, l1, p1, p2, plan->fct[k1].tw);
910     else if(ip==7)  pass7 (ido, l1, p1, p2, plan->fct[k1].tw, sign);
911     else if(ip==11) pass11(ido, l1, p1, p2, plan->fct[k1].tw, sign);
912     else
913       {
914       if (passg(ido, ip, l1, p1, p2, plan->fct[k1].tw, plan->fct[k1].tws, sign))
915         { DEALLOC(ch); return -1; }
916       SWAP(p1,p2,cmplx *);
917       }
918     SWAP(p1,p2,cmplx *);
919     l1=l2;
920     }
921   if (p1!=c)
922     {
923     if (fct!=1.)
924       for (size_t i=0; i<len; ++i)
925         {
926         c[i].r = ch[i].r*fct;
927         c[i].i = ch[i].i*fct;
928         }
929     else
930       memcpy (c,p1,len*sizeof(cmplx));
931     }
932   else
933     if (fct!=1.)
934       for (size_t i=0; i<len; ++i)
935         {
936         c[i].r *= fct;
937         c[i].i *= fct;
938         }
939   DEALLOC(ch);
940   return 0;
941   }
942 
943 #undef PMSIGNC
944 #undef A_EQ_B_MUL_C
945 #undef A_EQ_CB_MUL_C
946 #undef MULPMSIGNC
947 #undef MULPMSIGNCEQ
948 
949 #undef WA
950 #undef CC
951 #undef CH
952 #undef ROT90
953 #undef SCALEC
954 #undef ADDC
955 #undef PMC
956 
957 NOINLINE WARN_UNUSED_RESULT
cfftp_forward(cfftp_plan plan,double c[],double fct)958 static int cfftp_forward(cfftp_plan plan, double c[], double fct)
959   { return pass_all(plan,(cmplx *)c, fct, -1); }
960 
961 NOINLINE WARN_UNUSED_RESULT
cfftp_backward(cfftp_plan plan,double c[],double fct)962 static int cfftp_backward(cfftp_plan plan, double c[], double fct)
963   { return pass_all(plan,(cmplx *)c, fct, 1); }
964 
965 NOINLINE WARN_UNUSED_RESULT
cfftp_factorize(cfftp_plan plan)966 static int cfftp_factorize (cfftp_plan plan)
967   {
968   size_t length=plan->length;
969   size_t nfct=0;
970   while ((length%4)==0)
971     { if (nfct>=NFCT) return -1; plan->fct[nfct++].fct=4; length>>=2; }
972   if ((length%2)==0)
973     {
974     length>>=1;
975     // factor 2 should be at the front of the factor list
976     if (nfct>=NFCT) return -1;
977     plan->fct[nfct++].fct=2;
978     SWAP(plan->fct[0].fct, plan->fct[nfct-1].fct,size_t);
979     }
980   size_t maxl=(size_t)(sqrt((double)length))+1;
981   for (size_t divisor=3; (length>1)&&(divisor<maxl); divisor+=2)
982     if ((length%divisor)==0)
983       {
984       while ((length%divisor)==0)
985         {
986         if (nfct>=NFCT) return -1;
987         plan->fct[nfct++].fct=divisor;
988         length/=divisor;
989         }
990       maxl=(size_t)(sqrt((double)length))+1;
991       }
992   if (length>1) plan->fct[nfct++].fct=length;
993   plan->nfct=nfct;
994   return 0;
995   }
996 
cfftp_twsize(cfftp_plan plan)997 NOINLINE static size_t cfftp_twsize (cfftp_plan plan)
998   {
999   size_t twsize=0, l1=1;
1000   for (size_t k=0; k<plan->nfct; ++k)
1001     {
1002     size_t ip=plan->fct[k].fct, ido= plan->length/(l1*ip);
1003     twsize+=(ip-1)*(ido-1);
1004     if (ip>11)
1005       twsize+=ip;
1006     l1*=ip;
1007     }
1008   return twsize;
1009   }
1010 
cfftp_comp_twiddle(cfftp_plan plan)1011 NOINLINE WARN_UNUSED_RESULT static int cfftp_comp_twiddle (cfftp_plan plan)
1012   {
1013   size_t length=plan->length;
1014   double *twid = RALLOC(double, 2*length);
1015   if (!twid) return -1;
1016   sincos_2pibyn(length, twid);
1017   size_t l1=1;
1018   size_t memofs=0;
1019   for (size_t k=0; k<plan->nfct; ++k)
1020     {
1021     size_t ip=plan->fct[k].fct, ido= length/(l1*ip);
1022     plan->fct[k].tw=plan->mem+memofs;
1023     memofs+=(ip-1)*(ido-1);
1024     for (size_t j=1; j<ip; ++j)
1025       for (size_t i=1; i<ido; ++i)
1026         {
1027         plan->fct[k].tw[(j-1)*(ido-1)+i-1].r = twid[2*j*l1*i];
1028         plan->fct[k].tw[(j-1)*(ido-1)+i-1].i = twid[2*j*l1*i+1];
1029         }
1030     if (ip>11)
1031       {
1032       plan->fct[k].tws=plan->mem+memofs;
1033       memofs+=ip;
1034       for (size_t j=0; j<ip; ++j)
1035         {
1036         plan->fct[k].tws[j].r = twid[2*j*l1*ido];
1037         plan->fct[k].tws[j].i = twid[2*j*l1*ido+1];
1038         }
1039       }
1040     l1*=ip;
1041     }
1042   DEALLOC(twid);
1043   return 0;
1044   }
1045 
make_cfftp_plan(size_t length)1046 static cfftp_plan make_cfftp_plan (size_t length)
1047   {
1048   if (length==0) return NULL;
1049   cfftp_plan plan = RALLOC(cfftp_plan_i,1);
1050   if (!plan) return NULL;
1051   plan->length=length;
1052   plan->nfct=0;
1053   for (size_t i=0; i<NFCT; ++i)
1054     plan->fct[i]=(cfftp_fctdata){0,0,0};
1055   plan->mem=0;
1056   if (length==1) return plan;
1057   if (cfftp_factorize(plan)!=0) { DEALLOC(plan); return NULL; }
1058   size_t tws=cfftp_twsize(plan);
1059   plan->mem=RALLOC(cmplx,tws);
1060   if (!plan->mem) { DEALLOC(plan); return NULL; }
1061   if (cfftp_comp_twiddle(plan)!=0)
1062     { DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
1063   return plan;
1064   }
1065 
destroy_cfftp_plan(cfftp_plan plan)1066 static void destroy_cfftp_plan (cfftp_plan plan)
1067   {
1068   DEALLOC(plan->mem);
1069   DEALLOC(plan);
1070   }
1071 
1072 typedef struct rfftp_fctdata
1073   {
1074   size_t fct;
1075   double *tw, *tws;
1076   } rfftp_fctdata;
1077 
1078 typedef struct rfftp_plan_i
1079   {
1080   size_t length, nfct;
1081   double *mem;
1082   rfftp_fctdata fct[NFCT];
1083   } rfftp_plan_i;
1084 typedef struct rfftp_plan_i * rfftp_plan;
1085 
1086 #define WA(x,i) wa[(i)+(x)*(ido-1)]
1087 #define PM(a,b,c,d) { a=c+d; b=c-d; }
1088 /* (a+ib) = conj(c+id) * (e+if) */
1089 #define MULPM(a,b,c,d,e,f) { a=c*e+d*f; b=c*f-d*e; }
1090 
1091 #define CC(a,b,c) cc[(a)+ido*((b)+l1*(c))]
1092 #define CH(a,b,c) ch[(a)+ido*((b)+cdim*(c))]
1093 
radf2(size_t ido,size_t l1,const double * restrict cc,double * restrict ch,const double * restrict wa)1094 NOINLINE static void radf2 (size_t ido, size_t l1, const double * restrict cc,
1095   double * restrict ch, const double * restrict wa)
1096   {
1097   const size_t cdim=2;
1098 
1099   for (size_t k=0; k<l1; k++)
1100     PM (CH(0,0,k),CH(ido-1,1,k),CC(0,k,0),CC(0,k,1))
1101   if ((ido&1)==0)
1102     for (size_t k=0; k<l1; k++)
1103       {
1104       CH(    0,1,k) = -CC(ido-1,k,1);
1105       CH(ido-1,0,k) =  CC(ido-1,k,0);
1106       }
1107   if (ido<=2) return;
1108   for (size_t k=0; k<l1; k++)
1109     for (size_t i=2; i<ido; i+=2)
1110       {
1111       size_t ic=ido-i;
1112       double tr2, ti2;
1113       MULPM (tr2,ti2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
1114       PM (CH(i-1,0,k),CH(ic-1,1,k),CC(i-1,k,0),tr2)
1115       PM (CH(i  ,0,k),CH(ic  ,1,k),ti2,CC(i  ,k,0))
1116       }
1117   }
1118 
radf3(size_t ido,size_t l1,const double * restrict cc,double * restrict ch,const double * restrict wa)1119 NOINLINE static void radf3(size_t ido, size_t l1, const double * restrict cc,
1120   double * restrict ch, const double * restrict wa)
1121   {
1122   const size_t cdim=3;
1123   static const double taur=-0.5, taui=0.86602540378443864676;
1124 
1125   for (size_t k=0; k<l1; k++)
1126     {
1127     double cr2=CC(0,k,1)+CC(0,k,2);
1128     CH(0,0,k) = CC(0,k,0)+cr2;
1129     CH(0,2,k) = taui*(CC(0,k,2)-CC(0,k,1));
1130     CH(ido-1,1,k) = CC(0,k,0)+taur*cr2;
1131     }
1132   if (ido==1) return;
1133   for (size_t k=0; k<l1; k++)
1134     for (size_t i=2; i<ido; i+=2)
1135       {
1136       size_t ic=ido-i;
1137       double di2, di3, dr2, dr3;
1138       MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1)) // d2=conj(WA0)*CC1
1139       MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2)) // d3=conj(WA1)*CC2
1140       double cr2=dr2+dr3; // c add
1141       double ci2=di2+di3;
1142       CH(i-1,0,k) = CC(i-1,k,0)+cr2; // c add
1143       CH(i  ,0,k) = CC(i  ,k,0)+ci2;
1144       double tr2 = CC(i-1,k,0)+taur*cr2; // c add
1145       double ti2 = CC(i  ,k,0)+taur*ci2;
1146       double tr3 = taui*(di2-di3);  // t3 = taui*i*(d3-d2)?
1147       double ti3 = taui*(dr3-dr2);
1148       PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr3) // PM(i) = t2+t3
1149       PM(CH(i  ,2,k),CH(ic  ,1,k),ti3,ti2) // PM(ic) = conj(t2-t3)
1150       }
1151   }
1152 
radf4(size_t ido,size_t l1,const double * restrict cc,double * restrict ch,const double * restrict wa)1153 NOINLINE static void radf4(size_t ido, size_t l1, const double * restrict cc,
1154   double * restrict ch, const double * restrict wa)
1155   {
1156   const size_t cdim=4;
1157   static const double hsqt2=0.70710678118654752440;
1158 
1159   for (size_t k=0; k<l1; k++)
1160     {
1161     double tr1,tr2;
1162     PM (tr1,CH(0,2,k),CC(0,k,3),CC(0,k,1))
1163     PM (tr2,CH(ido-1,1,k),CC(0,k,0),CC(0,k,2))
1164     PM (CH(0,0,k),CH(ido-1,3,k),tr2,tr1)
1165     }
1166   if ((ido&1)==0)
1167     for (size_t k=0; k<l1; k++)
1168       {
1169       double ti1=-hsqt2*(CC(ido-1,k,1)+CC(ido-1,k,3));
1170       double tr1= hsqt2*(CC(ido-1,k,1)-CC(ido-1,k,3));
1171       PM (CH(ido-1,0,k),CH(ido-1,2,k),CC(ido-1,k,0),tr1)
1172       PM (CH(    0,3,k),CH(    0,1,k),ti1,CC(ido-1,k,2))
1173       }
1174   if (ido<=2) return;
1175   for (size_t k=0; k<l1; k++)
1176     for (size_t i=2; i<ido; i+=2)
1177       {
1178       size_t ic=ido-i;
1179       double ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
1180       MULPM(cr2,ci2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
1181       MULPM(cr3,ci3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2))
1182       MULPM(cr4,ci4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3))
1183       PM(tr1,tr4,cr4,cr2)
1184       PM(ti1,ti4,ci2,ci4)
1185       PM(tr2,tr3,CC(i-1,k,0),cr3)
1186       PM(ti2,ti3,CC(i  ,k,0),ci3)
1187       PM(CH(i-1,0,k),CH(ic-1,3,k),tr2,tr1)
1188       PM(CH(i  ,0,k),CH(ic  ,3,k),ti1,ti2)
1189       PM(CH(i-1,2,k),CH(ic-1,1,k),tr3,ti4)
1190       PM(CH(i  ,2,k),CH(ic  ,1,k),tr4,ti3)
1191       }
1192   }
1193 
radf5(size_t ido,size_t l1,const double * restrict cc,double * restrict ch,const double * restrict wa)1194 NOINLINE static void radf5(size_t ido, size_t l1, const double * restrict cc,
1195   double * restrict ch, const double * restrict wa)
1196   {
1197   const size_t cdim=5;
1198   static const double tr11= 0.3090169943749474241, ti11=0.95105651629515357212,
1199                       tr12=-0.8090169943749474241, ti12=0.58778525229247312917;
1200 
1201   for (size_t k=0; k<l1; k++)
1202     {
1203     double cr2, cr3, ci4, ci5;
1204     PM (cr2,ci5,CC(0,k,4),CC(0,k,1))
1205     PM (cr3,ci4,CC(0,k,3),CC(0,k,2))
1206     CH(0,0,k)=CC(0,k,0)+cr2+cr3;
1207     CH(ido-1,1,k)=CC(0,k,0)+tr11*cr2+tr12*cr3;
1208     CH(0,2,k)=ti11*ci5+ti12*ci4;
1209     CH(ido-1,3,k)=CC(0,k,0)+tr12*cr2+tr11*cr3;
1210     CH(0,4,k)=ti12*ci5-ti11*ci4;
1211     }
1212   if (ido==1) return;
1213   for (size_t k=0; k<l1;++k)
1214     for (size_t i=2; i<ido; i+=2)
1215       {
1216       double ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3,
1217          dr4, dr5, cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
1218       size_t ic=ido-i;
1219       MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
1220       MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2))
1221       MULPM (dr4,di4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3))
1222       MULPM (dr5,di5,WA(3,i-2),WA(3,i-1),CC(i-1,k,4),CC(i,k,4))
1223       PM(cr2,ci5,dr5,dr2)
1224       PM(ci2,cr5,di2,di5)
1225       PM(cr3,ci4,dr4,dr3)
1226       PM(ci3,cr4,di3,di4)
1227       CH(i-1,0,k)=CC(i-1,k,0)+cr2+cr3;
1228       CH(i  ,0,k)=CC(i  ,k,0)+ci2+ci3;
1229       tr2=CC(i-1,k,0)+tr11*cr2+tr12*cr3;
1230       ti2=CC(i  ,k,0)+tr11*ci2+tr12*ci3;
1231       tr3=CC(i-1,k,0)+tr12*cr2+tr11*cr3;
1232       ti3=CC(i  ,k,0)+tr12*ci2+tr11*ci3;
1233       MULPM(tr5,tr4,cr5,cr4,ti11,ti12)
1234       MULPM(ti5,ti4,ci5,ci4,ti11,ti12)
1235       PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr5)
1236       PM(CH(i  ,2,k),CH(ic  ,1,k),ti5,ti2)
1237       PM(CH(i-1,4,k),CH(ic-1,3,k),tr3,tr4)
1238       PM(CH(i  ,4,k),CH(ic  ,3,k),ti4,ti3)
1239       }
1240   }
1241 
1242 #undef CC
1243 #undef CH
1244 #define C1(a,b,c) cc[(a)+ido*((b)+l1*(c))]
1245 #define C2(a,b) cc[(a)+idl1*(b)]
1246 #define CH2(a,b) ch[(a)+idl1*(b)]
1247 #define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
1248 #define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
radfg(size_t ido,size_t ip,size_t l1,double * restrict cc,double * restrict ch,const double * restrict wa,const double * restrict csarr)1249 NOINLINE static void radfg(size_t ido, size_t ip, size_t l1,
1250   double * restrict cc, double * restrict ch, const double * restrict wa,
1251   const double * restrict csarr)
1252   {
1253   const size_t cdim=ip;
1254   size_t ipph=(ip+1)/2;
1255   size_t idl1 = ido*l1;
1256 
1257   if (ido>1)
1258     {
1259     for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)              // 114
1260       {
1261       size_t is=(j-1)*(ido-1),
1262              is2=(jc-1)*(ido-1);
1263       for (size_t k=0; k<l1; ++k)                            // 113
1264         {
1265         size_t idij=is;
1266         size_t idij2=is2;
1267         for (size_t i=1; i<=ido-2; i+=2)                      // 112
1268           {
1269           double t1=C1(i,k,j ), t2=C1(i+1,k,j ),
1270                  t3=C1(i,k,jc), t4=C1(i+1,k,jc);
1271           double x1=wa[idij]*t1 + wa[idij+1]*t2,
1272                  x2=wa[idij]*t2 - wa[idij+1]*t1,
1273                  x3=wa[idij2]*t3 + wa[idij2+1]*t4,
1274                  x4=wa[idij2]*t4 - wa[idij2+1]*t3;
1275           C1(i  ,k,j ) = x1+x3;
1276           C1(i  ,k,jc) = x2-x4;
1277           C1(i+1,k,j ) = x2+x4;
1278           C1(i+1,k,jc) = x3-x1;
1279           idij+=2;
1280           idij2+=2;
1281           }
1282         }
1283       }
1284     }
1285 
1286   for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                // 123
1287     for (size_t k=0; k<l1; ++k)                              // 122
1288       {
1289       double t1=C1(0,k,j), t2=C1(0,k,jc);
1290       C1(0,k,j ) = t1+t2;
1291       C1(0,k,jc) = t2-t1;
1292       }
1293 
1294 //everything in C
1295 //memset(ch,0,ip*l1*ido*sizeof(double));
1296 
1297   for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc)                 // 127
1298     {
1299     for (size_t ik=0; ik<idl1; ++ik)                         // 124
1300       {
1301       CH2(ik,l ) = C2(ik,0)+csarr[2*l]*C2(ik,1)+csarr[4*l]*C2(ik,2);
1302       CH2(ik,lc) = csarr[2*l+1]*C2(ik,ip-1)+csarr[4*l+1]*C2(ik,ip-2);
1303       }
1304     size_t iang = 2*l;
1305     size_t j=3, jc=ip-3;
1306     for (; j<ipph-3; j+=4,jc-=4)              // 126
1307       {
1308       iang+=l; if (iang>=ip) iang-=ip;
1309       double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
1310       iang+=l; if (iang>=ip) iang-=ip;
1311       double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
1312       iang+=l; if (iang>=ip) iang-=ip;
1313       double ar3=csarr[2*iang], ai3=csarr[2*iang+1];
1314       iang+=l; if (iang>=ip) iang-=ip;
1315       double ar4=csarr[2*iang], ai4=csarr[2*iang+1];
1316       for (size_t ik=0; ik<idl1; ++ik)                       // 125
1317         {
1318         CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1)
1319                      +ar3*C2(ik,j +2)+ar4*C2(ik,j +3);
1320         CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1)
1321                      +ai3*C2(ik,jc-2)+ai4*C2(ik,jc-3);
1322         }
1323       }
1324     for (; j<ipph-1; j+=2,jc-=2)              // 126
1325       {
1326       iang+=l; if (iang>=ip) iang-=ip;
1327       double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
1328       iang+=l; if (iang>=ip) iang-=ip;
1329       double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
1330       for (size_t ik=0; ik<idl1; ++ik)                       // 125
1331         {
1332         CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1);
1333         CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1);
1334         }
1335       }
1336     for (; j<ipph; ++j,--jc)              // 126
1337       {
1338       iang+=l; if (iang>=ip) iang-=ip;
1339       double ar=csarr[2*iang], ai=csarr[2*iang+1];
1340       for (size_t ik=0; ik<idl1; ++ik)                       // 125
1341         {
1342         CH2(ik,l ) += ar*C2(ik,j );
1343         CH2(ik,lc) += ai*C2(ik,jc);
1344         }
1345       }
1346     }
1347   for (size_t ik=0; ik<idl1; ++ik)                         // 101
1348     CH2(ik,0) = C2(ik,0);
1349   for (size_t j=1; j<ipph; ++j)                              // 129
1350     for (size_t ik=0; ik<idl1; ++ik)                         // 128
1351       CH2(ik,0) += C2(ik,j);
1352 
1353 // everything in CH at this point!
1354 //memset(cc,0,ip*l1*ido*sizeof(double));
1355 
1356   for (size_t k=0; k<l1; ++k)                                // 131
1357     for (size_t i=0; i<ido; ++i)                             // 130
1358       CC(i,0,k) = CH(i,k,0);
1359 
1360   for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                // 137
1361     {
1362     size_t j2=2*j-1;
1363     for (size_t k=0; k<l1; ++k)                              // 136
1364       {
1365       CC(ido-1,j2,k) = CH(0,k,j);
1366       CC(0,j2+1,k) = CH(0,k,jc);
1367       }
1368     }
1369 
1370   if (ido==1) return;
1371 
1372   for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                // 140
1373     {
1374     size_t j2=2*j-1;
1375     for(size_t k=0; k<l1; ++k)                               // 139
1376       for(size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2)      // 138
1377         {
1378         CC(i   ,j2+1,k) = CH(i  ,k,j )+CH(i  ,k,jc);
1379         CC(ic  ,j2  ,k) = CH(i  ,k,j )-CH(i  ,k,jc);
1380         CC(i+1 ,j2+1,k) = CH(i+1,k,j )+CH(i+1,k,jc);
1381         CC(ic+1,j2  ,k) = CH(i+1,k,jc)-CH(i+1,k,j );
1382         }
1383     }
1384   }
1385 #undef C1
1386 #undef C2
1387 #undef CH2
1388 
1389 #undef CH
1390 #undef CC
1391 #define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
1392 #define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
1393 
radb2(size_t ido,size_t l1,const double * restrict cc,double * restrict ch,const double * restrict wa)1394 NOINLINE static void radb2(size_t ido, size_t l1, const double * restrict cc,
1395   double * restrict ch, const double * restrict wa)
1396   {
1397   const size_t cdim=2;
1398 
1399   for (size_t k=0; k<l1; k++)
1400     PM (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(ido-1,1,k))
1401   if ((ido&1)==0)
1402     for (size_t k=0; k<l1; k++)
1403       {
1404       CH(ido-1,k,0) = 2.*CC(ido-1,0,k);
1405       CH(ido-1,k,1) =-2.*CC(0    ,1,k);
1406       }
1407   if (ido<=2) return;
1408   for (size_t k=0; k<l1;++k)
1409     for (size_t i=2; i<ido; i+=2)
1410       {
1411       size_t ic=ido-i;
1412       double ti2, tr2;
1413       PM (CH(i-1,k,0),tr2,CC(i-1,0,k),CC(ic-1,1,k))
1414       PM (ti2,CH(i  ,k,0),CC(i  ,0,k),CC(ic  ,1,k))
1415       MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ti2,tr2)
1416       }
1417   }
1418 
radb3(size_t ido,size_t l1,const double * restrict cc,double * restrict ch,const double * restrict wa)1419 NOINLINE static void radb3(size_t ido, size_t l1, const double * restrict cc,
1420   double * restrict ch, const double * restrict wa)
1421   {
1422   const size_t cdim=3;
1423   static const double taur=-0.5, taui=0.86602540378443864676;
1424 
1425   for (size_t k=0; k<l1; k++)
1426     {
1427     double tr2=2.*CC(ido-1,1,k);
1428     double cr2=CC(0,0,k)+taur*tr2;
1429     CH(0,k,0)=CC(0,0,k)+tr2;
1430     double ci3=2.*taui*CC(0,2,k);
1431     PM (CH(0,k,2),CH(0,k,1),cr2,ci3);
1432     }
1433   if (ido==1) return;
1434   for (size_t k=0; k<l1; k++)
1435     for (size_t i=2; i<ido; i+=2)
1436       {
1437       size_t ic=ido-i;
1438       double tr2=CC(i-1,2,k)+CC(ic-1,1,k); // t2=CC(I) + conj(CC(ic))
1439       double ti2=CC(i  ,2,k)-CC(ic  ,1,k);
1440       double cr2=CC(i-1,0,k)+taur*tr2;     // c2=CC +taur*t2
1441       double ci2=CC(i  ,0,k)+taur*ti2;
1442       CH(i-1,k,0)=CC(i-1,0,k)+tr2;         // CH=CC+t2
1443       CH(i  ,k,0)=CC(i  ,0,k)+ti2;
1444       double cr3=taui*(CC(i-1,2,k)-CC(ic-1,1,k));// c3=taui*(CC(i)-conj(CC(ic)))
1445       double ci3=taui*(CC(i  ,2,k)+CC(ic  ,1,k));
1446       double di2, di3, dr2, dr3;
1447       PM(dr3,dr2,cr2,ci3) // d2= (cr2-ci3, ci2+cr3) = c2+i*c3
1448       PM(di2,di3,ci2,cr3) // d3= (cr2+ci3, ci2-cr3) = c2-i*c3
1449       MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2) // ch = WA*d2
1450       MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3)
1451       }
1452   }
1453 
radb4(size_t ido,size_t l1,const double * restrict cc,double * restrict ch,const double * restrict wa)1454 NOINLINE static void radb4(size_t ido, size_t l1, const double * restrict cc,
1455   double * restrict ch, const double * restrict wa)
1456   {
1457   const size_t cdim=4;
1458   static const double sqrt2=1.41421356237309504880;
1459 
1460   for (size_t k=0; k<l1; k++)
1461     {
1462     double tr1, tr2;
1463     PM (tr2,tr1,CC(0,0,k),CC(ido-1,3,k))
1464     double tr3=2.*CC(ido-1,1,k);
1465     double tr4=2.*CC(0,2,k);
1466     PM (CH(0,k,0),CH(0,k,2),tr2,tr3)
1467     PM (CH(0,k,3),CH(0,k,1),tr1,tr4)
1468     }
1469   if ((ido&1)==0)
1470     for (size_t k=0; k<l1; k++)
1471       {
1472       double tr1,tr2,ti1,ti2;
1473       PM (ti1,ti2,CC(0    ,3,k),CC(0    ,1,k))
1474       PM (tr2,tr1,CC(ido-1,0,k),CC(ido-1,2,k))
1475       CH(ido-1,k,0)=tr2+tr2;
1476       CH(ido-1,k,1)=sqrt2*(tr1-ti1);
1477       CH(ido-1,k,2)=ti2+ti2;
1478       CH(ido-1,k,3)=-sqrt2*(tr1+ti1);
1479       }
1480   if (ido<=2) return;
1481   for (size_t k=0; k<l1;++k)
1482     for (size_t i=2; i<ido; i+=2)
1483       {
1484       double ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
1485       size_t ic=ido-i;
1486       PM (tr2,tr1,CC(i-1,0,k),CC(ic-1,3,k))
1487       PM (ti1,ti2,CC(i  ,0,k),CC(ic  ,3,k))
1488       PM (tr4,ti3,CC(i  ,2,k),CC(ic  ,1,k))
1489       PM (tr3,ti4,CC(i-1,2,k),CC(ic-1,1,k))
1490       PM (CH(i-1,k,0),cr3,tr2,tr3)
1491       PM (CH(i  ,k,0),ci3,ti2,ti3)
1492       PM (cr4,cr2,tr1,tr4)
1493       PM (ci2,ci4,ti1,ti4)
1494       MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ci2,cr2)
1495       MULPM (CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),ci3,cr3)
1496       MULPM (CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),ci4,cr4)
1497       }
1498   }
1499 
radb5(size_t ido,size_t l1,const double * restrict cc,double * restrict ch,const double * restrict wa)1500 NOINLINE static void radb5(size_t ido, size_t l1, const double * restrict cc,
1501   double * restrict ch, const double * restrict wa)
1502   {
1503   const size_t cdim=5;
1504   static const double tr11= 0.3090169943749474241, ti11=0.95105651629515357212,
1505                       tr12=-0.8090169943749474241, ti12=0.58778525229247312917;
1506 
1507   for (size_t k=0; k<l1; k++)
1508     {
1509     double ti5=CC(0,2,k)+CC(0,2,k);
1510     double ti4=CC(0,4,k)+CC(0,4,k);
1511     double tr2=CC(ido-1,1,k)+CC(ido-1,1,k);
1512     double tr3=CC(ido-1,3,k)+CC(ido-1,3,k);
1513     CH(0,k,0)=CC(0,0,k)+tr2+tr3;
1514     double cr2=CC(0,0,k)+tr11*tr2+tr12*tr3;
1515     double cr3=CC(0,0,k)+tr12*tr2+tr11*tr3;
1516     double ci4, ci5;
1517     MULPM(ci5,ci4,ti5,ti4,ti11,ti12)
1518     PM(CH(0,k,4),CH(0,k,1),cr2,ci5)
1519     PM(CH(0,k,3),CH(0,k,2),cr3,ci4)
1520     }
1521   if (ido==1) return;
1522   for (size_t k=0; k<l1;++k)
1523     for (size_t i=2; i<ido; i+=2)
1524       {
1525       size_t ic=ido-i;
1526       double tr2, tr3, tr4, tr5, ti2, ti3, ti4, ti5;
1527       PM(tr2,tr5,CC(i-1,2,k),CC(ic-1,1,k))
1528       PM(ti5,ti2,CC(i  ,2,k),CC(ic  ,1,k))
1529       PM(tr3,tr4,CC(i-1,4,k),CC(ic-1,3,k))
1530       PM(ti4,ti3,CC(i  ,4,k),CC(ic  ,3,k))
1531       CH(i-1,k,0)=CC(i-1,0,k)+tr2+tr3;
1532       CH(i  ,k,0)=CC(i  ,0,k)+ti2+ti3;
1533       double cr2=CC(i-1,0,k)+tr11*tr2+tr12*tr3;
1534       double ci2=CC(i  ,0,k)+tr11*ti2+tr12*ti3;
1535       double cr3=CC(i-1,0,k)+tr12*tr2+tr11*tr3;
1536       double ci3=CC(i  ,0,k)+tr12*ti2+tr11*ti3;
1537       double ci4, ci5, cr5, cr4;
1538       MULPM(cr5,cr4,tr5,tr4,ti11,ti12)
1539       MULPM(ci5,ci4,ti5,ti4,ti11,ti12)
1540       double dr2, dr3, dr4, dr5, di2, di3, di4, di5;
1541       PM(dr4,dr3,cr3,ci4)
1542       PM(di3,di4,ci3,cr4)
1543       PM(dr5,dr2,cr2,ci5)
1544       PM(di2,di5,ci2,cr5)
1545       MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2)
1546       MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3)
1547       MULPM(CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),di4,dr4)
1548       MULPM(CH(i,k,4),CH(i-1,k,4),WA(3,i-2),WA(3,i-1),di5,dr5)
1549       }
1550   }
1551 
1552 #undef CC
1553 #undef CH
1554 #define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
1555 #define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
1556 #define C1(a,b,c) cc[(a)+ido*((b)+l1*(c))]
1557 #define C2(a,b) cc[(a)+idl1*(b)]
1558 #define CH2(a,b) ch[(a)+idl1*(b)]
1559 
radbg(size_t ido,size_t ip,size_t l1,double * restrict cc,double * restrict ch,const double * restrict wa,const double * restrict csarr)1560 NOINLINE static void radbg(size_t ido, size_t ip, size_t l1,
1561   double * restrict cc, double * restrict ch, const double * restrict wa,
1562   const double * restrict csarr)
1563   {
1564   const size_t cdim=ip;
1565   size_t ipph=(ip+1)/ 2;
1566   size_t idl1 = ido*l1;
1567 
1568   for (size_t k=0; k<l1; ++k)        // 102
1569     for (size_t i=0; i<ido; ++i)     // 101
1570       CH(i,k,0) = CC(i,0,k);
1571   for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)   // 108
1572     {
1573     size_t j2=2*j-1;
1574     for (size_t k=0; k<l1; ++k)
1575       {
1576       CH(0,k,j ) = 2*CC(ido-1,j2,k);
1577       CH(0,k,jc) = 2*CC(0,j2+1,k);
1578       }
1579     }
1580 
1581   if (ido!=1)
1582     {
1583     for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)   // 111
1584       {
1585       size_t j2=2*j-1;
1586       for (size_t k=0; k<l1; ++k)
1587         for (size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2)      // 109
1588           {
1589           CH(i  ,k,j ) = CC(i  ,j2+1,k)+CC(ic  ,j2,k);
1590           CH(i  ,k,jc) = CC(i  ,j2+1,k)-CC(ic  ,j2,k);
1591           CH(i+1,k,j ) = CC(i+1,j2+1,k)-CC(ic+1,j2,k);
1592           CH(i+1,k,jc) = CC(i+1,j2+1,k)+CC(ic+1,j2,k);
1593           }
1594       }
1595     }
1596   for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc)
1597     {
1598     for (size_t ik=0; ik<idl1; ++ik)
1599       {
1600       C2(ik,l ) = CH2(ik,0)+csarr[2*l]*CH2(ik,1)+csarr[4*l]*CH2(ik,2);
1601       C2(ik,lc) = csarr[2*l+1]*CH2(ik,ip-1)+csarr[4*l+1]*CH2(ik,ip-2);
1602       }
1603     size_t iang=2*l;
1604     size_t j=3,jc=ip-3;
1605     for(; j<ipph-3; j+=4,jc-=4)
1606       {
1607       iang+=l; if(iang>ip) iang-=ip;
1608       double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
1609       iang+=l; if(iang>ip) iang-=ip;
1610       double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
1611       iang+=l; if(iang>ip) iang-=ip;
1612       double ar3=csarr[2*iang], ai3=csarr[2*iang+1];
1613       iang+=l; if(iang>ip) iang-=ip;
1614       double ar4=csarr[2*iang], ai4=csarr[2*iang+1];
1615       for (size_t ik=0; ik<idl1; ++ik)
1616         {
1617         C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1)
1618                     +ar3*CH2(ik,j +2)+ar4*CH2(ik,j +3);
1619         C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1)
1620                     +ai3*CH2(ik,jc-2)+ai4*CH2(ik,jc-3);
1621         }
1622       }
1623     for(; j<ipph-1; j+=2,jc-=2)
1624       {
1625       iang+=l; if(iang>ip) iang-=ip;
1626       double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
1627       iang+=l; if(iang>ip) iang-=ip;
1628       double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
1629       for (size_t ik=0; ik<idl1; ++ik)
1630         {
1631         C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1);
1632         C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1);
1633         }
1634       }
1635     for(; j<ipph; ++j,--jc)
1636       {
1637       iang+=l; if(iang>ip) iang-=ip;
1638       double war=csarr[2*iang], wai=csarr[2*iang+1];
1639       for (size_t ik=0; ik<idl1; ++ik)
1640         {
1641         C2(ik,l ) += war*CH2(ik,j );
1642         C2(ik,lc) += wai*CH2(ik,jc);
1643         }
1644       }
1645     }
1646   for (size_t j=1; j<ipph; ++j)
1647     for (size_t ik=0; ik<idl1; ++ik)
1648       CH2(ik,0) += CH2(ik,j);
1649   for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)   // 124
1650     for (size_t k=0; k<l1; ++k)
1651       {
1652       CH(0,k,j ) = C1(0,k,j)-C1(0,k,jc);
1653       CH(0,k,jc) = C1(0,k,j)+C1(0,k,jc);
1654       }
1655 
1656   if (ido==1) return;
1657 
1658   for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)  // 127
1659     for (size_t k=0; k<l1; ++k)
1660       for (size_t i=1; i<=ido-2; i+=2)
1661         {
1662         CH(i  ,k,j ) = C1(i  ,k,j)-C1(i+1,k,jc);
1663         CH(i  ,k,jc) = C1(i  ,k,j)+C1(i+1,k,jc);
1664         CH(i+1,k,j ) = C1(i+1,k,j)+C1(i  ,k,jc);
1665         CH(i+1,k,jc) = C1(i+1,k,j)-C1(i  ,k,jc);
1666         }
1667 
1668 // All in CH
1669 
1670   for (size_t j=1; j<ip; ++j)
1671     {
1672     size_t is = (j-1)*(ido-1);
1673     for (size_t k=0; k<l1; ++k)
1674       {
1675       size_t idij = is;
1676       for (size_t i=1; i<=ido-2; i+=2)
1677         {
1678         double t1=CH(i,k,j), t2=CH(i+1,k,j);
1679         CH(i  ,k,j) = wa[idij]*t1-wa[idij+1]*t2;
1680         CH(i+1,k,j) = wa[idij]*t2+wa[idij+1]*t1;
1681         idij+=2;
1682         }
1683       }
1684     }
1685   }
1686 #undef C1
1687 #undef C2
1688 #undef CH2
1689 
1690 #undef CC
1691 #undef CH
1692 #undef PM
1693 #undef MULPM
1694 #undef WA
1695 
copy_and_norm(double * c,double * p1,size_t n,double fct)1696 static void copy_and_norm(double *c, double *p1, size_t n, double fct)
1697   {
1698   if (p1!=c)
1699     {
1700     if (fct!=1.)
1701       for (size_t i=0; i<n; ++i)
1702         c[i] = fct*p1[i];
1703     else
1704       memcpy (c,p1,n*sizeof(double));
1705     }
1706   else
1707     if (fct!=1.)
1708       for (size_t i=0; i<n; ++i)
1709         c[i] *= fct;
1710   }
1711 
1712 WARN_UNUSED_RESULT
rfftp_forward(rfftp_plan plan,double c[],double fct)1713 static int rfftp_forward(rfftp_plan plan, double c[], double fct)
1714   {
1715   if (plan->length==1) return 0;
1716   size_t n=plan->length;
1717   size_t l1=n, nf=plan->nfct;
1718   double *ch = RALLOC(double, n);
1719   if (!ch) return -1;
1720   double *p1=c, *p2=ch;
1721 
1722   for(size_t k1=0; k1<nf;++k1)
1723     {
1724     size_t k=nf-k1-1;
1725     size_t ip=plan->fct[k].fct;
1726     size_t ido=n / l1;
1727     l1 /= ip;
1728     if(ip==4)
1729       radf4(ido, l1, p1, p2, plan->fct[k].tw);
1730     else if(ip==2)
1731       radf2(ido, l1, p1, p2, plan->fct[k].tw);
1732     else if(ip==3)
1733       radf3(ido, l1, p1, p2, plan->fct[k].tw);
1734     else if(ip==5)
1735       radf5(ido, l1, p1, p2, plan->fct[k].tw);
1736     else
1737       {
1738       radfg(ido, ip, l1, p1, p2, plan->fct[k].tw, plan->fct[k].tws);
1739       SWAP (p1,p2,double *);
1740       }
1741     SWAP (p1,p2,double *);
1742     }
1743   copy_and_norm(c,p1,n,fct);
1744   DEALLOC(ch);
1745   return 0;
1746   }
1747 
1748 WARN_UNUSED_RESULT
rfftp_backward(rfftp_plan plan,double c[],double fct)1749 static int rfftp_backward(rfftp_plan plan, double c[], double fct)
1750   {
1751   if (plan->length==1) return 0;
1752   size_t n=plan->length;
1753   size_t l1=1, nf=plan->nfct;
1754   double *ch = RALLOC(double, n);
1755   if (!ch) return -1;
1756   double *p1=c, *p2=ch;
1757 
1758   for(size_t k=0; k<nf; k++)
1759     {
1760     size_t ip = plan->fct[k].fct,
1761            ido= n/(ip*l1);
1762     if(ip==4)
1763       radb4(ido, l1, p1, p2, plan->fct[k].tw);
1764     else if(ip==2)
1765       radb2(ido, l1, p1, p2, plan->fct[k].tw);
1766     else if(ip==3)
1767       radb3(ido, l1, p1, p2, plan->fct[k].tw);
1768     else if(ip==5)
1769       radb5(ido, l1, p1, p2, plan->fct[k].tw);
1770     else
1771       radbg(ido, ip, l1, p1, p2, plan->fct[k].tw, plan->fct[k].tws);
1772     SWAP (p1,p2,double *);
1773     l1*=ip;
1774     }
1775   copy_and_norm(c,p1,n,fct);
1776   DEALLOC(ch);
1777   return 0;
1778   }
1779 
1780 WARN_UNUSED_RESULT
rfftp_factorize(rfftp_plan plan)1781 static int rfftp_factorize (rfftp_plan plan)
1782   {
1783   size_t length=plan->length;
1784   size_t nfct=0;
1785   while ((length%4)==0)
1786     { if (nfct>=NFCT) return -1; plan->fct[nfct++].fct=4; length>>=2; }
1787   if ((length%2)==0)
1788     {
1789     length>>=1;
1790     // factor 2 should be at the front of the factor list
1791     if (nfct>=NFCT) return -1;
1792     plan->fct[nfct++].fct=2;
1793     SWAP(plan->fct[0].fct, plan->fct[nfct-1].fct,size_t);
1794     }
1795   size_t maxl=(size_t)(sqrt((double)length))+1;
1796   for (size_t divisor=3; (length>1)&&(divisor<maxl); divisor+=2)
1797     if ((length%divisor)==0)
1798       {
1799       while ((length%divisor)==0)
1800         {
1801         if (nfct>=NFCT) return -1;
1802         plan->fct[nfct++].fct=divisor;
1803         length/=divisor;
1804         }
1805       maxl=(size_t)(sqrt((double)length))+1;
1806       }
1807   if (length>1) plan->fct[nfct++].fct=length;
1808   plan->nfct=nfct;
1809   return 0;
1810   }
1811 
rfftp_twsize(rfftp_plan plan)1812 static size_t rfftp_twsize(rfftp_plan plan)
1813   {
1814   size_t twsize=0, l1=1;
1815   for (size_t k=0; k<plan->nfct; ++k)
1816     {
1817     size_t ip=plan->fct[k].fct, ido= plan->length/(l1*ip);
1818     twsize+=(ip-1)*(ido-1);
1819     if (ip>5) twsize+=2*ip;
1820     l1*=ip;
1821     }
1822   return twsize;
1823   return 0;
1824   }
1825 
rfftp_comp_twiddle(rfftp_plan plan)1826 WARN_UNUSED_RESULT NOINLINE static int rfftp_comp_twiddle (rfftp_plan plan)
1827   {
1828   size_t length=plan->length;
1829   double *twid = RALLOC(double, 2*length);
1830   if (!twid) return -1;
1831   sincos_2pibyn_half(length, twid);
1832   size_t l1=1;
1833   double *ptr=plan->mem;
1834   for (size_t k=0; k<plan->nfct; ++k)
1835     {
1836     size_t ip=plan->fct[k].fct, ido=length/(l1*ip);
1837     if (k<plan->nfct-1) // last factor doesn't need twiddles
1838       {
1839       plan->fct[k].tw=ptr; ptr+=(ip-1)*(ido-1);
1840       for (size_t j=1; j<ip; ++j)
1841         for (size_t i=1; i<=(ido-1)/2; ++i)
1842           {
1843           plan->fct[k].tw[(j-1)*(ido-1)+2*i-2] = twid[2*j*l1*i];
1844           plan->fct[k].tw[(j-1)*(ido-1)+2*i-1] = twid[2*j*l1*i+1];
1845           }
1846       }
1847     if (ip>5) // special factors required by *g functions
1848       {
1849       plan->fct[k].tws=ptr; ptr+=2*ip;
1850       plan->fct[k].tws[0] = 1.;
1851       plan->fct[k].tws[1] = 0.;
1852       for (size_t i=1; i<=(ip>>1); ++i)
1853         {
1854         plan->fct[k].tws[2*i  ] = twid[2*i*(length/ip)];
1855         plan->fct[k].tws[2*i+1] = twid[2*i*(length/ip)+1];
1856         plan->fct[k].tws[2*(ip-i)  ] = twid[2*i*(length/ip)];
1857         plan->fct[k].tws[2*(ip-i)+1] = -twid[2*i*(length/ip)+1];
1858         }
1859       }
1860     l1*=ip;
1861     }
1862   DEALLOC(twid);
1863   return 0;
1864   }
1865 
make_rfftp_plan(size_t length)1866 NOINLINE static rfftp_plan make_rfftp_plan (size_t length)
1867   {
1868   if (length==0) return NULL;
1869   rfftp_plan plan = RALLOC(rfftp_plan_i,1);
1870   if (!plan) return NULL;
1871   plan->length=length;
1872   plan->nfct=0;
1873   plan->mem=NULL;
1874   for (size_t i=0; i<NFCT; ++i)
1875     plan->fct[i]=(rfftp_fctdata){0,0,0};
1876   if (length==1) return plan;
1877   if (rfftp_factorize(plan)!=0) { DEALLOC(plan); return NULL; }
1878   size_t tws=rfftp_twsize(plan);
1879   plan->mem=RALLOC(double,tws);
1880   if (!plan->mem) { DEALLOC(plan); return NULL; }
1881   if (rfftp_comp_twiddle(plan)!=0)
1882     { DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
1883   return plan;
1884   }
1885 
destroy_rfftp_plan(rfftp_plan plan)1886 NOINLINE static void destroy_rfftp_plan (rfftp_plan plan)
1887   {
1888   DEALLOC(plan->mem);
1889   DEALLOC(plan);
1890   }
1891 
1892 typedef struct fftblue_plan_i
1893   {
1894   size_t n, n2;
1895   cfftp_plan plan;
1896   double *mem;
1897   double *bk, *bkf;
1898   } fftblue_plan_i;
1899 typedef struct fftblue_plan_i * fftblue_plan;
1900 
make_fftblue_plan(size_t length)1901 NOINLINE static fftblue_plan make_fftblue_plan (size_t length)
1902   {
1903   fftblue_plan plan = RALLOC(fftblue_plan_i,1);
1904   if (!plan) return NULL;
1905   plan->n = length;
1906   plan->n2 = good_size(plan->n*2-1);
1907   plan->mem = RALLOC(double, 2*plan->n+2*plan->n2);
1908   if (!plan->mem) { DEALLOC(plan); return NULL; }
1909   plan->bk  = plan->mem;
1910   plan->bkf = plan->bk+2*plan->n;
1911 
1912 /* initialize b_k */
1913   double *tmp = RALLOC(double,4*plan->n);
1914   if (!tmp) { DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
1915   sincos_2pibyn(2*plan->n,tmp);
1916   plan->bk[0] = 1;
1917   plan->bk[1] = 0;
1918 
1919   size_t coeff=0;
1920   for (size_t m=1; m<plan->n; ++m)
1921     {
1922     coeff+=2*m-1;
1923     if (coeff>=2*plan->n) coeff-=2*plan->n;
1924     plan->bk[2*m  ] = tmp[2*coeff  ];
1925     plan->bk[2*m+1] = tmp[2*coeff+1];
1926     }
1927 
1928   /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */
1929   double xn2 = 1./plan->n2;
1930   plan->bkf[0] = plan->bk[0]*xn2;
1931   plan->bkf[1] = plan->bk[1]*xn2;
1932   for (size_t m=2; m<2*plan->n; m+=2)
1933     {
1934     plan->bkf[m]   = plan->bkf[2*plan->n2-m]   = plan->bk[m]   *xn2;
1935     plan->bkf[m+1] = plan->bkf[2*plan->n2-m+1] = plan->bk[m+1] *xn2;
1936     }
1937   for (size_t m=2*plan->n;m<=(2*plan->n2-2*plan->n+1);++m)
1938     plan->bkf[m]=0.;
1939   plan->plan=make_cfftp_plan(plan->n2);
1940   if (!plan->plan)
1941     { DEALLOC(tmp); DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
1942   if (cfftp_forward(plan->plan,plan->bkf,1.)!=0)
1943     { DEALLOC(tmp); DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
1944   DEALLOC(tmp);
1945 
1946   return plan;
1947   }
1948 
destroy_fftblue_plan(fftblue_plan plan)1949 NOINLINE static void destroy_fftblue_plan (fftblue_plan plan)
1950   {
1951   DEALLOC(plan->mem);
1952   destroy_cfftp_plan(plan->plan);
1953   DEALLOC(plan);
1954   }
1955 
1956 NOINLINE WARN_UNUSED_RESULT
fftblue_fft(fftblue_plan plan,double c[],int isign,double fct)1957 static int fftblue_fft(fftblue_plan plan, double c[], int isign, double fct)
1958   {
1959   size_t n=plan->n;
1960   size_t n2=plan->n2;
1961   double *bk  = plan->bk;
1962   double *bkf = plan->bkf;
1963   double *akf = RALLOC(double, 2*n2);
1964   if (!akf) return -1;
1965 
1966 /* initialize a_k and FFT it */
1967   if (isign>0)
1968     for (size_t m=0; m<2*n; m+=2)
1969       {
1970       akf[m]   = c[m]*bk[m]   - c[m+1]*bk[m+1];
1971       akf[m+1] = c[m]*bk[m+1] + c[m+1]*bk[m];
1972       }
1973   else
1974     for (size_t m=0; m<2*n; m+=2)
1975       {
1976       akf[m]   = c[m]*bk[m]   + c[m+1]*bk[m+1];
1977       akf[m+1] =-c[m]*bk[m+1] + c[m+1]*bk[m];
1978       }
1979   for (size_t m=2*n; m<2*n2; ++m)
1980     akf[m]=0;
1981 
1982   if (cfftp_forward (plan->plan,akf,fct)!=0)
1983     { DEALLOC(akf); return -1; }
1984 
1985 /* do the convolution */
1986   if (isign>0)
1987     for (size_t m=0; m<2*n2; m+=2)
1988       {
1989       double im = -akf[m]*bkf[m+1] + akf[m+1]*bkf[m];
1990       akf[m  ]  =  akf[m]*bkf[m]   + akf[m+1]*bkf[m+1];
1991       akf[m+1]  = im;
1992       }
1993   else
1994     for (size_t m=0; m<2*n2; m+=2)
1995       {
1996       double im = akf[m]*bkf[m+1] + akf[m+1]*bkf[m];
1997       akf[m  ]  = akf[m]*bkf[m]   - akf[m+1]*bkf[m+1];
1998       akf[m+1]  = im;
1999       }
2000 
2001 /* inverse FFT */
2002   if (cfftp_backward (plan->plan,akf,1.)!=0)
2003     { DEALLOC(akf); return -1; }
2004 
2005 /* multiply by b_k */
2006   if (isign>0)
2007     for (size_t m=0; m<2*n; m+=2)
2008       {
2009       c[m]   = bk[m]  *akf[m] - bk[m+1]*akf[m+1];
2010       c[m+1] = bk[m+1]*akf[m] + bk[m]  *akf[m+1];
2011       }
2012   else
2013     for (size_t m=0; m<2*n; m+=2)
2014       {
2015       c[m]   = bk[m]  *akf[m] + bk[m+1]*akf[m+1];
2016       c[m+1] =-bk[m+1]*akf[m] + bk[m]  *akf[m+1];
2017       }
2018   DEALLOC(akf);
2019   return 0;
2020   }
2021 
2022 WARN_UNUSED_RESULT
cfftblue_backward(fftblue_plan plan,double c[],double fct)2023 static int cfftblue_backward(fftblue_plan plan, double c[], double fct)
2024   { return fftblue_fft(plan,c,1,fct); }
2025 
2026 WARN_UNUSED_RESULT
cfftblue_forward(fftblue_plan plan,double c[],double fct)2027 static int cfftblue_forward(fftblue_plan plan, double c[], double fct)
2028   { return fftblue_fft(plan,c,-1,fct); }
2029 
2030 WARN_UNUSED_RESULT
rfftblue_backward(fftblue_plan plan,double c[],double fct)2031 static int rfftblue_backward(fftblue_plan plan, double c[], double fct)
2032   {
2033   size_t n=plan->n;
2034   double *tmp = RALLOC(double,2*n);
2035   if (!tmp) return -1;
2036   tmp[0]=c[0];
2037   tmp[1]=0.;
2038   memcpy (tmp+2,c+1, (n-1)*sizeof(double));
2039   if ((n&1)==0) tmp[n+1]=0.;
2040   for (size_t m=2; m<n; m+=2)
2041     {
2042     tmp[2*n-m]=tmp[m];
2043     tmp[2*n-m+1]=-tmp[m+1];
2044     }
2045   if (fftblue_fft(plan,tmp,1,fct)!=0)
2046     { DEALLOC(tmp); return -1; }
2047   for (size_t m=0; m<n; ++m)
2048     c[m] = tmp[2*m];
2049   DEALLOC(tmp);
2050   return 0;
2051   }
2052 
2053 WARN_UNUSED_RESULT
rfftblue_forward(fftblue_plan plan,double c[],double fct)2054 static int rfftblue_forward(fftblue_plan plan, double c[], double fct)
2055   {
2056   size_t n=plan->n;
2057   double *tmp = RALLOC(double,2*n);
2058   if (!tmp) return -1;
2059   for (size_t m=0; m<n; ++m)
2060     {
2061     tmp[2*m] = c[m];
2062     tmp[2*m+1] = 0.;
2063     }
2064   if (fftblue_fft(plan,tmp,-1,fct)!=0)
2065     { DEALLOC(tmp); return -1; }
2066   c[0] = tmp[0];
2067   memcpy (c+1, tmp+2, (n-1)*sizeof(double));
2068   DEALLOC(tmp);
2069   return 0;
2070   }
2071 
2072 typedef struct cfft_plan_i
2073   {
2074   cfftp_plan packplan;
2075   fftblue_plan blueplan;
2076   } cfft_plan_i;
2077 
make_cfft_plan(size_t length)2078 static cfft_plan make_cfft_plan (size_t length)
2079   {
2080   if (length==0) return NULL;
2081   cfft_plan plan = RALLOC(cfft_plan_i,1);
2082   if (!plan) return NULL;
2083   plan->blueplan=0;
2084   plan->packplan=0;
2085   if ((length<50) || (largest_prime_factor(length)<=sqrt(length)))
2086     {
2087     plan->packplan=make_cfftp_plan(length);
2088     if (!plan->packplan) { DEALLOC(plan); return NULL; }
2089     return plan;
2090     }
2091   double comp1 = cost_guess(length);
2092   double comp2 = 2*cost_guess(good_size(2*length-1));
2093   comp2*=1.5; /* fudge factor that appears to give good overall performance */
2094   if (comp2<comp1) // use Bluestein
2095     {
2096     plan->blueplan=make_fftblue_plan(length);
2097     if (!plan->blueplan) { DEALLOC(plan); return NULL; }
2098     }
2099   else
2100     {
2101     plan->packplan=make_cfftp_plan(length);
2102     if (!plan->packplan) { DEALLOC(plan); return NULL; }
2103     }
2104   return plan;
2105   }
2106 
destroy_cfft_plan(cfft_plan plan)2107 static void destroy_cfft_plan (cfft_plan plan)
2108   {
2109   if (plan->blueplan)
2110     destroy_fftblue_plan(plan->blueplan);
2111   if (plan->packplan)
2112     destroy_cfftp_plan(plan->packplan);
2113   DEALLOC(plan);
2114   }
2115 
cfft_backward(cfft_plan plan,double c[],double fct)2116 WARN_UNUSED_RESULT static int cfft_backward(cfft_plan plan, double c[], double fct)
2117   {
2118   if (plan->packplan)
2119     return cfftp_backward(plan->packplan,c,fct);
2120   // if (plan->blueplan)
2121   return cfftblue_backward(plan->blueplan,c,fct);
2122   }
2123 
cfft_forward(cfft_plan plan,double c[],double fct)2124 WARN_UNUSED_RESULT static int cfft_forward(cfft_plan plan, double c[], double fct)
2125   {
2126   if (plan->packplan)
2127     return cfftp_forward(plan->packplan,c,fct);
2128   // if (plan->blueplan)
2129   return cfftblue_forward(plan->blueplan,c,fct);
2130   }
2131 
2132 typedef struct rfft_plan_i
2133   {
2134   rfftp_plan packplan;
2135   fftblue_plan blueplan;
2136   } rfft_plan_i;
2137 
make_rfft_plan(size_t length)2138 static rfft_plan make_rfft_plan (size_t length)
2139   {
2140   if (length==0) return NULL;
2141   rfft_plan plan = RALLOC(rfft_plan_i,1);
2142   if (!plan) return NULL;
2143   plan->blueplan=0;
2144   plan->packplan=0;
2145   if ((length<50) || (largest_prime_factor(length)<=sqrt(length)))
2146     {
2147     plan->packplan=make_rfftp_plan(length);
2148     if (!plan->packplan) { DEALLOC(plan); return NULL; }
2149     return plan;
2150     }
2151   double comp1 = 0.5*cost_guess(length);
2152   double comp2 = 2*cost_guess(good_size(2*length-1));
2153   comp2*=1.5; /* fudge factor that appears to give good overall performance */
2154   if (comp2<comp1) // use Bluestein
2155     {
2156     plan->blueplan=make_fftblue_plan(length);
2157     if (!plan->blueplan) { DEALLOC(plan); return NULL; }
2158     }
2159   else
2160     {
2161     plan->packplan=make_rfftp_plan(length);
2162     if (!plan->packplan) { DEALLOC(plan); return NULL; }
2163     }
2164   return plan;
2165   }
2166 
destroy_rfft_plan(rfft_plan plan)2167 static void destroy_rfft_plan (rfft_plan plan)
2168   {
2169   if (plan->blueplan)
2170     destroy_fftblue_plan(plan->blueplan);
2171   if (plan->packplan)
2172     destroy_rfftp_plan(plan->packplan);
2173   DEALLOC(plan);
2174   }
2175 
rfft_backward(rfft_plan plan,double c[],double fct)2176 WARN_UNUSED_RESULT static int rfft_backward(rfft_plan plan, double c[], double fct)
2177   {
2178   if (plan->packplan)
2179     return rfftp_backward(plan->packplan,c,fct);
2180   else // if (plan->blueplan)
2181     return rfftblue_backward(plan->blueplan,c,fct);
2182   }
2183 
rfft_forward(rfft_plan plan,double c[],double fct)2184 WARN_UNUSED_RESULT static int rfft_forward(rfft_plan plan, double c[], double fct)
2185   {
2186   if (plan->packplan)
2187     return rfftp_forward(plan->packplan,c,fct);
2188   else // if (plan->blueplan)
2189     return rfftblue_forward(plan->blueplan,c,fct);
2190   }
2191 
2192 static PyObject *
execute_complex(PyObject * a1,int is_forward,double fct)2193 execute_complex(PyObject *a1, int is_forward, double fct)
2194 {
2195     PyArrayObject *data = (PyArrayObject *)PyArray_FromAny(a1,
2196             PyArray_DescrFromType(NPY_CDOUBLE), 1, 0,
2197             NPY_ARRAY_ENSURECOPY | NPY_ARRAY_DEFAULT |
2198             NPY_ARRAY_ENSUREARRAY | NPY_ARRAY_FORCECAST,
2199             NULL);
2200     if (!data) return NULL;
2201 
2202     int npts = PyArray_DIM(data, PyArray_NDIM(data) - 1);
2203     cfft_plan plan=NULL;
2204 
2205     int nrepeats = PyArray_SIZE(data)/npts;
2206     double *dptr = (double *)PyArray_DATA(data);
2207     int fail=0;
2208     Py_BEGIN_ALLOW_THREADS;
2209     plan = make_cfft_plan(npts);
2210     if (!plan) fail=1;
2211     if (!fail)
2212       for (int i = 0; i < nrepeats; i++) {
2213           int res = is_forward ?
2214             cfft_forward(plan, dptr, fct) : cfft_backward(plan, dptr, fct);
2215           if (res!=0) { fail=1; break; }
2216           dptr += npts*2;
2217       }
2218     if (plan) destroy_cfft_plan(plan);
2219     Py_END_ALLOW_THREADS;
2220     if (fail) {
2221       Py_XDECREF(data);
2222       return PyErr_NoMemory();
2223     }
2224     return (PyObject *)data;
2225 }
2226 
2227 static PyObject *
execute_real_forward(PyObject * a1,double fct)2228 execute_real_forward(PyObject *a1, double fct)
2229 {
2230     rfft_plan plan=NULL;
2231     int fail = 0;
2232     PyArrayObject *data = (PyArrayObject *)PyArray_FromAny(a1,
2233             PyArray_DescrFromType(NPY_DOUBLE), 1, 0,
2234             NPY_ARRAY_DEFAULT | NPY_ARRAY_ENSUREARRAY | NPY_ARRAY_FORCECAST,
2235             NULL);
2236     if (!data) return NULL;
2237 
2238     int ndim = PyArray_NDIM(data);
2239     const npy_intp *odim = PyArray_DIMS(data);
2240     int npts = odim[ndim - 1];
2241     npy_intp *tdim=(npy_intp *)malloc(ndim*sizeof(npy_intp));
2242     if (!tdim)
2243       { Py_XDECREF(data); return NULL; }
2244     for (int d=0; d<ndim-1; ++d)
2245       tdim[d] = odim[d];
2246     tdim[ndim-1] = npts/2 + 1;
2247     PyArrayObject *ret = (PyArrayObject *)PyArray_Empty(ndim,
2248             tdim, PyArray_DescrFromType(NPY_CDOUBLE), 0);
2249     free(tdim);
2250     if (!ret) fail=1;
2251     if (!fail) {
2252       int rstep = PyArray_DIM(ret, PyArray_NDIM(ret) - 1)*2;
2253 
2254       int nrepeats = PyArray_SIZE(data)/npts;
2255       double *rptr = (double *)PyArray_DATA(ret),
2256              *dptr = (double *)PyArray_DATA(data);
2257 
2258       Py_BEGIN_ALLOW_THREADS;
2259       plan = make_rfft_plan(npts);
2260       if (!plan) fail=1;
2261       if (!fail)
2262         for (int i = 0; i < nrepeats; i++) {
2263             rptr[rstep-1] = 0.0;
2264             memcpy((char *)(rptr+1), dptr, npts*sizeof(double));
2265             if (rfft_forward(plan, rptr+1, fct)!=0) {fail=1; break;}
2266             rptr[0] = rptr[1];
2267             rptr[1] = 0.0;
2268             rptr += rstep;
2269             dptr += npts;
2270       }
2271       if (plan) destroy_rfft_plan(plan);
2272       Py_END_ALLOW_THREADS;
2273     }
2274     if (fail) {
2275       Py_XDECREF(data);
2276       Py_XDECREF(ret);
2277       return PyErr_NoMemory();
2278     }
2279     Py_DECREF(data);
2280     return (PyObject *)ret;
2281 }
2282 static PyObject *
execute_real_backward(PyObject * a1,double fct)2283 execute_real_backward(PyObject *a1, double fct)
2284 {
2285     rfft_plan plan=NULL;
2286     PyArrayObject *data = (PyArrayObject *)PyArray_FromAny(a1,
2287             PyArray_DescrFromType(NPY_CDOUBLE), 1, 0,
2288             NPY_ARRAY_DEFAULT | NPY_ARRAY_ENSUREARRAY | NPY_ARRAY_FORCECAST,
2289             NULL);
2290     if (!data) return NULL;
2291     int npts = PyArray_DIM(data, PyArray_NDIM(data) - 1);
2292     PyArrayObject *ret = (PyArrayObject *)PyArray_Empty(PyArray_NDIM(data),
2293             PyArray_DIMS(data), PyArray_DescrFromType(NPY_DOUBLE), 0);
2294     int fail = 0;
2295     if (!ret) fail=1;
2296     if (!fail) {
2297       int nrepeats = PyArray_SIZE(ret)/npts;
2298       double *rptr = (double *)PyArray_DATA(ret),
2299              *dptr = (double *)PyArray_DATA(data);
2300 
2301       Py_BEGIN_ALLOW_THREADS;
2302       plan = make_rfft_plan(npts);
2303       if (!plan) fail=1;
2304       if (!fail) {
2305         for (int i = 0; i < nrepeats; i++) {
2306           memcpy((char *)(rptr + 1), (dptr + 2), (npts - 1)*sizeof(double));
2307           rptr[0] = dptr[0];
2308           if (rfft_backward(plan, rptr, fct)!=0) {fail=1; break;}
2309           rptr += npts;
2310           dptr += npts*2;
2311         }
2312       }
2313       if (plan) destroy_rfft_plan(plan);
2314       Py_END_ALLOW_THREADS;
2315     }
2316     if (fail) {
2317       Py_XDECREF(data);
2318       Py_XDECREF(ret);
2319       return PyErr_NoMemory();
2320     }
2321     Py_DECREF(data);
2322     return (PyObject *)ret;
2323 }
2324 
2325 static PyObject *
execute_real(PyObject * a1,int is_forward,double fct)2326 execute_real(PyObject *a1, int is_forward, double fct)
2327 {
2328     return is_forward ? execute_real_forward(a1, fct)
2329                       : execute_real_backward(a1, fct);
2330 }
2331 
2332 static const char execute__doc__[] = "";
2333 
2334 static PyObject *
execute(PyObject * NPY_UNUSED (self),PyObject * args)2335 execute(PyObject *NPY_UNUSED(self), PyObject *args)
2336 {
2337     PyObject *a1;
2338     int is_real, is_forward;
2339     double fct;
2340 
2341     if(!PyArg_ParseTuple(args, "Oiid:execute", &a1, &is_real, &is_forward, &fct)) {
2342         return NULL;
2343     }
2344 
2345     return is_real ? execute_real(a1, is_forward, fct)
2346                    : execute_complex(a1, is_forward, fct);
2347 }
2348 
2349 /* List of methods defined in the module */
2350 
2351 static struct PyMethodDef methods[] = {
2352     {"execute",   execute,   1, execute__doc__},
2353     {NULL, NULL, 0, NULL}          /* sentinel */
2354 };
2355 
2356 static struct PyModuleDef moduledef = {
2357         PyModuleDef_HEAD_INIT,
2358         "_pocketfft_internal",
2359         NULL,
2360         -1,
2361         methods,
2362         NULL,
2363         NULL,
2364         NULL,
2365         NULL
2366 };
2367 
2368 /* Initialization function for the module */
PyInit__pocketfft_internal(void)2369 PyMODINIT_FUNC PyInit__pocketfft_internal(void)
2370 {
2371     PyObject *m;
2372     m = PyModule_Create(&moduledef);
2373     if (m == NULL) {
2374         return NULL;
2375     }
2376 
2377     /* Import the array object */
2378     import_array();
2379 
2380     /* XXXX Add constants here */
2381 
2382     return m;
2383 }
2384