1 /*
2 * This file is part of pocketfft.
3 * Licensed under a 3-clause BSD style license - see LICENSE.md
4 */
5
6 /*
7 * Main implementation file.
8 *
9 * Copyright (C) 2004-2018 Max-Planck-Society
10 * \author Martin Reinecke
11 */
12
13 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
14
15 #include "Python.h"
16 #include "numpy/arrayobject.h"
17
18 #include <math.h>
19 #include <string.h>
20 #include <stdlib.h>
21
22 #include "npy_config.h"
23 #define restrict NPY_RESTRICT
24
25 #define RALLOC(type,num) \
26 ((type *)malloc((num)*sizeof(type)))
27 #define DEALLOC(ptr) \
28 do { free(ptr); (ptr)=NULL; } while(0)
29
30 #define SWAP(a,b,type) \
31 do { type tmp_=(a); (a)=(b); (b)=tmp_; } while(0)
32
33 #ifdef __GNUC__
34 #define NOINLINE __attribute__((noinline))
35 #define WARN_UNUSED_RESULT __attribute__ ((warn_unused_result))
36 #else
37 #define NOINLINE
38 #define WARN_UNUSED_RESULT
39 #endif
40
41 struct cfft_plan_i;
42 typedef struct cfft_plan_i * cfft_plan;
43 struct rfft_plan_i;
44 typedef struct rfft_plan_i * rfft_plan;
45
46 // adapted from https://stackoverflow.com/questions/42792939/
47 // CAUTION: this function only works for arguments in the range [-0.25; 0.25]!
my_sincosm1pi(double a,double * restrict res)48 static void my_sincosm1pi (double a, double *restrict res)
49 {
50 double s = a * a;
51 /* Approximate cos(pi*x)-1 for x in [-0.25,0.25] */
52 double r = -1.0369917389758117e-4;
53 r = fma (r, s, 1.9294935641298806e-3);
54 r = fma (r, s, -2.5806887942825395e-2);
55 r = fma (r, s, 2.3533063028328211e-1);
56 r = fma (r, s, -1.3352627688538006e+0);
57 r = fma (r, s, 4.0587121264167623e+0);
58 r = fma (r, s, -4.9348022005446790e+0);
59 double c = r*s;
60 /* Approximate sin(pi*x) for x in [-0.25,0.25] */
61 r = 4.6151442520157035e-4;
62 r = fma (r, s, -7.3700183130883555e-3);
63 r = fma (r, s, 8.2145868949323936e-2);
64 r = fma (r, s, -5.9926452893214921e-1);
65 r = fma (r, s, 2.5501640398732688e+0);
66 r = fma (r, s, -5.1677127800499516e+0);
67 s = s * a;
68 r = r * s;
69 s = fma (a, 3.1415926535897931e+0, r);
70 res[0] = c;
71 res[1] = s;
72 }
73
calc_first_octant(size_t den,double * restrict res)74 NOINLINE static void calc_first_octant(size_t den, double * restrict res)
75 {
76 size_t n = (den+4)>>3;
77 if (n==0) return;
78 res[0]=1.; res[1]=0.;
79 if (n==1) return;
80 size_t l1=(size_t)sqrt(n);
81 for (size_t i=1; i<l1; ++i)
82 my_sincosm1pi((2.*i)/den,&res[2*i]);
83 size_t start=l1;
84 while(start<n)
85 {
86 double cs[2];
87 my_sincosm1pi((2.*start)/den,cs);
88 res[2*start] = cs[0]+1.;
89 res[2*start+1] = cs[1];
90 size_t end = l1;
91 if (start+end>n) end = n-start;
92 for (size_t i=1; i<end; ++i)
93 {
94 double csx[2]={res[2*i], res[2*i+1]};
95 res[2*(start+i)] = ((cs[0]*csx[0] - cs[1]*csx[1] + cs[0]) + csx[0]) + 1.;
96 res[2*(start+i)+1] = (cs[0]*csx[1] + cs[1]*csx[0]) + cs[1] + csx[1];
97 }
98 start += l1;
99 }
100 for (size_t i=1; i<l1; ++i)
101 res[2*i] += 1.;
102 }
103
calc_first_quadrant(size_t n,double * restrict res)104 NOINLINE static void calc_first_quadrant(size_t n, double * restrict res)
105 {
106 double * restrict p = res+n;
107 calc_first_octant(n<<1, p);
108 size_t ndone=(n+2)>>2;
109 size_t i=0, idx1=0, idx2=2*ndone-2;
110 for (; i+1<ndone; i+=2, idx1+=2, idx2-=2)
111 {
112 res[idx1] = p[2*i];
113 res[idx1+1] = p[2*i+1];
114 res[idx2] = p[2*i+3];
115 res[idx2+1] = p[2*i+2];
116 }
117 if (i!=ndone)
118 {
119 res[idx1 ] = p[2*i];
120 res[idx1+1] = p[2*i+1];
121 }
122 }
123
calc_first_half(size_t n,double * restrict res)124 NOINLINE static void calc_first_half(size_t n, double * restrict res)
125 {
126 int ndone=(n+1)>>1;
127 double * p = res+n-1;
128 calc_first_octant(n<<2, p);
129 int i4=0, in=n, i=0;
130 for (; i4<=in-i4; ++i, i4+=4) // octant 0
131 {
132 res[2*i] = p[2*i4]; res[2*i+1] = p[2*i4+1];
133 }
134 for (; i4-in <= 0; ++i, i4+=4) // octant 1
135 {
136 int xm = in-i4;
137 res[2*i] = p[2*xm+1]; res[2*i+1] = p[2*xm];
138 }
139 for (; i4<=3*in-i4; ++i, i4+=4) // octant 2
140 {
141 int xm = i4-in;
142 res[2*i] = -p[2*xm+1]; res[2*i+1] = p[2*xm];
143 }
144 for (; i<ndone; ++i, i4+=4) // octant 3
145 {
146 int xm = 2*in-i4;
147 res[2*i] = -p[2*xm]; res[2*i+1] = p[2*xm+1];
148 }
149 }
150
fill_first_quadrant(size_t n,double * restrict res)151 NOINLINE static void fill_first_quadrant(size_t n, double * restrict res)
152 {
153 const double hsqt2 = 0.707106781186547524400844362104849;
154 size_t quart = n>>2;
155 if ((n&7)==0)
156 res[quart] = res[quart+1] = hsqt2;
157 for (size_t i=2, j=2*quart-2; i<quart; i+=2, j-=2)
158 {
159 res[j ] = res[i+1];
160 res[j+1] = res[i ];
161 }
162 }
163
fill_first_half(size_t n,double * restrict res)164 NOINLINE static void fill_first_half(size_t n, double * restrict res)
165 {
166 size_t half = n>>1;
167 if ((n&3)==0)
168 for (size_t i=0; i<half; i+=2)
169 {
170 res[i+half] = -res[i+1];
171 res[i+half+1] = res[i ];
172 }
173 else
174 for (size_t i=2, j=2*half-2; i<half; i+=2, j-=2)
175 {
176 res[j ] = -res[i ];
177 res[j+1] = res[i+1];
178 }
179 }
180
fill_second_half(size_t n,double * restrict res)181 NOINLINE static void fill_second_half(size_t n, double * restrict res)
182 {
183 if ((n&1)==0)
184 for (size_t i=0; i<n; ++i)
185 res[i+n] = -res[i];
186 else
187 for (size_t i=2, j=2*n-2; i<n; i+=2, j-=2)
188 {
189 res[j ] = res[i ];
190 res[j+1] = -res[i+1];
191 }
192 }
193
sincos_2pibyn_half(size_t n,double * restrict res)194 NOINLINE static void sincos_2pibyn_half(size_t n, double * restrict res)
195 {
196 if ((n&3)==0)
197 {
198 calc_first_octant(n, res);
199 fill_first_quadrant(n, res);
200 fill_first_half(n, res);
201 }
202 else if ((n&1)==0)
203 {
204 calc_first_quadrant(n, res);
205 fill_first_half(n, res);
206 }
207 else
208 calc_first_half(n, res);
209 }
210
sincos_2pibyn(size_t n,double * restrict res)211 NOINLINE static void sincos_2pibyn(size_t n, double * restrict res)
212 {
213 sincos_2pibyn_half(n, res);
214 fill_second_half(n, res);
215 }
216
largest_prime_factor(size_t n)217 NOINLINE static size_t largest_prime_factor (size_t n)
218 {
219 size_t res=1;
220 size_t tmp;
221 while (((tmp=(n>>1))<<1)==n)
222 { res=2; n=tmp; }
223
224 size_t limit=(size_t)sqrt(n+0.01);
225 for (size_t x=3; x<=limit; x+=2)
226 while (((tmp=(n/x))*x)==n)
227 {
228 res=x;
229 n=tmp;
230 limit=(size_t)sqrt(n+0.01);
231 }
232 if (n>1) res=n;
233
234 return res;
235 }
236
cost_guess(size_t n)237 NOINLINE static double cost_guess (size_t n)
238 {
239 const double lfp=1.1; // penalty for non-hardcoded larger factors
240 size_t ni=n;
241 double result=0.;
242 size_t tmp;
243 while (((tmp=(n>>1))<<1)==n)
244 { result+=2; n=tmp; }
245
246 size_t limit=(size_t)sqrt(n+0.01);
247 for (size_t x=3; x<=limit; x+=2)
248 while ((tmp=(n/x))*x==n)
249 {
250 result+= (x<=5) ? x : lfp*x; // penalize larger prime factors
251 n=tmp;
252 limit=(size_t)sqrt(n+0.01);
253 }
254 if (n>1) result+=(n<=5) ? n : lfp*n;
255
256 return result*ni;
257 }
258
259 /* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */
good_size(size_t n)260 NOINLINE static size_t good_size(size_t n)
261 {
262 if (n<=6) return n;
263
264 size_t bestfac=2*n;
265 for (size_t f2=1; f2<bestfac; f2*=2)
266 for (size_t f23=f2; f23<bestfac; f23*=3)
267 for (size_t f235=f23; f235<bestfac; f235*=5)
268 for (size_t f2357=f235; f2357<bestfac; f2357*=7)
269 for (size_t f235711=f2357; f235711<bestfac; f235711*=11)
270 if (f235711>=n) bestfac=f235711;
271 return bestfac;
272 }
273
274 typedef struct cmplx {
275 double r,i;
276 } cmplx;
277
278 #define NFCT 25
279 typedef struct cfftp_fctdata
280 {
281 size_t fct;
282 cmplx *tw, *tws;
283 } cfftp_fctdata;
284
285 typedef struct cfftp_plan_i
286 {
287 size_t length, nfct;
288 cmplx *mem;
289 cfftp_fctdata fct[NFCT];
290 } cfftp_plan_i;
291 typedef struct cfftp_plan_i * cfftp_plan;
292
293 #define PMC(a,b,c,d) { a.r=c.r+d.r; a.i=c.i+d.i; b.r=c.r-d.r; b.i=c.i-d.i; }
294 #define ADDC(a,b,c) { a.r=b.r+c.r; a.i=b.i+c.i; }
295 #define SCALEC(a,b) { a.r*=b; a.i*=b; }
296 #define ROT90(a) { double tmp_=a.r; a.r=-a.i; a.i=tmp_; }
297 #define ROTM90(a) { double tmp_=-a.r; a.r=a.i; a.i=tmp_; }
298 #define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
299 #define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
300 #define WA(x,i) wa[(i)-1+(x)*(ido-1)]
301 /* a = b*c */
302 #define A_EQ_B_MUL_C(a,b,c) { a.r=b.r*c.r-b.i*c.i; a.i=b.r*c.i+b.i*c.r; }
303 /* a = conj(b)*c*/
304 #define A_EQ_CB_MUL_C(a,b,c) { a.r=b.r*c.r+b.i*c.i; a.i=b.r*c.i-b.i*c.r; }
305
306 #define PMSIGNC(a,b,c,d) { a.r=c.r+sign*d.r; a.i=c.i+sign*d.i; b.r=c.r-sign*d.r; b.i=c.i-sign*d.i; }
307 /* a = b*c */
308 #define MULPMSIGNC(a,b,c) { a.r=b.r*c.r-sign*b.i*c.i; a.i=b.r*c.i+sign*b.i*c.r; }
309 /* a *= b */
310 #define MULPMSIGNCEQ(a,b) { double xtmp=a.r; a.r=b.r*a.r-sign*b.i*a.i; a.i=b.r*a.i+sign*b.i*xtmp; }
311
pass2b(size_t ido,size_t l1,const cmplx * restrict cc,cmplx * restrict ch,const cmplx * restrict wa)312 NOINLINE static void pass2b (size_t ido, size_t l1, const cmplx * restrict cc,
313 cmplx * restrict ch, const cmplx * restrict wa)
314 {
315 const size_t cdim=2;
316
317 if (ido==1)
318 for (size_t k=0; k<l1; ++k)
319 PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
320 else
321 for (size_t k=0; k<l1; ++k)
322 {
323 PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
324 for (size_t i=1; i<ido; ++i)
325 {
326 cmplx t;
327 PMC (CH(i,k,0),t,CC(i,0,k),CC(i,1,k))
328 A_EQ_B_MUL_C (CH(i,k,1),WA(0,i),t)
329 }
330 }
331 }
332
pass2f(size_t ido,size_t l1,const cmplx * restrict cc,cmplx * restrict ch,const cmplx * restrict wa)333 NOINLINE static void pass2f (size_t ido, size_t l1, const cmplx * restrict cc,
334 cmplx * restrict ch, const cmplx * restrict wa)
335 {
336 const size_t cdim=2;
337
338 if (ido==1)
339 for (size_t k=0; k<l1; ++k)
340 PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
341 else
342 for (size_t k=0; k<l1; ++k)
343 {
344 PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
345 for (size_t i=1; i<ido; ++i)
346 {
347 cmplx t;
348 PMC (CH(i,k,0),t,CC(i,0,k),CC(i,1,k))
349 A_EQ_CB_MUL_C (CH(i,k,1),WA(0,i),t)
350 }
351 }
352 }
353
354 #define PREP3(idx) \
355 cmplx t0 = CC(idx,0,k), t1, t2; \
356 PMC (t1,t2,CC(idx,1,k),CC(idx,2,k)) \
357 CH(idx,k,0).r=t0.r+t1.r; \
358 CH(idx,k,0).i=t0.i+t1.i;
359 #define PARTSTEP3a(u1,u2,twr,twi) \
360 { \
361 cmplx ca,cb; \
362 ca.r=t0.r+twr*t1.r; \
363 ca.i=t0.i+twr*t1.i; \
364 cb.i=twi*t2.r; \
365 cb.r=-(twi*t2.i); \
366 PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) \
367 }
368
369 #define PARTSTEP3b(u1,u2,twr,twi) \
370 { \
371 cmplx ca,cb,da,db; \
372 ca.r=t0.r+twr*t1.r; \
373 ca.i=t0.i+twr*t1.i; \
374 cb.i=twi*t2.r; \
375 cb.r=-(twi*t2.i); \
376 PMC(da,db,ca,cb) \
377 A_EQ_B_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
378 A_EQ_B_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
379 }
pass3b(size_t ido,size_t l1,const cmplx * restrict cc,cmplx * restrict ch,const cmplx * restrict wa)380 NOINLINE static void pass3b (size_t ido, size_t l1, const cmplx * restrict cc,
381 cmplx * restrict ch, const cmplx * restrict wa)
382 {
383 const size_t cdim=3;
384 const double tw1r=-0.5, tw1i= 0.86602540378443864676;
385
386 if (ido==1)
387 for (size_t k=0; k<l1; ++k)
388 {
389 PREP3(0)
390 PARTSTEP3a(1,2,tw1r,tw1i)
391 }
392 else
393 for (size_t k=0; k<l1; ++k)
394 {
395 {
396 PREP3(0)
397 PARTSTEP3a(1,2,tw1r,tw1i)
398 }
399 for (size_t i=1; i<ido; ++i)
400 {
401 PREP3(i)
402 PARTSTEP3b(1,2,tw1r,tw1i)
403 }
404 }
405 }
406 #define PARTSTEP3f(u1,u2,twr,twi) \
407 { \
408 cmplx ca,cb,da,db; \
409 ca.r=t0.r+twr*t1.r; \
410 ca.i=t0.i+twr*t1.i; \
411 cb.i=twi*t2.r; \
412 cb.r=-(twi*t2.i); \
413 PMC(da,db,ca,cb) \
414 A_EQ_CB_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
415 A_EQ_CB_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
416 }
pass3f(size_t ido,size_t l1,const cmplx * restrict cc,cmplx * restrict ch,const cmplx * restrict wa)417 NOINLINE static void pass3f (size_t ido, size_t l1, const cmplx * restrict cc,
418 cmplx * restrict ch, const cmplx * restrict wa)
419 {
420 const size_t cdim=3;
421 const double tw1r=-0.5, tw1i= -0.86602540378443864676;
422
423 if (ido==1)
424 for (size_t k=0; k<l1; ++k)
425 {
426 PREP3(0)
427 PARTSTEP3a(1,2,tw1r,tw1i)
428 }
429 else
430 for (size_t k=0; k<l1; ++k)
431 {
432 {
433 PREP3(0)
434 PARTSTEP3a(1,2,tw1r,tw1i)
435 }
436 for (size_t i=1; i<ido; ++i)
437 {
438 PREP3(i)
439 PARTSTEP3f(1,2,tw1r,tw1i)
440 }
441 }
442 }
443
pass4b(size_t ido,size_t l1,const cmplx * restrict cc,cmplx * restrict ch,const cmplx * restrict wa)444 NOINLINE static void pass4b (size_t ido, size_t l1, const cmplx * restrict cc,
445 cmplx * restrict ch, const cmplx * restrict wa)
446 {
447 const size_t cdim=4;
448
449 if (ido==1)
450 for (size_t k=0; k<l1; ++k)
451 {
452 cmplx t1, t2, t3, t4;
453 PMC(t2,t1,CC(0,0,k),CC(0,2,k))
454 PMC(t3,t4,CC(0,1,k),CC(0,3,k))
455 ROT90(t4)
456 PMC(CH(0,k,0),CH(0,k,2),t2,t3)
457 PMC(CH(0,k,1),CH(0,k,3),t1,t4)
458 }
459 else
460 for (size_t k=0; k<l1; ++k)
461 {
462 {
463 cmplx t1, t2, t3, t4;
464 PMC(t2,t1,CC(0,0,k),CC(0,2,k))
465 PMC(t3,t4,CC(0,1,k),CC(0,3,k))
466 ROT90(t4)
467 PMC(CH(0,k,0),CH(0,k,2),t2,t3)
468 PMC(CH(0,k,1),CH(0,k,3),t1,t4)
469 }
470 for (size_t i=1; i<ido; ++i)
471 {
472 cmplx c2, c3, c4, t1, t2, t3, t4;
473 cmplx cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
474 PMC(t2,t1,cc0,cc2)
475 PMC(t3,t4,cc1,cc3)
476 ROT90(t4)
477 cmplx wa0=WA(0,i), wa1=WA(1,i),wa2=WA(2,i);
478 PMC(CH(i,k,0),c3,t2,t3)
479 PMC(c2,c4,t1,t4)
480 A_EQ_B_MUL_C (CH(i,k,1),wa0,c2)
481 A_EQ_B_MUL_C (CH(i,k,2),wa1,c3)
482 A_EQ_B_MUL_C (CH(i,k,3),wa2,c4)
483 }
484 }
485 }
pass4f(size_t ido,size_t l1,const cmplx * restrict cc,cmplx * restrict ch,const cmplx * restrict wa)486 NOINLINE static void pass4f (size_t ido, size_t l1, const cmplx * restrict cc,
487 cmplx * restrict ch, const cmplx * restrict wa)
488 {
489 const size_t cdim=4;
490
491 if (ido==1)
492 for (size_t k=0; k<l1; ++k)
493 {
494 cmplx t1, t2, t3, t4;
495 PMC(t2,t1,CC(0,0,k),CC(0,2,k))
496 PMC(t3,t4,CC(0,1,k),CC(0,3,k))
497 ROTM90(t4)
498 PMC(CH(0,k,0),CH(0,k,2),t2,t3)
499 PMC(CH(0,k,1),CH(0,k,3),t1,t4)
500 }
501 else
502 for (size_t k=0; k<l1; ++k)
503 {
504 {
505 cmplx t1, t2, t3, t4;
506 PMC(t2,t1,CC(0,0,k),CC(0,2,k))
507 PMC(t3,t4,CC(0,1,k),CC(0,3,k))
508 ROTM90(t4)
509 PMC(CH(0,k,0),CH(0,k,2),t2,t3)
510 PMC (CH(0,k,1),CH(0,k,3),t1,t4)
511 }
512 for (size_t i=1; i<ido; ++i)
513 {
514 cmplx c2, c3, c4, t1, t2, t3, t4;
515 cmplx cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
516 PMC(t2,t1,cc0,cc2)
517 PMC(t3,t4,cc1,cc3)
518 ROTM90(t4)
519 cmplx wa0=WA(0,i), wa1=WA(1,i),wa2=WA(2,i);
520 PMC(CH(i,k,0),c3,t2,t3)
521 PMC(c2,c4,t1,t4)
522 A_EQ_CB_MUL_C (CH(i,k,1),wa0,c2)
523 A_EQ_CB_MUL_C (CH(i,k,2),wa1,c3)
524 A_EQ_CB_MUL_C (CH(i,k,3),wa2,c4)
525 }
526 }
527 }
528
529 #define PREP5(idx) \
530 cmplx t0 = CC(idx,0,k), t1, t2, t3, t4; \
531 PMC (t1,t4,CC(idx,1,k),CC(idx,4,k)) \
532 PMC (t2,t3,CC(idx,2,k),CC(idx,3,k)) \
533 CH(idx,k,0).r=t0.r+t1.r+t2.r; \
534 CH(idx,k,0).i=t0.i+t1.i+t2.i;
535
536 #define PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \
537 { \
538 cmplx ca,cb; \
539 ca.r=t0.r+twar*t1.r+twbr*t2.r; \
540 ca.i=t0.i+twar*t1.i+twbr*t2.i; \
541 cb.i=twai*t4.r twbi*t3.r; \
542 cb.r=-(twai*t4.i twbi*t3.i); \
543 PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) \
544 }
545
546 #define PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \
547 { \
548 cmplx ca,cb,da,db; \
549 ca.r=t0.r+twar*t1.r+twbr*t2.r; \
550 ca.i=t0.i+twar*t1.i+twbr*t2.i; \
551 cb.i=twai*t4.r twbi*t3.r; \
552 cb.r=-(twai*t4.i twbi*t3.i); \
553 PMC(da,db,ca,cb) \
554 A_EQ_B_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
555 A_EQ_B_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
556 }
pass5b(size_t ido,size_t l1,const cmplx * restrict cc,cmplx * restrict ch,const cmplx * restrict wa)557 NOINLINE static void pass5b (size_t ido, size_t l1, const cmplx * restrict cc,
558 cmplx * restrict ch, const cmplx * restrict wa)
559 {
560 const size_t cdim=5;
561 const double tw1r= 0.3090169943749474241,
562 tw1i= 0.95105651629515357212,
563 tw2r= -0.8090169943749474241,
564 tw2i= 0.58778525229247312917;
565
566 if (ido==1)
567 for (size_t k=0; k<l1; ++k)
568 {
569 PREP5(0)
570 PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
571 PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
572 }
573 else
574 for (size_t k=0; k<l1; ++k)
575 {
576 {
577 PREP5(0)
578 PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
579 PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
580 }
581 for (size_t i=1; i<ido; ++i)
582 {
583 PREP5(i)
584 PARTSTEP5b(1,4,tw1r,tw2r,+tw1i,+tw2i)
585 PARTSTEP5b(2,3,tw2r,tw1r,+tw2i,-tw1i)
586 }
587 }
588 }
589 #define PARTSTEP5f(u1,u2,twar,twbr,twai,twbi) \
590 { \
591 cmplx ca,cb,da,db; \
592 ca.r=t0.r+twar*t1.r+twbr*t2.r; \
593 ca.i=t0.i+twar*t1.i+twbr*t2.i; \
594 cb.i=twai*t4.r twbi*t3.r; \
595 cb.r=-(twai*t4.i twbi*t3.i); \
596 PMC(da,db,ca,cb) \
597 A_EQ_CB_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
598 A_EQ_CB_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
599 }
pass5f(size_t ido,size_t l1,const cmplx * restrict cc,cmplx * restrict ch,const cmplx * restrict wa)600 NOINLINE static void pass5f (size_t ido, size_t l1, const cmplx * restrict cc,
601 cmplx * restrict ch, const cmplx * restrict wa)
602 {
603 const size_t cdim=5;
604 const double tw1r= 0.3090169943749474241,
605 tw1i= -0.95105651629515357212,
606 tw2r= -0.8090169943749474241,
607 tw2i= -0.58778525229247312917;
608
609 if (ido==1)
610 for (size_t k=0; k<l1; ++k)
611 {
612 PREP5(0)
613 PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
614 PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
615 }
616 else
617 for (size_t k=0; k<l1; ++k)
618 {
619 {
620 PREP5(0)
621 PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
622 PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
623 }
624 for (size_t i=1; i<ido; ++i)
625 {
626 PREP5(i)
627 PARTSTEP5f(1,4,tw1r,tw2r,+tw1i,+tw2i)
628 PARTSTEP5f(2,3,tw2r,tw1r,+tw2i,-tw1i)
629 }
630 }
631 }
632
633 #define PREP7(idx) \
634 cmplx t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \
635 PMC (t2,t7,CC(idx,1,k),CC(idx,6,k)) \
636 PMC (t3,t6,CC(idx,2,k),CC(idx,5,k)) \
637 PMC (t4,t5,CC(idx,3,k),CC(idx,4,k)) \
638 CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \
639 CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i;
640
641 #define PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,out1,out2) \
642 { \
643 cmplx ca,cb; \
644 ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; \
645 ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \
646 cb.i=y1*t7.r y2*t6.r y3*t5.r; \
647 cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \
648 PMC(out1,out2,ca,cb) \
649 }
650 #define PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \
651 PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2))
652 #define PARTSTEP7(u1,u2,x1,x2,x3,y1,y2,y3) \
653 { \
654 cmplx da,db; \
655 PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \
656 MULPMSIGNC (CH(i,k,u1),WA(u1-1,i),da) \
657 MULPMSIGNC (CH(i,k,u2),WA(u2-1,i),db) \
658 }
659
pass7(size_t ido,size_t l1,const cmplx * restrict cc,cmplx * restrict ch,const cmplx * restrict wa,const int sign)660 NOINLINE static void pass7(size_t ido, size_t l1, const cmplx * restrict cc,
661 cmplx * restrict ch, const cmplx * restrict wa, const int sign)
662 {
663 const size_t cdim=7;
664 const double tw1r= 0.623489801858733530525,
665 tw1i= sign * 0.7818314824680298087084,
666 tw2r= -0.222520933956314404289,
667 tw2i= sign * 0.9749279121818236070181,
668 tw3r= -0.9009688679024191262361,
669 tw3i= sign * 0.4338837391175581204758;
670
671 if (ido==1)
672 for (size_t k=0; k<l1; ++k)
673 {
674 PREP7(0)
675 PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
676 PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
677 PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
678 }
679 else
680 for (size_t k=0; k<l1; ++k)
681 {
682 {
683 PREP7(0)
684 PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
685 PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
686 PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
687 }
688 for (size_t i=1; i<ido; ++i)
689 {
690 PREP7(i)
691 PARTSTEP7(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
692 PARTSTEP7(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
693 PARTSTEP7(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
694 }
695 }
696 }
697
698 #define PREP11(idx) \
699 cmplx t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \
700 PMC (t2,t11,CC(idx,1,k),CC(idx,10,k)) \
701 PMC (t3,t10,CC(idx,2,k),CC(idx, 9,k)) \
702 PMC (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)) \
703 PMC (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)) \
704 PMC (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)) \
705 CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; \
706 CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i;
707
708 #define PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,out1,out2) \
709 { \
710 cmplx ca,cb; \
711 ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r+x4*t5.r+x5*t6.r; \
712 ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i+x4*t5.i+x5*t6.i; \
713 cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; \
714 cb.r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); \
715 PMC(out1,out2,ca,cb) \
716 }
717 #define PARTSTEP11a(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
718 PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,CH(0,k,u1),CH(0,k,u2))
719 #define PARTSTEP11(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
720 { \
721 cmplx da,db; \
722 PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,da,db) \
723 MULPMSIGNC (CH(i,k,u1),WA(u1-1,i),da) \
724 MULPMSIGNC (CH(i,k,u2),WA(u2-1,i),db) \
725 }
726
pass11(size_t ido,size_t l1,const cmplx * restrict cc,cmplx * restrict ch,const cmplx * restrict wa,const int sign)727 NOINLINE static void pass11 (size_t ido, size_t l1, const cmplx * restrict cc,
728 cmplx * restrict ch, const cmplx * restrict wa, const int sign)
729 {
730 const size_t cdim=11;
731 const double tw1r = 0.8412535328311811688618,
732 tw1i = sign * 0.5406408174555975821076,
733 tw2r = 0.4154150130018864255293,
734 tw2i = sign * 0.9096319953545183714117,
735 tw3r = -0.1423148382732851404438,
736 tw3i = sign * 0.9898214418809327323761,
737 tw4r = -0.6548607339452850640569,
738 tw4i = sign * 0.755749574354258283774,
739 tw5r = -0.9594929736144973898904,
740 tw5i = sign * 0.2817325568414296977114;
741
742 if (ido==1)
743 for (size_t k=0; k<l1; ++k)
744 {
745 PREP11(0)
746 PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
747 PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
748 PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
749 PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
750 PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
751 }
752 else
753 for (size_t k=0; k<l1; ++k)
754 {
755 {
756 PREP11(0)
757 PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
758 PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
759 PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
760 PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
761 PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
762 }
763 for (size_t i=1; i<ido; ++i)
764 {
765 PREP11(i)
766 PARTSTEP11(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
767 PARTSTEP11(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
768 PARTSTEP11(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
769 PARTSTEP11(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
770 PARTSTEP11(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
771 }
772 }
773 }
774
775 #define CX(a,b,c) cc[(a)+ido*((b)+l1*(c))]
776 #define CX2(a,b) cc[(a)+idl1*(b)]
777 #define CH2(a,b) ch[(a)+idl1*(b)]
778
passg(size_t ido,size_t ip,size_t l1,cmplx * restrict cc,cmplx * restrict ch,const cmplx * restrict wa,const cmplx * restrict csarr,const int sign)779 NOINLINE static int passg (size_t ido, size_t ip, size_t l1,
780 cmplx * restrict cc, cmplx * restrict ch, const cmplx * restrict wa,
781 const cmplx * restrict csarr, const int sign)
782 {
783 const size_t cdim=ip;
784 size_t ipph = (ip+1)/2;
785 size_t idl1 = ido*l1;
786
787 cmplx * restrict wal=RALLOC(cmplx,ip);
788 if (!wal) return -1;
789 wal[0]=(cmplx){1.,0.};
790 for (size_t i=1; i<ip; ++i)
791 wal[i]=(cmplx){csarr[i].r,sign*csarr[i].i};
792
793 for (size_t k=0; k<l1; ++k)
794 for (size_t i=0; i<ido; ++i)
795 CH(i,k,0) = CC(i,0,k);
796 for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
797 for (size_t k=0; k<l1; ++k)
798 for (size_t i=0; i<ido; ++i)
799 PMC(CH(i,k,j),CH(i,k,jc),CC(i,j,k),CC(i,jc,k))
800 for (size_t k=0; k<l1; ++k)
801 for (size_t i=0; i<ido; ++i)
802 {
803 cmplx tmp = CH(i,k,0);
804 for (size_t j=1; j<ipph; ++j)
805 ADDC(tmp,tmp,CH(i,k,j))
806 CX(i,k,0) = tmp;
807 }
808 for (size_t l=1, lc=ip-1; l<ipph; ++l, --lc)
809 {
810 // j=0
811 for (size_t ik=0; ik<idl1; ++ik)
812 {
813 CX2(ik,l).r = CH2(ik,0).r+wal[l].r*CH2(ik,1).r+wal[2*l].r*CH2(ik,2).r;
814 CX2(ik,l).i = CH2(ik,0).i+wal[l].r*CH2(ik,1).i+wal[2*l].r*CH2(ik,2).i;
815 CX2(ik,lc).r=-wal[l].i*CH2(ik,ip-1).i-wal[2*l].i*CH2(ik,ip-2).i;
816 CX2(ik,lc).i=wal[l].i*CH2(ik,ip-1).r+wal[2*l].i*CH2(ik,ip-2).r;
817 }
818
819 size_t iwal=2*l;
820 size_t j=3, jc=ip-3;
821 for (; j<ipph-1; j+=2, jc-=2)
822 {
823 iwal+=l; if (iwal>ip) iwal-=ip;
824 cmplx xwal=wal[iwal];
825 iwal+=l; if (iwal>ip) iwal-=ip;
826 cmplx xwal2=wal[iwal];
827 for (size_t ik=0; ik<idl1; ++ik)
828 {
829 CX2(ik,l).r += CH2(ik,j).r*xwal.r+CH2(ik,j+1).r*xwal2.r;
830 CX2(ik,l).i += CH2(ik,j).i*xwal.r+CH2(ik,j+1).i*xwal2.r;
831 CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i+CH2(ik,jc-1).i*xwal2.i;
832 CX2(ik,lc).i += CH2(ik,jc).r*xwal.i+CH2(ik,jc-1).r*xwal2.i;
833 }
834 }
835 for (; j<ipph; ++j, --jc)
836 {
837 iwal+=l; if (iwal>ip) iwal-=ip;
838 cmplx xwal=wal[iwal];
839 for (size_t ik=0; ik<idl1; ++ik)
840 {
841 CX2(ik,l).r += CH2(ik,j).r*xwal.r;
842 CX2(ik,l).i += CH2(ik,j).i*xwal.r;
843 CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i;
844 CX2(ik,lc).i += CH2(ik,jc).r*xwal.i;
845 }
846 }
847 }
848 DEALLOC(wal);
849
850 // shuffling and twiddling
851 if (ido==1)
852 for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
853 for (size_t ik=0; ik<idl1; ++ik)
854 {
855 cmplx t1=CX2(ik,j), t2=CX2(ik,jc);
856 PMC(CX2(ik,j),CX2(ik,jc),t1,t2)
857 }
858 else
859 {
860 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)
861 for (size_t k=0; k<l1; ++k)
862 {
863 cmplx t1=CX(0,k,j), t2=CX(0,k,jc);
864 PMC(CX(0,k,j),CX(0,k,jc),t1,t2)
865 for (size_t i=1; i<ido; ++i)
866 {
867 cmplx x1, x2;
868 PMC(x1,x2,CX(i,k,j),CX(i,k,jc))
869 size_t idij=(j-1)*(ido-1)+i-1;
870 MULPMSIGNC (CX(i,k,j),wa[idij],x1)
871 idij=(jc-1)*(ido-1)+i-1;
872 MULPMSIGNC (CX(i,k,jc),wa[idij],x2)
873 }
874 }
875 }
876 return 0;
877 }
878
879 #undef CH2
880 #undef CX2
881 #undef CX
882
pass_all(cfftp_plan plan,cmplx c[],double fct,const int sign)883 NOINLINE WARN_UNUSED_RESULT static int pass_all(cfftp_plan plan, cmplx c[], double fct,
884 const int sign)
885 {
886 if (plan->length==1) return 0;
887 size_t len=plan->length;
888 size_t l1=1, nf=plan->nfct;
889 cmplx *ch = RALLOC(cmplx, len);
890 if (!ch) return -1;
891 cmplx *p1=c, *p2=ch;
892
893 for(size_t k1=0; k1<nf; k1++)
894 {
895 size_t ip=plan->fct[k1].fct;
896 size_t l2=ip*l1;
897 size_t ido = len/l2;
898 if (ip==4)
899 sign>0 ? pass4b (ido, l1, p1, p2, plan->fct[k1].tw)
900 : pass4f (ido, l1, p1, p2, plan->fct[k1].tw);
901 else if(ip==2)
902 sign>0 ? pass2b (ido, l1, p1, p2, plan->fct[k1].tw)
903 : pass2f (ido, l1, p1, p2, plan->fct[k1].tw);
904 else if(ip==3)
905 sign>0 ? pass3b (ido, l1, p1, p2, plan->fct[k1].tw)
906 : pass3f (ido, l1, p1, p2, plan->fct[k1].tw);
907 else if(ip==5)
908 sign>0 ? pass5b (ido, l1, p1, p2, plan->fct[k1].tw)
909 : pass5f (ido, l1, p1, p2, plan->fct[k1].tw);
910 else if(ip==7) pass7 (ido, l1, p1, p2, plan->fct[k1].tw, sign);
911 else if(ip==11) pass11(ido, l1, p1, p2, plan->fct[k1].tw, sign);
912 else
913 {
914 if (passg(ido, ip, l1, p1, p2, plan->fct[k1].tw, plan->fct[k1].tws, sign))
915 { DEALLOC(ch); return -1; }
916 SWAP(p1,p2,cmplx *);
917 }
918 SWAP(p1,p2,cmplx *);
919 l1=l2;
920 }
921 if (p1!=c)
922 {
923 if (fct!=1.)
924 for (size_t i=0; i<len; ++i)
925 {
926 c[i].r = ch[i].r*fct;
927 c[i].i = ch[i].i*fct;
928 }
929 else
930 memcpy (c,p1,len*sizeof(cmplx));
931 }
932 else
933 if (fct!=1.)
934 for (size_t i=0; i<len; ++i)
935 {
936 c[i].r *= fct;
937 c[i].i *= fct;
938 }
939 DEALLOC(ch);
940 return 0;
941 }
942
943 #undef PMSIGNC
944 #undef A_EQ_B_MUL_C
945 #undef A_EQ_CB_MUL_C
946 #undef MULPMSIGNC
947 #undef MULPMSIGNCEQ
948
949 #undef WA
950 #undef CC
951 #undef CH
952 #undef ROT90
953 #undef SCALEC
954 #undef ADDC
955 #undef PMC
956
957 NOINLINE WARN_UNUSED_RESULT
cfftp_forward(cfftp_plan plan,double c[],double fct)958 static int cfftp_forward(cfftp_plan plan, double c[], double fct)
959 { return pass_all(plan,(cmplx *)c, fct, -1); }
960
961 NOINLINE WARN_UNUSED_RESULT
cfftp_backward(cfftp_plan plan,double c[],double fct)962 static int cfftp_backward(cfftp_plan plan, double c[], double fct)
963 { return pass_all(plan,(cmplx *)c, fct, 1); }
964
965 NOINLINE WARN_UNUSED_RESULT
cfftp_factorize(cfftp_plan plan)966 static int cfftp_factorize (cfftp_plan plan)
967 {
968 size_t length=plan->length;
969 size_t nfct=0;
970 while ((length%4)==0)
971 { if (nfct>=NFCT) return -1; plan->fct[nfct++].fct=4; length>>=2; }
972 if ((length%2)==0)
973 {
974 length>>=1;
975 // factor 2 should be at the front of the factor list
976 if (nfct>=NFCT) return -1;
977 plan->fct[nfct++].fct=2;
978 SWAP(plan->fct[0].fct, plan->fct[nfct-1].fct,size_t);
979 }
980 size_t maxl=(size_t)(sqrt((double)length))+1;
981 for (size_t divisor=3; (length>1)&&(divisor<maxl); divisor+=2)
982 if ((length%divisor)==0)
983 {
984 while ((length%divisor)==0)
985 {
986 if (nfct>=NFCT) return -1;
987 plan->fct[nfct++].fct=divisor;
988 length/=divisor;
989 }
990 maxl=(size_t)(sqrt((double)length))+1;
991 }
992 if (length>1) plan->fct[nfct++].fct=length;
993 plan->nfct=nfct;
994 return 0;
995 }
996
cfftp_twsize(cfftp_plan plan)997 NOINLINE static size_t cfftp_twsize (cfftp_plan plan)
998 {
999 size_t twsize=0, l1=1;
1000 for (size_t k=0; k<plan->nfct; ++k)
1001 {
1002 size_t ip=plan->fct[k].fct, ido= plan->length/(l1*ip);
1003 twsize+=(ip-1)*(ido-1);
1004 if (ip>11)
1005 twsize+=ip;
1006 l1*=ip;
1007 }
1008 return twsize;
1009 }
1010
cfftp_comp_twiddle(cfftp_plan plan)1011 NOINLINE WARN_UNUSED_RESULT static int cfftp_comp_twiddle (cfftp_plan plan)
1012 {
1013 size_t length=plan->length;
1014 double *twid = RALLOC(double, 2*length);
1015 if (!twid) return -1;
1016 sincos_2pibyn(length, twid);
1017 size_t l1=1;
1018 size_t memofs=0;
1019 for (size_t k=0; k<plan->nfct; ++k)
1020 {
1021 size_t ip=plan->fct[k].fct, ido= length/(l1*ip);
1022 plan->fct[k].tw=plan->mem+memofs;
1023 memofs+=(ip-1)*(ido-1);
1024 for (size_t j=1; j<ip; ++j)
1025 for (size_t i=1; i<ido; ++i)
1026 {
1027 plan->fct[k].tw[(j-1)*(ido-1)+i-1].r = twid[2*j*l1*i];
1028 plan->fct[k].tw[(j-1)*(ido-1)+i-1].i = twid[2*j*l1*i+1];
1029 }
1030 if (ip>11)
1031 {
1032 plan->fct[k].tws=plan->mem+memofs;
1033 memofs+=ip;
1034 for (size_t j=0; j<ip; ++j)
1035 {
1036 plan->fct[k].tws[j].r = twid[2*j*l1*ido];
1037 plan->fct[k].tws[j].i = twid[2*j*l1*ido+1];
1038 }
1039 }
1040 l1*=ip;
1041 }
1042 DEALLOC(twid);
1043 return 0;
1044 }
1045
make_cfftp_plan(size_t length)1046 static cfftp_plan make_cfftp_plan (size_t length)
1047 {
1048 if (length==0) return NULL;
1049 cfftp_plan plan = RALLOC(cfftp_plan_i,1);
1050 if (!plan) return NULL;
1051 plan->length=length;
1052 plan->nfct=0;
1053 for (size_t i=0; i<NFCT; ++i)
1054 plan->fct[i]=(cfftp_fctdata){0,0,0};
1055 plan->mem=0;
1056 if (length==1) return plan;
1057 if (cfftp_factorize(plan)!=0) { DEALLOC(plan); return NULL; }
1058 size_t tws=cfftp_twsize(plan);
1059 plan->mem=RALLOC(cmplx,tws);
1060 if (!plan->mem) { DEALLOC(plan); return NULL; }
1061 if (cfftp_comp_twiddle(plan)!=0)
1062 { DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
1063 return plan;
1064 }
1065
destroy_cfftp_plan(cfftp_plan plan)1066 static void destroy_cfftp_plan (cfftp_plan plan)
1067 {
1068 DEALLOC(plan->mem);
1069 DEALLOC(plan);
1070 }
1071
1072 typedef struct rfftp_fctdata
1073 {
1074 size_t fct;
1075 double *tw, *tws;
1076 } rfftp_fctdata;
1077
1078 typedef struct rfftp_plan_i
1079 {
1080 size_t length, nfct;
1081 double *mem;
1082 rfftp_fctdata fct[NFCT];
1083 } rfftp_plan_i;
1084 typedef struct rfftp_plan_i * rfftp_plan;
1085
1086 #define WA(x,i) wa[(i)+(x)*(ido-1)]
1087 #define PM(a,b,c,d) { a=c+d; b=c-d; }
1088 /* (a+ib) = conj(c+id) * (e+if) */
1089 #define MULPM(a,b,c,d,e,f) { a=c*e+d*f; b=c*f-d*e; }
1090
1091 #define CC(a,b,c) cc[(a)+ido*((b)+l1*(c))]
1092 #define CH(a,b,c) ch[(a)+ido*((b)+cdim*(c))]
1093
radf2(size_t ido,size_t l1,const double * restrict cc,double * restrict ch,const double * restrict wa)1094 NOINLINE static void radf2 (size_t ido, size_t l1, const double * restrict cc,
1095 double * restrict ch, const double * restrict wa)
1096 {
1097 const size_t cdim=2;
1098
1099 for (size_t k=0; k<l1; k++)
1100 PM (CH(0,0,k),CH(ido-1,1,k),CC(0,k,0),CC(0,k,1))
1101 if ((ido&1)==0)
1102 for (size_t k=0; k<l1; k++)
1103 {
1104 CH( 0,1,k) = -CC(ido-1,k,1);
1105 CH(ido-1,0,k) = CC(ido-1,k,0);
1106 }
1107 if (ido<=2) return;
1108 for (size_t k=0; k<l1; k++)
1109 for (size_t i=2; i<ido; i+=2)
1110 {
1111 size_t ic=ido-i;
1112 double tr2, ti2;
1113 MULPM (tr2,ti2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
1114 PM (CH(i-1,0,k),CH(ic-1,1,k),CC(i-1,k,0),tr2)
1115 PM (CH(i ,0,k),CH(ic ,1,k),ti2,CC(i ,k,0))
1116 }
1117 }
1118
radf3(size_t ido,size_t l1,const double * restrict cc,double * restrict ch,const double * restrict wa)1119 NOINLINE static void radf3(size_t ido, size_t l1, const double * restrict cc,
1120 double * restrict ch, const double * restrict wa)
1121 {
1122 const size_t cdim=3;
1123 static const double taur=-0.5, taui=0.86602540378443864676;
1124
1125 for (size_t k=0; k<l1; k++)
1126 {
1127 double cr2=CC(0,k,1)+CC(0,k,2);
1128 CH(0,0,k) = CC(0,k,0)+cr2;
1129 CH(0,2,k) = taui*(CC(0,k,2)-CC(0,k,1));
1130 CH(ido-1,1,k) = CC(0,k,0)+taur*cr2;
1131 }
1132 if (ido==1) return;
1133 for (size_t k=0; k<l1; k++)
1134 for (size_t i=2; i<ido; i+=2)
1135 {
1136 size_t ic=ido-i;
1137 double di2, di3, dr2, dr3;
1138 MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1)) // d2=conj(WA0)*CC1
1139 MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2)) // d3=conj(WA1)*CC2
1140 double cr2=dr2+dr3; // c add
1141 double ci2=di2+di3;
1142 CH(i-1,0,k) = CC(i-1,k,0)+cr2; // c add
1143 CH(i ,0,k) = CC(i ,k,0)+ci2;
1144 double tr2 = CC(i-1,k,0)+taur*cr2; // c add
1145 double ti2 = CC(i ,k,0)+taur*ci2;
1146 double tr3 = taui*(di2-di3); // t3 = taui*i*(d3-d2)?
1147 double ti3 = taui*(dr3-dr2);
1148 PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr3) // PM(i) = t2+t3
1149 PM(CH(i ,2,k),CH(ic ,1,k),ti3,ti2) // PM(ic) = conj(t2-t3)
1150 }
1151 }
1152
radf4(size_t ido,size_t l1,const double * restrict cc,double * restrict ch,const double * restrict wa)1153 NOINLINE static void radf4(size_t ido, size_t l1, const double * restrict cc,
1154 double * restrict ch, const double * restrict wa)
1155 {
1156 const size_t cdim=4;
1157 static const double hsqt2=0.70710678118654752440;
1158
1159 for (size_t k=0; k<l1; k++)
1160 {
1161 double tr1,tr2;
1162 PM (tr1,CH(0,2,k),CC(0,k,3),CC(0,k,1))
1163 PM (tr2,CH(ido-1,1,k),CC(0,k,0),CC(0,k,2))
1164 PM (CH(0,0,k),CH(ido-1,3,k),tr2,tr1)
1165 }
1166 if ((ido&1)==0)
1167 for (size_t k=0; k<l1; k++)
1168 {
1169 double ti1=-hsqt2*(CC(ido-1,k,1)+CC(ido-1,k,3));
1170 double tr1= hsqt2*(CC(ido-1,k,1)-CC(ido-1,k,3));
1171 PM (CH(ido-1,0,k),CH(ido-1,2,k),CC(ido-1,k,0),tr1)
1172 PM (CH( 0,3,k),CH( 0,1,k),ti1,CC(ido-1,k,2))
1173 }
1174 if (ido<=2) return;
1175 for (size_t k=0; k<l1; k++)
1176 for (size_t i=2; i<ido; i+=2)
1177 {
1178 size_t ic=ido-i;
1179 double ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
1180 MULPM(cr2,ci2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
1181 MULPM(cr3,ci3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2))
1182 MULPM(cr4,ci4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3))
1183 PM(tr1,tr4,cr4,cr2)
1184 PM(ti1,ti4,ci2,ci4)
1185 PM(tr2,tr3,CC(i-1,k,0),cr3)
1186 PM(ti2,ti3,CC(i ,k,0),ci3)
1187 PM(CH(i-1,0,k),CH(ic-1,3,k),tr2,tr1)
1188 PM(CH(i ,0,k),CH(ic ,3,k),ti1,ti2)
1189 PM(CH(i-1,2,k),CH(ic-1,1,k),tr3,ti4)
1190 PM(CH(i ,2,k),CH(ic ,1,k),tr4,ti3)
1191 }
1192 }
1193
radf5(size_t ido,size_t l1,const double * restrict cc,double * restrict ch,const double * restrict wa)1194 NOINLINE static void radf5(size_t ido, size_t l1, const double * restrict cc,
1195 double * restrict ch, const double * restrict wa)
1196 {
1197 const size_t cdim=5;
1198 static const double tr11= 0.3090169943749474241, ti11=0.95105651629515357212,
1199 tr12=-0.8090169943749474241, ti12=0.58778525229247312917;
1200
1201 for (size_t k=0; k<l1; k++)
1202 {
1203 double cr2, cr3, ci4, ci5;
1204 PM (cr2,ci5,CC(0,k,4),CC(0,k,1))
1205 PM (cr3,ci4,CC(0,k,3),CC(0,k,2))
1206 CH(0,0,k)=CC(0,k,0)+cr2+cr3;
1207 CH(ido-1,1,k)=CC(0,k,0)+tr11*cr2+tr12*cr3;
1208 CH(0,2,k)=ti11*ci5+ti12*ci4;
1209 CH(ido-1,3,k)=CC(0,k,0)+tr12*cr2+tr11*cr3;
1210 CH(0,4,k)=ti12*ci5-ti11*ci4;
1211 }
1212 if (ido==1) return;
1213 for (size_t k=0; k<l1;++k)
1214 for (size_t i=2; i<ido; i+=2)
1215 {
1216 double ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3,
1217 dr4, dr5, cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
1218 size_t ic=ido-i;
1219 MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
1220 MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2))
1221 MULPM (dr4,di4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3))
1222 MULPM (dr5,di5,WA(3,i-2),WA(3,i-1),CC(i-1,k,4),CC(i,k,4))
1223 PM(cr2,ci5,dr5,dr2)
1224 PM(ci2,cr5,di2,di5)
1225 PM(cr3,ci4,dr4,dr3)
1226 PM(ci3,cr4,di3,di4)
1227 CH(i-1,0,k)=CC(i-1,k,0)+cr2+cr3;
1228 CH(i ,0,k)=CC(i ,k,0)+ci2+ci3;
1229 tr2=CC(i-1,k,0)+tr11*cr2+tr12*cr3;
1230 ti2=CC(i ,k,0)+tr11*ci2+tr12*ci3;
1231 tr3=CC(i-1,k,0)+tr12*cr2+tr11*cr3;
1232 ti3=CC(i ,k,0)+tr12*ci2+tr11*ci3;
1233 MULPM(tr5,tr4,cr5,cr4,ti11,ti12)
1234 MULPM(ti5,ti4,ci5,ci4,ti11,ti12)
1235 PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr5)
1236 PM(CH(i ,2,k),CH(ic ,1,k),ti5,ti2)
1237 PM(CH(i-1,4,k),CH(ic-1,3,k),tr3,tr4)
1238 PM(CH(i ,4,k),CH(ic ,3,k),ti4,ti3)
1239 }
1240 }
1241
1242 #undef CC
1243 #undef CH
1244 #define C1(a,b,c) cc[(a)+ido*((b)+l1*(c))]
1245 #define C2(a,b) cc[(a)+idl1*(b)]
1246 #define CH2(a,b) ch[(a)+idl1*(b)]
1247 #define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
1248 #define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
radfg(size_t ido,size_t ip,size_t l1,double * restrict cc,double * restrict ch,const double * restrict wa,const double * restrict csarr)1249 NOINLINE static void radfg(size_t ido, size_t ip, size_t l1,
1250 double * restrict cc, double * restrict ch, const double * restrict wa,
1251 const double * restrict csarr)
1252 {
1253 const size_t cdim=ip;
1254 size_t ipph=(ip+1)/2;
1255 size_t idl1 = ido*l1;
1256
1257 if (ido>1)
1258 {
1259 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 114
1260 {
1261 size_t is=(j-1)*(ido-1),
1262 is2=(jc-1)*(ido-1);
1263 for (size_t k=0; k<l1; ++k) // 113
1264 {
1265 size_t idij=is;
1266 size_t idij2=is2;
1267 for (size_t i=1; i<=ido-2; i+=2) // 112
1268 {
1269 double t1=C1(i,k,j ), t2=C1(i+1,k,j ),
1270 t3=C1(i,k,jc), t4=C1(i+1,k,jc);
1271 double x1=wa[idij]*t1 + wa[idij+1]*t2,
1272 x2=wa[idij]*t2 - wa[idij+1]*t1,
1273 x3=wa[idij2]*t3 + wa[idij2+1]*t4,
1274 x4=wa[idij2]*t4 - wa[idij2+1]*t3;
1275 C1(i ,k,j ) = x1+x3;
1276 C1(i ,k,jc) = x2-x4;
1277 C1(i+1,k,j ) = x2+x4;
1278 C1(i+1,k,jc) = x3-x1;
1279 idij+=2;
1280 idij2+=2;
1281 }
1282 }
1283 }
1284 }
1285
1286 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 123
1287 for (size_t k=0; k<l1; ++k) // 122
1288 {
1289 double t1=C1(0,k,j), t2=C1(0,k,jc);
1290 C1(0,k,j ) = t1+t2;
1291 C1(0,k,jc) = t2-t1;
1292 }
1293
1294 //everything in C
1295 //memset(ch,0,ip*l1*ido*sizeof(double));
1296
1297 for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc) // 127
1298 {
1299 for (size_t ik=0; ik<idl1; ++ik) // 124
1300 {
1301 CH2(ik,l ) = C2(ik,0)+csarr[2*l]*C2(ik,1)+csarr[4*l]*C2(ik,2);
1302 CH2(ik,lc) = csarr[2*l+1]*C2(ik,ip-1)+csarr[4*l+1]*C2(ik,ip-2);
1303 }
1304 size_t iang = 2*l;
1305 size_t j=3, jc=ip-3;
1306 for (; j<ipph-3; j+=4,jc-=4) // 126
1307 {
1308 iang+=l; if (iang>=ip) iang-=ip;
1309 double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
1310 iang+=l; if (iang>=ip) iang-=ip;
1311 double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
1312 iang+=l; if (iang>=ip) iang-=ip;
1313 double ar3=csarr[2*iang], ai3=csarr[2*iang+1];
1314 iang+=l; if (iang>=ip) iang-=ip;
1315 double ar4=csarr[2*iang], ai4=csarr[2*iang+1];
1316 for (size_t ik=0; ik<idl1; ++ik) // 125
1317 {
1318 CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1)
1319 +ar3*C2(ik,j +2)+ar4*C2(ik,j +3);
1320 CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1)
1321 +ai3*C2(ik,jc-2)+ai4*C2(ik,jc-3);
1322 }
1323 }
1324 for (; j<ipph-1; j+=2,jc-=2) // 126
1325 {
1326 iang+=l; if (iang>=ip) iang-=ip;
1327 double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
1328 iang+=l; if (iang>=ip) iang-=ip;
1329 double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
1330 for (size_t ik=0; ik<idl1; ++ik) // 125
1331 {
1332 CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1);
1333 CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1);
1334 }
1335 }
1336 for (; j<ipph; ++j,--jc) // 126
1337 {
1338 iang+=l; if (iang>=ip) iang-=ip;
1339 double ar=csarr[2*iang], ai=csarr[2*iang+1];
1340 for (size_t ik=0; ik<idl1; ++ik) // 125
1341 {
1342 CH2(ik,l ) += ar*C2(ik,j );
1343 CH2(ik,lc) += ai*C2(ik,jc);
1344 }
1345 }
1346 }
1347 for (size_t ik=0; ik<idl1; ++ik) // 101
1348 CH2(ik,0) = C2(ik,0);
1349 for (size_t j=1; j<ipph; ++j) // 129
1350 for (size_t ik=0; ik<idl1; ++ik) // 128
1351 CH2(ik,0) += C2(ik,j);
1352
1353 // everything in CH at this point!
1354 //memset(cc,0,ip*l1*ido*sizeof(double));
1355
1356 for (size_t k=0; k<l1; ++k) // 131
1357 for (size_t i=0; i<ido; ++i) // 130
1358 CC(i,0,k) = CH(i,k,0);
1359
1360 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 137
1361 {
1362 size_t j2=2*j-1;
1363 for (size_t k=0; k<l1; ++k) // 136
1364 {
1365 CC(ido-1,j2,k) = CH(0,k,j);
1366 CC(0,j2+1,k) = CH(0,k,jc);
1367 }
1368 }
1369
1370 if (ido==1) return;
1371
1372 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 140
1373 {
1374 size_t j2=2*j-1;
1375 for(size_t k=0; k<l1; ++k) // 139
1376 for(size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2) // 138
1377 {
1378 CC(i ,j2+1,k) = CH(i ,k,j )+CH(i ,k,jc);
1379 CC(ic ,j2 ,k) = CH(i ,k,j )-CH(i ,k,jc);
1380 CC(i+1 ,j2+1,k) = CH(i+1,k,j )+CH(i+1,k,jc);
1381 CC(ic+1,j2 ,k) = CH(i+1,k,jc)-CH(i+1,k,j );
1382 }
1383 }
1384 }
1385 #undef C1
1386 #undef C2
1387 #undef CH2
1388
1389 #undef CH
1390 #undef CC
1391 #define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
1392 #define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
1393
radb2(size_t ido,size_t l1,const double * restrict cc,double * restrict ch,const double * restrict wa)1394 NOINLINE static void radb2(size_t ido, size_t l1, const double * restrict cc,
1395 double * restrict ch, const double * restrict wa)
1396 {
1397 const size_t cdim=2;
1398
1399 for (size_t k=0; k<l1; k++)
1400 PM (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(ido-1,1,k))
1401 if ((ido&1)==0)
1402 for (size_t k=0; k<l1; k++)
1403 {
1404 CH(ido-1,k,0) = 2.*CC(ido-1,0,k);
1405 CH(ido-1,k,1) =-2.*CC(0 ,1,k);
1406 }
1407 if (ido<=2) return;
1408 for (size_t k=0; k<l1;++k)
1409 for (size_t i=2; i<ido; i+=2)
1410 {
1411 size_t ic=ido-i;
1412 double ti2, tr2;
1413 PM (CH(i-1,k,0),tr2,CC(i-1,0,k),CC(ic-1,1,k))
1414 PM (ti2,CH(i ,k,0),CC(i ,0,k),CC(ic ,1,k))
1415 MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ti2,tr2)
1416 }
1417 }
1418
radb3(size_t ido,size_t l1,const double * restrict cc,double * restrict ch,const double * restrict wa)1419 NOINLINE static void radb3(size_t ido, size_t l1, const double * restrict cc,
1420 double * restrict ch, const double * restrict wa)
1421 {
1422 const size_t cdim=3;
1423 static const double taur=-0.5, taui=0.86602540378443864676;
1424
1425 for (size_t k=0; k<l1; k++)
1426 {
1427 double tr2=2.*CC(ido-1,1,k);
1428 double cr2=CC(0,0,k)+taur*tr2;
1429 CH(0,k,0)=CC(0,0,k)+tr2;
1430 double ci3=2.*taui*CC(0,2,k);
1431 PM (CH(0,k,2),CH(0,k,1),cr2,ci3);
1432 }
1433 if (ido==1) return;
1434 for (size_t k=0; k<l1; k++)
1435 for (size_t i=2; i<ido; i+=2)
1436 {
1437 size_t ic=ido-i;
1438 double tr2=CC(i-1,2,k)+CC(ic-1,1,k); // t2=CC(I) + conj(CC(ic))
1439 double ti2=CC(i ,2,k)-CC(ic ,1,k);
1440 double cr2=CC(i-1,0,k)+taur*tr2; // c2=CC +taur*t2
1441 double ci2=CC(i ,0,k)+taur*ti2;
1442 CH(i-1,k,0)=CC(i-1,0,k)+tr2; // CH=CC+t2
1443 CH(i ,k,0)=CC(i ,0,k)+ti2;
1444 double cr3=taui*(CC(i-1,2,k)-CC(ic-1,1,k));// c3=taui*(CC(i)-conj(CC(ic)))
1445 double ci3=taui*(CC(i ,2,k)+CC(ic ,1,k));
1446 double di2, di3, dr2, dr3;
1447 PM(dr3,dr2,cr2,ci3) // d2= (cr2-ci3, ci2+cr3) = c2+i*c3
1448 PM(di2,di3,ci2,cr3) // d3= (cr2+ci3, ci2-cr3) = c2-i*c3
1449 MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2) // ch = WA*d2
1450 MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3)
1451 }
1452 }
1453
radb4(size_t ido,size_t l1,const double * restrict cc,double * restrict ch,const double * restrict wa)1454 NOINLINE static void radb4(size_t ido, size_t l1, const double * restrict cc,
1455 double * restrict ch, const double * restrict wa)
1456 {
1457 const size_t cdim=4;
1458 static const double sqrt2=1.41421356237309504880;
1459
1460 for (size_t k=0; k<l1; k++)
1461 {
1462 double tr1, tr2;
1463 PM (tr2,tr1,CC(0,0,k),CC(ido-1,3,k))
1464 double tr3=2.*CC(ido-1,1,k);
1465 double tr4=2.*CC(0,2,k);
1466 PM (CH(0,k,0),CH(0,k,2),tr2,tr3)
1467 PM (CH(0,k,3),CH(0,k,1),tr1,tr4)
1468 }
1469 if ((ido&1)==0)
1470 for (size_t k=0; k<l1; k++)
1471 {
1472 double tr1,tr2,ti1,ti2;
1473 PM (ti1,ti2,CC(0 ,3,k),CC(0 ,1,k))
1474 PM (tr2,tr1,CC(ido-1,0,k),CC(ido-1,2,k))
1475 CH(ido-1,k,0)=tr2+tr2;
1476 CH(ido-1,k,1)=sqrt2*(tr1-ti1);
1477 CH(ido-1,k,2)=ti2+ti2;
1478 CH(ido-1,k,3)=-sqrt2*(tr1+ti1);
1479 }
1480 if (ido<=2) return;
1481 for (size_t k=0; k<l1;++k)
1482 for (size_t i=2; i<ido; i+=2)
1483 {
1484 double ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
1485 size_t ic=ido-i;
1486 PM (tr2,tr1,CC(i-1,0,k),CC(ic-1,3,k))
1487 PM (ti1,ti2,CC(i ,0,k),CC(ic ,3,k))
1488 PM (tr4,ti3,CC(i ,2,k),CC(ic ,1,k))
1489 PM (tr3,ti4,CC(i-1,2,k),CC(ic-1,1,k))
1490 PM (CH(i-1,k,0),cr3,tr2,tr3)
1491 PM (CH(i ,k,0),ci3,ti2,ti3)
1492 PM (cr4,cr2,tr1,tr4)
1493 PM (ci2,ci4,ti1,ti4)
1494 MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ci2,cr2)
1495 MULPM (CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),ci3,cr3)
1496 MULPM (CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),ci4,cr4)
1497 }
1498 }
1499
radb5(size_t ido,size_t l1,const double * restrict cc,double * restrict ch,const double * restrict wa)1500 NOINLINE static void radb5(size_t ido, size_t l1, const double * restrict cc,
1501 double * restrict ch, const double * restrict wa)
1502 {
1503 const size_t cdim=5;
1504 static const double tr11= 0.3090169943749474241, ti11=0.95105651629515357212,
1505 tr12=-0.8090169943749474241, ti12=0.58778525229247312917;
1506
1507 for (size_t k=0; k<l1; k++)
1508 {
1509 double ti5=CC(0,2,k)+CC(0,2,k);
1510 double ti4=CC(0,4,k)+CC(0,4,k);
1511 double tr2=CC(ido-1,1,k)+CC(ido-1,1,k);
1512 double tr3=CC(ido-1,3,k)+CC(ido-1,3,k);
1513 CH(0,k,0)=CC(0,0,k)+tr2+tr3;
1514 double cr2=CC(0,0,k)+tr11*tr2+tr12*tr3;
1515 double cr3=CC(0,0,k)+tr12*tr2+tr11*tr3;
1516 double ci4, ci5;
1517 MULPM(ci5,ci4,ti5,ti4,ti11,ti12)
1518 PM(CH(0,k,4),CH(0,k,1),cr2,ci5)
1519 PM(CH(0,k,3),CH(0,k,2),cr3,ci4)
1520 }
1521 if (ido==1) return;
1522 for (size_t k=0; k<l1;++k)
1523 for (size_t i=2; i<ido; i+=2)
1524 {
1525 size_t ic=ido-i;
1526 double tr2, tr3, tr4, tr5, ti2, ti3, ti4, ti5;
1527 PM(tr2,tr5,CC(i-1,2,k),CC(ic-1,1,k))
1528 PM(ti5,ti2,CC(i ,2,k),CC(ic ,1,k))
1529 PM(tr3,tr4,CC(i-1,4,k),CC(ic-1,3,k))
1530 PM(ti4,ti3,CC(i ,4,k),CC(ic ,3,k))
1531 CH(i-1,k,0)=CC(i-1,0,k)+tr2+tr3;
1532 CH(i ,k,0)=CC(i ,0,k)+ti2+ti3;
1533 double cr2=CC(i-1,0,k)+tr11*tr2+tr12*tr3;
1534 double ci2=CC(i ,0,k)+tr11*ti2+tr12*ti3;
1535 double cr3=CC(i-1,0,k)+tr12*tr2+tr11*tr3;
1536 double ci3=CC(i ,0,k)+tr12*ti2+tr11*ti3;
1537 double ci4, ci5, cr5, cr4;
1538 MULPM(cr5,cr4,tr5,tr4,ti11,ti12)
1539 MULPM(ci5,ci4,ti5,ti4,ti11,ti12)
1540 double dr2, dr3, dr4, dr5, di2, di3, di4, di5;
1541 PM(dr4,dr3,cr3,ci4)
1542 PM(di3,di4,ci3,cr4)
1543 PM(dr5,dr2,cr2,ci5)
1544 PM(di2,di5,ci2,cr5)
1545 MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2)
1546 MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3)
1547 MULPM(CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),di4,dr4)
1548 MULPM(CH(i,k,4),CH(i-1,k,4),WA(3,i-2),WA(3,i-1),di5,dr5)
1549 }
1550 }
1551
1552 #undef CC
1553 #undef CH
1554 #define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
1555 #define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
1556 #define C1(a,b,c) cc[(a)+ido*((b)+l1*(c))]
1557 #define C2(a,b) cc[(a)+idl1*(b)]
1558 #define CH2(a,b) ch[(a)+idl1*(b)]
1559
radbg(size_t ido,size_t ip,size_t l1,double * restrict cc,double * restrict ch,const double * restrict wa,const double * restrict csarr)1560 NOINLINE static void radbg(size_t ido, size_t ip, size_t l1,
1561 double * restrict cc, double * restrict ch, const double * restrict wa,
1562 const double * restrict csarr)
1563 {
1564 const size_t cdim=ip;
1565 size_t ipph=(ip+1)/ 2;
1566 size_t idl1 = ido*l1;
1567
1568 for (size_t k=0; k<l1; ++k) // 102
1569 for (size_t i=0; i<ido; ++i) // 101
1570 CH(i,k,0) = CC(i,0,k);
1571 for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc) // 108
1572 {
1573 size_t j2=2*j-1;
1574 for (size_t k=0; k<l1; ++k)
1575 {
1576 CH(0,k,j ) = 2*CC(ido-1,j2,k);
1577 CH(0,k,jc) = 2*CC(0,j2+1,k);
1578 }
1579 }
1580
1581 if (ido!=1)
1582 {
1583 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 111
1584 {
1585 size_t j2=2*j-1;
1586 for (size_t k=0; k<l1; ++k)
1587 for (size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2) // 109
1588 {
1589 CH(i ,k,j ) = CC(i ,j2+1,k)+CC(ic ,j2,k);
1590 CH(i ,k,jc) = CC(i ,j2+1,k)-CC(ic ,j2,k);
1591 CH(i+1,k,j ) = CC(i+1,j2+1,k)-CC(ic+1,j2,k);
1592 CH(i+1,k,jc) = CC(i+1,j2+1,k)+CC(ic+1,j2,k);
1593 }
1594 }
1595 }
1596 for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc)
1597 {
1598 for (size_t ik=0; ik<idl1; ++ik)
1599 {
1600 C2(ik,l ) = CH2(ik,0)+csarr[2*l]*CH2(ik,1)+csarr[4*l]*CH2(ik,2);
1601 C2(ik,lc) = csarr[2*l+1]*CH2(ik,ip-1)+csarr[4*l+1]*CH2(ik,ip-2);
1602 }
1603 size_t iang=2*l;
1604 size_t j=3,jc=ip-3;
1605 for(; j<ipph-3; j+=4,jc-=4)
1606 {
1607 iang+=l; if(iang>ip) iang-=ip;
1608 double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
1609 iang+=l; if(iang>ip) iang-=ip;
1610 double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
1611 iang+=l; if(iang>ip) iang-=ip;
1612 double ar3=csarr[2*iang], ai3=csarr[2*iang+1];
1613 iang+=l; if(iang>ip) iang-=ip;
1614 double ar4=csarr[2*iang], ai4=csarr[2*iang+1];
1615 for (size_t ik=0; ik<idl1; ++ik)
1616 {
1617 C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1)
1618 +ar3*CH2(ik,j +2)+ar4*CH2(ik,j +3);
1619 C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1)
1620 +ai3*CH2(ik,jc-2)+ai4*CH2(ik,jc-3);
1621 }
1622 }
1623 for(; j<ipph-1; j+=2,jc-=2)
1624 {
1625 iang+=l; if(iang>ip) iang-=ip;
1626 double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
1627 iang+=l; if(iang>ip) iang-=ip;
1628 double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
1629 for (size_t ik=0; ik<idl1; ++ik)
1630 {
1631 C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1);
1632 C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1);
1633 }
1634 }
1635 for(; j<ipph; ++j,--jc)
1636 {
1637 iang+=l; if(iang>ip) iang-=ip;
1638 double war=csarr[2*iang], wai=csarr[2*iang+1];
1639 for (size_t ik=0; ik<idl1; ++ik)
1640 {
1641 C2(ik,l ) += war*CH2(ik,j );
1642 C2(ik,lc) += wai*CH2(ik,jc);
1643 }
1644 }
1645 }
1646 for (size_t j=1; j<ipph; ++j)
1647 for (size_t ik=0; ik<idl1; ++ik)
1648 CH2(ik,0) += CH2(ik,j);
1649 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 124
1650 for (size_t k=0; k<l1; ++k)
1651 {
1652 CH(0,k,j ) = C1(0,k,j)-C1(0,k,jc);
1653 CH(0,k,jc) = C1(0,k,j)+C1(0,k,jc);
1654 }
1655
1656 if (ido==1) return;
1657
1658 for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc) // 127
1659 for (size_t k=0; k<l1; ++k)
1660 for (size_t i=1; i<=ido-2; i+=2)
1661 {
1662 CH(i ,k,j ) = C1(i ,k,j)-C1(i+1,k,jc);
1663 CH(i ,k,jc) = C1(i ,k,j)+C1(i+1,k,jc);
1664 CH(i+1,k,j ) = C1(i+1,k,j)+C1(i ,k,jc);
1665 CH(i+1,k,jc) = C1(i+1,k,j)-C1(i ,k,jc);
1666 }
1667
1668 // All in CH
1669
1670 for (size_t j=1; j<ip; ++j)
1671 {
1672 size_t is = (j-1)*(ido-1);
1673 for (size_t k=0; k<l1; ++k)
1674 {
1675 size_t idij = is;
1676 for (size_t i=1; i<=ido-2; i+=2)
1677 {
1678 double t1=CH(i,k,j), t2=CH(i+1,k,j);
1679 CH(i ,k,j) = wa[idij]*t1-wa[idij+1]*t2;
1680 CH(i+1,k,j) = wa[idij]*t2+wa[idij+1]*t1;
1681 idij+=2;
1682 }
1683 }
1684 }
1685 }
1686 #undef C1
1687 #undef C2
1688 #undef CH2
1689
1690 #undef CC
1691 #undef CH
1692 #undef PM
1693 #undef MULPM
1694 #undef WA
1695
copy_and_norm(double * c,double * p1,size_t n,double fct)1696 static void copy_and_norm(double *c, double *p1, size_t n, double fct)
1697 {
1698 if (p1!=c)
1699 {
1700 if (fct!=1.)
1701 for (size_t i=0; i<n; ++i)
1702 c[i] = fct*p1[i];
1703 else
1704 memcpy (c,p1,n*sizeof(double));
1705 }
1706 else
1707 if (fct!=1.)
1708 for (size_t i=0; i<n; ++i)
1709 c[i] *= fct;
1710 }
1711
1712 WARN_UNUSED_RESULT
rfftp_forward(rfftp_plan plan,double c[],double fct)1713 static int rfftp_forward(rfftp_plan plan, double c[], double fct)
1714 {
1715 if (plan->length==1) return 0;
1716 size_t n=plan->length;
1717 size_t l1=n, nf=plan->nfct;
1718 double *ch = RALLOC(double, n);
1719 if (!ch) return -1;
1720 double *p1=c, *p2=ch;
1721
1722 for(size_t k1=0; k1<nf;++k1)
1723 {
1724 size_t k=nf-k1-1;
1725 size_t ip=plan->fct[k].fct;
1726 size_t ido=n / l1;
1727 l1 /= ip;
1728 if(ip==4)
1729 radf4(ido, l1, p1, p2, plan->fct[k].tw);
1730 else if(ip==2)
1731 radf2(ido, l1, p1, p2, plan->fct[k].tw);
1732 else if(ip==3)
1733 radf3(ido, l1, p1, p2, plan->fct[k].tw);
1734 else if(ip==5)
1735 radf5(ido, l1, p1, p2, plan->fct[k].tw);
1736 else
1737 {
1738 radfg(ido, ip, l1, p1, p2, plan->fct[k].tw, plan->fct[k].tws);
1739 SWAP (p1,p2,double *);
1740 }
1741 SWAP (p1,p2,double *);
1742 }
1743 copy_and_norm(c,p1,n,fct);
1744 DEALLOC(ch);
1745 return 0;
1746 }
1747
1748 WARN_UNUSED_RESULT
rfftp_backward(rfftp_plan plan,double c[],double fct)1749 static int rfftp_backward(rfftp_plan plan, double c[], double fct)
1750 {
1751 if (plan->length==1) return 0;
1752 size_t n=plan->length;
1753 size_t l1=1, nf=plan->nfct;
1754 double *ch = RALLOC(double, n);
1755 if (!ch) return -1;
1756 double *p1=c, *p2=ch;
1757
1758 for(size_t k=0; k<nf; k++)
1759 {
1760 size_t ip = plan->fct[k].fct,
1761 ido= n/(ip*l1);
1762 if(ip==4)
1763 radb4(ido, l1, p1, p2, plan->fct[k].tw);
1764 else if(ip==2)
1765 radb2(ido, l1, p1, p2, plan->fct[k].tw);
1766 else if(ip==3)
1767 radb3(ido, l1, p1, p2, plan->fct[k].tw);
1768 else if(ip==5)
1769 radb5(ido, l1, p1, p2, plan->fct[k].tw);
1770 else
1771 radbg(ido, ip, l1, p1, p2, plan->fct[k].tw, plan->fct[k].tws);
1772 SWAP (p1,p2,double *);
1773 l1*=ip;
1774 }
1775 copy_and_norm(c,p1,n,fct);
1776 DEALLOC(ch);
1777 return 0;
1778 }
1779
1780 WARN_UNUSED_RESULT
rfftp_factorize(rfftp_plan plan)1781 static int rfftp_factorize (rfftp_plan plan)
1782 {
1783 size_t length=plan->length;
1784 size_t nfct=0;
1785 while ((length%4)==0)
1786 { if (nfct>=NFCT) return -1; plan->fct[nfct++].fct=4; length>>=2; }
1787 if ((length%2)==0)
1788 {
1789 length>>=1;
1790 // factor 2 should be at the front of the factor list
1791 if (nfct>=NFCT) return -1;
1792 plan->fct[nfct++].fct=2;
1793 SWAP(plan->fct[0].fct, plan->fct[nfct-1].fct,size_t);
1794 }
1795 size_t maxl=(size_t)(sqrt((double)length))+1;
1796 for (size_t divisor=3; (length>1)&&(divisor<maxl); divisor+=2)
1797 if ((length%divisor)==0)
1798 {
1799 while ((length%divisor)==0)
1800 {
1801 if (nfct>=NFCT) return -1;
1802 plan->fct[nfct++].fct=divisor;
1803 length/=divisor;
1804 }
1805 maxl=(size_t)(sqrt((double)length))+1;
1806 }
1807 if (length>1) plan->fct[nfct++].fct=length;
1808 plan->nfct=nfct;
1809 return 0;
1810 }
1811
rfftp_twsize(rfftp_plan plan)1812 static size_t rfftp_twsize(rfftp_plan plan)
1813 {
1814 size_t twsize=0, l1=1;
1815 for (size_t k=0; k<plan->nfct; ++k)
1816 {
1817 size_t ip=plan->fct[k].fct, ido= plan->length/(l1*ip);
1818 twsize+=(ip-1)*(ido-1);
1819 if (ip>5) twsize+=2*ip;
1820 l1*=ip;
1821 }
1822 return twsize;
1823 return 0;
1824 }
1825
rfftp_comp_twiddle(rfftp_plan plan)1826 WARN_UNUSED_RESULT NOINLINE static int rfftp_comp_twiddle (rfftp_plan plan)
1827 {
1828 size_t length=plan->length;
1829 double *twid = RALLOC(double, 2*length);
1830 if (!twid) return -1;
1831 sincos_2pibyn_half(length, twid);
1832 size_t l1=1;
1833 double *ptr=plan->mem;
1834 for (size_t k=0; k<plan->nfct; ++k)
1835 {
1836 size_t ip=plan->fct[k].fct, ido=length/(l1*ip);
1837 if (k<plan->nfct-1) // last factor doesn't need twiddles
1838 {
1839 plan->fct[k].tw=ptr; ptr+=(ip-1)*(ido-1);
1840 for (size_t j=1; j<ip; ++j)
1841 for (size_t i=1; i<=(ido-1)/2; ++i)
1842 {
1843 plan->fct[k].tw[(j-1)*(ido-1)+2*i-2] = twid[2*j*l1*i];
1844 plan->fct[k].tw[(j-1)*(ido-1)+2*i-1] = twid[2*j*l1*i+1];
1845 }
1846 }
1847 if (ip>5) // special factors required by *g functions
1848 {
1849 plan->fct[k].tws=ptr; ptr+=2*ip;
1850 plan->fct[k].tws[0] = 1.;
1851 plan->fct[k].tws[1] = 0.;
1852 for (size_t i=1; i<=(ip>>1); ++i)
1853 {
1854 plan->fct[k].tws[2*i ] = twid[2*i*(length/ip)];
1855 plan->fct[k].tws[2*i+1] = twid[2*i*(length/ip)+1];
1856 plan->fct[k].tws[2*(ip-i) ] = twid[2*i*(length/ip)];
1857 plan->fct[k].tws[2*(ip-i)+1] = -twid[2*i*(length/ip)+1];
1858 }
1859 }
1860 l1*=ip;
1861 }
1862 DEALLOC(twid);
1863 return 0;
1864 }
1865
make_rfftp_plan(size_t length)1866 NOINLINE static rfftp_plan make_rfftp_plan (size_t length)
1867 {
1868 if (length==0) return NULL;
1869 rfftp_plan plan = RALLOC(rfftp_plan_i,1);
1870 if (!plan) return NULL;
1871 plan->length=length;
1872 plan->nfct=0;
1873 plan->mem=NULL;
1874 for (size_t i=0; i<NFCT; ++i)
1875 plan->fct[i]=(rfftp_fctdata){0,0,0};
1876 if (length==1) return plan;
1877 if (rfftp_factorize(plan)!=0) { DEALLOC(plan); return NULL; }
1878 size_t tws=rfftp_twsize(plan);
1879 plan->mem=RALLOC(double,tws);
1880 if (!plan->mem) { DEALLOC(plan); return NULL; }
1881 if (rfftp_comp_twiddle(plan)!=0)
1882 { DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
1883 return plan;
1884 }
1885
destroy_rfftp_plan(rfftp_plan plan)1886 NOINLINE static void destroy_rfftp_plan (rfftp_plan plan)
1887 {
1888 DEALLOC(plan->mem);
1889 DEALLOC(plan);
1890 }
1891
1892 typedef struct fftblue_plan_i
1893 {
1894 size_t n, n2;
1895 cfftp_plan plan;
1896 double *mem;
1897 double *bk, *bkf;
1898 } fftblue_plan_i;
1899 typedef struct fftblue_plan_i * fftblue_plan;
1900
make_fftblue_plan(size_t length)1901 NOINLINE static fftblue_plan make_fftblue_plan (size_t length)
1902 {
1903 fftblue_plan plan = RALLOC(fftblue_plan_i,1);
1904 if (!plan) return NULL;
1905 plan->n = length;
1906 plan->n2 = good_size(plan->n*2-1);
1907 plan->mem = RALLOC(double, 2*plan->n+2*plan->n2);
1908 if (!plan->mem) { DEALLOC(plan); return NULL; }
1909 plan->bk = plan->mem;
1910 plan->bkf = plan->bk+2*plan->n;
1911
1912 /* initialize b_k */
1913 double *tmp = RALLOC(double,4*plan->n);
1914 if (!tmp) { DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
1915 sincos_2pibyn(2*plan->n,tmp);
1916 plan->bk[0] = 1;
1917 plan->bk[1] = 0;
1918
1919 size_t coeff=0;
1920 for (size_t m=1; m<plan->n; ++m)
1921 {
1922 coeff+=2*m-1;
1923 if (coeff>=2*plan->n) coeff-=2*plan->n;
1924 plan->bk[2*m ] = tmp[2*coeff ];
1925 plan->bk[2*m+1] = tmp[2*coeff+1];
1926 }
1927
1928 /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */
1929 double xn2 = 1./plan->n2;
1930 plan->bkf[0] = plan->bk[0]*xn2;
1931 plan->bkf[1] = plan->bk[1]*xn2;
1932 for (size_t m=2; m<2*plan->n; m+=2)
1933 {
1934 plan->bkf[m] = plan->bkf[2*plan->n2-m] = plan->bk[m] *xn2;
1935 plan->bkf[m+1] = plan->bkf[2*plan->n2-m+1] = plan->bk[m+1] *xn2;
1936 }
1937 for (size_t m=2*plan->n;m<=(2*plan->n2-2*plan->n+1);++m)
1938 plan->bkf[m]=0.;
1939 plan->plan=make_cfftp_plan(plan->n2);
1940 if (!plan->plan)
1941 { DEALLOC(tmp); DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
1942 if (cfftp_forward(plan->plan,plan->bkf,1.)!=0)
1943 { DEALLOC(tmp); DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
1944 DEALLOC(tmp);
1945
1946 return plan;
1947 }
1948
destroy_fftblue_plan(fftblue_plan plan)1949 NOINLINE static void destroy_fftblue_plan (fftblue_plan plan)
1950 {
1951 DEALLOC(plan->mem);
1952 destroy_cfftp_plan(plan->plan);
1953 DEALLOC(plan);
1954 }
1955
1956 NOINLINE WARN_UNUSED_RESULT
fftblue_fft(fftblue_plan plan,double c[],int isign,double fct)1957 static int fftblue_fft(fftblue_plan plan, double c[], int isign, double fct)
1958 {
1959 size_t n=plan->n;
1960 size_t n2=plan->n2;
1961 double *bk = plan->bk;
1962 double *bkf = plan->bkf;
1963 double *akf = RALLOC(double, 2*n2);
1964 if (!akf) return -1;
1965
1966 /* initialize a_k and FFT it */
1967 if (isign>0)
1968 for (size_t m=0; m<2*n; m+=2)
1969 {
1970 akf[m] = c[m]*bk[m] - c[m+1]*bk[m+1];
1971 akf[m+1] = c[m]*bk[m+1] + c[m+1]*bk[m];
1972 }
1973 else
1974 for (size_t m=0; m<2*n; m+=2)
1975 {
1976 akf[m] = c[m]*bk[m] + c[m+1]*bk[m+1];
1977 akf[m+1] =-c[m]*bk[m+1] + c[m+1]*bk[m];
1978 }
1979 for (size_t m=2*n; m<2*n2; ++m)
1980 akf[m]=0;
1981
1982 if (cfftp_forward (plan->plan,akf,fct)!=0)
1983 { DEALLOC(akf); return -1; }
1984
1985 /* do the convolution */
1986 if (isign>0)
1987 for (size_t m=0; m<2*n2; m+=2)
1988 {
1989 double im = -akf[m]*bkf[m+1] + akf[m+1]*bkf[m];
1990 akf[m ] = akf[m]*bkf[m] + akf[m+1]*bkf[m+1];
1991 akf[m+1] = im;
1992 }
1993 else
1994 for (size_t m=0; m<2*n2; m+=2)
1995 {
1996 double im = akf[m]*bkf[m+1] + akf[m+1]*bkf[m];
1997 akf[m ] = akf[m]*bkf[m] - akf[m+1]*bkf[m+1];
1998 akf[m+1] = im;
1999 }
2000
2001 /* inverse FFT */
2002 if (cfftp_backward (plan->plan,akf,1.)!=0)
2003 { DEALLOC(akf); return -1; }
2004
2005 /* multiply by b_k */
2006 if (isign>0)
2007 for (size_t m=0; m<2*n; m+=2)
2008 {
2009 c[m] = bk[m] *akf[m] - bk[m+1]*akf[m+1];
2010 c[m+1] = bk[m+1]*akf[m] + bk[m] *akf[m+1];
2011 }
2012 else
2013 for (size_t m=0; m<2*n; m+=2)
2014 {
2015 c[m] = bk[m] *akf[m] + bk[m+1]*akf[m+1];
2016 c[m+1] =-bk[m+1]*akf[m] + bk[m] *akf[m+1];
2017 }
2018 DEALLOC(akf);
2019 return 0;
2020 }
2021
2022 WARN_UNUSED_RESULT
cfftblue_backward(fftblue_plan plan,double c[],double fct)2023 static int cfftblue_backward(fftblue_plan plan, double c[], double fct)
2024 { return fftblue_fft(plan,c,1,fct); }
2025
2026 WARN_UNUSED_RESULT
cfftblue_forward(fftblue_plan plan,double c[],double fct)2027 static int cfftblue_forward(fftblue_plan plan, double c[], double fct)
2028 { return fftblue_fft(plan,c,-1,fct); }
2029
2030 WARN_UNUSED_RESULT
rfftblue_backward(fftblue_plan plan,double c[],double fct)2031 static int rfftblue_backward(fftblue_plan plan, double c[], double fct)
2032 {
2033 size_t n=plan->n;
2034 double *tmp = RALLOC(double,2*n);
2035 if (!tmp) return -1;
2036 tmp[0]=c[0];
2037 tmp[1]=0.;
2038 memcpy (tmp+2,c+1, (n-1)*sizeof(double));
2039 if ((n&1)==0) tmp[n+1]=0.;
2040 for (size_t m=2; m<n; m+=2)
2041 {
2042 tmp[2*n-m]=tmp[m];
2043 tmp[2*n-m+1]=-tmp[m+1];
2044 }
2045 if (fftblue_fft(plan,tmp,1,fct)!=0)
2046 { DEALLOC(tmp); return -1; }
2047 for (size_t m=0; m<n; ++m)
2048 c[m] = tmp[2*m];
2049 DEALLOC(tmp);
2050 return 0;
2051 }
2052
2053 WARN_UNUSED_RESULT
rfftblue_forward(fftblue_plan plan,double c[],double fct)2054 static int rfftblue_forward(fftblue_plan plan, double c[], double fct)
2055 {
2056 size_t n=plan->n;
2057 double *tmp = RALLOC(double,2*n);
2058 if (!tmp) return -1;
2059 for (size_t m=0; m<n; ++m)
2060 {
2061 tmp[2*m] = c[m];
2062 tmp[2*m+1] = 0.;
2063 }
2064 if (fftblue_fft(plan,tmp,-1,fct)!=0)
2065 { DEALLOC(tmp); return -1; }
2066 c[0] = tmp[0];
2067 memcpy (c+1, tmp+2, (n-1)*sizeof(double));
2068 DEALLOC(tmp);
2069 return 0;
2070 }
2071
2072 typedef struct cfft_plan_i
2073 {
2074 cfftp_plan packplan;
2075 fftblue_plan blueplan;
2076 } cfft_plan_i;
2077
make_cfft_plan(size_t length)2078 static cfft_plan make_cfft_plan (size_t length)
2079 {
2080 if (length==0) return NULL;
2081 cfft_plan plan = RALLOC(cfft_plan_i,1);
2082 if (!plan) return NULL;
2083 plan->blueplan=0;
2084 plan->packplan=0;
2085 if ((length<50) || (largest_prime_factor(length)<=sqrt(length)))
2086 {
2087 plan->packplan=make_cfftp_plan(length);
2088 if (!plan->packplan) { DEALLOC(plan); return NULL; }
2089 return plan;
2090 }
2091 double comp1 = cost_guess(length);
2092 double comp2 = 2*cost_guess(good_size(2*length-1));
2093 comp2*=1.5; /* fudge factor that appears to give good overall performance */
2094 if (comp2<comp1) // use Bluestein
2095 {
2096 plan->blueplan=make_fftblue_plan(length);
2097 if (!plan->blueplan) { DEALLOC(plan); return NULL; }
2098 }
2099 else
2100 {
2101 plan->packplan=make_cfftp_plan(length);
2102 if (!plan->packplan) { DEALLOC(plan); return NULL; }
2103 }
2104 return plan;
2105 }
2106
destroy_cfft_plan(cfft_plan plan)2107 static void destroy_cfft_plan (cfft_plan plan)
2108 {
2109 if (plan->blueplan)
2110 destroy_fftblue_plan(plan->blueplan);
2111 if (plan->packplan)
2112 destroy_cfftp_plan(plan->packplan);
2113 DEALLOC(plan);
2114 }
2115
cfft_backward(cfft_plan plan,double c[],double fct)2116 WARN_UNUSED_RESULT static int cfft_backward(cfft_plan plan, double c[], double fct)
2117 {
2118 if (plan->packplan)
2119 return cfftp_backward(plan->packplan,c,fct);
2120 // if (plan->blueplan)
2121 return cfftblue_backward(plan->blueplan,c,fct);
2122 }
2123
cfft_forward(cfft_plan plan,double c[],double fct)2124 WARN_UNUSED_RESULT static int cfft_forward(cfft_plan plan, double c[], double fct)
2125 {
2126 if (plan->packplan)
2127 return cfftp_forward(plan->packplan,c,fct);
2128 // if (plan->blueplan)
2129 return cfftblue_forward(plan->blueplan,c,fct);
2130 }
2131
2132 typedef struct rfft_plan_i
2133 {
2134 rfftp_plan packplan;
2135 fftblue_plan blueplan;
2136 } rfft_plan_i;
2137
make_rfft_plan(size_t length)2138 static rfft_plan make_rfft_plan (size_t length)
2139 {
2140 if (length==0) return NULL;
2141 rfft_plan plan = RALLOC(rfft_plan_i,1);
2142 if (!plan) return NULL;
2143 plan->blueplan=0;
2144 plan->packplan=0;
2145 if ((length<50) || (largest_prime_factor(length)<=sqrt(length)))
2146 {
2147 plan->packplan=make_rfftp_plan(length);
2148 if (!plan->packplan) { DEALLOC(plan); return NULL; }
2149 return plan;
2150 }
2151 double comp1 = 0.5*cost_guess(length);
2152 double comp2 = 2*cost_guess(good_size(2*length-1));
2153 comp2*=1.5; /* fudge factor that appears to give good overall performance */
2154 if (comp2<comp1) // use Bluestein
2155 {
2156 plan->blueplan=make_fftblue_plan(length);
2157 if (!plan->blueplan) { DEALLOC(plan); return NULL; }
2158 }
2159 else
2160 {
2161 plan->packplan=make_rfftp_plan(length);
2162 if (!plan->packplan) { DEALLOC(plan); return NULL; }
2163 }
2164 return plan;
2165 }
2166
destroy_rfft_plan(rfft_plan plan)2167 static void destroy_rfft_plan (rfft_plan plan)
2168 {
2169 if (plan->blueplan)
2170 destroy_fftblue_plan(plan->blueplan);
2171 if (plan->packplan)
2172 destroy_rfftp_plan(plan->packplan);
2173 DEALLOC(plan);
2174 }
2175
rfft_backward(rfft_plan plan,double c[],double fct)2176 WARN_UNUSED_RESULT static int rfft_backward(rfft_plan plan, double c[], double fct)
2177 {
2178 if (plan->packplan)
2179 return rfftp_backward(plan->packplan,c,fct);
2180 else // if (plan->blueplan)
2181 return rfftblue_backward(plan->blueplan,c,fct);
2182 }
2183
rfft_forward(rfft_plan plan,double c[],double fct)2184 WARN_UNUSED_RESULT static int rfft_forward(rfft_plan plan, double c[], double fct)
2185 {
2186 if (plan->packplan)
2187 return rfftp_forward(plan->packplan,c,fct);
2188 else // if (plan->blueplan)
2189 return rfftblue_forward(plan->blueplan,c,fct);
2190 }
2191
2192 static PyObject *
execute_complex(PyObject * a1,int is_forward,double fct)2193 execute_complex(PyObject *a1, int is_forward, double fct)
2194 {
2195 PyArrayObject *data = (PyArrayObject *)PyArray_FromAny(a1,
2196 PyArray_DescrFromType(NPY_CDOUBLE), 1, 0,
2197 NPY_ARRAY_ENSURECOPY | NPY_ARRAY_DEFAULT |
2198 NPY_ARRAY_ENSUREARRAY | NPY_ARRAY_FORCECAST,
2199 NULL);
2200 if (!data) return NULL;
2201
2202 int npts = PyArray_DIM(data, PyArray_NDIM(data) - 1);
2203 cfft_plan plan=NULL;
2204
2205 int nrepeats = PyArray_SIZE(data)/npts;
2206 double *dptr = (double *)PyArray_DATA(data);
2207 int fail=0;
2208 Py_BEGIN_ALLOW_THREADS;
2209 plan = make_cfft_plan(npts);
2210 if (!plan) fail=1;
2211 if (!fail)
2212 for (int i = 0; i < nrepeats; i++) {
2213 int res = is_forward ?
2214 cfft_forward(plan, dptr, fct) : cfft_backward(plan, dptr, fct);
2215 if (res!=0) { fail=1; break; }
2216 dptr += npts*2;
2217 }
2218 if (plan) destroy_cfft_plan(plan);
2219 Py_END_ALLOW_THREADS;
2220 if (fail) {
2221 Py_XDECREF(data);
2222 return PyErr_NoMemory();
2223 }
2224 return (PyObject *)data;
2225 }
2226
2227 static PyObject *
execute_real_forward(PyObject * a1,double fct)2228 execute_real_forward(PyObject *a1, double fct)
2229 {
2230 rfft_plan plan=NULL;
2231 int fail = 0;
2232 PyArrayObject *data = (PyArrayObject *)PyArray_FromAny(a1,
2233 PyArray_DescrFromType(NPY_DOUBLE), 1, 0,
2234 NPY_ARRAY_DEFAULT | NPY_ARRAY_ENSUREARRAY | NPY_ARRAY_FORCECAST,
2235 NULL);
2236 if (!data) return NULL;
2237
2238 int ndim = PyArray_NDIM(data);
2239 const npy_intp *odim = PyArray_DIMS(data);
2240 int npts = odim[ndim - 1];
2241 npy_intp *tdim=(npy_intp *)malloc(ndim*sizeof(npy_intp));
2242 if (!tdim)
2243 { Py_XDECREF(data); return NULL; }
2244 for (int d=0; d<ndim-1; ++d)
2245 tdim[d] = odim[d];
2246 tdim[ndim-1] = npts/2 + 1;
2247 PyArrayObject *ret = (PyArrayObject *)PyArray_Empty(ndim,
2248 tdim, PyArray_DescrFromType(NPY_CDOUBLE), 0);
2249 free(tdim);
2250 if (!ret) fail=1;
2251 if (!fail) {
2252 int rstep = PyArray_DIM(ret, PyArray_NDIM(ret) - 1)*2;
2253
2254 int nrepeats = PyArray_SIZE(data)/npts;
2255 double *rptr = (double *)PyArray_DATA(ret),
2256 *dptr = (double *)PyArray_DATA(data);
2257
2258 Py_BEGIN_ALLOW_THREADS;
2259 plan = make_rfft_plan(npts);
2260 if (!plan) fail=1;
2261 if (!fail)
2262 for (int i = 0; i < nrepeats; i++) {
2263 rptr[rstep-1] = 0.0;
2264 memcpy((char *)(rptr+1), dptr, npts*sizeof(double));
2265 if (rfft_forward(plan, rptr+1, fct)!=0) {fail=1; break;}
2266 rptr[0] = rptr[1];
2267 rptr[1] = 0.0;
2268 rptr += rstep;
2269 dptr += npts;
2270 }
2271 if (plan) destroy_rfft_plan(plan);
2272 Py_END_ALLOW_THREADS;
2273 }
2274 if (fail) {
2275 Py_XDECREF(data);
2276 Py_XDECREF(ret);
2277 return PyErr_NoMemory();
2278 }
2279 Py_DECREF(data);
2280 return (PyObject *)ret;
2281 }
2282 static PyObject *
execute_real_backward(PyObject * a1,double fct)2283 execute_real_backward(PyObject *a1, double fct)
2284 {
2285 rfft_plan plan=NULL;
2286 PyArrayObject *data = (PyArrayObject *)PyArray_FromAny(a1,
2287 PyArray_DescrFromType(NPY_CDOUBLE), 1, 0,
2288 NPY_ARRAY_DEFAULT | NPY_ARRAY_ENSUREARRAY | NPY_ARRAY_FORCECAST,
2289 NULL);
2290 if (!data) return NULL;
2291 int npts = PyArray_DIM(data, PyArray_NDIM(data) - 1);
2292 PyArrayObject *ret = (PyArrayObject *)PyArray_Empty(PyArray_NDIM(data),
2293 PyArray_DIMS(data), PyArray_DescrFromType(NPY_DOUBLE), 0);
2294 int fail = 0;
2295 if (!ret) fail=1;
2296 if (!fail) {
2297 int nrepeats = PyArray_SIZE(ret)/npts;
2298 double *rptr = (double *)PyArray_DATA(ret),
2299 *dptr = (double *)PyArray_DATA(data);
2300
2301 Py_BEGIN_ALLOW_THREADS;
2302 plan = make_rfft_plan(npts);
2303 if (!plan) fail=1;
2304 if (!fail) {
2305 for (int i = 0; i < nrepeats; i++) {
2306 memcpy((char *)(rptr + 1), (dptr + 2), (npts - 1)*sizeof(double));
2307 rptr[0] = dptr[0];
2308 if (rfft_backward(plan, rptr, fct)!=0) {fail=1; break;}
2309 rptr += npts;
2310 dptr += npts*2;
2311 }
2312 }
2313 if (plan) destroy_rfft_plan(plan);
2314 Py_END_ALLOW_THREADS;
2315 }
2316 if (fail) {
2317 Py_XDECREF(data);
2318 Py_XDECREF(ret);
2319 return PyErr_NoMemory();
2320 }
2321 Py_DECREF(data);
2322 return (PyObject *)ret;
2323 }
2324
2325 static PyObject *
execute_real(PyObject * a1,int is_forward,double fct)2326 execute_real(PyObject *a1, int is_forward, double fct)
2327 {
2328 return is_forward ? execute_real_forward(a1, fct)
2329 : execute_real_backward(a1, fct);
2330 }
2331
2332 static const char execute__doc__[] = "";
2333
2334 static PyObject *
execute(PyObject * NPY_UNUSED (self),PyObject * args)2335 execute(PyObject *NPY_UNUSED(self), PyObject *args)
2336 {
2337 PyObject *a1;
2338 int is_real, is_forward;
2339 double fct;
2340
2341 if(!PyArg_ParseTuple(args, "Oiid:execute", &a1, &is_real, &is_forward, &fct)) {
2342 return NULL;
2343 }
2344
2345 return is_real ? execute_real(a1, is_forward, fct)
2346 : execute_complex(a1, is_forward, fct);
2347 }
2348
2349 /* List of methods defined in the module */
2350
2351 static struct PyMethodDef methods[] = {
2352 {"execute", execute, 1, execute__doc__},
2353 {NULL, NULL, 0, NULL} /* sentinel */
2354 };
2355
2356 static struct PyModuleDef moduledef = {
2357 PyModuleDef_HEAD_INIT,
2358 "_pocketfft_internal",
2359 NULL,
2360 -1,
2361 methods,
2362 NULL,
2363 NULL,
2364 NULL,
2365 NULL
2366 };
2367
2368 /* Initialization function for the module */
PyInit__pocketfft_internal(void)2369 PyMODINIT_FUNC PyInit__pocketfft_internal(void)
2370 {
2371 PyObject *m;
2372 m = PyModule_Create(&moduledef);
2373 if (m == NULL) {
2374 return NULL;
2375 }
2376
2377 /* Import the array object */
2378 import_array();
2379
2380 /* XXXX Add constants here */
2381
2382 return m;
2383 }
2384