1 /*
2  * Copyright (c) 2002, 2020 Jens Keiner, Stefan Kunis, Daniel Potts
3  *
4  * This program is free software; you can redistribute it and/or modify it under
5  * the terms of the GNU General Public License as published by the Free Software
6  * Foundation; either version 2 of the License, or (at your option) any later
7  * version.
8  *
9  * This program is distributed in the hope that it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11  * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
12  * details.
13  *
14  * You should have received a copy of the GNU General Public License along with
15  * this program; if not, write to the Free Software Foundation, Inc., 51
16  * Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17  */
18 
19 /* Nonequispaced fast cosine transform */
20 
21 /* Author: Steffen Klatt 2004-2006, Jens Keiner 2010 */
22 
23 /* configure header */
24 #include "config.h"
25 
26 /* complex datatype (maybe) */
27 #ifdef HAVE_COMPLEX_H
28 #include<complex.h>
29 #endif
30 
31 /* NFFT headers */
32 #include "nfft3.h"
33 #include "infft.h"
34 
35 #ifdef _OPENMP
36 #include <omp.h>
37 #endif
38 
39 #ifdef OMP_ASSERT
40 #include <assert.h>
41 #endif
42 
43 #undef X
44 #define X(name) NFCT(name)
45 
46 /** Compute aggregated product of integer array. */
intprod(const INT * vec,const INT a,const INT d)47 static inline INT intprod(const INT *vec, const INT a, const INT d)
48 {
49   INT t, p;
50 
51   p = 1;
52   for (t = 0; t < d; t++)
53     p *= vec[t] - a;
54 
55   return p;
56 }
57 
58 /* handy shortcuts */
59 #define BASE(x) COS(x)
60 #define NN(x) (x - 1)
61 #define OFFSET 0
62 #define FOURIER_TRAFO FFTW_REDFT00
63 #define FFTW_DEFAULT_FLAGS FFTW_ESTIMATE | FFTW_DESTROY_INPUT
64 
65 #define NODE(p,r) (ths->x[(p) * ths->d + (r)])
66 
67 #define MACRO_with_FG_PSI fg_psi[t][lj[t]]
68 #define MACRO_with_PRE_PSI ths->psi[(j * ths->d + t) * (2 * ths->m + 2) + lj[t]]
69 #define MACRO_without_PRE_PSI PHI((2 * NN(ths->n[t])), ((ths->x[(j) * ths->d + t]) \
70   - ((R)(lj[t] + u[t])) / (K(2.0) * ((R)NN(ths->n[t])))), t)
71 #define MACRO_compute_PSI PHI((2 * NN(ths->n[t])), (NODE(j,t) - ((R)(lj[t] + u[t])) / (K(2.0) * ((R)NN(ths->n[t])))), t)
72 
73 /**
74  * Direct computation of non equispaced cosine transforms
75  *  nfct_trafo_direct,  nfct_adjoint_direct
76  *  require O(M N^d) arithemtical operations
77  *
78  * direct computation of the nfct_trafo_direct, formula (1.1)
79  * nfct_trafo_direct:
80  * for j=0,...,M-1
81  *  f[j] = sum_{k in I_N^d} f_hat[k] * cos(2 (pi) k x[j])
82  *
83  * direct computation of the nfft_adjoint_direct, formula (1.2)
84  * nfct_adjoint_direct:
85  * for k in I_N^d
86  *  f_hat[k] = sum_{j=0}^{M-1} f[j] * cos(2 (pi) k x[j])
87  */
X(trafo_direct)88 void X(trafo_direct)(const X(plan) *ths)
89 {
90   R *f_hat = (R*)ths->f_hat, *f = (R*)ths->f;
91 
92   memset(f, 0, (size_t)(ths->M_total) * sizeof(R));
93 
94   if (ths->d == 1)
95   {
96     /* specialize for univariate case, rationale: faster */
97     INT j;
98 #ifdef _OPENMP
99     #pragma omp parallel for default(shared) private(j)
100 #endif
101     for (j = 0; j < ths->M_total; j++)
102     {
103       INT k_L;
104       for (k_L = 0; k_L < ths->N_total; k_L++)
105       {
106         R omega = K2PI * ((R)(k_L + OFFSET)) * ths->x[j];
107         f[j] += f_hat[k_L] * BASE(omega);
108       }
109     }
110   }
111   else
112   {
113     /* multivariate case */
114     INT j;
115 #ifdef _OPENMP
116     #pragma omp parallel for default(shared) private(j)
117 #endif
118     for (j = 0; j < ths->M_total; j++)
119     {
120       R x[ths->d], omega, Omega[ths->d + 1];
121       INT t, t2, k_L, k[ths->d];
122       Omega[0] = K(1.0);
123       for (t = 0; t < ths->d; t++)
124       {
125         k[t] = OFFSET;
126         x[t] = K2PI * ths->x[j * ths->d + t];
127         Omega[t+1] = BASE(((R)(k[t])) * x[t]) * Omega[t];
128       }
129       omega = Omega[ths->d];
130 
131       for (k_L = 0; k_L < ths->N_total; k_L++)
132       {
133         f[j] += f_hat[k_L] * omega;
134         {
135           for (t = ths->d - 1; (t >= 1) && (k[t] == (ths->N[t] - 1)); t--)
136             k[t] = OFFSET;
137 
138           k[t]++;
139 
140           for (t2 = t; t2 < ths->d; t2++)
141             Omega[t2+1] = BASE(((R)(k[t2])) * x[t2]) * Omega[t2];
142 
143           omega = Omega[ths->d];
144         }
145       }
146     }
147   }
148 }
149 
X(adjoint_direct)150 void X(adjoint_direct)(const X(plan) *ths)
151 {
152   R *f_hat = (R*)ths->f_hat, *f = (R*)ths->f;
153 
154   memset(f_hat, 0, (size_t)(ths->N_total) * sizeof(R));
155 
156   if (ths->d == 1)
157   {
158     /* specialize for univariate case, rationale: faster */
159 #ifdef _OPENMP
160       INT k_L;
161       #pragma omp parallel for default(shared) private(k_L)
162       for (k_L = 0; k_L < ths->N_total; k_L++)
163       {
164         INT j;
165         for (j = 0; j < ths->M_total; j++)
166         {
167           R omega = K2PI * ((R)(k_L + OFFSET)) * ths->x[j];
168           f_hat[k_L] += f[j] * BASE(omega);
169         }
170       }
171 #else
172       INT j;
173       for (j = 0; j < ths->M_total; j++)
174       {
175         INT k_L;
176         for (k_L = 0; k_L < ths->N_total; k_L++)
177         {
178           R omega = K2PI * ((R)(k_L + OFFSET)) * ths->x[j];
179           f_hat[k_L] += f[j] * BASE(omega);
180         }
181       }
182 #endif
183   }
184   else
185   {
186     /* multivariate case */
187     INT j, k_L;
188 #ifdef _OPENMP
189     #pragma omp parallel for default(shared) private(j, k_L)
190     for (k_L = 0; k_L < ths->N_total; k_L++)
191     {
192       INT k[ths->d], k_temp, t;
193 
194       k_temp = k_L;
195 
196       for (t = ths->d - 1; t >= 0; t--)
197       {
198         k[t] = k_temp % ths->N[t];
199         k_temp /= ths->N[t];
200       }
201 
202       for (j = 0; j < ths->M_total; j++)
203       {
204         R omega = K(1.0);
205         for (t = 0; t < ths->d; t++)
206           omega *= BASE(K2PI * (k[t] + OFFSET) * ths->x[j * ths->d + t]);
207         f_hat[k_L] += f[j] * omega;
208       }
209     }
210 #else
211     for (j = 0; j < ths->M_total; j++)
212     {
213       R x[ths->d], omega, Omega[ths->d+1];
214       INT t, t2, k[ths->d];
215       Omega[0] = K(1.0);
216       for (t = 0; t < ths->d; t++)
217       {
218         k[t] = OFFSET;
219         x[t] = K2PI * ths->x[j * ths->d + t];
220         Omega[t+1] = BASE(((R)(k[t])) * x[t]) * Omega[t];
221       }
222       omega = Omega[ths->d];
223       for (k_L = 0; k_L < ths->N_total; k_L++)
224       {
225         f_hat[k_L] += f[j] * omega;
226 
227         for (t = ths->d-1; (t >= 1) && (k[t] == ths->N[t] - 1); t--)
228           k[t] = OFFSET;
229 
230         k[t]++;
231 
232         for (t2 = t; t2 < ths->d; t2++)
233           Omega[t2+1] = BASE(((R)(k[t2])) * x[t2]) * Omega[t2];
234 
235         omega = Omega[ths->d];
236       }
237     }
238 #endif
239   }
240 }
241 
242 /** fast computation of non equispaced cosine transforms
243  *  require O(N^d log(N) + M) arithemtical operations
244  *
245  * fast computation of the nfct_trafo, formula (1.1)
246  * nfct_trafo:
247  * for j=0,...,M-1
248  *  f[j] = sum_{k in I_N^d} f_hat[k] * cos(2 (pi) k x[j])
249  *
250  * direct computation of the nfct_adjoint, formula (1.2)
251  * nfct_adjoint:
252  * for k in I_N^d
253  *  f_hat[k] = sum_{j=0}^{M-1} f[j] * cos(2 (pi) k x[j])
254  */
255 
256 /** macros and small sub routines for the fast transforms
257  */
258 
259 /** computes 2m+2 indices for the matrix B
260  */
uo(const X (plan)* ths,const INT j,INT * up,INT * op,const INT act_dim)261 static inline void uo(const X(plan) *ths, const INT j, INT *up, INT *op,
262   const INT act_dim)
263 {
264   const R xj = ths->x[j * ths->d + act_dim];
265   INT c = LRINT(xj * (2 * NN(ths->n[(act_dim)])));
266 
267   (*up) = c - (ths->m);
268   (*op) = c + 1 + (ths->m);
269 }
270 
271 #define MACRO_D_compute_A \
272 { \
273   g_hat[kg_plain[ths->d]] = f_hat[k_L] * c_phi_inv_k[ths->d]; \
274 }
275 
276 #define MACRO_D_compute_T \
277 { \
278   f_hat[k_L] = g_hat[kg_plain[ths->d]] * c_phi_inv_k[ths->d]; \
279 }
280 
281 #define MACRO_D_init_result_A memset(g_hat, 0, (size_t)(ths->n_total) * sizeof(R));
282 
283 #define MACRO_D_init_result_T memset(f_hat, 0, (size_t)(ths->N_total) * sizeof(R));
284 
285 #define MACRO_with_PRE_PHI_HUT ths->c_phi_inv[t][kg[t]]
286 
287 #define MACRO_compute_PHI_HUT_INV (K(1.0) / (PHI_HUT((2 * NN(ths->n[t])), kg[t] + OFFSET, t)))
288 
289 #define MACRO_init_k_ks \
290 { \
291   for (t = 0; t < ths->d; t++) \
292   { \
293     kg[t] = 0; \
294   } \
295   i = 0; \
296 }
297 
298 #define MACRO_update_c_phi_inv_k(what_kind, which_phi) \
299 { \
300   for (t = i; t < ths->d; t++) \
301   { \
302     MACRO_update_c_phi_inv_k_ ## what_kind(which_phi); \
303     kg_plain[t+1] = kg_plain[t] * ths->n[t] + kg[t]; \
304   } \
305 }
306 
307 #define MACRO_update_c_phi_inv_k_A(which_phi) \
308 { \
309   c_phi_inv_k[t+1] = (kg[t] == 0 ? K(1.0) : K(0.5)) * c_phi_inv_k[t] * MACRO_ ## which_phi; \
310 }
311 
312 #define MACRO_update_c_phi_inv_k_T(which_phi) \
313 { \
314   c_phi_inv_k[t+1] = c_phi_inv_k[t] * MACRO_ ## which_phi; \
315 }
316 
317 #define MACRO_count_k_ks \
318 { \
319   kg[ths->d - 1]++; \
320   i = ths->d - 1; \
321 \
322   while ((kg[i] == ths->N[i]) && (i > 0)) \
323   { \
324     kg[i - 1]++; \
325     kg[i] = 0; \
326     i--; \
327   } \
328 }
329 
330 /* sub routines for the fast transforms  matrix vector multiplication with D, D^T */
331 #define MACRO_D(which_one) \
332 static inline void D_ ## which_one (X(plan) *ths) \
333 { \
334   R *g_hat, *f_hat; /* local copy */ \
335   R c_phi_inv_k[ths->d+1]; /* postfix product of PHI_HUT */ \
336   INT t; /* index dimensions */ \
337   INT i; \
338   INT k_L; /* plain index */ \
339   INT kg[ths->d]; /* multi index in g_hat */ \
340   INT kg_plain[ths->d+1]; /* postfix plain index */ \
341 \
342   f_hat = (R*)ths->f_hat; g_hat = (R*)ths->g_hat; \
343   MACRO_D_init_result_ ## which_one; \
344 \
345   c_phi_inv_k[0] = K(1.0); \
346   kg_plain[0] = 0; \
347 \
348   MACRO_init_k_ks; \
349 \
350   if (ths->flags & PRE_PHI_HUT) \
351   { \
352     for (k_L = 0; k_L < ths->N_total; k_L++) \
353     { \
354       MACRO_update_c_phi_inv_k(which_one, with_PRE_PHI_HUT); \
355       MACRO_D_compute_ ## which_one; \
356       MACRO_count_k_ks; \
357     } \
358   } \
359   else \
360   { \
361     for (k_L = 0; k_L < ths->N_total; k_L++) \
362     { \
363       MACRO_update_c_phi_inv_k(which_one,compute_PHI_HUT_INV); \
364       MACRO_D_compute_ ## which_one; \
365       MACRO_count_k_ks; \
366     } \
367   } \
368 }
369 
370 MACRO_D(A)
MACRO_D(T)371 MACRO_D(T)
372 
373 /* sub routines for the fast transforms matrix vector multiplication with B, B^T */
374 #define MACRO_B_init_result_A memset(f, 0, (size_t)(ths->M_total) * sizeof(R));
375 #define MACRO_B_init_result_T memset(g, 0, (size_t)(ths->n_total) * sizeof(R));
376 
377 #define MACRO_B_PRE_FULL_PSI_compute_A \
378 { \
379   (*fj) += ths->psi[ix] * g[ths->psi_index_g[ix]]; \
380 }
381 
382 #define MACRO_B_PRE_FULL_PSI_compute_T \
383 { \
384   R factor = K(1.0); \
385   INT d = ths->psi_index_g[ix]; \
386   for (t = ths->d - 1; t >= 0; t--) \
387   { \
388     INT m = d % ths->n[t]; \
389     if (m != 0 && m != ths->n[t] - 1) \
390       factor *= K(0.5); \
391     d = d / ths->n[t]; \
392   } \
393   g[ths->psi_index_g[ix]] += factor * ths->psi[ix] * (*fj); \
394 }
395 
396 #define MACRO_B_compute_A \
397 { \
398   (*fj) += phi_prod[ths->d] * g[ll_plain[ths->d]]; \
399 }
400 
401 #define MACRO_B_compute_T \
402 { \
403   g[ll_plain[ths->d]] += phi_prod[ths->d] * (*fj); \
404 }
405 
406 #define MACRO_init_uo_l_lj_t \
407 { \
408   for (t2 = 0; t2 < ths->d; t2++) \
409   { \
410     uo(ths, j, &u[t2], &o[t2], t2); \
411     \
412     /* determine index in g-array corresponding to u[(t2)] */ \
413     if (u[(t2)] < 0) \
414       lg_offset[(t2)] = \
415         (u[(t2)] % (2 * NN(ths->n[(t2)]))) + (2 * NN(ths->n[(t2)])); \
416     else \
417       lg_offset[(t2)] = u[(t2)] % (2 * NN(ths->n[(t2)])); \
418       if (lg_offset[(t2)] > NN(ths->n[(t2)])) \
419         lg_offset[(t2)] = -(2 * NN(ths->n[(t2)]) - lg_offset[(t2)]); \
420     \
421     if (lg_offset[t2] <= 0) \
422     { \
423       l[t2] = -lg_offset[t2]; \
424       count_lg[t2] = -1; \
425     } \
426     else \
427     { \
428       l[t2] = +lg_offset[t2]; \
429       count_lg[t2] = +1; \
430     } \
431  \
432     lj[t2] = 0; \
433    } \
434    t2 = 0; \
435 }
436 
437 #define FOO_A K(1.0)
438 
439 #define FOO_T ((l[t] == 0 || l[t] == ths->n[t] - 1) ? K(1.0) : K(0.5))
440 
441 #define MACRO_update_phi_prod_ll_plain(which_one,which_psi) \
442 { \
443   for (t = t2; t < ths->d; t++) \
444   { \
445     phi_prod[t+1] = (FOO_ ## which_one) * phi_prod[t] * (MACRO_ ## which_psi); \
446     ll_plain[t+1]  = ll_plain[t] * ths->n[t] + l[t]; \
447   } \
448 }
449 
450 #define MACRO_count_uo_l_lj_t \
451 { \
452   /* turn around if we hit one of the boundaries */ \
453   if ((l[(ths->d-1)] == 0) || (l[(ths->d-1)] == NN(ths->n[(ths->d-1)]))) \
454     count_lg[(ths->d-1)] *= -1; \
455  \
456   /* move array index */ \
457   l[(ths->d-1)] += count_lg[(ths->d-1)]; \
458  \
459   lj[ths->d - 1]++; \
460   t2 = ths->d - 1; \
461  \
462   while ((lj[t2] == (2 * ths->m + 2)) && (t2 > 0)) \
463   { \
464     lj[t2 - 1]++; \
465     lj[t2] = 0; \
466     /* ansonsten lg[i-1] verschieben */ \
467  \
468     /* turn around if we hit one of the boundaries */ \
469     if ((l[(t2 - 1)] == 0) || (l[(t2 - 1)] == NN(ths->n[(t2 - 1)]))) \
470       count_lg[(t2 - 1)] *= -1; \
471     /* move array index */ \
472     l[(t2 - 1)] += count_lg[(t2 - 1)]; \
473  \
474     /* lg[i] = anfangswert */ \
475     if (lg_offset[t2] <= 0) \
476     { \
477       l[t2] = -lg_offset[t2]; \
478       count_lg[t2] = -1; \
479     } \
480     else \
481     { \
482       l[t2] = +lg_offset[t2]; \
483       count_lg[t2] = +1; \
484     } \
485  \
486     t2--; \
487   } \
488 }
489 
490 #define MACRO_B(which_one) \
491 static inline void B_ ## which_one (X(plan) *ths) \
492 { \
493   INT lprod; /* 'regular bandwidth' of matrix B  */ \
494   INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */ \
495   INT t, t2; /* index dimensions */ \
496   INT j; /* index nodes */ \
497   INT l_L, ix; /* index one row of B */ \
498   INT l[ths->d]; /* multi index u<=l<=o (real index of g in array) */ \
499   INT lj[ths->d]; /* multi index 0<=lc<2m+2 */ \
500   INT ll_plain[ths->d+1]; /* postfix plain index in g */ \
501   R phi_prod[ths->d+1]; /* postfix product of PHI */ \
502   R *f, *g; /* local copy */ \
503   R *fj; /* local copy */ \
504   R y[ths->d]; \
505   R fg_psi[ths->d][2*ths->m+2]; \
506   R fg_exp_l[ths->d][2*ths->m+2]; \
507   INT l_fg,lj_fg; \
508   R tmpEXP1, tmpEXP2, tmpEXP2sq, tmp1, tmp2, tmp3; \
509   R ip_w; \
510   INT ip_u; \
511   INT ip_s = ths->K/(ths->m+2); \
512   INT lg_offset[ths->d]; /* offset in g according to u */ \
513   INT count_lg[ths->d]; /* count summands (2m+2) */ \
514 \
515   f = (R*)ths->f; g = (R*)ths->g; \
516 \
517   MACRO_B_init_result_ ## which_one \
518 \
519   if (ths->flags & PRE_FULL_PSI) \
520   { \
521     for (ix = 0, j = 0, fj = f; j < ths->M_total; j++, fj++) \
522     { \
523       for (l_L = 0; l_L < ths->psi_index_f[j]; l_L++, ix++) \
524       { \
525         MACRO_B_PRE_FULL_PSI_compute_ ## which_one; \
526       } \
527     } \
528     return; \
529   } \
530 \
531   phi_prod[0] = K(1.0); \
532   ll_plain[0] = 0; \
533 \
534   for (t = 0, lprod = 1; t < ths->d; t++) \
535     lprod *= (2 * ths->m + 2); \
536 \
537   if (ths->flags & PRE_PSI) \
538   { \
539     for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
540     { \
541       MACRO_init_uo_l_lj_t; \
542  \
543       for (l_L = 0; l_L < lprod; l_L++) \
544       { \
545         MACRO_update_phi_prod_ll_plain(which_one, with_PRE_PSI); \
546  \
547         MACRO_B_compute_ ## which_one; \
548  \
549         MACRO_count_uo_l_lj_t; \
550       } /* for(l_L) */ \
551     } /* for(j) */ \
552     return; \
553   } /* if(PRE_PSI) */ \
554  \
555   if (ths->flags & PRE_FG_PSI) \
556   { \
557     for (t = 0; t < ths->d; t++) \
558     { \
559       tmpEXP2 = EXP(K(-1.0) / ths->b[t]); \
560       tmpEXP2sq = tmpEXP2 * tmpEXP2; \
561       tmp2 = K(1.0); \
562       tmp3 = K(1.0); \
563       fg_exp_l[t][0] = K(1.0); \
564  \
565       for (lj_fg = 1; lj_fg <= (2 * ths->m + 2); lj_fg++) \
566       { \
567         tmp3 = tmp2 * tmpEXP2; \
568         tmp2 *= tmpEXP2sq; \
569         fg_exp_l[t][lj_fg] = fg_exp_l[t][lj_fg-1] * tmp3; \
570       } \
571     } \
572  \
573     for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
574     { \
575       MACRO_init_uo_l_lj_t; \
576  \
577       for (t = 0; t < ths->d; t++) \
578       { \
579         fg_psi[t][0] = ths->psi[2 * (j * ths->d + t)]; \
580         tmpEXP1 = ths->psi[2 * (j * ths->d + t) + 1]; \
581         tmp1 = K(1.0); \
582  \
583         for (l_fg = u[t] + 1, lj_fg = 1; l_fg <= o[t]; l_fg++, lj_fg++) \
584         { \
585           tmp1 *= tmpEXP1; \
586           fg_psi[t][lj_fg] = fg_psi[t][0] * tmp1 * fg_exp_l[t][lj_fg]; \
587         } \
588       } \
589  \
590       for (l_L= 0; l_L < lprod; l_L++) \
591       { \
592         MACRO_update_phi_prod_ll_plain(which_one, with_FG_PSI); \
593  \
594         MACRO_B_compute_ ## which_one; \
595  \
596         MACRO_count_uo_l_lj_t; \
597       } \
598     } \
599     return; \
600   } \
601  \
602   if (ths->flags & FG_PSI) \
603   { \
604     for (t = 0; t < ths->d; t++) \
605     { \
606       tmpEXP2 = EXP(K(-1.0) / ths->b[t]); \
607       tmpEXP2sq = tmpEXP2 * tmpEXP2; \
608       tmp2 = K(1.0); \
609       tmp3 = K(1.0); \
610       fg_exp_l[t][0] = K(1.0); \
611       for (lj_fg = 1; lj_fg <= (2 * ths->m + 2); lj_fg++) \
612       { \
613         tmp3 = tmp2 * tmpEXP2; \
614         tmp2 *= tmpEXP2sq; \
615         fg_exp_l[t][lj_fg] = fg_exp_l[t][lj_fg-1] * tmp3; \
616       } \
617     } \
618  \
619     for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
620     { \
621       MACRO_init_uo_l_lj_t; \
622  \
623       for (t = 0; t < ths->d; t++) \
624       { \
625         fg_psi[t][0] = (PHI((2 * NN(ths->n[t])), (ths->x[j*ths->d+t] - ((R)u[t])/(2 * NN(ths->n[t]))),(t)));\
626  \
627         tmpEXP1 = EXP(K(2.0) * ((2 * NN(ths->n[t])) * ths->x[j * ths->d + t] - u[t]) / ths->b[t]); \
628         tmp1 = K(1.0); \
629         for (l_fg = u[t] + 1, lj_fg = 1; l_fg <= o[t]; l_fg++, lj_fg++) \
630         { \
631           tmp1 *= tmpEXP1; \
632           fg_psi[t][lj_fg] = fg_psi[t][0] * tmp1 * fg_exp_l[t][lj_fg]; \
633         } \
634       } \
635   \
636       for (l_L = 0; l_L < lprod; l_L++) \
637       { \
638         MACRO_update_phi_prod_ll_plain(which_one, with_FG_PSI); \
639  \
640         MACRO_B_compute_ ## which_one; \
641  \
642         MACRO_count_uo_l_lj_t; \
643       } \
644     } \
645     return; \
646   } \
647  \
648   if (ths->flags & PRE_LIN_PSI) \
649   { \
650     for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
651     { \
652       MACRO_init_uo_l_lj_t; \
653   \
654       for (t = 0; t < ths->d; t++) \
655       { \
656         y[t] = (((2 * NN(ths->n[t])) * ths->x[j * ths->d + t] - (R)u[t]) \
657                 * ((R)ths->K))/(ths->m + 2); \
658         ip_u  = LRINT(FLOOR(y[t])); \
659         ip_w  = y[t]-ip_u; \
660         for (l_fg = u[t], lj_fg = 0; l_fg <= o[t]; l_fg++, lj_fg++) \
661         { \
662           fg_psi[t][lj_fg] = ths->psi[(ths->K+1)*t + ABS(ip_u-lj_fg*ip_s)] \
663             * (1-ip_w) + ths->psi[(ths->K+1)*t + ABS(ip_u-lj_fg*ip_s+1)] \
664             * (ip_w); \
665         } \
666       } \
667   \
668       for (l_L = 0; l_L < lprod; l_L++) \
669       { \
670         MACRO_update_phi_prod_ll_plain(which_one, with_FG_PSI); \
671  \
672         MACRO_B_compute_ ## which_one; \
673  \
674         MACRO_count_uo_l_lj_t; \
675       }  /* for(l_L) */  \
676     } /* for(j) */  \
677     return; \
678   } /* if(PRE_LIN_PSI) */ \
679   \
680   /* no precomputed psi at all */ \
681   for (j = 0, fj = &f[0]; j < ths->M_total; j++, fj += 1) \
682   { \
683     MACRO_init_uo_l_lj_t; \
684  \
685     for (l_L = 0; l_L < lprod; l_L++) \
686     { \
687       MACRO_update_phi_prod_ll_plain(which_one, without_PRE_PSI); \
688  \
689       MACRO_B_compute_ ## which_one; \
690  \
691       MACRO_count_uo_l_lj_t; \
692     } /* for (l_L) */ \
693   } /* for (j) */ \
694 } /* B */
695 
696 MACRO_B(A)
697 MACRO_B(T)
698 
699 /**
700  * user routines
701  */
702 void X(trafo)(X(plan) *ths)
703 {
704   switch(ths->d)
705   {
706     default:
707     {
708       /* use ths->my_fftw_r2r_plan */
709       ths->g_hat = ths->g1;
710       ths->g = ths->g2;
711 
712       /* form \f$ \hat g_k = \frac{\hat f_k}{c_k\left(\phi\right)} \text{ for }
713        * k \in I_N \f$ */
714       TIC(0)
715       D_A(ths);
716       TOC(0)
717 
718       /* Compute by d-variate discrete Fourier transform
719        * \f$ g_l = \sum_{k \in I_N} \hat g_k {\rm e}^{-2\pi {\rm i} \frac{kl}{n}}
720        * \text{ for } l \in I_n \f$ */
721       TIC_FFTW(1)
722       FFTW(execute)(ths->my_fftw_r2r_plan);
723       TOC_FFTW(1)
724 
725       /*if (ths->flags & PRE_FULL_PSI)
726         full_psi__A(ths);*/
727 
728       /* Set \f$ f_j = \sum_{l \in I_n,m(x_j)} g_l \psi\left(x_j-\frac{l}{n}\right)
729        * \text{ for } j=0,\dots,M-1 \f$ */
730       TIC(2)
731       B_A(ths);
732       TOC(2)
733 
734       /*if (ths->flags & PRE_FULL_PSI)
735       {
736         Y(free)(ths->psi_index_g);
737         Y(free)(ths->psi_index_f);
738       }*/
739     }
740   }
741 } /* trafo */
742 
X(adjoint)743 void X(adjoint)(X(plan) *ths)
744 {
745   switch(ths->d)
746   {
747     default:
748     {
749       /* use ths->my_fftw_plan */
750       ths->g_hat = ths->g2;
751       ths->g = ths->g1;
752 
753       /*if (ths->flags & PRE_FULL_PSI)
754         full_psi__T(ths);*/
755 
756       /* Set \f$ g_l = \sum_{j=0}^{M-1} f_j \psi\left(x_j-\frac{l}{n}\right)
757        * \text{ for } l \in I_n,m(x_j) \f$ */
758       TIC(2)
759       B_T(ths);
760       TOC(2)
761 
762       /* Compute by d-variate discrete cosine transform
763        * \f$ \hat g_k = \sum_{l \in I_n} g_l {\rm e}^{-2\pi {\rm i} \frac{kl}{n}}
764        * \text{ for }  k \in I_N\f$ */
765       TIC_FFTW(1)
766       FFTW(execute)(ths->my_fftw_r2r_plan);
767       TOC_FFTW(1)
768 
769       /* Form \f$ \hat f_k = \frac{\hat g_k}{c_k\left(\phi\right)} \text{ for }
770        * k \in I_N \f$ */
771       TIC(0)
772       D_T(ths);
773       TOC(0)
774     }
775   }
776 } /* adjoint */
777 
778 /** initialisation of direct transform
779  */
precompute_phi_hut(X (plan)* ths)780 static inline void precompute_phi_hut(X(plan) *ths)
781 {
782   INT ks[ths->d]; /* index over all frequencies */
783   INT t; /* index over all dimensions */
784 
785   ths->c_phi_inv = (R**) Y(malloc)((size_t)(ths->d) * sizeof(R*));
786 
787   for (t = 0; t < ths->d; t++)
788   {
789     ths->c_phi_inv[t] = (R*)Y(malloc)((size_t)(ths->N[t] - OFFSET) * sizeof(R));
790 
791     for (ks[t] = 0; ks[t] < ths->N[t] - OFFSET; ks[t]++)
792     {
793       ths->c_phi_inv[t][ks[t]] = (K(1.0) / (PHI_HUT((2 * NN(ths->n[t])), ks[t] + OFFSET, t)));
794     }
795   }
796 } /* phi_hut */
797 
798 /** create a lookup table, but NOT for each node
799  *  good idea K=2^xx
800  *  TODO: estimate K, call from init
801  *  assumes an EVEN window function
802  */
X(precompute_lin_psi)803 void X(precompute_lin_psi)(X(plan) *ths)
804 {
805   INT t; /**< index over all dimensions */
806   INT j; /**< index over all nodes */
807   R step; /**< step size in [0,(m+2)/n] */
808 
809   for (t = 0; t < ths->d; t++)
810   {
811     step = ((R)(ths->m+2)) / (((R)ths->K) * (2 * NN(ths->n[t])));
812 
813     for (j = 0; j <= ths->K; j++)
814     {
815       ths->psi[(ths->K + 1) * t + j] = PHI((2 * NN(ths->n[t])), (j * step), t);
816     } /* for(j) */
817   } /* for(t) */
818 }
819 
X(precompute_fg_psi)820 void X(precompute_fg_psi)(X(plan) *ths)
821 {
822   INT t; /* index over all dimensions */
823   INT u, o; /* depends on x_j */
824 
825 //  sort(ths);
826 
827   for (t = 0; t < ths->d; t++)
828   {
829     INT j;
830 //    #pragma omp parallel for default(shared) private(j,u,o)
831     for (j = 0; j < ths->M_total; j++)
832     {
833       uo(ths, j, &u, &o, t);
834 
835       ths->psi[2 * (j*ths->d + t)] = (PHI((2 * NN(ths->n[t])),(ths->x[j * ths->d + t] - ((R)u) / (2 * NN(ths->n[t]))),(t)));
836       ths->psi[2 * (j*ths->d + t) + 1] = EXP(K(2.0) * ( (2 * NN(ths->n[t])) * ths->x[j * ths->d + t] - u) / ths->b[t]);
837       } /* for(j) */
838   }
839   /* for(t) */
840 } /* nfft_precompute_fg_psi */
841 
X(precompute_psi)842 void X(precompute_psi)(X(plan) *ths)
843 {
844   INT t; /* index over all dimensions */
845   INT lj; /* index 0<=lj<u+o+1 */
846   INT u, o; /* depends on x_j */
847 
848   //sort(ths);
849 
850   for (t = 0; t < ths->d; t++)
851   {
852     INT j;
853 
854     for (j = 0; j < ths->M_total; j++)
855     {
856       uo(ths, j, &u, &o, t);
857 
858       for(lj = 0; lj < (2 * ths->m + 2); lj++)
859         ths->psi[(j * ths->d + t) * (2 * ths->m + 2) + lj] =
860             (PHI((2 * NN(ths->n[t])), ((ths->x[(j) * ths->d + (t)]) - ((R)(lj + u)) / (K(2.0) * ((R)NN(ths->n[t])))), t));
861     } /* for (j) */
862   } /* for (t) */
863 } /* precompute_psi */
864 
X(precompute_full_psi)865 void X(precompute_full_psi)(X(plan) *ths)
866 {
867 //#ifdef _OPENMP
868 //  sort(ths);
869 //
870 //  nfft_precompute_full_psi_omp(ths);
871 //#else
872   INT t, t2; /* index over all dimensions */
873   INT j; /* index over all nodes */
874   INT l_L; /* plain index 0 <= l_L < lprod */
875   INT l[ths->d]; /* multi index u<=l<=o */
876   INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
877   INT ll_plain[ths->d+1]; /* postfix plain index */
878   INT lprod; /* 'bandwidth' of matrix B */
879   INT u[ths->d], o[ths->d]; /* depends on x_j */
880   INT count_lg[ths->d];
881   INT lg_offset[ths->d];
882 
883   R phi_prod[ths->d+1];
884 
885   INT ix, ix_old;
886 
887   //sort(ths);
888 
889   phi_prod[0] = K(1.0);
890   ll_plain[0]  = 0;
891 
892   for (t = 0, lprod = 1; t < ths->d; t++)
893     lprod *= 2 * ths->m + 2;
894 
895   for (j = 0, ix = 0, ix_old = 0; j < ths->M_total; j++)
896   {
897     MACRO_init_uo_l_lj_t;
898 
899     for (l_L = 0; l_L < lprod; l_L++, ix++)
900     {
901       MACRO_update_phi_prod_ll_plain(A, without_PRE_PSI);
902 
903       ths->psi_index_g[ix] = ll_plain[ths->d];
904       ths->psi[ix] = phi_prod[ths->d];
905 
906       MACRO_count_uo_l_lj_t;
907     } /* for (l_L) */
908 
909     ths->psi_index_f[j] = ix - ix_old;
910     ix_old = ix;
911   } /* for(j) */
912 //#endif
913 }
914 
X(precompute_one_psi)915 void X(precompute_one_psi)(X(plan) *ths)
916 {
917   if(ths->flags & PRE_PSI)
918     X(precompute_psi)(ths);
919   if(ths->flags & PRE_FULL_PSI)
920     X(precompute_full_psi)(ths);
921   if(ths->flags & PRE_FG_PSI)
922     X(precompute_fg_psi)(ths);
923   if(ths->flags & PRE_LIN_PSI)
924     X(precompute_lin_psi)(ths);
925 }
926 
init_help(X (plan)* ths)927 static inline void init_help(X(plan) *ths)
928 {
929   INT t; /* index over all dimensions */
930   INT lprod; /* 'bandwidth' of matrix B */
931 
932   if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT)
933     ths->flags |= NFFT_SORT_NODES;
934 
935   ths->N_total = intprod(ths->N, OFFSET, ths->d);
936   ths->n_total = intprod(ths->n, 0, ths->d);
937 
938   ths->sigma = (R*)Y(malloc)((size_t)(ths->d) * sizeof(R));
939 
940   for (t = 0; t < ths->d; t++)
941     ths->sigma[t] = ((R)NN(ths->n[t])) / ths->N[t];
942 
943   /* Assign r2r transform kinds for each dimension */
944   ths->r2r_kind = (FFTW(r2r_kind)*)Y(malloc)((size_t)(ths->d) * sizeof (FFTW(r2r_kind)));
945   for (t = 0; t < ths->d; t++)
946     ths->r2r_kind[t] = FOURIER_TRAFO;
947 
948   WINDOW_HELP_INIT;
949 
950   if (ths->flags & MALLOC_X)
951     ths->x = (R*)Y(malloc)((size_t)(ths->d * ths->M_total) * sizeof(R));
952 
953   if (ths->flags & MALLOC_F_HAT)
954     ths->f_hat = (R*)Y(malloc)((size_t)(ths->N_total) * sizeof(R));
955 
956   if (ths->flags & MALLOC_F)
957     ths->f = (R*)Y(malloc)((size_t)(ths->M_total) * sizeof(R));
958 
959   if (ths->flags & PRE_PHI_HUT)
960     precompute_phi_hut(ths);
961 
962   if(ths->flags & PRE_LIN_PSI)
963   {
964       ths->K = (1U<< 10) * (ths->m+2);
965       ths->psi = (R*) Y(malloc)((size_t)((ths->K + 1) * ths->d) * sizeof(R));
966   }
967 
968   if(ths->flags & PRE_FG_PSI)
969     ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * ths->d * 2) * sizeof(R));
970 
971   if (ths->flags & PRE_PSI)
972     ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * ths->d * (2 * ths->m + 2 )) * sizeof(R));
973 
974   if(ths->flags & PRE_FULL_PSI)
975   {
976       for (t = 0, lprod = 1; t < ths->d; t++)
977         lprod *= 2 * ths->m + 2;
978 
979       ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * lprod) * sizeof(R));
980 
981       ths->psi_index_f = (INT*) Y(malloc)((size_t)(ths->M_total) * sizeof(INT));
982       ths->psi_index_g = (INT*) Y(malloc)((size_t)(ths->M_total * lprod) * sizeof(INT));
983   }
984 
985   if (ths->flags & FFTW_INIT)
986   {
987     ths->g1 = (R*)Y(malloc)((size_t)(ths->n_total) * sizeof(R));
988 
989     if (ths->flags & FFT_OUT_OF_PLACE)
990       ths->g2 = (R*) Y(malloc)((size_t)(ths->n_total) * sizeof(R));
991     else
992       ths->g2 = ths->g1;
993 
994     {
995       int *_n = Y(malloc)((size_t)(ths->d) * sizeof(int));
996 
997       for (t = 0; t < ths->d; t++)
998         _n[t] = (int)(ths->n[t]);
999 
1000       ths->my_fftw_r2r_plan = FFTW(plan_r2r)((int)ths->d, _n, ths->g1, ths->g2, ths->r2r_kind, ths->fftw_flags);
1001       Y(free)(_n);
1002     }
1003   }
1004 
1005 //  if(ths->flags & NFFT_SORT_NODES)
1006 //    ths->index_x = (INT*) Y(malloc)(sizeof(INT)*2*ths->M_total);
1007 //  else
1008 //    ths->index_x = NULL;
1009 
1010   ths->mv_trafo = (void (*) (void* ))X(trafo);
1011   ths->mv_adjoint = (void (*) (void* ))X(adjoint);
1012 }
1013 
X(init)1014 void X(init)(X(plan) *ths, int d, int *N, int M_total)
1015 {
1016   int t; /* index over all dimensions */
1017 
1018   ths->d = (INT)d;
1019 
1020   ths->N = (INT*) Y(malloc)((size_t)(d) * sizeof(INT));
1021 
1022   for (t = 0; t < d; t++)
1023     ths->N[t] = (INT)N[t];
1024 
1025   ths->M_total = (INT)M_total;
1026 
1027   ths->n = (INT*) Y(malloc)((size_t)(d) * sizeof(INT));
1028 
1029   for (t = 0; t < d; t++)
1030     ths->n[t] = 2 * (Y(next_power_of_2)(ths->N[t]) - 1) + OFFSET;
1031 
1032   ths->m = WINDOW_HELP_ESTIMATE_m;
1033 
1034   if (d > 1)
1035   {
1036 //#ifdef _OPENMP
1037 //    ths->flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
1038 //                      FFTW_INIT | NFFT_SORT_NODES | NFFT_OMP_BLOCKWISE_ADJOINT;
1039 //#else
1040     ths->flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
1041                       FFTW_INIT | NFFT_SORT_NODES;
1042 //#endif
1043   }
1044   else
1045     ths->flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
1046                       FFTW_INIT | FFT_OUT_OF_PLACE;
1047 
1048   ths->fftw_flags = FFTW_ESTIMATE | FFTW_DESTROY_INPUT;
1049 
1050   init_help(ths);
1051 }
1052 
X(init_guru)1053 void X(init_guru)(X(plan) *ths, int d, int *N, int M_total, int *n, int m,
1054   unsigned flags, unsigned fftw_flags)
1055 {
1056   INT t; /* index over all dimensions */
1057 
1058   ths->d = (INT)d;
1059   ths->M_total = (INT)M_total;
1060   ths->N = (INT*)Y(malloc)((size_t)(ths->d) * sizeof(INT));
1061 
1062   for (t = 0; t < d; t++)
1063     ths->N[t] = (INT)N[t];
1064 
1065   ths->n = (INT*)Y(malloc)((size_t)(ths->d) * sizeof(INT));
1066 
1067   for (t = 0; t < d; t++)
1068     ths->n[t] = (INT)n[t];
1069 
1070   ths->m = (INT)m;
1071 
1072   ths->flags = flags;
1073   ths->fftw_flags = fftw_flags;
1074 
1075   init_help(ths);
1076 }
1077 
X(init_1d)1078 void X(init_1d)(X(plan) *ths, int N1, int M_total)
1079 {
1080   int N[1];
1081 
1082   N[0] = N1;
1083 
1084   X(init)(ths, 1, N, M_total);
1085 }
1086 
X(init_2d)1087 void X(init_2d)(X(plan) *ths, int N1, int N2, int M_total)
1088 {
1089   int N[2];
1090 
1091   N[0] = N1;
1092   N[1] = N2;
1093 
1094   X(init)(ths, 2, N, M_total);
1095 }
1096 
X(init_3d)1097 void X(init_3d)(X(plan) *ths, int N1, int N2, int N3, int M_total)
1098 {
1099   int N[3];
1100 
1101   N[0] = N1;
1102   N[1] = N2;
1103   N[2] = N3;
1104 
1105   X(init)(ths, 3, N, M_total);
1106 }
1107 
X(check)1108 const char* X(check)(X(plan) *ths)
1109 {
1110   INT j;
1111 
1112   if (!ths->f)
1113       return "Member f not initialized.";
1114 
1115   if (!ths->x)
1116       return "Member x not initialized.";
1117 
1118   if (!ths->f_hat)
1119       return "Member f_hat not initialized.";
1120 
1121   for (j = 0; j < ths->M_total * ths->d; j++)
1122   {
1123     if ((ths->x[j] < K(0.0)) || (ths->x[j] >= K(0.5)))
1124     {
1125       return "ths->x out of range [0.0,0.5)";
1126     }
1127   }
1128 
1129   for (j = 0; j < ths->d; j++)
1130   {
1131     if (ths->sigma[j] <= 1)
1132       return "Oversampling factor too small";
1133 
1134     if(ths->N[j] - 1 <= ths->m)
1135       return "Polynomial degree N is smaller than cut-off m";
1136   }
1137   return 0;
1138 }
1139 
X(finalize)1140 void X(finalize)(X(plan) *ths)
1141 {
1142   INT t; /* index over dimensions */
1143 
1144 //  if(ths->flags & NFFT_SORT_NODES)
1145 //    Y(free)(ths->index_x);
1146 
1147   if (ths->flags & FFTW_INIT)
1148   {
1149 #ifdef _OPENMP
1150     #pragma omp critical (nfft_omp_critical_fftw_plan)
1151 #endif
1152     FFTW(destroy_plan)(ths->my_fftw_r2r_plan);
1153 
1154     if (ths->flags & FFT_OUT_OF_PLACE)
1155       Y(free)(ths->g2);
1156 
1157     Y(free)(ths->g1);
1158   }
1159 
1160   if(ths->flags & PRE_FULL_PSI)
1161   {
1162     Y(free)(ths->psi_index_g);
1163     Y(free)(ths->psi_index_f);
1164     Y(free)(ths->psi);
1165   }
1166 
1167   if (ths->flags & PRE_PSI)
1168     Y(free)(ths->psi);
1169 
1170   if(ths->flags & PRE_FG_PSI)
1171     Y(free)(ths->psi);
1172 
1173   if(ths->flags & PRE_LIN_PSI)
1174     Y(free)(ths->psi);
1175 
1176   if (ths->flags & PRE_PHI_HUT)
1177   {
1178     for (t = 0; t < ths->d; t++)
1179       Y(free)(ths->c_phi_inv[t]);
1180     Y(free)(ths->c_phi_inv);
1181   }
1182 
1183   if (ths->flags & MALLOC_F)
1184     Y(free)(ths->f);
1185 
1186   if(ths->flags & MALLOC_F_HAT)
1187     Y(free)(ths->f_hat);
1188 
1189   if (ths->flags & MALLOC_X)
1190     Y(free)(ths->x);
1191 
1192   WINDOW_HELP_FINALIZE;
1193 
1194   Y(free)(ths->N);
1195   Y(free)(ths->n);
1196   Y(free)(ths->sigma);
1197 
1198   Y(free)(ths->r2r_kind);
1199 } /* finalize */
1200