1 /*
2  * Copyright (c) 1997-1999, 2003 Massachusetts Institute of Technology
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; either version 2 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software
16  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17  *
18  */
19 
20 /* $Id: fftwnd.c,v 1.44 2003/03/16 23:43:46 stevenj Exp $ */
21 
22 #include "fftw-int.h"
23 
24 /* the number of buffers to use for buffered transforms: */
25 #define FFTWND_NBUFFERS 8
26 
27 /* the default number of buffers to use: */
28 #define FFTWND_DEFAULT_NBUFFERS 0
29 
30 /* the number of "padding" elements between consecutive buffer lines */
31 #define FFTWND_BUFFER_PADDING 8
32 
33 static void destroy_plan_array(int rank, fftw_plan *plans);
34 
init_test_array(fftw_complex * arr,int stride,int n)35 static void init_test_array(fftw_complex *arr, int stride, int n)
36 {
37      int j;
38 
39      for (j = 0; j < n; ++j) {
40 	  c_re(arr[stride * j]) = 0.0;
41 	  c_im(arr[stride * j]) = 0.0;
42      }
43 }
44 
45 /*
46  * Same as fftw_measure_runtime, except for fftwnd plan.
47  */
fftwnd_measure_runtime(fftwnd_plan plan,fftw_complex * in,int istride,fftw_complex * out,int ostride)48 double fftwnd_measure_runtime(fftwnd_plan plan,
49 			      fftw_complex *in, int istride,
50 			      fftw_complex *out, int ostride)
51 {
52      fftw_time begin, end, start;
53      double t, tmax, tmin;
54      int i, iter;
55      int n;
56      int repeat;
57 
58      if (plan->rank == 0)
59 	  return 0.0;
60 
61      n = 1;
62      for (i = 0; i < plan->rank; ++i)
63 	  n *= plan->n[i];
64 
65      iter = 1;
66 
67      for (;;) {
68 	  tmin = 1.0E10;
69 	  tmax = -1.0E10;
70 	  init_test_array(in, istride, n);
71 
72 	  start = fftw_get_time();
73 	  /* repeat the measurement FFTW_TIME_REPEAT times */
74 	  for (repeat = 0; repeat < FFTW_TIME_REPEAT; ++repeat) {
75 	       begin = fftw_get_time();
76 	       for (i = 0; i < iter; ++i) {
77 		    fftwnd(plan, 1, in, istride, 0, out, ostride, 0);
78 	       }
79 	       end = fftw_get_time();
80 
81 	       t = fftw_time_to_sec(fftw_time_diff(end, begin));
82 	       if (t < tmin)
83 		    tmin = t;
84 	       if (t > tmax)
85 		    tmax = t;
86 
87 	       /* do not run for too long */
88 	       t = fftw_time_to_sec(fftw_time_diff(end, start));
89 	       if (t > FFTW_TIME_LIMIT)
90 		    break;
91 	  }
92 
93 	  if (tmin >= FFTW_TIME_MIN)
94 	       break;
95 
96 	  iter *= 2;
97      }
98 
99      tmin /= (double) iter;
100      tmax /= (double) iter;
101 
102      return tmin;
103 }
104 
105 /********************** Initializing the FFTWND Plan ***********************/
106 
107 /* Initialize everything except for the 1D plans and the work array: */
fftwnd_create_plan_aux(int rank,const int * n,fftw_direction dir,int flags)108 fftwnd_plan fftwnd_create_plan_aux(int rank, const int *n,
109 				   fftw_direction dir, int flags)
110 {
111      int i;
112      fftwnd_plan p;
113 
114      if (rank < 0)
115 	  return 0;
116 
117      for (i = 0; i < rank; ++i)
118 	  if (n[i] <= 0)
119 	       return 0;
120 
121      p = (fftwnd_plan) fftw_malloc(sizeof(fftwnd_data));
122      p->n = 0;
123      p->n_before = 0;
124      p->n_after = 0;
125      p->plans = 0;
126      p->work = 0;
127      p->dir = dir;
128 
129      p->rank = rank;
130      p->is_in_place = flags & FFTW_IN_PLACE;
131 
132      p->nwork = 0;
133      p->nbuffers = 0;
134 
135      if (rank == 0)
136 	  return 0;
137 
138      p->n = (int *) fftw_malloc(sizeof(int) * rank);
139      p->n_before = (int *) fftw_malloc(sizeof(int) * rank);
140      p->n_after = (int *) fftw_malloc(sizeof(int) * rank);
141      p->n_before[0] = 1;
142      p->n_after[rank - 1] = 1;
143 
144      for (i = 0; i < rank; ++i) {
145 	  p->n[i] = n[i];
146 
147 	  if (i) {
148 	       p->n_before[i] = p->n_before[i - 1] * n[i - 1];
149 	       p->n_after[rank - 1 - i] = p->n_after[rank - i] * n[rank - i];
150 	  }
151      }
152 
153      return p;
154 }
155 
156 /* create an empty new array of rank 1d plans */
fftwnd_new_plan_array(int rank)157 fftw_plan *fftwnd_new_plan_array(int rank)
158 {
159      fftw_plan *plans;
160      int i;
161 
162      plans = (fftw_plan *) fftw_malloc(rank * sizeof(fftw_plan));
163      if (!plans)
164 	  return 0;
165      for (i = 0; i < rank; ++i)
166 	  plans[i] = 0;
167      return plans;
168 }
169 
170 /*
171  * create an array of plans using the ordinary 1d fftw_create_plan,
172  * which allocates its own array and creates plans optimized for
173  * contiguous data.
174  */
fftwnd_create_plans_generic(fftw_plan * plans,int rank,const int * n,fftw_direction dir,int flags)175 fftw_plan *fftwnd_create_plans_generic(fftw_plan *plans,
176 				       int rank, const int *n,
177 				       fftw_direction dir, int flags)
178 {
179      if (rank <= 0)
180 	  return 0;
181 
182      if (plans) {
183 	  int i, j;
184 	  int cur_flags;
185 
186 	  for (i = 0; i < rank; ++i) {
187 	       if (i < rank - 1 || (flags & FFTW_IN_PLACE)) {
188 		    /*
189 		     * fft's except the last dimension are always in-place
190 		     */
191 		    cur_flags = flags | FFTW_IN_PLACE;
192 		    for (j = i - 1; j >= 0 && n[i] != n[j]; --j);
193 	       } else {
194 		    cur_flags = flags;
195 		    /*
196 		     * we must create a separate plan for the last
197 		     * dimension
198 		     */
199 		    j = -1;
200 	       }
201 
202 	       if (j >= 0) {
203 		    /*
204 		     * If a plan already exists for this size
205 		     * array, reuse it:
206 		     */
207 		    plans[i] = plans[j];
208 	       } else {
209 		    /* generate a new plan: */
210 		    plans[i] = fftw_create_plan(n[i], dir, cur_flags);
211 		    if (!plans[i]) {
212 			 destroy_plan_array(rank, plans);
213 			 return 0;
214 		    }
215 	       }
216 	  }
217      }
218      return plans;
219 }
220 
get_maxdim(int rank,const int * n,int flags)221 static int get_maxdim(int rank, const int *n, int flags)
222 {
223      int i;
224      int maxdim = 0;
225 
226      for (i = 0; i < rank - 1; ++i)
227 	  if (n[i] > maxdim)
228 	       maxdim = n[i];
229      if (rank > 0 && flags & FFTW_IN_PLACE && n[rank - 1] > maxdim)
230 	  maxdim = n[rank - 1];
231 
232      return maxdim;
233 }
234 
235 /* compute number of elements required for work array (has to
236    be big enough to hold ncopies of the largest dimension in
237    n that will need an in-place transform. */
fftwnd_work_size(int rank,const int * n,int flags,int ncopies)238 int fftwnd_work_size(int rank, const int *n, int flags, int ncopies)
239 {
240      return (ncopies * get_maxdim(rank, n, flags)
241 	     + (ncopies - 1) * FFTWND_BUFFER_PADDING);
242 }
243 
244 /*
245  * create plans using the fftw_create_plan_specific planner, which
246  * allows us to create plans for each dimension that are specialized
247  * for the strides that we are going to use.
248  */
fftwnd_create_plans_specific(fftw_plan * plans,int rank,const int * n,const int * n_after,fftw_direction dir,int flags,fftw_complex * in,int istride,fftw_complex * out,int ostride)249 fftw_plan *fftwnd_create_plans_specific(fftw_plan *plans,
250 					int rank, const int *n,
251 					const int *n_after,
252 					fftw_direction dir, int flags,
253 					fftw_complex *in, int istride,
254 					fftw_complex *out, int ostride)
255 {
256      if (rank <= 0)
257 	  return 0;
258 
259      if (plans) {
260 	  int i, stride, cur_flags;
261 	  fftw_complex *work = 0;
262 	  int nwork;
263 
264 	  nwork = fftwnd_work_size(rank, n, flags, 1);
265 	  if (nwork)
266 	       work = (fftw_complex*)fftw_malloc(nwork * sizeof(fftw_complex));
267 
268 	  for (i = 0; i < rank; ++i) {
269 	       /* fft's except the last dimension are always in-place */
270 	       if (i < rank - 1)
271 		    cur_flags = flags | FFTW_IN_PLACE;
272 	       else
273 		    cur_flags = flags;
274 
275 	       /* stride for transforming ith dimension */
276 	       stride = n_after[i];
277 
278 	       if (cur_flags & FFTW_IN_PLACE)
279 		    plans[i] = fftw_create_plan_specific(n[i], dir, cur_flags,
280 						    in, istride * stride,
281 							 work, 1);
282 	       else
283 		    plans[i] = fftw_create_plan_specific(n[i], dir, cur_flags,
284 						    in, istride * stride,
285 						  out, ostride * stride);
286 	       if (!plans[i]) {
287 		    destroy_plan_array(rank, plans);
288 		    fftw_free(work);
289 		    return 0;
290 	       }
291 	  }
292 
293 	  if (work)
294 	       fftw_free(work);
295      }
296      return plans;
297 }
298 
299 /*
300  * Create an fftwnd_plan specialized for specific arrays.  (These
301  * arrays are ignored, however, if they are NULL or if the flags do
302  * not include FFTW_MEASURE.)  The main advantage of being provided
303  * arrays like this is that we can do runtime timing measurements of
304  * our options, without worrying about allocating excessive scratch
305  * space.
306  */
fftwnd_create_plan_specific(int rank,const int * n,fftw_direction dir,int flags,fftw_complex * in,int istride,fftw_complex * out,int ostride)307 fftwnd_plan fftwnd_create_plan_specific(int rank, const int *n,
308 					fftw_direction dir, int flags,
309 					fftw_complex *in, int istride,
310 					fftw_complex *out, int ostride)
311 {
312      fftwnd_plan p;
313 
314      if (!(p = fftwnd_create_plan_aux(rank, n, dir, flags)))
315 	  return 0;
316 
317      if (!(flags & FFTW_MEASURE) || in == 0
318 	 || (!p->is_in_place && out == 0)) {
319 
320 /**** use default plan ****/
321 
322 	  p->plans = fftwnd_create_plans_generic(fftwnd_new_plan_array(rank),
323 						 rank, n, dir, flags);
324 	  if (!p->plans) {
325 	       fftwnd_destroy_plan(p);
326 	       return 0;
327 	  }
328 	  if (flags & FFTWND_FORCE_BUFFERED)
329 	       p->nbuffers = FFTWND_NBUFFERS;
330 	  else
331 	       p->nbuffers = FFTWND_DEFAULT_NBUFFERS;
332 
333 	  p->nwork = fftwnd_work_size(rank, n, flags, p->nbuffers + 1);
334 	  if (p->nwork && !(flags & FFTW_THREADSAFE)) {
335 	       p->work = (fftw_complex*) fftw_malloc(p->nwork
336 						     * sizeof(fftw_complex));
337 	       if (!p->work) {
338 		    fftwnd_destroy_plan(p);
339 		    return 0;
340 	       }
341 	  }
342      } else {
343 /**** use runtime measurements to pick plan ****/
344 
345 	  fftw_plan *plans_buf, *plans_nobuf;
346 	  double t_buf, t_nobuf;
347 
348 	  p->nwork = fftwnd_work_size(rank, n, flags, FFTWND_NBUFFERS + 1);
349 	  if (p->nwork && !(flags & FFTW_THREADSAFE)) {
350 	       p->work = (fftw_complex*) fftw_malloc(p->nwork
351 						     * sizeof(fftw_complex));
352 	       if (!p->work) {
353 		    fftwnd_destroy_plan(p);
354 		    return 0;
355 	       }
356 	  }
357 	  else
358 	       p->work = (fftw_complex*) NULL;
359 
360 	  /* two possible sets of 1D plans: */
361 	  plans_buf = fftwnd_create_plans_generic(fftwnd_new_plan_array(rank),
362 						  rank, n, dir, flags);
363 	  plans_nobuf =
364 	       fftwnd_create_plans_specific(fftwnd_new_plan_array(rank),
365 					    rank, n, p->n_after, dir,
366 					    flags, in, istride,
367 					    out, ostride);
368 	  if (!plans_buf || !plans_nobuf) {
369 	       destroy_plan_array(rank, plans_nobuf);
370 	       destroy_plan_array(rank, plans_buf);
371 	       fftwnd_destroy_plan(p);
372 	       return 0;
373 	  }
374 	  /* time the two possible plans */
375 	  p->plans = plans_nobuf;
376 	  p->nbuffers = 0;
377 	  p->nwork = fftwnd_work_size(rank, n, flags, p->nbuffers + 1);
378 	  t_nobuf = fftwnd_measure_runtime(p, in, istride, out, ostride);
379 	  p->plans = plans_buf;
380 	  p->nbuffers = FFTWND_NBUFFERS;
381 	  p->nwork = fftwnd_work_size(rank, n, flags, p->nbuffers + 1);
382 	  t_buf = fftwnd_measure_runtime(p, in, istride, out, ostride);
383 
384 	  /* pick the better one: */
385 	  if (t_nobuf < t_buf) {	/* use unbuffered transform */
386 	       p->plans = plans_nobuf;
387 	       p->nbuffers = 0;
388 
389 	       /* work array is unnecessarily large */
390 	       if (p->work)
391 		    fftw_free(p->work);
392 	       p->work = 0;
393 
394 	       destroy_plan_array(rank, plans_buf);
395 
396 	       /* allocate a work array of the correct size: */
397 	       p->nwork = fftwnd_work_size(rank, n, flags, p->nbuffers + 1);
398 	       if (p->nwork && !(flags & FFTW_THREADSAFE)) {
399 		    p->work = (fftw_complex*) fftw_malloc(p->nwork
400 						       * sizeof(fftw_complex));
401 		    if (!p->work) {
402 			 fftwnd_destroy_plan(p);
403 			 return 0;
404 		    }
405 	       }
406 	  } else {		/* use buffered transform */
407 	       destroy_plan_array(rank, plans_nobuf);
408 	  }
409      }
410 
411      return p;
412 }
413 
fftw2d_create_plan_specific(int nx,int ny,fftw_direction dir,int flags,fftw_complex * in,int istride,fftw_complex * out,int ostride)414 fftwnd_plan fftw2d_create_plan_specific(int nx, int ny,
415 					fftw_direction dir, int flags,
416 					fftw_complex *in, int istride,
417 					fftw_complex *out, int ostride)
418 {
419      int n[2];
420 
421      n[0] = nx;
422      n[1] = ny;
423 
424      return fftwnd_create_plan_specific(2, n, dir, flags,
425 					in, istride, out, ostride);
426 }
427 
fftw3d_create_plan_specific(int nx,int ny,int nz,fftw_direction dir,int flags,fftw_complex * in,int istride,fftw_complex * out,int ostride)428 fftwnd_plan fftw3d_create_plan_specific(int nx, int ny, int nz,
429 					fftw_direction dir, int flags,
430 					fftw_complex *in, int istride,
431 					fftw_complex *out, int ostride)
432 {
433      int n[3];
434 
435      n[0] = nx;
436      n[1] = ny;
437      n[2] = nz;
438 
439      return fftwnd_create_plan_specific(3, n, dir, flags,
440 					in, istride, out, ostride);
441 }
442 
443 /* Create a generic fftwnd plan: */
444 
fftwnd_create_plan(int rank,const int * n,fftw_direction dir,int flags)445 fftwnd_plan fftwnd_create_plan(int rank, const int *n,
446 			       fftw_direction dir, int flags)
447 {
448      return fftwnd_create_plan_specific(rank, n, dir, flags, 0, 1, 0, 1);
449 }
450 
fftw2d_create_plan(int nx,int ny,fftw_direction dir,int flags)451 fftwnd_plan fftw2d_create_plan(int nx, int ny,
452 			       fftw_direction dir, int flags)
453 {
454      return fftw2d_create_plan_specific(nx, ny, dir, flags, 0, 1, 0, 1);
455 }
456 
fftw3d_create_plan(int nx,int ny,int nz,fftw_direction dir,int flags)457 fftwnd_plan fftw3d_create_plan(int nx, int ny, int nz,
458 			       fftw_direction dir, int flags)
459 {
460      return fftw3d_create_plan_specific(nx, ny, nz, dir, flags, 0, 1, 0, 1);
461 }
462 
463 /************************ Freeing the FFTWND Plan ************************/
464 
destroy_plan_array(int rank,fftw_plan * plans)465 static void destroy_plan_array(int rank, fftw_plan *plans)
466 {
467      if (plans) {
468 	  int i, j;
469 
470 	  for (i = 0; i < rank; ++i) {
471 	       for (j = i - 1;
472 		    j >= 0 && plans[i] != plans[j];
473 		    --j);
474 	       if (j < 0 && plans[i])
475 		    fftw_destroy_plan(plans[i]);
476 	  }
477 	  fftw_free(plans);
478      }
479 }
480 
fftwnd_destroy_plan(fftwnd_plan plan)481 void fftwnd_destroy_plan(fftwnd_plan plan)
482 {
483      if (plan) {
484 	  destroy_plan_array(plan->rank, plan->plans);
485 
486 	  if (plan->n)
487 	       fftw_free(plan->n);
488 
489 	  if (plan->n_before)
490 	       fftw_free(plan->n_before);
491 
492 	  if (plan->n_after)
493 	       fftw_free(plan->n_after);
494 
495 	  if (plan->work)
496 	       fftw_free(plan->work);
497 
498 	  fftw_free(plan);
499      }
500 }
501 
502 /************************ Printing the FFTWND Plan ************************/
503 
fftwnd_fprint_plan(FILE * f,fftwnd_plan plan)504 void fftwnd_fprint_plan(FILE *f, fftwnd_plan plan)
505 {
506      if (plan) {
507 	  int i, j;
508 
509 	  if (plan->rank == 0) {
510 	       fprintf(f, "plan for rank 0 (null) transform.\n");
511 	       return;
512 	  }
513 	  fprintf(f, "plan for ");
514 	  for (i = 0; i < plan->rank; ++i)
515 	       fprintf(f, "%s%d", i ? "x" : "", plan->n[i]);
516 	  fprintf(f, " transform:\n");
517 
518 	  if (plan->nbuffers > 0)
519 	       fprintf(f, "  -- using buffered transforms (%d buffers)\n",
520 		       plan->nbuffers);
521 	  else
522 	       fprintf(f, "  -- using unbuffered transform\n");
523 
524 	  for (i = 0; i < plan->rank; ++i) {
525 	       fprintf(f, "* dimension %d (size %d) ", i, plan->n[i]);
526 
527 	       for (j = i - 1; j >= 0; --j)
528 		    if (plan->plans[j] == plan->plans[i])
529 			 break;
530 
531 	       if (j < 0)
532 		    fftw_fprint_plan(f, plan->plans[i]);
533 	       else
534 		    fprintf(f, "plan is same as dimension %d plan.\n", j);
535 	  }
536      }
537 }
538 
fftwnd_print_plan(fftwnd_plan plan)539 void fftwnd_print_plan(fftwnd_plan plan)
540 {
541      fftwnd_fprint_plan(stdout, plan);
542 }
543 
544 /********************* Buffered FFTW (in-place) *********************/
545 
fftw_buffered(fftw_plan p,int howmany,fftw_complex * in,int istride,int idist,fftw_complex * work,int nbuffers,fftw_complex * buffers)546 void fftw_buffered(fftw_plan p, int howmany,
547 		   fftw_complex *in, int istride, int idist,
548 		   fftw_complex *work,
549 		   int nbuffers, fftw_complex *buffers)
550 {
551      int i = 0, n, nb;
552 
553      n = p->n;
554      nb = n + FFTWND_BUFFER_PADDING;
555 
556      do {
557 	  for (; i <= howmany - nbuffers; i += nbuffers) {
558 	       fftw_complex *cur_in = in + i * idist;
559 	       int j, buf;
560 
561 	       /*
562 	        * First, copy nbuffers strided arrays to the
563 	        * contiguous buffer arrays (reading consecutive
564 	        * locations, assuming that idist is 1):
565 	        */
566 	       for (j = 0; j < n; ++j) {
567 		    fftw_complex *cur_in2 = cur_in + j * istride;
568 		    fftw_complex *cur_buffers = buffers + j;
569 
570 		    for (buf = 0; buf <= nbuffers - 4; buf += 4) {
571 			 fftw_real r0, i0, r1, i1, r2, i2, r3, i3;
572 			 r0 = c_re(cur_in2[0]);
573 			 i0 = c_im(cur_in2[0]);
574 			 r1 = c_re(cur_in2[idist]);
575 			 i1 = c_im(cur_in2[idist]);
576 			 r2 = c_re(cur_in2[2 * idist]);
577 			 i2 = c_im(cur_in2[2 * idist]);
578 			 r3 = c_re(cur_in2[3 * idist]);
579 			 i3 = c_im(cur_in2[3 * idist]);
580 			 c_re(cur_buffers[0]) = r0;
581 			 c_im(cur_buffers[0]) = i0;
582 			 c_re(cur_buffers[nb]) = r1;
583 			 c_im(cur_buffers[nb]) = i1;
584 			 c_re(cur_buffers[2 * nb]) = r2;
585 			 c_im(cur_buffers[2 * nb]) = i2;
586 			 c_re(cur_buffers[3 * nb]) = r3;
587 			 c_im(cur_buffers[3 * nb]) = i3;
588 			 cur_buffers += 4 * nb;
589 			 cur_in2 += 4 * idist;
590 		    }
591 		    for (; buf < nbuffers; ++buf) {
592 			 *cur_buffers = *cur_in2;
593 			 cur_buffers += nb;
594 			 cur_in2 += idist;
595 		    }
596 	       }
597 
598 	       /*
599 	        * Now, compute the FFTs in the buffers (in-place
600 	        * using work):
601 	        */
602 	       fftw(p, nbuffers, buffers, 1, nb, work, 1, 0);
603 
604 	       /*
605 	        * Finally, copy the results back from the contiguous
606 	        * buffers to the strided arrays (writing consecutive
607 	        * locations):
608 	        */
609 	       for (j = 0; j < n; ++j) {
610 		    fftw_complex *cur_in2 = cur_in + j * istride;
611 		    fftw_complex *cur_buffers = buffers + j;
612 
613 		    for (buf = 0; buf <= nbuffers - 4; buf += 4) {
614 			 fftw_real r0, i0, r1, i1, r2, i2, r3, i3;
615 			 r0 = c_re(cur_buffers[0]);
616 			 i0 = c_im(cur_buffers[0]);
617 			 r1 = c_re(cur_buffers[nb]);
618 			 i1 = c_im(cur_buffers[nb]);
619 			 r2 = c_re(cur_buffers[2 * nb]);
620 			 i2 = c_im(cur_buffers[2 * nb]);
621 			 r3 = c_re(cur_buffers[3 * nb]);
622 			 i3 = c_im(cur_buffers[3 * nb]);
623 			 c_re(cur_in2[0]) = r0;
624 			 c_im(cur_in2[0]) = i0;
625 			 c_re(cur_in2[idist]) = r1;
626 			 c_im(cur_in2[idist]) = i1;
627 			 c_re(cur_in2[2 * idist]) = r2;
628 			 c_im(cur_in2[2 * idist]) = i2;
629 			 c_re(cur_in2[3 * idist]) = r3;
630 			 c_im(cur_in2[3 * idist]) = i3;
631 			 cur_buffers += 4 * nb;
632 			 cur_in2 += 4 * idist;
633 		    }
634 		    for (; buf < nbuffers; ++buf) {
635 			 *cur_in2 = *cur_buffers;
636 			 cur_buffers += nb;
637 			 cur_in2 += idist;
638 		    }
639 	       }
640 	  }
641 
642 	  /*
643 	   * we skip howmany % nbuffers ffts at the end of the loop,
644 	   * so we have to go back and do them:
645 	   */
646 	  nbuffers = howmany - i;
647      } while (i < howmany);
648 }
649 
650 /********************* Computing the N-Dimensional FFT *********************/
651 
fftwnd_aux(fftwnd_plan p,int cur_dim,fftw_complex * in,int istride,fftw_complex * out,int ostride,fftw_complex * work)652 void fftwnd_aux(fftwnd_plan p, int cur_dim,
653 		fftw_complex *in, int istride,
654 		fftw_complex *out, int ostride,
655 		fftw_complex *work)
656 {
657      int n_after = p->n_after[cur_dim], n = p->n[cur_dim];
658 
659      if (cur_dim == p->rank - 2) {
660 	  /* just do the last dimension directly: */
661 	  if (p->is_in_place)
662 	       fftw(p->plans[p->rank - 1], n,
663 		    in, istride, n_after * istride,
664 		    work, 1, 0);
665 	  else
666 	       fftw(p->plans[p->rank - 1], n,
667 		    in, istride, n_after * istride,
668 		    out, ostride, n_after * ostride);
669      } else {			/* we have at least two dimensions to go */
670 	  int i;
671 
672 	  /*
673 	   * process the subsequent dimensions recursively, in hyperslabs,
674 	   * to get maximum locality:
675 	   */
676 	  for (i = 0; i < n; ++i)
677 	       fftwnd_aux(p, cur_dim + 1,
678 			  in + i * n_after * istride, istride,
679 			  out + i * n_after * ostride, ostride, work);
680      }
681 
682      /* do the current dimension (in-place): */
683      if (p->nbuffers == 0) {
684 	  fftw(p->plans[cur_dim], n_after,
685 	       out, n_after * ostride, ostride,
686 	       work, 1, 0);
687      } else			/* using contiguous copy buffers: */
688 	  fftw_buffered(p->plans[cur_dim], n_after,
689 			out, n_after * ostride, ostride,
690 			work, p->nbuffers, work + n);
691 }
692 
693 /*
694  * alternate version of fftwnd_aux -- this version pushes the howmany
695  * loop down to the leaves of the computation, for greater locality in
696  * cases where dist < stride
697  */
fftwnd_aux_howmany(fftwnd_plan p,int cur_dim,int howmany,fftw_complex * in,int istride,int idist,fftw_complex * out,int ostride,int odist,fftw_complex * work)698 void fftwnd_aux_howmany(fftwnd_plan p, int cur_dim,
699 			int howmany,
700 			fftw_complex *in, int istride, int idist,
701 			fftw_complex *out, int ostride, int odist,
702 			fftw_complex *work)
703 {
704      int n_after = p->n_after[cur_dim], n = p->n[cur_dim];
705      int k;
706 
707      if (cur_dim == p->rank - 2) {
708 	  /* just do the last dimension directly: */
709 	  if (p->is_in_place)
710 	       for (k = 0; k < n; ++k)
711 		    fftw(p->plans[p->rank - 1], howmany,
712 			 in + k * n_after * istride, istride, idist,
713 			 work, 1, 0);
714 	  else
715 	       for (k = 0; k < n; ++k)
716 		    fftw(p->plans[p->rank - 1], howmany,
717 			 in + k * n_after * istride, istride, idist,
718 			 out + k * n_after * ostride, ostride, odist);
719      } else {			/* we have at least two dimensions to go */
720 	  int i;
721 
722 	  /*
723 	   * process the subsequent dimensions recursively, in
724 	   * hyperslabs, to get maximum locality:
725 	   */
726 	  for (i = 0; i < n; ++i)
727 	       fftwnd_aux_howmany(p, cur_dim + 1, howmany,
728 			      in + i * n_after * istride, istride, idist,
729 				  out + i * n_after * ostride, ostride, odist,
730 				  work);
731      }
732 
733      /* do the current dimension (in-place): */
734      if (p->nbuffers == 0)
735 	  for (k = 0; k < n_after; ++k)
736 	       fftw(p->plans[cur_dim], howmany,
737 		    out + k * ostride, n_after * ostride, odist,
738 		    work, 1, 0);
739      else			/* using contiguous copy buffers: */
740 	  for (k = 0; k < n_after; ++k)
741 	       fftw_buffered(p->plans[cur_dim], howmany,
742 			     out + k * ostride, n_after * ostride, odist,
743 			     work, p->nbuffers, work + n);
744 }
745 
fftwnd(fftwnd_plan p,int howmany,fftw_complex * in,int istride,int idist,fftw_complex * out,int ostride,int odist)746 void fftwnd(fftwnd_plan p, int howmany,
747 	    fftw_complex *in, int istride, int idist,
748 	    fftw_complex *out, int ostride, int odist)
749 {
750      fftw_complex *work;
751 
752 #ifdef FFTW_DEBUG
753      if (p->rank > 0 && (p->plans[0]->flags & FFTW_THREADSAFE)
754 	 && p->nwork && p->work)
755 	  fftw_die("bug with FFTW_THREADSAFE flag\n");
756 #endif
757 
758      if (p->nwork && !p->work)
759 	  work = (fftw_complex *) fftw_malloc(p->nwork * sizeof(fftw_complex));
760      else
761 	  work = p->work;
762 
763      switch (p->rank) {
764 	 case 0:
765 	      break;
766 	 case 1:
767 	      if (p->is_in_place)	/* fft is in-place */
768 		   fftw(p->plans[0], howmany, in, istride, idist,
769 			work, 1, 0);
770 	      else
771 		   fftw(p->plans[0], howmany, in, istride, idist,
772 			out, ostride, odist);
773 	      break;
774 	 default:		/* rank >= 2 */
775 	      {
776 		   if (p->is_in_place) {
777 			out = in;
778 			ostride = istride;
779 			odist = idist;
780 		   }
781 		   if (howmany > 1 && odist < ostride)
782 			fftwnd_aux_howmany(p, 0, howmany,
783 					   in, istride, idist,
784 					   out, ostride, odist,
785 					   work);
786 		   else {
787 			int i;
788 
789 			for (i = 0; i < howmany; ++i)
790 			     fftwnd_aux(p, 0,
791 					in + i * idist, istride,
792 					out + i * odist, ostride,
793 					work);
794 		   }
795 	      }
796      }
797 
798      if (p->nwork && !p->work)
799 	  fftw_free(work);
800 
801 }
802 
fftwnd_one(fftwnd_plan p,fftw_complex * in,fftw_complex * out)803 void fftwnd_one(fftwnd_plan p, fftw_complex *in, fftw_complex *out)
804 {
805      fftwnd(p, 1, in, 1, 1, out, 1, 1);
806 }
807