1 /*
2 * Copyright (c) 1997-1999, 2003 Massachusetts Institute of Technology
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 *
18 */
19
20 /* $Id: fftwnd.c,v 1.44 2003/03/16 23:43:46 stevenj Exp $ */
21
22 #include "fftw-int.h"
23
24 /* the number of buffers to use for buffered transforms: */
25 #define FFTWND_NBUFFERS 8
26
27 /* the default number of buffers to use: */
28 #define FFTWND_DEFAULT_NBUFFERS 0
29
30 /* the number of "padding" elements between consecutive buffer lines */
31 #define FFTWND_BUFFER_PADDING 8
32
33 static void destroy_plan_array(int rank, fftw_plan *plans);
34
init_test_array(fftw_complex * arr,int stride,int n)35 static void init_test_array(fftw_complex *arr, int stride, int n)
36 {
37 int j;
38
39 for (j = 0; j < n; ++j) {
40 c_re(arr[stride * j]) = 0.0;
41 c_im(arr[stride * j]) = 0.0;
42 }
43 }
44
45 /*
46 * Same as fftw_measure_runtime, except for fftwnd plan.
47 */
fftwnd_measure_runtime(fftwnd_plan plan,fftw_complex * in,int istride,fftw_complex * out,int ostride)48 double fftwnd_measure_runtime(fftwnd_plan plan,
49 fftw_complex *in, int istride,
50 fftw_complex *out, int ostride)
51 {
52 fftw_time begin, end, start;
53 double t, tmax, tmin;
54 int i, iter;
55 int n;
56 int repeat;
57
58 if (plan->rank == 0)
59 return 0.0;
60
61 n = 1;
62 for (i = 0; i < plan->rank; ++i)
63 n *= plan->n[i];
64
65 iter = 1;
66
67 for (;;) {
68 tmin = 1.0E10;
69 tmax = -1.0E10;
70 init_test_array(in, istride, n);
71
72 start = fftw_get_time();
73 /* repeat the measurement FFTW_TIME_REPEAT times */
74 for (repeat = 0; repeat < FFTW_TIME_REPEAT; ++repeat) {
75 begin = fftw_get_time();
76 for (i = 0; i < iter; ++i) {
77 fftwnd(plan, 1, in, istride, 0, out, ostride, 0);
78 }
79 end = fftw_get_time();
80
81 t = fftw_time_to_sec(fftw_time_diff(end, begin));
82 if (t < tmin)
83 tmin = t;
84 if (t > tmax)
85 tmax = t;
86
87 /* do not run for too long */
88 t = fftw_time_to_sec(fftw_time_diff(end, start));
89 if (t > FFTW_TIME_LIMIT)
90 break;
91 }
92
93 if (tmin >= FFTW_TIME_MIN)
94 break;
95
96 iter *= 2;
97 }
98
99 tmin /= (double) iter;
100 tmax /= (double) iter;
101
102 return tmin;
103 }
104
105 /********************** Initializing the FFTWND Plan ***********************/
106
107 /* Initialize everything except for the 1D plans and the work array: */
fftwnd_create_plan_aux(int rank,const int * n,fftw_direction dir,int flags)108 fftwnd_plan fftwnd_create_plan_aux(int rank, const int *n,
109 fftw_direction dir, int flags)
110 {
111 int i;
112 fftwnd_plan p;
113
114 if (rank < 0)
115 return 0;
116
117 for (i = 0; i < rank; ++i)
118 if (n[i] <= 0)
119 return 0;
120
121 p = (fftwnd_plan) fftw_malloc(sizeof(fftwnd_data));
122 p->n = 0;
123 p->n_before = 0;
124 p->n_after = 0;
125 p->plans = 0;
126 p->work = 0;
127 p->dir = dir;
128
129 p->rank = rank;
130 p->is_in_place = flags & FFTW_IN_PLACE;
131
132 p->nwork = 0;
133 p->nbuffers = 0;
134
135 if (rank == 0)
136 return 0;
137
138 p->n = (int *) fftw_malloc(sizeof(int) * rank);
139 p->n_before = (int *) fftw_malloc(sizeof(int) * rank);
140 p->n_after = (int *) fftw_malloc(sizeof(int) * rank);
141 p->n_before[0] = 1;
142 p->n_after[rank - 1] = 1;
143
144 for (i = 0; i < rank; ++i) {
145 p->n[i] = n[i];
146
147 if (i) {
148 p->n_before[i] = p->n_before[i - 1] * n[i - 1];
149 p->n_after[rank - 1 - i] = p->n_after[rank - i] * n[rank - i];
150 }
151 }
152
153 return p;
154 }
155
156 /* create an empty new array of rank 1d plans */
fftwnd_new_plan_array(int rank)157 fftw_plan *fftwnd_new_plan_array(int rank)
158 {
159 fftw_plan *plans;
160 int i;
161
162 plans = (fftw_plan *) fftw_malloc(rank * sizeof(fftw_plan));
163 if (!plans)
164 return 0;
165 for (i = 0; i < rank; ++i)
166 plans[i] = 0;
167 return plans;
168 }
169
170 /*
171 * create an array of plans using the ordinary 1d fftw_create_plan,
172 * which allocates its own array and creates plans optimized for
173 * contiguous data.
174 */
fftwnd_create_plans_generic(fftw_plan * plans,int rank,const int * n,fftw_direction dir,int flags)175 fftw_plan *fftwnd_create_plans_generic(fftw_plan *plans,
176 int rank, const int *n,
177 fftw_direction dir, int flags)
178 {
179 if (rank <= 0)
180 return 0;
181
182 if (plans) {
183 int i, j;
184 int cur_flags;
185
186 for (i = 0; i < rank; ++i) {
187 if (i < rank - 1 || (flags & FFTW_IN_PLACE)) {
188 /*
189 * fft's except the last dimension are always in-place
190 */
191 cur_flags = flags | FFTW_IN_PLACE;
192 for (j = i - 1; j >= 0 && n[i] != n[j]; --j);
193 } else {
194 cur_flags = flags;
195 /*
196 * we must create a separate plan for the last
197 * dimension
198 */
199 j = -1;
200 }
201
202 if (j >= 0) {
203 /*
204 * If a plan already exists for this size
205 * array, reuse it:
206 */
207 plans[i] = plans[j];
208 } else {
209 /* generate a new plan: */
210 plans[i] = fftw_create_plan(n[i], dir, cur_flags);
211 if (!plans[i]) {
212 destroy_plan_array(rank, plans);
213 return 0;
214 }
215 }
216 }
217 }
218 return plans;
219 }
220
get_maxdim(int rank,const int * n,int flags)221 static int get_maxdim(int rank, const int *n, int flags)
222 {
223 int i;
224 int maxdim = 0;
225
226 for (i = 0; i < rank - 1; ++i)
227 if (n[i] > maxdim)
228 maxdim = n[i];
229 if (rank > 0 && flags & FFTW_IN_PLACE && n[rank - 1] > maxdim)
230 maxdim = n[rank - 1];
231
232 return maxdim;
233 }
234
235 /* compute number of elements required for work array (has to
236 be big enough to hold ncopies of the largest dimension in
237 n that will need an in-place transform. */
fftwnd_work_size(int rank,const int * n,int flags,int ncopies)238 int fftwnd_work_size(int rank, const int *n, int flags, int ncopies)
239 {
240 return (ncopies * get_maxdim(rank, n, flags)
241 + (ncopies - 1) * FFTWND_BUFFER_PADDING);
242 }
243
244 /*
245 * create plans using the fftw_create_plan_specific planner, which
246 * allows us to create plans for each dimension that are specialized
247 * for the strides that we are going to use.
248 */
fftwnd_create_plans_specific(fftw_plan * plans,int rank,const int * n,const int * n_after,fftw_direction dir,int flags,fftw_complex * in,int istride,fftw_complex * out,int ostride)249 fftw_plan *fftwnd_create_plans_specific(fftw_plan *plans,
250 int rank, const int *n,
251 const int *n_after,
252 fftw_direction dir, int flags,
253 fftw_complex *in, int istride,
254 fftw_complex *out, int ostride)
255 {
256 if (rank <= 0)
257 return 0;
258
259 if (plans) {
260 int i, stride, cur_flags;
261 fftw_complex *work = 0;
262 int nwork;
263
264 nwork = fftwnd_work_size(rank, n, flags, 1);
265 if (nwork)
266 work = (fftw_complex*)fftw_malloc(nwork * sizeof(fftw_complex));
267
268 for (i = 0; i < rank; ++i) {
269 /* fft's except the last dimension are always in-place */
270 if (i < rank - 1)
271 cur_flags = flags | FFTW_IN_PLACE;
272 else
273 cur_flags = flags;
274
275 /* stride for transforming ith dimension */
276 stride = n_after[i];
277
278 if (cur_flags & FFTW_IN_PLACE)
279 plans[i] = fftw_create_plan_specific(n[i], dir, cur_flags,
280 in, istride * stride,
281 work, 1);
282 else
283 plans[i] = fftw_create_plan_specific(n[i], dir, cur_flags,
284 in, istride * stride,
285 out, ostride * stride);
286 if (!plans[i]) {
287 destroy_plan_array(rank, plans);
288 fftw_free(work);
289 return 0;
290 }
291 }
292
293 if (work)
294 fftw_free(work);
295 }
296 return plans;
297 }
298
299 /*
300 * Create an fftwnd_plan specialized for specific arrays. (These
301 * arrays are ignored, however, if they are NULL or if the flags do
302 * not include FFTW_MEASURE.) The main advantage of being provided
303 * arrays like this is that we can do runtime timing measurements of
304 * our options, without worrying about allocating excessive scratch
305 * space.
306 */
fftwnd_create_plan_specific(int rank,const int * n,fftw_direction dir,int flags,fftw_complex * in,int istride,fftw_complex * out,int ostride)307 fftwnd_plan fftwnd_create_plan_specific(int rank, const int *n,
308 fftw_direction dir, int flags,
309 fftw_complex *in, int istride,
310 fftw_complex *out, int ostride)
311 {
312 fftwnd_plan p;
313
314 if (!(p = fftwnd_create_plan_aux(rank, n, dir, flags)))
315 return 0;
316
317 if (!(flags & FFTW_MEASURE) || in == 0
318 || (!p->is_in_place && out == 0)) {
319
320 /**** use default plan ****/
321
322 p->plans = fftwnd_create_plans_generic(fftwnd_new_plan_array(rank),
323 rank, n, dir, flags);
324 if (!p->plans) {
325 fftwnd_destroy_plan(p);
326 return 0;
327 }
328 if (flags & FFTWND_FORCE_BUFFERED)
329 p->nbuffers = FFTWND_NBUFFERS;
330 else
331 p->nbuffers = FFTWND_DEFAULT_NBUFFERS;
332
333 p->nwork = fftwnd_work_size(rank, n, flags, p->nbuffers + 1);
334 if (p->nwork && !(flags & FFTW_THREADSAFE)) {
335 p->work = (fftw_complex*) fftw_malloc(p->nwork
336 * sizeof(fftw_complex));
337 if (!p->work) {
338 fftwnd_destroy_plan(p);
339 return 0;
340 }
341 }
342 } else {
343 /**** use runtime measurements to pick plan ****/
344
345 fftw_plan *plans_buf, *plans_nobuf;
346 double t_buf, t_nobuf;
347
348 p->nwork = fftwnd_work_size(rank, n, flags, FFTWND_NBUFFERS + 1);
349 if (p->nwork && !(flags & FFTW_THREADSAFE)) {
350 p->work = (fftw_complex*) fftw_malloc(p->nwork
351 * sizeof(fftw_complex));
352 if (!p->work) {
353 fftwnd_destroy_plan(p);
354 return 0;
355 }
356 }
357 else
358 p->work = (fftw_complex*) NULL;
359
360 /* two possible sets of 1D plans: */
361 plans_buf = fftwnd_create_plans_generic(fftwnd_new_plan_array(rank),
362 rank, n, dir, flags);
363 plans_nobuf =
364 fftwnd_create_plans_specific(fftwnd_new_plan_array(rank),
365 rank, n, p->n_after, dir,
366 flags, in, istride,
367 out, ostride);
368 if (!plans_buf || !plans_nobuf) {
369 destroy_plan_array(rank, plans_nobuf);
370 destroy_plan_array(rank, plans_buf);
371 fftwnd_destroy_plan(p);
372 return 0;
373 }
374 /* time the two possible plans */
375 p->plans = plans_nobuf;
376 p->nbuffers = 0;
377 p->nwork = fftwnd_work_size(rank, n, flags, p->nbuffers + 1);
378 t_nobuf = fftwnd_measure_runtime(p, in, istride, out, ostride);
379 p->plans = plans_buf;
380 p->nbuffers = FFTWND_NBUFFERS;
381 p->nwork = fftwnd_work_size(rank, n, flags, p->nbuffers + 1);
382 t_buf = fftwnd_measure_runtime(p, in, istride, out, ostride);
383
384 /* pick the better one: */
385 if (t_nobuf < t_buf) { /* use unbuffered transform */
386 p->plans = plans_nobuf;
387 p->nbuffers = 0;
388
389 /* work array is unnecessarily large */
390 if (p->work)
391 fftw_free(p->work);
392 p->work = 0;
393
394 destroy_plan_array(rank, plans_buf);
395
396 /* allocate a work array of the correct size: */
397 p->nwork = fftwnd_work_size(rank, n, flags, p->nbuffers + 1);
398 if (p->nwork && !(flags & FFTW_THREADSAFE)) {
399 p->work = (fftw_complex*) fftw_malloc(p->nwork
400 * sizeof(fftw_complex));
401 if (!p->work) {
402 fftwnd_destroy_plan(p);
403 return 0;
404 }
405 }
406 } else { /* use buffered transform */
407 destroy_plan_array(rank, plans_nobuf);
408 }
409 }
410
411 return p;
412 }
413
fftw2d_create_plan_specific(int nx,int ny,fftw_direction dir,int flags,fftw_complex * in,int istride,fftw_complex * out,int ostride)414 fftwnd_plan fftw2d_create_plan_specific(int nx, int ny,
415 fftw_direction dir, int flags,
416 fftw_complex *in, int istride,
417 fftw_complex *out, int ostride)
418 {
419 int n[2];
420
421 n[0] = nx;
422 n[1] = ny;
423
424 return fftwnd_create_plan_specific(2, n, dir, flags,
425 in, istride, out, ostride);
426 }
427
fftw3d_create_plan_specific(int nx,int ny,int nz,fftw_direction dir,int flags,fftw_complex * in,int istride,fftw_complex * out,int ostride)428 fftwnd_plan fftw3d_create_plan_specific(int nx, int ny, int nz,
429 fftw_direction dir, int flags,
430 fftw_complex *in, int istride,
431 fftw_complex *out, int ostride)
432 {
433 int n[3];
434
435 n[0] = nx;
436 n[1] = ny;
437 n[2] = nz;
438
439 return fftwnd_create_plan_specific(3, n, dir, flags,
440 in, istride, out, ostride);
441 }
442
443 /* Create a generic fftwnd plan: */
444
fftwnd_create_plan(int rank,const int * n,fftw_direction dir,int flags)445 fftwnd_plan fftwnd_create_plan(int rank, const int *n,
446 fftw_direction dir, int flags)
447 {
448 return fftwnd_create_plan_specific(rank, n, dir, flags, 0, 1, 0, 1);
449 }
450
fftw2d_create_plan(int nx,int ny,fftw_direction dir,int flags)451 fftwnd_plan fftw2d_create_plan(int nx, int ny,
452 fftw_direction dir, int flags)
453 {
454 return fftw2d_create_plan_specific(nx, ny, dir, flags, 0, 1, 0, 1);
455 }
456
fftw3d_create_plan(int nx,int ny,int nz,fftw_direction dir,int flags)457 fftwnd_plan fftw3d_create_plan(int nx, int ny, int nz,
458 fftw_direction dir, int flags)
459 {
460 return fftw3d_create_plan_specific(nx, ny, nz, dir, flags, 0, 1, 0, 1);
461 }
462
463 /************************ Freeing the FFTWND Plan ************************/
464
destroy_plan_array(int rank,fftw_plan * plans)465 static void destroy_plan_array(int rank, fftw_plan *plans)
466 {
467 if (plans) {
468 int i, j;
469
470 for (i = 0; i < rank; ++i) {
471 for (j = i - 1;
472 j >= 0 && plans[i] != plans[j];
473 --j);
474 if (j < 0 && plans[i])
475 fftw_destroy_plan(plans[i]);
476 }
477 fftw_free(plans);
478 }
479 }
480
fftwnd_destroy_plan(fftwnd_plan plan)481 void fftwnd_destroy_plan(fftwnd_plan plan)
482 {
483 if (plan) {
484 destroy_plan_array(plan->rank, plan->plans);
485
486 if (plan->n)
487 fftw_free(plan->n);
488
489 if (plan->n_before)
490 fftw_free(plan->n_before);
491
492 if (plan->n_after)
493 fftw_free(plan->n_after);
494
495 if (plan->work)
496 fftw_free(plan->work);
497
498 fftw_free(plan);
499 }
500 }
501
502 /************************ Printing the FFTWND Plan ************************/
503
fftwnd_fprint_plan(FILE * f,fftwnd_plan plan)504 void fftwnd_fprint_plan(FILE *f, fftwnd_plan plan)
505 {
506 if (plan) {
507 int i, j;
508
509 if (plan->rank == 0) {
510 fprintf(f, "plan for rank 0 (null) transform.\n");
511 return;
512 }
513 fprintf(f, "plan for ");
514 for (i = 0; i < plan->rank; ++i)
515 fprintf(f, "%s%d", i ? "x" : "", plan->n[i]);
516 fprintf(f, " transform:\n");
517
518 if (plan->nbuffers > 0)
519 fprintf(f, " -- using buffered transforms (%d buffers)\n",
520 plan->nbuffers);
521 else
522 fprintf(f, " -- using unbuffered transform\n");
523
524 for (i = 0; i < plan->rank; ++i) {
525 fprintf(f, "* dimension %d (size %d) ", i, plan->n[i]);
526
527 for (j = i - 1; j >= 0; --j)
528 if (plan->plans[j] == plan->plans[i])
529 break;
530
531 if (j < 0)
532 fftw_fprint_plan(f, plan->plans[i]);
533 else
534 fprintf(f, "plan is same as dimension %d plan.\n", j);
535 }
536 }
537 }
538
fftwnd_print_plan(fftwnd_plan plan)539 void fftwnd_print_plan(fftwnd_plan plan)
540 {
541 fftwnd_fprint_plan(stdout, plan);
542 }
543
544 /********************* Buffered FFTW (in-place) *********************/
545
fftw_buffered(fftw_plan p,int howmany,fftw_complex * in,int istride,int idist,fftw_complex * work,int nbuffers,fftw_complex * buffers)546 void fftw_buffered(fftw_plan p, int howmany,
547 fftw_complex *in, int istride, int idist,
548 fftw_complex *work,
549 int nbuffers, fftw_complex *buffers)
550 {
551 int i = 0, n, nb;
552
553 n = p->n;
554 nb = n + FFTWND_BUFFER_PADDING;
555
556 do {
557 for (; i <= howmany - nbuffers; i += nbuffers) {
558 fftw_complex *cur_in = in + i * idist;
559 int j, buf;
560
561 /*
562 * First, copy nbuffers strided arrays to the
563 * contiguous buffer arrays (reading consecutive
564 * locations, assuming that idist is 1):
565 */
566 for (j = 0; j < n; ++j) {
567 fftw_complex *cur_in2 = cur_in + j * istride;
568 fftw_complex *cur_buffers = buffers + j;
569
570 for (buf = 0; buf <= nbuffers - 4; buf += 4) {
571 fftw_real r0, i0, r1, i1, r2, i2, r3, i3;
572 r0 = c_re(cur_in2[0]);
573 i0 = c_im(cur_in2[0]);
574 r1 = c_re(cur_in2[idist]);
575 i1 = c_im(cur_in2[idist]);
576 r2 = c_re(cur_in2[2 * idist]);
577 i2 = c_im(cur_in2[2 * idist]);
578 r3 = c_re(cur_in2[3 * idist]);
579 i3 = c_im(cur_in2[3 * idist]);
580 c_re(cur_buffers[0]) = r0;
581 c_im(cur_buffers[0]) = i0;
582 c_re(cur_buffers[nb]) = r1;
583 c_im(cur_buffers[nb]) = i1;
584 c_re(cur_buffers[2 * nb]) = r2;
585 c_im(cur_buffers[2 * nb]) = i2;
586 c_re(cur_buffers[3 * nb]) = r3;
587 c_im(cur_buffers[3 * nb]) = i3;
588 cur_buffers += 4 * nb;
589 cur_in2 += 4 * idist;
590 }
591 for (; buf < nbuffers; ++buf) {
592 *cur_buffers = *cur_in2;
593 cur_buffers += nb;
594 cur_in2 += idist;
595 }
596 }
597
598 /*
599 * Now, compute the FFTs in the buffers (in-place
600 * using work):
601 */
602 fftw(p, nbuffers, buffers, 1, nb, work, 1, 0);
603
604 /*
605 * Finally, copy the results back from the contiguous
606 * buffers to the strided arrays (writing consecutive
607 * locations):
608 */
609 for (j = 0; j < n; ++j) {
610 fftw_complex *cur_in2 = cur_in + j * istride;
611 fftw_complex *cur_buffers = buffers + j;
612
613 for (buf = 0; buf <= nbuffers - 4; buf += 4) {
614 fftw_real r0, i0, r1, i1, r2, i2, r3, i3;
615 r0 = c_re(cur_buffers[0]);
616 i0 = c_im(cur_buffers[0]);
617 r1 = c_re(cur_buffers[nb]);
618 i1 = c_im(cur_buffers[nb]);
619 r2 = c_re(cur_buffers[2 * nb]);
620 i2 = c_im(cur_buffers[2 * nb]);
621 r3 = c_re(cur_buffers[3 * nb]);
622 i3 = c_im(cur_buffers[3 * nb]);
623 c_re(cur_in2[0]) = r0;
624 c_im(cur_in2[0]) = i0;
625 c_re(cur_in2[idist]) = r1;
626 c_im(cur_in2[idist]) = i1;
627 c_re(cur_in2[2 * idist]) = r2;
628 c_im(cur_in2[2 * idist]) = i2;
629 c_re(cur_in2[3 * idist]) = r3;
630 c_im(cur_in2[3 * idist]) = i3;
631 cur_buffers += 4 * nb;
632 cur_in2 += 4 * idist;
633 }
634 for (; buf < nbuffers; ++buf) {
635 *cur_in2 = *cur_buffers;
636 cur_buffers += nb;
637 cur_in2 += idist;
638 }
639 }
640 }
641
642 /*
643 * we skip howmany % nbuffers ffts at the end of the loop,
644 * so we have to go back and do them:
645 */
646 nbuffers = howmany - i;
647 } while (i < howmany);
648 }
649
650 /********************* Computing the N-Dimensional FFT *********************/
651
fftwnd_aux(fftwnd_plan p,int cur_dim,fftw_complex * in,int istride,fftw_complex * out,int ostride,fftw_complex * work)652 void fftwnd_aux(fftwnd_plan p, int cur_dim,
653 fftw_complex *in, int istride,
654 fftw_complex *out, int ostride,
655 fftw_complex *work)
656 {
657 int n_after = p->n_after[cur_dim], n = p->n[cur_dim];
658
659 if (cur_dim == p->rank - 2) {
660 /* just do the last dimension directly: */
661 if (p->is_in_place)
662 fftw(p->plans[p->rank - 1], n,
663 in, istride, n_after * istride,
664 work, 1, 0);
665 else
666 fftw(p->plans[p->rank - 1], n,
667 in, istride, n_after * istride,
668 out, ostride, n_after * ostride);
669 } else { /* we have at least two dimensions to go */
670 int i;
671
672 /*
673 * process the subsequent dimensions recursively, in hyperslabs,
674 * to get maximum locality:
675 */
676 for (i = 0; i < n; ++i)
677 fftwnd_aux(p, cur_dim + 1,
678 in + i * n_after * istride, istride,
679 out + i * n_after * ostride, ostride, work);
680 }
681
682 /* do the current dimension (in-place): */
683 if (p->nbuffers == 0) {
684 fftw(p->plans[cur_dim], n_after,
685 out, n_after * ostride, ostride,
686 work, 1, 0);
687 } else /* using contiguous copy buffers: */
688 fftw_buffered(p->plans[cur_dim], n_after,
689 out, n_after * ostride, ostride,
690 work, p->nbuffers, work + n);
691 }
692
693 /*
694 * alternate version of fftwnd_aux -- this version pushes the howmany
695 * loop down to the leaves of the computation, for greater locality in
696 * cases where dist < stride
697 */
fftwnd_aux_howmany(fftwnd_plan p,int cur_dim,int howmany,fftw_complex * in,int istride,int idist,fftw_complex * out,int ostride,int odist,fftw_complex * work)698 void fftwnd_aux_howmany(fftwnd_plan p, int cur_dim,
699 int howmany,
700 fftw_complex *in, int istride, int idist,
701 fftw_complex *out, int ostride, int odist,
702 fftw_complex *work)
703 {
704 int n_after = p->n_after[cur_dim], n = p->n[cur_dim];
705 int k;
706
707 if (cur_dim == p->rank - 2) {
708 /* just do the last dimension directly: */
709 if (p->is_in_place)
710 for (k = 0; k < n; ++k)
711 fftw(p->plans[p->rank - 1], howmany,
712 in + k * n_after * istride, istride, idist,
713 work, 1, 0);
714 else
715 for (k = 0; k < n; ++k)
716 fftw(p->plans[p->rank - 1], howmany,
717 in + k * n_after * istride, istride, idist,
718 out + k * n_after * ostride, ostride, odist);
719 } else { /* we have at least two dimensions to go */
720 int i;
721
722 /*
723 * process the subsequent dimensions recursively, in
724 * hyperslabs, to get maximum locality:
725 */
726 for (i = 0; i < n; ++i)
727 fftwnd_aux_howmany(p, cur_dim + 1, howmany,
728 in + i * n_after * istride, istride, idist,
729 out + i * n_after * ostride, ostride, odist,
730 work);
731 }
732
733 /* do the current dimension (in-place): */
734 if (p->nbuffers == 0)
735 for (k = 0; k < n_after; ++k)
736 fftw(p->plans[cur_dim], howmany,
737 out + k * ostride, n_after * ostride, odist,
738 work, 1, 0);
739 else /* using contiguous copy buffers: */
740 for (k = 0; k < n_after; ++k)
741 fftw_buffered(p->plans[cur_dim], howmany,
742 out + k * ostride, n_after * ostride, odist,
743 work, p->nbuffers, work + n);
744 }
745
fftwnd(fftwnd_plan p,int howmany,fftw_complex * in,int istride,int idist,fftw_complex * out,int ostride,int odist)746 void fftwnd(fftwnd_plan p, int howmany,
747 fftw_complex *in, int istride, int idist,
748 fftw_complex *out, int ostride, int odist)
749 {
750 fftw_complex *work;
751
752 #ifdef FFTW_DEBUG
753 if (p->rank > 0 && (p->plans[0]->flags & FFTW_THREADSAFE)
754 && p->nwork && p->work)
755 fftw_die("bug with FFTW_THREADSAFE flag\n");
756 #endif
757
758 if (p->nwork && !p->work)
759 work = (fftw_complex *) fftw_malloc(p->nwork * sizeof(fftw_complex));
760 else
761 work = p->work;
762
763 switch (p->rank) {
764 case 0:
765 break;
766 case 1:
767 if (p->is_in_place) /* fft is in-place */
768 fftw(p->plans[0], howmany, in, istride, idist,
769 work, 1, 0);
770 else
771 fftw(p->plans[0], howmany, in, istride, idist,
772 out, ostride, odist);
773 break;
774 default: /* rank >= 2 */
775 {
776 if (p->is_in_place) {
777 out = in;
778 ostride = istride;
779 odist = idist;
780 }
781 if (howmany > 1 && odist < ostride)
782 fftwnd_aux_howmany(p, 0, howmany,
783 in, istride, idist,
784 out, ostride, odist,
785 work);
786 else {
787 int i;
788
789 for (i = 0; i < howmany; ++i)
790 fftwnd_aux(p, 0,
791 in + i * idist, istride,
792 out + i * odist, ostride,
793 work);
794 }
795 }
796 }
797
798 if (p->nwork && !p->work)
799 fftw_free(work);
800
801 }
802
fftwnd_one(fftwnd_plan p,fftw_complex * in,fftw_complex * out)803 void fftwnd_one(fftwnd_plan p, fftw_complex *in, fftw_complex *out)
804 {
805 fftwnd(p, 1, in, 1, 1, out, 1, 1);
806 }
807