1 /*
2  * Copyright (c) 1997-1999, 2003 Massachusetts Institute of Technology
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; either version 2 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software
16  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17  *
18  */
19 
20 /*
21  * rexec.c -- execute the fft
22  */
23 
24 /* $Id: rexec.c,v 1.28 2003/03/16 23:43:46 stevenj Exp $ */
25 #include <stdio.h>
26 #include <stdlib.h>
27 
28 #include "fftw-int.h"
29 #include "rfftw.h"
30 
rfftw_strided_copy(int n,fftw_real * in,int ostride,fftw_real * out)31 void rfftw_strided_copy(int n, fftw_real *in, int ostride,
32 			fftw_real *out)
33 {
34      int i;
35      fftw_real r0, r1, r2, r3;
36 
37      i = 0;
38      for (; i < (n & 3); ++i) {
39 	  out[i * ostride] = in[i];
40      }
41      for (; i < n; i += 4) {
42 	  r0 = in[i];
43 	  r1 = in[i + 1];
44 	  r2 = in[i + 2];
45 	  r3 = in[i + 3];
46 	  out[i * ostride] = r0;
47 	  out[(i + 1) * ostride] = r1;
48 	  out[(i + 2) * ostride] = r2;
49 	  out[(i + 3) * ostride] = r3;
50      }
51 }
52 
rexecutor_many(int n,fftw_real * in,fftw_real * out,fftw_plan_node * p,int istride,int ostride,int howmany,int idist,int odist,fftw_recurse_kind recurse_kind)53 static void rexecutor_many(int n, fftw_real *in,
54 			   fftw_real *out,
55 			   fftw_plan_node *p,
56 			   int istride,
57 			   int ostride,
58 			   int howmany, int idist, int odist,
59 			   fftw_recurse_kind recurse_kind)
60 {
61      int s;
62 
63      switch (p->type) {
64 	 case FFTW_REAL2HC:
65 	      {
66 		   fftw_real2hc_codelet *codelet = p->nodeu.real2hc.codelet;
67 
68 		   HACK_ALIGN_STACK_ODD;
69 		   for (s = 0; s < howmany; ++s)
70 			codelet(in + s * idist, out + s * odist,
71 				out + n * ostride + s * odist,
72 				istride, ostride, -ostride);
73 		   break;
74 	      }
75 
76 	 case FFTW_HC2REAL:
77 	      {
78 		   fftw_hc2real_codelet *codelet = p->nodeu.hc2real.codelet;
79 
80 		   HACK_ALIGN_STACK_ODD;
81 		   for (s = 0; s < howmany; ++s)
82 			codelet(in + s * idist, in + n * istride + s * idist,
83 				out + s * odist,
84 				istride, -istride, ostride);
85 		   break;
86 	      }
87 
88 	 default:
89 	      for (s = 0; s < howmany; ++s)
90 		   rfftw_executor_simple(n, in + s * idist,
91 					 out + s * odist,
92 					 p, istride, ostride,
93 					 recurse_kind);
94      }
95 }
96 
97 #ifdef FFTW_ENABLE_VECTOR_RECURSE
98 
99 /* rexecutor_many_vector is like rexecutor_many, but it pushes the
100    howmany loop down to the leaves of the transform: */
rexecutor_many_vector(int n,fftw_real * in,fftw_real * out,fftw_plan_node * p,int istride,int ostride,int howmany,int idist,int odist)101 static void rexecutor_many_vector(int n, fftw_real *in,
102 				  fftw_real *out,
103 				  fftw_plan_node *p,
104 				  int istride,
105 				  int ostride,
106 				  int howmany, int idist, int odist)
107 {
108      switch (p->type) {
109 	 case FFTW_REAL2HC:
110 	      {
111 		   fftw_real2hc_codelet *codelet = p->nodeu.real2hc.codelet;
112 		   int s;
113 
114 		   HACK_ALIGN_STACK_ODD;
115 		   for (s = 0; s < howmany; ++s)
116 			codelet(in + s * idist, out + s * odist,
117 				out + n * ostride + s * odist,
118 				istride, ostride, -ostride);
119 		   break;
120 	      }
121 
122 	 case FFTW_HC2REAL:
123 	      {
124 		   fftw_hc2real_codelet *codelet = p->nodeu.hc2real.codelet;
125 		   int s;
126 
127 		   HACK_ALIGN_STACK_ODD;
128 		   for (s = 0; s < howmany; ++s)
129 			codelet(in + s * idist, in + n * istride + s * idist,
130 				out + s * odist,
131 				istride, -istride, ostride);
132 		   break;
133 	      }
134 
135 	 case FFTW_HC2HC:
136 	      {
137 		   int r = p->nodeu.hc2hc.size;
138 		   int m = n / r;
139 		   int i;
140 		   fftw_hc2hc_codelet *codelet;
141 		   fftw_complex *W;
142 
143 		   switch (p->nodeu.hc2hc.dir) {
144 		       case FFTW_REAL_TO_COMPLEX:
145 			    for (i = 0; i < r; ++i)
146 				 rexecutor_many_vector(m, in + i * istride,
147 						       out + i * (m*ostride),
148 						       p->nodeu.hc2hc.recurse,
149 						       istride * r, ostride,
150 						       howmany, idist, odist);
151 
152 			    W = p->nodeu.hc2hc.tw->twarray;
153 			    codelet = p->nodeu.hc2hc.codelet;
154 			    HACK_ALIGN_STACK_EVEN;
155 			    for (i = 0; i < howmany; ++i)
156 				 codelet(out + i * odist,
157 					 W, m * ostride, m, ostride);
158 			    break;
159 		       case FFTW_COMPLEX_TO_REAL:
160 			    W = p->nodeu.hc2hc.tw->twarray;
161 			    codelet = p->nodeu.hc2hc.codelet;
162 			    HACK_ALIGN_STACK_EVEN;
163 			    for (i = 0; i < howmany; ++i)
164 				 codelet(in + i * idist,
165 					 W, m * istride, m, istride);
166 
167 			    for (i = 0; i < r; ++i)
168 				 rexecutor_many_vector(m, in + i * (m*istride),
169 						       out + i * ostride,
170 						       p->nodeu.hc2hc.recurse,
171 						       istride, ostride * r,
172 						       howmany, idist, odist);
173 			    break;
174 		       default:
175 			    goto bug;
176 		   }
177 
178 		   break;
179 	      }
180 
181 	 case FFTW_RGENERIC:
182 	      {
183 		   int r = p->nodeu.rgeneric.size;
184 		   int m = n / r;
185 		   int i;
186 		   fftw_rgeneric_codelet *codelet = p->nodeu.rgeneric.codelet;
187 		   fftw_complex *W = p->nodeu.rgeneric.tw->twarray;
188 
189 		   switch (p->nodeu.rgeneric.dir) {
190 		       case FFTW_REAL_TO_COMPLEX:
191 			    for (i = 0; i < r; ++i)
192 				 rexecutor_many_vector(m, in + i * istride,
193 						 out + i * (m * ostride),
194 					       p->nodeu.rgeneric.recurse,
195 						   istride * r, ostride,
196 						       howmany, idist, odist);
197 
198 			    for (i = 0; i < howmany; ++i)
199 				 codelet(out + i * odist, W, m, r, n, ostride);
200 			    break;
201 		       case FFTW_COMPLEX_TO_REAL:
202 			    for (i = 0; i < howmany; ++i)
203 				 codelet(in + i * idist, W, m, r, n, istride);
204 
205 			    for (i = 0; i < r; ++i)
206 				 rexecutor_many_vector(m, in + i * m * istride,
207 						       out + i * ostride,
208 					       p->nodeu.rgeneric.recurse,
209 						   istride, ostride * r,
210 						       howmany, idist, odist);
211 			    break;
212 		       default:
213 			    goto bug;
214 		   }
215 
216 		   break;
217 	      }
218 
219 	 default:
220 	    bug:
221 	      fftw_die("BUG in rexecutor: invalid plan\n");
222 	      break;
223      }
224 }
225 
226 #endif /* FFTW_ENABLE_VECTOR_RECURSE */
227 
rfftw_executor_simple(int n,fftw_real * in,fftw_real * out,fftw_plan_node * p,int istride,int ostride,fftw_recurse_kind recurse_kind)228 void rfftw_executor_simple(int n, fftw_real *in,
229 			   fftw_real *out,
230 			   fftw_plan_node *p,
231 			   int istride,
232 			   int ostride,
233 			   fftw_recurse_kind recurse_kind)
234 {
235      switch (p->type) {
236 	 case FFTW_REAL2HC:
237 	      HACK_ALIGN_STACK_ODD;
238 	      (p->nodeu.real2hc.codelet) (in, out, out + n * ostride,
239 					  istride, ostride, -ostride);
240 	      break;
241 
242 	 case FFTW_HC2REAL:
243 	      HACK_ALIGN_STACK_ODD;
244 	      (p->nodeu.hc2real.codelet) (in, in + n * istride, out,
245 					  istride, -istride, ostride);
246 	      break;
247 
248 	 case FFTW_HC2HC:
249 	      {
250 		   int r = p->nodeu.hc2hc.size;
251 		   int m = n / r;
252 		   /*
253 		    * please do resist the temptation of initializing
254 		    * these variables here.  Doing so forces the
255 		    * compiler to keep a live variable across the
256 		    * recursive call.
257 		    */
258 		   fftw_hc2hc_codelet *codelet;
259 		   fftw_complex *W;
260 
261 		   switch (p->nodeu.hc2hc.dir) {
262 		       case FFTW_REAL_TO_COMPLEX:
263 #ifdef FFTW_ENABLE_VECTOR_RECURSE
264 			    if (recurse_kind == FFTW_NORMAL_RECURSE)
265 #endif
266 				 rexecutor_many(m, in, out,
267 						p->nodeu.hc2hc.recurse,
268 						istride * r, ostride,
269 						r, istride, m * ostride,
270 						FFTW_NORMAL_RECURSE);
271 #ifdef FFTW_ENABLE_VECTOR_RECURSE
272 			    else
273 				 rexecutor_many_vector(m, in, out,
274 						p->nodeu.hc2hc.recurse,
275 						istride * r, ostride,
276 						r, istride, m * ostride);
277 #endif
278 
279 			    W = p->nodeu.hc2hc.tw->twarray;
280 			    codelet = p->nodeu.hc2hc.codelet;
281 			    HACK_ALIGN_STACK_EVEN;
282 			    codelet(out, W, m * ostride, m, ostride);
283 			    break;
284 		       case FFTW_COMPLEX_TO_REAL:
285 			    W = p->nodeu.hc2hc.tw->twarray;
286 			    codelet = p->nodeu.hc2hc.codelet;
287 			    HACK_ALIGN_STACK_EVEN;
288 			    codelet(in, W, m * istride, m, istride);
289 
290 #ifdef FFTW_ENABLE_VECTOR_RECURSE
291 			    if (recurse_kind == FFTW_NORMAL_RECURSE)
292 #endif
293 				 rexecutor_many(m, in, out,
294 						p->nodeu.hc2hc.recurse,
295 						istride, ostride * r,
296 						r, m * istride, ostride,
297 						FFTW_NORMAL_RECURSE);
298 #ifdef FFTW_ENABLE_VECTOR_RECURSE
299 			    else
300 				 rexecutor_many_vector(m, in, out,
301 						p->nodeu.hc2hc.recurse,
302 						istride, ostride * r,
303 						r, m * istride, ostride);
304 #endif
305 			    break;
306 		       default:
307 			    goto bug;
308 		   }
309 
310 		   break;
311 	      }
312 
313 	 case FFTW_RGENERIC:
314 	      {
315 		   int r = p->nodeu.rgeneric.size;
316 		   int m = n / r;
317 		   fftw_rgeneric_codelet *codelet = p->nodeu.rgeneric.codelet;
318 		   fftw_complex *W = p->nodeu.rgeneric.tw->twarray;
319 
320 		   switch (p->nodeu.rgeneric.dir) {
321 		       case FFTW_REAL_TO_COMPLEX:
322 #ifdef FFTW_ENABLE_VECTOR_RECURSE
323 			    if (recurse_kind == FFTW_NORMAL_RECURSE)
324 #endif
325 				 rexecutor_many(m, in, out,
326 						p->nodeu.rgeneric.recurse,
327 						istride * r, ostride,
328 						r, istride, m * ostride,
329 						FFTW_NORMAL_RECURSE);
330 #ifdef FFTW_ENABLE_VECTOR_RECURSE
331 			    else
332 				 rexecutor_many_vector(m, in, out,
333 						p->nodeu.rgeneric.recurse,
334 						istride * r, ostride,
335 						r, istride, m * ostride);
336 #endif
337 
338 			    codelet(out, W, m, r, n, ostride);
339 			    break;
340 		       case FFTW_COMPLEX_TO_REAL:
341 			    codelet(in, W, m, r, n, istride);
342 
343 #ifdef FFTW_ENABLE_VECTOR_RECURSE
344 			    if (recurse_kind == FFTW_NORMAL_RECURSE)
345 #endif
346 				 rexecutor_many(m, in, out,
347 						p->nodeu.rgeneric.recurse,
348 						istride, ostride * r,
349 						r, m * istride, ostride,
350 						FFTW_NORMAL_RECURSE);
351 #ifdef FFTW_ENABLE_VECTOR_RECURSE
352 			    else
353 				 rexecutor_many_vector(m, in, out,
354 						p->nodeu.rgeneric.recurse,
355 						istride, ostride * r,
356 						r, m * istride, ostride);
357 #endif
358 			    break;
359 		       default:
360 			    goto bug;
361 		   }
362 
363 		   break;
364 	      }
365 
366 	 default:
367 	    bug:
368 	      fftw_die("BUG in rexecutor: invalid plan\n");
369 	      break;
370      }
371 }
372 
rexecutor_simple_inplace(int n,fftw_real * in,fftw_real * out,fftw_plan_node * p,int istride,fftw_recurse_kind recurse_kind)373 static void rexecutor_simple_inplace(int n, fftw_real *in,
374 				     fftw_real *out,
375 				     fftw_plan_node *p,
376 				     int istride,
377 				     fftw_recurse_kind recurse_kind)
378 {
379      switch (p->type) {
380 	 case FFTW_REAL2HC:
381 	      HACK_ALIGN_STACK_ODD;
382 	      (p->nodeu.real2hc.codelet) (in, in, in + n * istride,
383 					  istride, istride, -istride);
384 	      break;
385 
386 	 case FFTW_HC2REAL:
387 	      HACK_ALIGN_STACK_ODD;
388 	      (p->nodeu.hc2real.codelet) (in, in + n * istride, in,
389 					  istride, -istride, istride);
390 	      break;
391 
392 	 default:
393 	      {
394 		   fftw_real *tmp;
395 
396 		   if (out)
397 			tmp = out;
398 		   else
399 			tmp = (fftw_real *) fftw_malloc(n * sizeof(fftw_real));
400 
401 		   rfftw_executor_simple(n, in, tmp, p, istride, 1,
402 					 recurse_kind);
403 		   rfftw_strided_copy(n, tmp, istride, in);
404 
405 		   if (!out)
406 			fftw_free(tmp);
407 	      }
408      }
409 }
410 
rexecutor_many_inplace(int n,fftw_real * in,fftw_real * out,fftw_plan_node * p,int istride,int howmany,int idist,fftw_recurse_kind recurse_kind)411 static void rexecutor_many_inplace(int n, fftw_real *in,
412 				   fftw_real *out,
413 				   fftw_plan_node *p,
414 				   int istride,
415 				   int howmany, int idist,
416 				   fftw_recurse_kind recurse_kind)
417 {
418      switch (p->type) {
419 	 case FFTW_REAL2HC:
420 	      {
421 		   fftw_real2hc_codelet *codelet = p->nodeu.real2hc.codelet;
422 		   int s;
423 
424 		   HACK_ALIGN_STACK_ODD;
425 		   for (s = 0; s < howmany; ++s)
426 			codelet(in + s * idist, in + s * idist,
427 				in + n * istride + s * idist,
428 				istride, istride, -istride);
429 
430 		   break;
431 	      }
432 
433 	 case FFTW_HC2REAL:
434 	      {
435 		   fftw_hc2real_codelet *codelet = p->nodeu.hc2real.codelet;
436 		   int s;
437 
438 		   HACK_ALIGN_STACK_ODD;
439 		   for (s = 0; s < howmany; ++s)
440 			codelet(in + s * idist, in + n * istride + s * idist,
441 				in + s * idist,
442 				istride, -istride, istride);
443 
444 		   break;
445 	      }
446 
447 	 default:
448 	      {
449 		   int s;
450 		   fftw_real *tmp;
451 		   if (out)
452 			tmp = out;
453 		   else
454 			tmp = (fftw_real *) fftw_malloc(n * sizeof(fftw_real));
455 
456 		   for (s = 0; s < howmany; ++s) {
457 			rfftw_executor_simple(n,
458 					      in + s * idist,
459 					      tmp,
460 					      p, istride, 1, recurse_kind);
461 			rfftw_strided_copy(n, tmp, istride, in + s * idist);
462 		   }
463 
464 		   if (!out)
465 			fftw_free(tmp);
466 	      }
467      }
468 }
469 
470 /* user interface */
rfftw(fftw_plan plan,int howmany,fftw_real * in,int istride,int idist,fftw_real * out,int ostride,int odist)471 void rfftw(fftw_plan plan, int howmany, fftw_real *in, int istride,
472 	   int idist, fftw_real *out, int ostride, int odist)
473 {
474      int n = plan->n;
475 
476      if (plan->flags & FFTW_IN_PLACE) {
477 	  if (howmany == 1) {
478 	       rexecutor_simple_inplace(n, in, out, plan->root, istride,
479 					plan->recurse_kind);
480 	  } else {
481 	       rexecutor_many_inplace(n, in, out, plan->root, istride, howmany,
482 				      idist, plan->recurse_kind);
483 	  }
484      } else {
485 	  if (howmany == 1) {
486 	       rfftw_executor_simple(n, in, out, plan->root, istride, ostride,
487 				     plan->recurse_kind);
488 	  } else {
489 #ifdef FFTW_ENABLE_VECTOR_RECURSE
490                int vector_size = plan->vector_size;
491                if (vector_size <= 1)
492 #endif
493 		    rexecutor_many(n, in, out, plan->root, istride, ostride,
494 				   howmany, idist, odist,
495 				   plan->recurse_kind);
496 #ifdef FFTW_ENABLE_VECTOR_RECURSE
497                else {
498                     int s;
499                     int num_vects = howmany / vector_size;
500                     fftw_plan_node *root = plan->root;
501 
502                     for (s = 0; s < num_vects; ++s)
503                          rexecutor_many_vector(n,
504 					       in + s * (vector_size * idist),
505 					       out + s * (vector_size * odist),
506 					       root,
507 					       istride, ostride,
508 					       vector_size, idist, odist);
509 
510                     s = howmany % vector_size;
511                     if (s > 0)
512                          rexecutor_many(n,
513 					in + num_vects * (vector_size*idist),
514 					out + num_vects * (vector_size*odist),
515 					root,
516 					istride, ostride,
517 					s, idist, odist,
518 					FFTW_NORMAL_RECURSE);
519                }
520 #endif
521 	  }
522      }
523 }
524 
rfftw_one(fftw_plan plan,fftw_real * in,fftw_real * out)525 void rfftw_one(fftw_plan plan, fftw_real *in, fftw_real *out)
526 {
527      int n = plan->n;
528 
529      if (plan->flags & FFTW_IN_PLACE)
530 	  rexecutor_simple_inplace(n, in, out, plan->root, 1,
531 				   plan->recurse_kind);
532      else
533 	  rfftw_executor_simple(n, in, out, plan->root, 1, 1,
534 				plan->recurse_kind);
535 }
536