1 /*
2  * Copyright (c) 2003, 2007-14 Matteo Frigo
3  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
18  *
19  */
20 
21 
22 /* FFTW internal header file */
23 #ifndef __IFFTW_H__
24 #define __IFFTW_H__
25 
26 #include "config.h"
27 
28 #include <stdlib.h>		/* size_t */
29 #include <stdarg.h>		/* va_list */
30 #include <stddef.h>             /* ptrdiff_t */
31 #include <limits.h>             /* INT_MAX */
32 
33 #if HAVE_SYS_TYPES_H
34 # include <sys/types.h>
35 #endif
36 
37 #if HAVE_STDINT_H
38 # include <stdint.h>             /* uintptr_t, maybe */
39 #endif
40 
41 #if HAVE_INTTYPES_H
42 # include <inttypes.h>           /* uintptr_t, maybe */
43 #endif
44 
45 #ifdef __cplusplus
46 extern "C"
47 {
48 #endif /* __cplusplus */
49 
50 /* Windows annoyances -- since tests/hook.c uses some internal
51    FFTW functions, we need to given them the dllexport attribute
52    under Windows when compiling as a DLL (see api/fftw3.h). */
53 #if defined(FFTW_EXTERN)
54 #  define IFFTW_EXTERN FFTW_EXTERN
55 #elif (defined(FFTW_DLL) || defined(DLL_EXPORT)) \
56  && (defined(_WIN32) || defined(__WIN32__))
57 #  define IFFTW_EXTERN extern __declspec(dllexport)
58 #else
59 #  define IFFTW_EXTERN extern
60 #endif
61 
62 /* determine precision and name-mangling scheme */
63 #define CONCAT(prefix, name) prefix ## name
64 #if defined(FFTW_SINGLE)
65   typedef float R;
66 # define X(name) CONCAT(fftwf_, name)
67 #elif defined(FFTW_LDOUBLE)
68   typedef long double R;
69 # define X(name) CONCAT(fftwl_, name)
70 # define TRIGREAL_IS_LONG_DOUBLE
71 #elif defined(FFTW_QUAD)
72   typedef __float128 R;
73 # define X(name) CONCAT(fftwq_, name)
74 # define TRIGREAL_IS_QUAD
75 #else
76   typedef double R;
77 # define X(name) CONCAT(fftw_, name)
78 #endif
79 
80 /*
81   integral type large enough to contain a stride (what ``int'' should
82   have been in the first place.
83 */
84 typedef ptrdiff_t INT;
85 
86 /* dummy use of unused parameters to silence compiler warnings */
87 #define UNUSED(x) (void)x
88 
89 #define NELEM(array) ((sizeof(array) / sizeof((array)[0])))
90 
91 #define FFT_SIGN (-1)  /* sign convention for forward transforms */
92 extern void X(extract_reim)(int sign, R *c, R **r, R **i);
93 
94 #define REGISTER_SOLVER(p, s) X(solver_register)(p, s)
95 
96 #define STRINGIZEx(x) #x
97 #define STRINGIZE(x) STRINGIZEx(x)
98 #define CIMPLIES(ante, post) (!(ante) || (post))
99 
100 /* define HAVE_SIMD if any simd extensions are supported */
101 #if defined(HAVE_SSE) || defined(HAVE_SSE2) || \
102       defined(HAVE_AVX) || defined(HAVE_AVX_128_FMA) || \
103       defined(HAVE_AVX2) || defined(HAVE_AVX512) || \
104       defined(HAVE_KCVI) || \
105       defined(HAVE_ALTIVEC) || defined(HAVE_VSX) || \
106       defined(HAVE_MIPS_PS) || \
107       defined(HAVE_GENERIC_SIMD128) || defined(HAVE_GENERIC_SIMD256)
108 #define HAVE_SIMD 1
109 #else
110 #define HAVE_SIMD 0
111 #endif
112 
113 extern int X(have_simd_sse2)(void);
114 extern int X(have_simd_avx)(void);
115 extern int X(have_simd_avx_128_fma)(void);
116 extern int X(have_simd_avx2)(void);
117 extern int X(have_simd_avx2_128)(void);
118 extern int X(have_simd_avx512)(void);
119 extern int X(have_simd_altivec)(void);
120 extern int X(have_simd_vsx)(void);
121 extern int X(have_simd_neon)(void);
122 
123 /* forward declarations */
124 typedef struct problem_s problem;
125 typedef struct plan_s plan;
126 typedef struct solver_s solver;
127 typedef struct planner_s planner;
128 typedef struct printer_s printer;
129 typedef struct scanner_s scanner;
130 
131 /*-----------------------------------------------------------------------*/
132 /* alloca: */
133 #if HAVE_SIMD
134 #  if defined(HAVE_KCVI) || defined(HAVE_AVX512)
135 #    define MIN_ALIGNMENT 64
136 #  elif defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_GENERIC_SIMD256)
137 #    define MIN_ALIGNMENT 32  /* best alignment for AVX, conservative for
138 			       * everything else */
139 #  else
140      /* Note that we cannot use 32-byte alignment for all SIMD.  For
141 	example, MacOS X malloc is 16-byte aligned, but there was no
142 	posix_memalign in MacOS X until version 10.6. */
143 #    define MIN_ALIGNMENT 16
144 #  endif
145 #endif
146 
147 #if defined(HAVE_ALLOCA) && defined(FFTW_ENABLE_ALLOCA)
148    /* use alloca if available */
149 
150 #ifndef alloca
151 #ifdef __GNUC__
152 # define alloca __builtin_alloca
153 #else
154 # ifdef _MSC_VER
155 #  include <malloc.h>
156 #  define alloca _alloca
157 # else
158 #  if HAVE_ALLOCA_H
159 #   include <alloca.h>
160 #  else
161 #   ifdef _AIX
162  #pragma alloca
163 #   else
164 #    ifndef alloca /* predefined by HP cc +Olibcalls */
165 void *alloca(size_t);
166 #    endif
167 #   endif
168 #  endif
169 # endif
170 #endif
171 #endif
172 
173 #  ifdef MIN_ALIGNMENT
174 #    define STACK_MALLOC(T, p, n)				\
175      {								\
176          p = (T)alloca((n) + MIN_ALIGNMENT);			\
177          p = (T)(((uintptr_t)p + (MIN_ALIGNMENT - 1)) &	\
178                (~(uintptr_t)(MIN_ALIGNMENT - 1)));		\
179      }
180 #    define STACK_FREE(n)
181 #  else /* HAVE_ALLOCA && !defined(MIN_ALIGNMENT) */
182 #    define STACK_MALLOC(T, p, n) p = (T)alloca(n)
183 #    define STACK_FREE(n)
184 #  endif
185 
186 #else /* ! HAVE_ALLOCA */
187    /* use malloc instead of alloca */
188 #  define STACK_MALLOC(T, p, n) p = (T)MALLOC(n, OTHER)
189 #  define STACK_FREE(n) X(ifree)(n)
190 #endif /* ! HAVE_ALLOCA */
191 
192 /* allocation of buffers.  If these grow too large use malloc(), else
193    use STACK_MALLOC (hopefully reducing to alloca()). */
194 
195 /* 64KiB ought to be enough for anybody */
196 #define MAX_STACK_ALLOC ((size_t)64 * 1024)
197 
198 #define BUF_ALLOC(T, p, n)			\
199 {						\
200      if (n < MAX_STACK_ALLOC) {			\
201 	  STACK_MALLOC(T, p, n);		\
202      } else {					\
203 	  p = (T)MALLOC(n, BUFFERS);		\
204      }						\
205 }
206 
207 #define BUF_FREE(p, n)				\
208 {						\
209      if (n < MAX_STACK_ALLOC) {			\
210 	  STACK_FREE(p);			\
211      } else {					\
212 	  X(ifree)(p);				\
213      }						\
214 }
215 
216 /*-----------------------------------------------------------------------*/
217 /* define uintptr_t if it is not already defined */
218 
219 #ifndef HAVE_UINTPTR_T
220 #  if SIZEOF_VOID_P == 0
221 #    error sizeof void* is unknown!
222 #  elif SIZEOF_UNSIGNED_INT == SIZEOF_VOID_P
223      typedef unsigned int uintptr_t;
224 #  elif SIZEOF_UNSIGNED_LONG == SIZEOF_VOID_P
225      typedef unsigned long uintptr_t;
226 #  elif SIZEOF_UNSIGNED_LONG_LONG == SIZEOF_VOID_P
227      typedef unsigned long long uintptr_t;
228 #  else
229 #    error no unsigned integer type matches void* sizeof!
230 #  endif
231 #endif
232 
233 /*-----------------------------------------------------------------------*/
234 /* We can do an optimization for copying pairs of (aligned) floats
235    when in single precision if 2*float = double. */
236 
237 #define FFTW_2R_IS_DOUBLE (defined(FFTW_SINGLE) \
238                            && SIZEOF_FLOAT != 0 \
239                            && SIZEOF_DOUBLE == 2*SIZEOF_FLOAT)
240 
241 #define DOUBLE_ALIGNED(p) ((((uintptr_t)(p)) % sizeof(double)) == 0)
242 
243 /*-----------------------------------------------------------------------*/
244 /* assert.c: */
245 IFFTW_EXTERN void X(assertion_failed)(const char *s,
246 				      int line, const char *file);
247 
248 /* always check */
249 #define CK(ex)						 \
250       (void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0))
251 
252 #ifdef FFTW_DEBUG
253 /* check only if debug enabled */
254 #define A(ex)						 \
255       (void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0))
256 #else
257 #define A(ex) /* nothing */
258 #endif
259 
260 extern void X(debug)(const char *format, ...);
261 #define D X(debug)
262 
263 /*-----------------------------------------------------------------------*/
264 /* kalloc.c: */
265 extern void *X(kernel_malloc)(size_t n);
266 extern void X(kernel_free)(void *p);
267 
268 /*-----------------------------------------------------------------------*/
269 /* alloc.c: */
270 
271 /* objects allocated by malloc, for statistical purposes */
272 enum malloc_tag {
273      EVERYTHING,
274      PLANS,
275      SOLVERS,
276      PROBLEMS,
277      BUFFERS,
278      HASHT,
279      TENSORS,
280      PLANNERS,
281      SLVDESCS,
282      TWIDDLES,
283      STRIDES,
284      OTHER,
285      MALLOC_WHAT_LAST		/* must be last */
286 };
287 
288 IFFTW_EXTERN void X(ifree)(void *ptr);
289 extern void X(ifree0)(void *ptr);
290 
291 IFFTW_EXTERN void *X(malloc_plain)(size_t sz);
292 #define MALLOC(n, what)  X(malloc_plain)(n)
293 
294 /*-----------------------------------------------------------------------*/
295 /* low-resolution clock */
296 
297 #ifdef FAKE_CRUDE_TIME
298  typedef int crude_time;
299 #else
300 # if TIME_WITH_SYS_TIME
301 #  include <sys/time.h>
302 #  include <time.h>
303 # else
304 #  if HAVE_SYS_TIME_H
305 #   include <sys/time.h>
306 #  else
307 #   include <time.h>
308 #  endif
309 # endif
310 
311 # ifdef HAVE_BSDGETTIMEOFDAY
312 # ifndef HAVE_GETTIMEOFDAY
313 # define gettimeofday BSDgettimeofday
314 # define HAVE_GETTIMEOFDAY 1
315 # endif
316 # endif
317 
318 # if defined(HAVE_GETTIMEOFDAY)
319    typedef struct timeval crude_time;
320 # else
321    typedef clock_t crude_time;
322 # endif
323 #endif /* else FAKE_CRUDE_TIME */
324 
325 crude_time X(get_crude_time)(void);
326 double X(elapsed_since)(const planner *plnr, const problem *p,
327 			crude_time t0); /* time in seconds since t0 */
328 
329 /*-----------------------------------------------------------------------*/
330 /* ops.c: */
331 /*
332  * ops counter.  The total number of additions is add + fma
333  * and the total number of multiplications is mul + fma.
334  * Total flops = add + mul + 2 * fma
335  */
336 typedef struct {
337      double add;
338      double mul;
339      double fma;
340      double other;
341 } opcnt;
342 
343 void X(ops_zero)(opcnt *dst);
344 void X(ops_other)(INT o, opcnt *dst);
345 void X(ops_cpy)(const opcnt *src, opcnt *dst);
346 
347 void X(ops_add)(const opcnt *a, const opcnt *b, opcnt *dst);
348 void X(ops_add2)(const opcnt *a, opcnt *dst);
349 
350 /* dst = m * a + b */
351 void X(ops_madd)(INT m, const opcnt *a, const opcnt *b, opcnt *dst);
352 
353 /* dst += m * a */
354 void X(ops_madd2)(INT m, const opcnt *a, opcnt *dst);
355 
356 
357 /*-----------------------------------------------------------------------*/
358 /* minmax.c: */
359 INT X(imax)(INT a, INT b);
360 INT X(imin)(INT a, INT b);
361 
362 /*-----------------------------------------------------------------------*/
363 /* iabs.c: */
364 INT X(iabs)(INT a);
365 
366 /* inline version */
367 #define IABS(x) (((x) < 0) ? (0 - (x)) : (x))
368 
369 /*-----------------------------------------------------------------------*/
370 /* md5.c */
371 
372 #if SIZEOF_UNSIGNED_INT >= 4
373 typedef unsigned int md5uint;
374 #else
375 typedef unsigned long md5uint; /* at least 32 bits as per C standard */
376 #endif
377 
378 typedef md5uint md5sig[4];
379 
380 typedef struct {
381      md5sig s; /* state and signature */
382 
383      /* fields not meant to be used outside md5.c: */
384      unsigned char c[64]; /* stuff not yet processed */
385      unsigned l;  /* total length.  Should be 64 bits long, but this is
386 		     good enough for us */
387 } md5;
388 
389 void X(md5begin)(md5 *p);
390 void X(md5putb)(md5 *p, const void *d_, size_t len);
391 void X(md5puts)(md5 *p, const char *s);
392 void X(md5putc)(md5 *p, unsigned char c);
393 void X(md5int)(md5 *p, int i);
394 void X(md5INT)(md5 *p, INT i);
395 void X(md5unsigned)(md5 *p, unsigned i);
396 void X(md5end)(md5 *p);
397 
398 /*-----------------------------------------------------------------------*/
399 /* tensor.c: */
400 #define STRUCT_HACK_KR
401 #undef STRUCT_HACK_C99
402 
403 typedef struct {
404      INT n;
405      INT is;			/* input stride */
406      INT os;			/* output stride */
407 } iodim;
408 
409 typedef struct {
410      int rnk;
411 #if defined(STRUCT_HACK_KR)
412      iodim dims[1];
413 #elif defined(STRUCT_HACK_C99)
414      iodim dims[];
415 #else
416      iodim *dims;
417 #endif
418 } tensor;
419 
420 /*
421   Definition of rank -infinity.
422   This definition has the property that if you want rank 0 or 1,
423   you can simply test for rank <= 1.  This is a common case.
424 
425   A tensor of rank -infinity has size 0.
426 */
427 #define RNK_MINFTY  INT_MAX
428 #define FINITE_RNK(rnk) ((rnk) != RNK_MINFTY)
429 
430 typedef enum { INPLACE_IS, INPLACE_OS } inplace_kind;
431 
432 tensor *X(mktensor)(int rnk);
433 tensor *X(mktensor_0d)(void);
434 tensor *X(mktensor_1d)(INT n, INT is, INT os);
435 tensor *X(mktensor_2d)(INT n0, INT is0, INT os0,
436 		       INT n1, INT is1, INT os1);
437 tensor *X(mktensor_3d)(INT n0, INT is0, INT os0,
438 		       INT n1, INT is1, INT os1,
439 		       INT n2, INT is2, INT os2);
440 tensor *X(mktensor_4d)(INT n0, INT is0, INT os0,
441 		       INT n1, INT is1, INT os1,
442 		       INT n2, INT is2, INT os2,
443 		       INT n3, INT is3, INT os3);
444 tensor *X(mktensor_5d)(INT n0, INT is0, INT os0,
445 		       INT n1, INT is1, INT os1,
446 		       INT n2, INT is2, INT os2,
447 		       INT n3, INT is3, INT os3,
448 		       INT n4, INT is4, INT os4);
449 INT X(tensor_sz)(const tensor *sz);
450 void X(tensor_md5)(md5 *p, const tensor *t);
451 INT X(tensor_max_index)(const tensor *sz);
452 INT X(tensor_min_istride)(const tensor *sz);
453 INT X(tensor_min_ostride)(const tensor *sz);
454 INT X(tensor_min_stride)(const tensor *sz);
455 int X(tensor_inplace_strides)(const tensor *sz);
456 int X(tensor_inplace_strides2)(const tensor *a, const tensor *b);
457 int X(tensor_strides_decrease)(const tensor *sz, const tensor *vecsz,
458                                inplace_kind k);
459 tensor *X(tensor_copy)(const tensor *sz);
460 int X(tensor_kosherp)(const tensor *x);
461 
462 tensor *X(tensor_copy_inplace)(const tensor *sz, inplace_kind k);
463 tensor *X(tensor_copy_except)(const tensor *sz, int except_dim);
464 tensor *X(tensor_copy_sub)(const tensor *sz, int start_dim, int rnk);
465 tensor *X(tensor_compress)(const tensor *sz);
466 tensor *X(tensor_compress_contiguous)(const tensor *sz);
467 tensor *X(tensor_append)(const tensor *a, const tensor *b);
468 void X(tensor_split)(const tensor *sz, tensor **a, int a_rnk, tensor **b);
469 int X(tensor_tornk1)(const tensor *t, INT *n, INT *is, INT *os);
470 void X(tensor_destroy)(tensor *sz);
471 void X(tensor_destroy2)(tensor *a, tensor *b);
472 void X(tensor_destroy4)(tensor *a, tensor *b, tensor *c, tensor *d);
473 void X(tensor_print)(const tensor *sz, printer *p);
474 int X(dimcmp)(const iodim *a, const iodim *b);
475 int X(tensor_equal)(const tensor *a, const tensor *b);
476 int X(tensor_inplace_locations)(const tensor *sz, const tensor *vecsz);
477 
478 /*-----------------------------------------------------------------------*/
479 /* problem.c: */
480 enum {
481      /* a problem that cannot be solved */
482      PROBLEM_UNSOLVABLE,
483 
484      PROBLEM_DFT,
485      PROBLEM_RDFT,
486      PROBLEM_RDFT2,
487 
488      /* for mpi/ subdirectory */
489      PROBLEM_MPI_DFT,
490      PROBLEM_MPI_RDFT,
491      PROBLEM_MPI_RDFT2,
492      PROBLEM_MPI_TRANSPOSE,
493 
494      PROBLEM_LAST
495 };
496 
497 typedef struct {
498      int problem_kind;
499      void (*hash) (const problem *ego, md5 *p);
500      void (*zero) (const problem *ego);
501      void (*print) (const problem *ego, printer *p);
502      void (*destroy) (problem *ego);
503 } problem_adt;
504 
505 struct problem_s {
506      const problem_adt *adt;
507 };
508 
509 problem *X(mkproblem)(size_t sz, const problem_adt *adt);
510 void X(problem_destroy)(problem *ego);
511 problem *X(mkproblem_unsolvable)(void);
512 
513 /*-----------------------------------------------------------------------*/
514 /* print.c */
515 struct printer_s {
516      void (*print)(printer *p, const char *format, ...);
517      void (*vprint)(printer *p, const char *format, va_list ap);
518      void (*putchr)(printer *p, char c);
519      void (*cleanup)(printer *p);
520      int indent;
521      int indent_incr;
522 };
523 
524 printer *X(mkprinter)(size_t size,
525 		      void (*putchr)(printer *p, char c),
526 		      void (*cleanup)(printer *p));
527 IFFTW_EXTERN void X(printer_destroy)(printer *p);
528 
529 /*-----------------------------------------------------------------------*/
530 /* scan.c */
531 struct scanner_s {
532      int (*scan)(scanner *sc, const char *format, ...);
533      int (*vscan)(scanner *sc, const char *format, va_list ap);
534      int (*getchr)(scanner *sc);
535      int ungotc;
536 };
537 
538 scanner *X(mkscanner)(size_t size, int (*getchr)(scanner *sc));
539 void X(scanner_destroy)(scanner *sc);
540 
541 /*-----------------------------------------------------------------------*/
542 /* plan.c: */
543 
544 enum wakefulness {
545      SLEEPY,
546      AWAKE_ZERO,
547      AWAKE_SQRTN_TABLE,
548      AWAKE_SINCOS
549 };
550 
551 typedef struct {
552      void (*solve)(const plan *ego, const problem *p);
553      void (*awake)(plan *ego, enum wakefulness wakefulness);
554      void (*print)(const plan *ego, printer *p);
555      void (*destroy)(plan *ego);
556 } plan_adt;
557 
558 struct plan_s {
559      const plan_adt *adt;
560      opcnt ops;
561      double pcost;
562      enum wakefulness wakefulness; /* used for debugging only */
563      int could_prune_now_p;
564 };
565 
566 plan *X(mkplan)(size_t size, const plan_adt *adt);
567 void X(plan_destroy_internal)(plan *ego);
568 IFFTW_EXTERN void X(plan_awake)(plan *ego, enum wakefulness wakefulness);
569 void X(plan_null_destroy)(plan *ego);
570 
571 /*-----------------------------------------------------------------------*/
572 /* solver.c: */
573 typedef struct {
574      int problem_kind;
575      plan *(*mkplan)(const solver *ego, const problem *p, planner *plnr);
576      void (*destroy)(solver *ego);
577 } solver_adt;
578 
579 struct solver_s {
580      const solver_adt *adt;
581      int refcnt;
582 };
583 
584 solver *X(mksolver)(size_t size, const solver_adt *adt);
585 void X(solver_use)(solver *ego);
586 void X(solver_destroy)(solver *ego);
587 void X(solver_register)(planner *plnr, solver *s);
588 
589 /* shorthand */
590 #define MKSOLVER(type, adt) (type *)X(mksolver)(sizeof(type), adt)
591 
592 /*-----------------------------------------------------------------------*/
593 /* planner.c */
594 
595 typedef struct slvdesc_s {
596      solver *slv;
597      const char *reg_nam;
598      unsigned nam_hash;
599      int reg_id;
600      int next_for_same_problem_kind;
601 } slvdesc;
602 
603 typedef struct solution_s solution; /* opaque */
604 
605 /* interpretation of L and U:
606 
607    - if it returns a plan, the planner guarantees that all applicable
608      plans at least as impatient as U have been tried, and that each
609      plan in the solution is at least as impatient as L.
610 
611    - if it returns 0, the planner guarantees to have tried all solvers
612      at least as impatient as L, and that none of them was applicable.
613 
614    The structure is packed to fit into 64 bits.
615 */
616 
617 typedef struct {
618      unsigned l:20;
619      unsigned hash_info:3;
620 #    define BITS_FOR_TIMELIMIT 9
621      unsigned timelimit_impatience:BITS_FOR_TIMELIMIT;
622      unsigned u:20;
623 
624      /* abstraction break: we store the solver here to pad the
625 	structure to 64 bits.  Otherwise, the struct is padded to 64
626 	bits anyway, and another word is allocated for slvndx. */
627 #    define BITS_FOR_SLVNDX 12
628      unsigned slvndx:BITS_FOR_SLVNDX;
629 } flags_t;
630 
631 /* impatience flags  */
632 enum {
633      BELIEVE_PCOST = 0x0001,
634      ESTIMATE = 0x0002,
635      NO_DFT_R2HC = 0x0004,
636      NO_SLOW = 0x0008,
637      NO_VRECURSE = 0x0010,
638      NO_INDIRECT_OP = 0x0020,
639      NO_LARGE_GENERIC = 0x0040,
640      NO_RANK_SPLITS = 0x0080,
641      NO_VRANK_SPLITS = 0x0100,
642      NO_NONTHREADED = 0x0200,
643      NO_BUFFERING = 0x0400,
644      NO_FIXED_RADIX_LARGE_N = 0x0800,
645      NO_DESTROY_INPUT = 0x1000,
646      NO_SIMD = 0x2000,
647      CONSERVE_MEMORY = 0x4000,
648      NO_DHT_R2HC = 0x8000,
649      NO_UGLY = 0x10000,
650      ALLOW_PRUNING = 0x20000
651 };
652 
653 /* hashtable information */
654 enum {
655      BLESSING = 0x1u,   /* save this entry */
656      H_VALID = 0x2u,    /* valid hastable entry */
657      H_LIVE = 0x4u      /* entry is nonempty, implies H_VALID */
658 };
659 
660 #define PLNR_L(plnr) ((plnr)->flags.l)
661 #define PLNR_U(plnr) ((plnr)->flags.u)
662 #define PLNR_TIMELIMIT_IMPATIENCE(plnr) ((plnr)->flags.timelimit_impatience)
663 
664 #define ESTIMATEP(plnr) (PLNR_U(plnr) & ESTIMATE)
665 #define BELIEVE_PCOSTP(plnr) (PLNR_U(plnr) & BELIEVE_PCOST)
666 #define ALLOW_PRUNINGP(plnr) (PLNR_U(plnr) & ALLOW_PRUNING)
667 
668 #define NO_INDIRECT_OP_P(plnr) (PLNR_L(plnr) & NO_INDIRECT_OP)
669 #define NO_LARGE_GENERICP(plnr) (PLNR_L(plnr) & NO_LARGE_GENERIC)
670 #define NO_RANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_RANK_SPLITS)
671 #define NO_VRANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_VRANK_SPLITS)
672 #define NO_VRECURSEP(plnr) (PLNR_L(plnr) & NO_VRECURSE)
673 #define NO_DFT_R2HCP(plnr) (PLNR_L(plnr) & NO_DFT_R2HC)
674 #define NO_SLOWP(plnr) (PLNR_L(plnr) & NO_SLOW)
675 #define NO_UGLYP(plnr) (PLNR_L(plnr) & NO_UGLY)
676 #define NO_FIXED_RADIX_LARGE_NP(plnr) \
677   (PLNR_L(plnr) & NO_FIXED_RADIX_LARGE_N)
678 #define NO_NONTHREADEDP(plnr) \
679   ((PLNR_L(plnr) & NO_NONTHREADED) && (plnr)->nthr > 1)
680 
681 #define NO_DESTROY_INPUTP(plnr) (PLNR_L(plnr) & NO_DESTROY_INPUT)
682 #define NO_SIMDP(plnr) (PLNR_L(plnr) & NO_SIMD)
683 #define CONSERVE_MEMORYP(plnr) (PLNR_L(plnr) & CONSERVE_MEMORY)
684 #define NO_DHT_R2HCP(plnr) (PLNR_L(plnr) & NO_DHT_R2HC)
685 #define NO_BUFFERINGP(plnr) (PLNR_L(plnr) & NO_BUFFERING)
686 
687 typedef enum { FORGET_ACCURSED, FORGET_EVERYTHING } amnesia;
688 
689 typedef enum {
690      /* WISDOM_NORMAL: planner may or may not use wisdom */
691      WISDOM_NORMAL,
692 
693      /* WISDOM_ONLY: planner must use wisdom and must avoid searching */
694      WISDOM_ONLY,
695 
696      /* WISDOM_IS_BOGUS: planner must return 0 as quickly as possible */
697      WISDOM_IS_BOGUS,
698 
699      /* WISDOM_IGNORE_INFEASIBLE: planner ignores infeasible wisdom */
700      WISDOM_IGNORE_INFEASIBLE,
701 
702      /* WISDOM_IGNORE_ALL: planner ignores all */
703      WISDOM_IGNORE_ALL
704 } wisdom_state_t;
705 
706 typedef struct {
707      void (*register_solver)(planner *ego, solver *s);
708      plan *(*mkplan)(planner *ego, const problem *p);
709      void (*forget)(planner *ego, amnesia a);
710      void (*exprt)(planner *ego, printer *p); /* ``export'' is a reserved
711 						 word in C++. */
712      int (*imprt)(planner *ego, scanner *sc);
713 } planner_adt;
714 
715 /* hash table of solutions */
716 typedef struct {
717      solution *solutions;
718      unsigned hashsiz, nelem;
719 
720      /* statistics */
721      int lookup, succ_lookup, lookup_iter;
722      int insert, insert_iter, insert_unknown;
723      int nrehash;
724 } hashtab;
725 
726 typedef enum { COST_SUM, COST_MAX } cost_kind;
727 
728 struct planner_s {
729      const planner_adt *adt;
730      void (*hook)(struct planner_s *plnr, plan *pln,
731 		  const problem *p, int optimalp);
732      double (*cost_hook)(const problem *p, double t, cost_kind k);
733      int (*wisdom_ok_hook)(const problem *p, flags_t flags);
734      void (*nowisdom_hook)(const problem *p);
735      wisdom_state_t (*bogosity_hook)(wisdom_state_t state, const problem *p);
736 
737      /* solver descriptors */
738      slvdesc *slvdescs;
739      unsigned nslvdesc, slvdescsiz;
740      const char *cur_reg_nam;
741      int cur_reg_id;
742      int slvdescs_for_problem_kind[PROBLEM_LAST];
743 
744      wisdom_state_t wisdom_state;
745 
746      hashtab htab_blessed;
747      hashtab htab_unblessed;
748 
749      int nthr;
750      flags_t flags;
751 
752      crude_time start_time;
753      double timelimit; /* elapsed_since(start_time) at which to bail out */
754      int timed_out; /* whether most recent search timed out */
755      int need_timeout_check;
756 
757      /* various statistics */
758      int nplan;    /* number of plans evaluated */
759      double pcost, epcost; /* total pcost of measured/estimated plans */
760      int nprob;    /* number of problems evaluated */
761 };
762 
763 planner *X(mkplanner)(void);
764 void X(planner_destroy)(planner *ego);
765 
766 /*
767   Iterate over all solvers.   Read:
768 
769   @article{ baker93iterators,
770   author = "Henry G. Baker, Jr.",
771   title = "Iterators: Signs of Weakness in Object-Oriented Languages",
772   journal = "{ACM} {OOPS} Messenger",
773   volume = "4",
774   number = "3",
775   pages = "18--25"
776   }
777 */
778 #define FORALL_SOLVERS(ego, s, p, what)			\
779 {							\
780      unsigned _cnt;					\
781      for (_cnt = 0; _cnt < ego->nslvdesc; ++_cnt) {	\
782 	  slvdesc *p = ego->slvdescs + _cnt;		\
783 	  solver *s = p->slv;				\
784 	  what;						\
785      }							\
786 }
787 
788 #define FORALL_SOLVERS_OF_KIND(kind, ego, s, p, what)		\
789 {								\
790      int _cnt = ego->slvdescs_for_problem_kind[kind]; 		\
791      while (_cnt >= 0) {					\
792 	  slvdesc *p = ego->slvdescs + _cnt;			\
793 	  solver *s = p->slv;					\
794 	  what;							\
795 	  _cnt = p->next_for_same_problem_kind;			\
796      }								\
797 }
798 
799 
800 /* make plan, destroy problem */
801 plan *X(mkplan_d)(planner *ego, problem *p);
802 plan *X(mkplan_f_d)(planner *ego, problem *p,
803 		    unsigned l_set, unsigned u_set, unsigned u_reset);
804 
805 /*-----------------------------------------------------------------------*/
806 /* stride.c: */
807 
808 /* If PRECOMPUTE_ARRAY_INDICES is defined, precompute all strides. */
809 #if (defined(__i386__) || defined(__x86_64__) || _M_IX86 >= 500) && !defined(FFTW_LDOUBLE)
810 #define PRECOMPUTE_ARRAY_INDICES
811 #endif
812 
813 extern const INT X(an_INT_guaranteed_to_be_zero);
814 
815 #ifdef PRECOMPUTE_ARRAY_INDICES
816 typedef INT *stride;
817 #define WS(stride, i)  (stride[i])
818 extern stride X(mkstride)(INT n, INT s);
819 void X(stride_destroy)(stride p);
820 /* hackery to prevent the compiler from copying the strides array
821    onto the stack */
822 #define MAKE_VOLATILE_STRIDE(nptr, x) (x) = (x) + X(an_INT_guaranteed_to_be_zero)
823 #else
824 
825 typedef INT stride;
826 #define WS(stride, i)  (stride * i)
827 #define fftwf_mkstride(n, stride) stride
828 #define fftw_mkstride(n, stride) stride
829 #define fftwl_mkstride(n, stride) stride
830 #define fftwf_stride_destroy(p) ((void) p)
831 #define fftw_stride_destroy(p) ((void) p)
832 #define fftwl_stride_destroy(p) ((void) p)
833 
834 /* hackery to prevent the compiler from ``optimizing'' induction
835    variables in codelet loops.  The problem is that for each K and for
836    each expression of the form P[I + STRIDE * K] in a loop, most
837    compilers will try to lift an induction variable PK := &P[I + STRIDE * K].
838    For large values of K this behavior overflows the
839    register set, which is likely worse than doing the index computation
840    in the first place.
841 
842    If we guess that there are more than
843    ESTIMATED_AVAILABLE_INDEX_REGISTERS such pointers, we deliberately confuse
844    the compiler by setting STRIDE ^= ZERO, where ZERO is a value guaranteed to
845    be 0, but the compiler does not know this.
846 
847    16 registers ought to be enough for anybody, or so the amd64 and ARM ISA's
848    seem to imply.
849 */
850 #define ESTIMATED_AVAILABLE_INDEX_REGISTERS 16
851 #define MAKE_VOLATILE_STRIDE(nptr, x)                   \
852      (nptr <= ESTIMATED_AVAILABLE_INDEX_REGISTERS ?     \
853         0 :                                             \
854       ((x) = (x) ^ X(an_INT_guaranteed_to_be_zero)))
855 #endif /* PRECOMPUTE_ARRAY_INDICES */
856 
857 /*-----------------------------------------------------------------------*/
858 /* solvtab.c */
859 
860 struct solvtab_s { void (*reg)(planner *); const char *reg_nam; };
861 typedef struct solvtab_s solvtab[];
862 void X(solvtab_exec)(const solvtab tbl, planner *p);
863 #define SOLVTAB(s) { s, STRINGIZE(s) }
864 #define SOLVTAB_END { 0, 0 }
865 
866 /*-----------------------------------------------------------------------*/
867 /* pickdim.c */
868 int X(pickdim)(int which_dim, const int *buddies, size_t nbuddies,
869 	       const tensor *sz, int oop, int *dp);
870 
871 /*-----------------------------------------------------------------------*/
872 /* twiddle.c */
873 /* little language to express twiddle factors computation */
874 enum { TW_COS = 0, TW_SIN = 1, TW_CEXP = 2, TW_NEXT = 3,
875        TW_FULL = 4, TW_HALF = 5 };
876 
877 typedef struct {
878      unsigned char op;
879      signed char v;
880      short i;
881 } tw_instr;
882 
883 typedef struct twid_s {
884      R *W;                     /* array of twiddle factors */
885      INT n, r, m;                /* transform order, radix, # twiddle rows */
886      int refcnt;
887      const tw_instr *instr;
888      struct twid_s *cdr;
889      enum wakefulness wakefulness;
890 } twid;
891 
892 INT X(twiddle_length)(INT r, const tw_instr *p);
893 void X(twiddle_awake)(enum wakefulness wakefulness,
894 		      twid **pp, const tw_instr *instr, INT n, INT r, INT m);
895 
896 /*-----------------------------------------------------------------------*/
897 /* trig.c */
898 #if defined(TRIGREAL_IS_LONG_DOUBLE)
899    typedef long double trigreal;
900 #elif defined(TRIGREAL_IS_QUAD)
901    typedef __float128 trigreal;
902 #else
903    typedef double trigreal;
904 #endif
905 
906 typedef struct triggen_s triggen;
907 
908 struct triggen_s {
909      void (*cexp)(triggen *t, INT m, R *result);
910      void (*cexpl)(triggen *t, INT m, trigreal *result);
911      void (*rotate)(triggen *p, INT m, R xr, R xi, R *res);
912 
913      INT twshft;
914      INT twradix;
915      INT twmsk;
916      trigreal *W0, *W1;
917      INT n;
918 };
919 
920 triggen *X(mktriggen)(enum wakefulness wakefulness, INT n);
921 void X(triggen_destroy)(triggen *p);
922 
923 /*-----------------------------------------------------------------------*/
924 /* primes.c: */
925 
926 #define MULMOD(x, y, p) \
927    (((x) <= 92681 - (y)) ? ((x) * (y)) % (p) : X(safe_mulmod)(x, y, p))
928 
929 INT X(safe_mulmod)(INT x, INT y, INT p);
930 INT X(power_mod)(INT n, INT m, INT p);
931 INT X(find_generator)(INT p);
932 INT X(first_divisor)(INT n);
933 int X(is_prime)(INT n);
934 INT X(next_prime)(INT n);
935 int X(factors_into)(INT n, const INT *primes);
936 int X(factors_into_small_primes)(INT n);
937 INT X(choose_radix)(INT r, INT n);
938 INT X(isqrt)(INT n);
939 INT X(modulo)(INT a, INT n);
940 
941 #define GENERIC_MIN_BAD 173 /* min prime for which generic becomes bad */
942 
943 /* thresholds below which certain solvers are considered SLOW.  These are guesses
944    believed to be conservative */
945 #define GENERIC_MAX_SLOW     16
946 #define RADER_MAX_SLOW       32
947 #define BLUESTEIN_MAX_SLOW   24
948 
949 /*-----------------------------------------------------------------------*/
950 /* rader.c: */
951 typedef struct rader_tls rader_tl;
952 
953 void X(rader_tl_insert)(INT k1, INT k2, INT k3, R *W, rader_tl **tl);
954 R *X(rader_tl_find)(INT k1, INT k2, INT k3, rader_tl *t);
955 void X(rader_tl_delete)(R *W, rader_tl **tl);
956 
957 /*-----------------------------------------------------------------------*/
958 /* copy/transposition routines */
959 
960 /* lower bound to the cache size, for tiled routines */
961 #define CACHESIZE 8192
962 
963 INT X(compute_tilesz)(INT vl, int how_many_tiles_in_cache);
964 
965 void X(tile2d)(INT n0l, INT n0u, INT n1l, INT n1u, INT tilesz,
966 	       void (*f)(INT n0l, INT n0u, INT n1l, INT n1u, void *args),
967 	       void *args);
968 void X(cpy1d)(R *I, R *O, INT n0, INT is0, INT os0, INT vl);
969 void X(zero1d_pair)(R *O0, R *O1, INT n0, INT os0);
970 void X(cpy2d)(R *I, R *O,
971 	      INT n0, INT is0, INT os0,
972 	      INT n1, INT is1, INT os1,
973 	      INT vl);
974 void X(cpy2d_ci)(R *I, R *O,
975 		 INT n0, INT is0, INT os0,
976 		 INT n1, INT is1, INT os1,
977 		 INT vl);
978 void X(cpy2d_co)(R *I, R *O,
979 		 INT n0, INT is0, INT os0,
980 		 INT n1, INT is1, INT os1,
981 		 INT vl);
982 void X(cpy2d_tiled)(R *I, R *O,
983 		    INT n0, INT is0, INT os0,
984 		    INT n1, INT is1, INT os1,
985 		    INT vl);
986 void X(cpy2d_tiledbuf)(R *I, R *O,
987 		       INT n0, INT is0, INT os0,
988 		       INT n1, INT is1, INT os1,
989 		       INT vl);
990 void X(cpy2d_pair)(R *I0, R *I1, R *O0, R *O1,
991 		   INT n0, INT is0, INT os0,
992 		   INT n1, INT is1, INT os1);
993 void X(cpy2d_pair_ci)(R *I0, R *I1, R *O0, R *O1,
994 		      INT n0, INT is0, INT os0,
995 		      INT n1, INT is1, INT os1);
996 void X(cpy2d_pair_co)(R *I0, R *I1, R *O0, R *O1,
997 		      INT n0, INT is0, INT os0,
998 		      INT n1, INT is1, INT os1);
999 
1000 void X(transpose)(R *I, INT n, INT s0, INT s1, INT vl);
1001 void X(transpose_tiled)(R *I, INT n, INT s0, INT s1, INT vl);
1002 void X(transpose_tiledbuf)(R *I, INT n, INT s0, INT s1, INT vl);
1003 
1004 typedef void (*transpose_func)(R *I, INT n, INT s0, INT s1, INT vl);
1005 typedef void (*cpy2d_func)(R *I, R *O,
1006 			   INT n0, INT is0, INT os0,
1007 			   INT n1, INT is1, INT os1,
1008 			   INT vl);
1009 
1010 /*-----------------------------------------------------------------------*/
1011 /* misc stuff */
1012 void X(null_awake)(plan *ego, enum wakefulness wakefulness);
1013 double X(iestimate_cost)(const planner *, const plan *, const problem *);
1014 
1015 #ifdef FFTW_RANDOM_ESTIMATOR
1016 extern unsigned X(random_estimate_seed);
1017 #endif
1018 
1019 double X(measure_execution_time)(const planner *plnr,
1020 				 plan *pln, const problem *p);
1021 IFFTW_EXTERN int X(ialignment_of)(R *p);
1022 unsigned X(hash)(const char *s);
1023 INT X(nbuf)(INT n, INT vl, INT maxnbuf);
1024 int X(nbuf_redundant)(INT n, INT vl, size_t which,
1025 		      const INT *maxnbuf, size_t nmaxnbuf);
1026 INT X(bufdist)(INT n, INT vl);
1027 int X(toobig)(INT n);
1028 int X(ct_uglyp)(INT min_n, INT v, INT n, INT r);
1029 
1030 #if HAVE_SIMD
1031 R *X(taint)(R *p, INT s);
1032 R *X(join_taint)(R *p1, R *p2);
1033 #define TAINT(p, s) X(taint)(p, s)
1034 #define UNTAINT(p) ((R *) (((uintptr_t) (p)) & ~(uintptr_t)3))
1035 #define TAINTOF(p) (((uintptr_t)(p)) & 3)
1036 #define JOIN_TAINT(p1, p2) X(join_taint)(p1, p2)
1037 #else
1038 #define TAINT(p, s) (p)
1039 #define UNTAINT(p) (p)
1040 #define TAINTOF(p) 0
1041 #define JOIN_TAINT(p1, p2) p1
1042 #endif
1043 
1044 #define ASSERT_ALIGNED_DOUBLE  /*unused, legacy*/
1045 
1046 /*-----------------------------------------------------------------------*/
1047 /* macros used in codelets to reduce source code size */
1048 
1049 typedef R E;  /* internal precision of codelets. */
1050 
1051 #if defined(FFTW_LDOUBLE)
1052 #  define K(x) ((E) x##L)
1053 #elif defined(FFTW_QUAD)
1054 #  define K(x) ((E) x##Q)
1055 #else
1056 #  define K(x) ((E) x)
1057 #endif
1058 #define DK(name, value) const E name = K(value)
1059 
1060 /* FMA macros */
1061 
1062 #if defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__) || defined(_POWER))
1063 /* The obvious expression a * b + c does not work.  If both x = a * b
1064    + c and y = a * b - c appear in the source, gcc computes t = a * b,
1065    x = t + c, y = t - c, thus destroying the fma.
1066 
1067    This peculiar coding seems to do the right thing on all of
1068    gcc-2.95, gcc-3.1, gcc-3.2, and gcc-3.3.  It does the right thing
1069    on gcc-3.4 -fno-web (because the ``web'' pass splits the variable
1070    `x' for the single-assignment form).
1071 
1072    However, gcc-4.0 is a formidable adversary which succeeds in
1073    pessimizing two fma's into one multiplication and two additions.
1074    It does it very early in the game---before the optimization passes
1075    even start.  The only real workaround seems to use fake inline asm
1076    such as
1077 
1078      asm ("# confuse gcc %0" : "=f"(a) : "0"(a));
1079      return a * b + c;
1080 
1081    in each of the FMA, FMS, FNMA, and FNMS functions.  However, this
1082    does not solve the problem either, because two equal asm statements
1083    count as a common subexpression!  One must use *different* fake asm
1084    statements:
1085 
1086    in FMA:
1087      asm ("# confuse gcc for fma %0" : "=f"(a) : "0"(a));
1088 
1089    in FMS:
1090      asm ("# confuse gcc for fms %0" : "=f"(a) : "0"(a));
1091 
1092    etc.
1093 
1094    After these changes, gcc recalcitrantly generates the fma that was
1095    in the source to begin with.  However, the extra asm() cruft
1096    confuses other passes of gcc, notably the instruction scheduler.
1097    (Of course, one could also generate the fma directly via inline
1098    asm, but this confuses the scheduler even more.)
1099 
1100    Steven and I have submitted more than one bug report to the gcc
1101    mailing list over the past few years, to no effect.  Thus, I give
1102    up.  gcc-4.0 can go to hell.  I'll wait at least until gcc-4.3 is
1103    out before touching this crap again.
1104 */
FMA(E a,E b,E c)1105 static __inline__ E FMA(E a, E b, E c)
1106 {
1107      E x = a * b;
1108      x = x + c;
1109      return x;
1110 }
1111 
FMS(E a,E b,E c)1112 static __inline__ E FMS(E a, E b, E c)
1113 {
1114      E x = a * b;
1115      x = x - c;
1116      return x;
1117 }
1118 
FNMA(E a,E b,E c)1119 static __inline__ E FNMA(E a, E b, E c)
1120 {
1121      E x = a * b;
1122      x = - (x + c);
1123      return x;
1124 }
1125 
FNMS(E a,E b,E c)1126 static __inline__ E FNMS(E a, E b, E c)
1127 {
1128      E x = a * b;
1129      x = - (x - c);
1130      return x;
1131 }
1132 #else
1133 #define FMA(a, b, c) (((a) * (b)) + (c))
1134 #define FMS(a, b, c) (((a) * (b)) - (c))
1135 #define FNMA(a, b, c) (- (((a) * (b)) + (c)))
1136 #define FNMS(a, b, c) ((c) - ((a) * (b)))
1137 #endif
1138 
1139 #ifdef __cplusplus
1140 }  /* extern "C" */
1141 #endif /* __cplusplus */
1142 
1143 #endif /* __IFFTW_H__ */
1144