1 /*
2 * Copyright (c) 2003, 2007-14 Matteo Frigo
3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 */
20
21
22 /* FFTW internal header file */
23 #ifndef __IFFTW_H__
24 #define __IFFTW_H__
25
26 #include "config.h"
27
28 #include <stdlib.h> /* size_t */
29 #include <stdarg.h> /* va_list */
30 #include <stddef.h> /* ptrdiff_t */
31 #include <limits.h> /* INT_MAX */
32
33 #if HAVE_SYS_TYPES_H
34 # include <sys/types.h>
35 #endif
36
37 #if HAVE_STDINT_H
38 # include <stdint.h> /* uintptr_t, maybe */
39 #endif
40
41 #if HAVE_INTTYPES_H
42 # include <inttypes.h> /* uintptr_t, maybe */
43 #endif
44
45 #ifdef __cplusplus
46 extern "C"
47 {
48 #endif /* __cplusplus */
49
50 /* Windows annoyances -- since tests/hook.c uses some internal
51 FFTW functions, we need to given them the dllexport attribute
52 under Windows when compiling as a DLL (see api/fftw3.h). */
53 #if defined(FFTW_EXTERN)
54 # define IFFTW_EXTERN FFTW_EXTERN
55 #elif (defined(FFTW_DLL) || defined(DLL_EXPORT)) \
56 && (defined(_WIN32) || defined(__WIN32__))
57 # define IFFTW_EXTERN extern __declspec(dllexport)
58 #else
59 # define IFFTW_EXTERN extern
60 #endif
61
62 /* determine precision and name-mangling scheme */
63 #define CONCAT(prefix, name) prefix ## name
64 #if defined(FFTW_SINGLE)
65 typedef float R;
66 # define X(name) CONCAT(fftwf_, name)
67 #elif defined(FFTW_LDOUBLE)
68 typedef long double R;
69 # define X(name) CONCAT(fftwl_, name)
70 # define TRIGREAL_IS_LONG_DOUBLE
71 #elif defined(FFTW_QUAD)
72 typedef __float128 R;
73 # define X(name) CONCAT(fftwq_, name)
74 # define TRIGREAL_IS_QUAD
75 #else
76 typedef double R;
77 # define X(name) CONCAT(fftw_, name)
78 #endif
79
80 /*
81 integral type large enough to contain a stride (what ``int'' should
82 have been in the first place.
83 */
84 typedef ptrdiff_t INT;
85
86 /* dummy use of unused parameters to silence compiler warnings */
87 #define UNUSED(x) (void)x
88
89 #define NELEM(array) ((sizeof(array) / sizeof((array)[0])))
90
91 #define FFT_SIGN (-1) /* sign convention for forward transforms */
92 extern void X(extract_reim)(int sign, R *c, R **r, R **i);
93
94 #define REGISTER_SOLVER(p, s) X(solver_register)(p, s)
95
96 #define STRINGIZEx(x) #x
97 #define STRINGIZE(x) STRINGIZEx(x)
98 #define CIMPLIES(ante, post) (!(ante) || (post))
99
100 /* define HAVE_SIMD if any simd extensions are supported */
101 #if defined(HAVE_SSE) || defined(HAVE_SSE2) || \
102 defined(HAVE_AVX) || defined(HAVE_AVX_128_FMA) || \
103 defined(HAVE_AVX2) || defined(HAVE_AVX512) || \
104 defined(HAVE_KCVI) || \
105 defined(HAVE_ALTIVEC) || defined(HAVE_VSX) || \
106 defined(HAVE_MIPS_PS) || \
107 defined(HAVE_GENERIC_SIMD128) || defined(HAVE_GENERIC_SIMD256)
108 #define HAVE_SIMD 1
109 #else
110 #define HAVE_SIMD 0
111 #endif
112
113 extern int X(have_simd_sse2)(void);
114 extern int X(have_simd_avx)(void);
115 extern int X(have_simd_avx_128_fma)(void);
116 extern int X(have_simd_avx2)(void);
117 extern int X(have_simd_avx2_128)(void);
118 extern int X(have_simd_avx512)(void);
119 extern int X(have_simd_altivec)(void);
120 extern int X(have_simd_vsx)(void);
121 extern int X(have_simd_neon)(void);
122
123 /* forward declarations */
124 typedef struct problem_s problem;
125 typedef struct plan_s plan;
126 typedef struct solver_s solver;
127 typedef struct planner_s planner;
128 typedef struct printer_s printer;
129 typedef struct scanner_s scanner;
130
131 /*-----------------------------------------------------------------------*/
132 /* alloca: */
133 #if HAVE_SIMD
134 # if defined(HAVE_KCVI) || defined(HAVE_AVX512)
135 # define MIN_ALIGNMENT 64
136 # elif defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_GENERIC_SIMD256)
137 # define MIN_ALIGNMENT 32 /* best alignment for AVX, conservative for
138 * everything else */
139 # else
140 /* Note that we cannot use 32-byte alignment for all SIMD. For
141 example, MacOS X malloc is 16-byte aligned, but there was no
142 posix_memalign in MacOS X until version 10.6. */
143 # define MIN_ALIGNMENT 16
144 # endif
145 #endif
146
147 #if defined(HAVE_ALLOCA) && defined(FFTW_ENABLE_ALLOCA)
148 /* use alloca if available */
149
150 #ifndef alloca
151 #ifdef __GNUC__
152 # define alloca __builtin_alloca
153 #else
154 # ifdef _MSC_VER
155 # include <malloc.h>
156 # define alloca _alloca
157 # else
158 # if HAVE_ALLOCA_H
159 # include <alloca.h>
160 # else
161 # ifdef _AIX
162 #pragma alloca
163 # else
164 # ifndef alloca /* predefined by HP cc +Olibcalls */
165 void *alloca(size_t);
166 # endif
167 # endif
168 # endif
169 # endif
170 #endif
171 #endif
172
173 # ifdef MIN_ALIGNMENT
174 # define STACK_MALLOC(T, p, n) \
175 { \
176 p = (T)alloca((n) + MIN_ALIGNMENT); \
177 p = (T)(((uintptr_t)p + (MIN_ALIGNMENT - 1)) & \
178 (~(uintptr_t)(MIN_ALIGNMENT - 1))); \
179 }
180 # define STACK_FREE(n)
181 # else /* HAVE_ALLOCA && !defined(MIN_ALIGNMENT) */
182 # define STACK_MALLOC(T, p, n) p = (T)alloca(n)
183 # define STACK_FREE(n)
184 # endif
185
186 #else /* ! HAVE_ALLOCA */
187 /* use malloc instead of alloca */
188 # define STACK_MALLOC(T, p, n) p = (T)MALLOC(n, OTHER)
189 # define STACK_FREE(n) X(ifree)(n)
190 #endif /* ! HAVE_ALLOCA */
191
192 /* allocation of buffers. If these grow too large use malloc(), else
193 use STACK_MALLOC (hopefully reducing to alloca()). */
194
195 /* 64KiB ought to be enough for anybody */
196 #define MAX_STACK_ALLOC ((size_t)64 * 1024)
197
198 #define BUF_ALLOC(T, p, n) \
199 { \
200 if (n < MAX_STACK_ALLOC) { \
201 STACK_MALLOC(T, p, n); \
202 } else { \
203 p = (T)MALLOC(n, BUFFERS); \
204 } \
205 }
206
207 #define BUF_FREE(p, n) \
208 { \
209 if (n < MAX_STACK_ALLOC) { \
210 STACK_FREE(p); \
211 } else { \
212 X(ifree)(p); \
213 } \
214 }
215
216 /*-----------------------------------------------------------------------*/
217 /* define uintptr_t if it is not already defined */
218
219 #ifndef HAVE_UINTPTR_T
220 # if SIZEOF_VOID_P == 0
221 # error sizeof void* is unknown!
222 # elif SIZEOF_UNSIGNED_INT == SIZEOF_VOID_P
223 typedef unsigned int uintptr_t;
224 # elif SIZEOF_UNSIGNED_LONG == SIZEOF_VOID_P
225 typedef unsigned long uintptr_t;
226 # elif SIZEOF_UNSIGNED_LONG_LONG == SIZEOF_VOID_P
227 typedef unsigned long long uintptr_t;
228 # else
229 # error no unsigned integer type matches void* sizeof!
230 # endif
231 #endif
232
233 /*-----------------------------------------------------------------------*/
234 /* We can do an optimization for copying pairs of (aligned) floats
235 when in single precision if 2*float = double. */
236
237 #define FFTW_2R_IS_DOUBLE (defined(FFTW_SINGLE) \
238 && SIZEOF_FLOAT != 0 \
239 && SIZEOF_DOUBLE == 2*SIZEOF_FLOAT)
240
241 #define DOUBLE_ALIGNED(p) ((((uintptr_t)(p)) % sizeof(double)) == 0)
242
243 /*-----------------------------------------------------------------------*/
244 /* assert.c: */
245 IFFTW_EXTERN void X(assertion_failed)(const char *s,
246 int line, const char *file);
247
248 /* always check */
249 #define CK(ex) \
250 (void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0))
251
252 #ifdef FFTW_DEBUG
253 /* check only if debug enabled */
254 #define A(ex) \
255 (void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0))
256 #else
257 #define A(ex) /* nothing */
258 #endif
259
260 extern void X(debug)(const char *format, ...);
261 #define D X(debug)
262
263 /*-----------------------------------------------------------------------*/
264 /* kalloc.c: */
265 extern void *X(kernel_malloc)(size_t n);
266 extern void X(kernel_free)(void *p);
267
268 /*-----------------------------------------------------------------------*/
269 /* alloc.c: */
270
271 /* objects allocated by malloc, for statistical purposes */
272 enum malloc_tag {
273 EVERYTHING,
274 PLANS,
275 SOLVERS,
276 PROBLEMS,
277 BUFFERS,
278 HASHT,
279 TENSORS,
280 PLANNERS,
281 SLVDESCS,
282 TWIDDLES,
283 STRIDES,
284 OTHER,
285 MALLOC_WHAT_LAST /* must be last */
286 };
287
288 IFFTW_EXTERN void X(ifree)(void *ptr);
289 extern void X(ifree0)(void *ptr);
290
291 IFFTW_EXTERN void *X(malloc_plain)(size_t sz);
292 #define MALLOC(n, what) X(malloc_plain)(n)
293
294 /*-----------------------------------------------------------------------*/
295 /* low-resolution clock */
296
297 #ifdef FAKE_CRUDE_TIME
298 typedef int crude_time;
299 #else
300 # if TIME_WITH_SYS_TIME
301 # include <sys/time.h>
302 # include <time.h>
303 # else
304 # if HAVE_SYS_TIME_H
305 # include <sys/time.h>
306 # else
307 # include <time.h>
308 # endif
309 # endif
310
311 # ifdef HAVE_BSDGETTIMEOFDAY
312 # ifndef HAVE_GETTIMEOFDAY
313 # define gettimeofday BSDgettimeofday
314 # define HAVE_GETTIMEOFDAY 1
315 # endif
316 # endif
317
318 # if defined(HAVE_GETTIMEOFDAY)
319 typedef struct timeval crude_time;
320 # else
321 typedef clock_t crude_time;
322 # endif
323 #endif /* else FAKE_CRUDE_TIME */
324
325 crude_time X(get_crude_time)(void);
326 double X(elapsed_since)(const planner *plnr, const problem *p,
327 crude_time t0); /* time in seconds since t0 */
328
329 /*-----------------------------------------------------------------------*/
330 /* ops.c: */
331 /*
332 * ops counter. The total number of additions is add + fma
333 * and the total number of multiplications is mul + fma.
334 * Total flops = add + mul + 2 * fma
335 */
336 typedef struct {
337 double add;
338 double mul;
339 double fma;
340 double other;
341 } opcnt;
342
343 void X(ops_zero)(opcnt *dst);
344 void X(ops_other)(INT o, opcnt *dst);
345 void X(ops_cpy)(const opcnt *src, opcnt *dst);
346
347 void X(ops_add)(const opcnt *a, const opcnt *b, opcnt *dst);
348 void X(ops_add2)(const opcnt *a, opcnt *dst);
349
350 /* dst = m * a + b */
351 void X(ops_madd)(INT m, const opcnt *a, const opcnt *b, opcnt *dst);
352
353 /* dst += m * a */
354 void X(ops_madd2)(INT m, const opcnt *a, opcnt *dst);
355
356
357 /*-----------------------------------------------------------------------*/
358 /* minmax.c: */
359 INT X(imax)(INT a, INT b);
360 INT X(imin)(INT a, INT b);
361
362 /*-----------------------------------------------------------------------*/
363 /* iabs.c: */
364 INT X(iabs)(INT a);
365
366 /* inline version */
367 #define IABS(x) (((x) < 0) ? (0 - (x)) : (x))
368
369 /*-----------------------------------------------------------------------*/
370 /* md5.c */
371
372 #if SIZEOF_UNSIGNED_INT >= 4
373 typedef unsigned int md5uint;
374 #else
375 typedef unsigned long md5uint; /* at least 32 bits as per C standard */
376 #endif
377
378 typedef md5uint md5sig[4];
379
380 typedef struct {
381 md5sig s; /* state and signature */
382
383 /* fields not meant to be used outside md5.c: */
384 unsigned char c[64]; /* stuff not yet processed */
385 unsigned l; /* total length. Should be 64 bits long, but this is
386 good enough for us */
387 } md5;
388
389 void X(md5begin)(md5 *p);
390 void X(md5putb)(md5 *p, const void *d_, size_t len);
391 void X(md5puts)(md5 *p, const char *s);
392 void X(md5putc)(md5 *p, unsigned char c);
393 void X(md5int)(md5 *p, int i);
394 void X(md5INT)(md5 *p, INT i);
395 void X(md5unsigned)(md5 *p, unsigned i);
396 void X(md5end)(md5 *p);
397
398 /*-----------------------------------------------------------------------*/
399 /* tensor.c: */
400 #define STRUCT_HACK_KR
401 #undef STRUCT_HACK_C99
402
403 typedef struct {
404 INT n;
405 INT is; /* input stride */
406 INT os; /* output stride */
407 } iodim;
408
409 typedef struct {
410 int rnk;
411 #if defined(STRUCT_HACK_KR)
412 iodim dims[1];
413 #elif defined(STRUCT_HACK_C99)
414 iodim dims[];
415 #else
416 iodim *dims;
417 #endif
418 } tensor;
419
420 /*
421 Definition of rank -infinity.
422 This definition has the property that if you want rank 0 or 1,
423 you can simply test for rank <= 1. This is a common case.
424
425 A tensor of rank -infinity has size 0.
426 */
427 #define RNK_MINFTY INT_MAX
428 #define FINITE_RNK(rnk) ((rnk) != RNK_MINFTY)
429
430 typedef enum { INPLACE_IS, INPLACE_OS } inplace_kind;
431
432 tensor *X(mktensor)(int rnk);
433 tensor *X(mktensor_0d)(void);
434 tensor *X(mktensor_1d)(INT n, INT is, INT os);
435 tensor *X(mktensor_2d)(INT n0, INT is0, INT os0,
436 INT n1, INT is1, INT os1);
437 tensor *X(mktensor_3d)(INT n0, INT is0, INT os0,
438 INT n1, INT is1, INT os1,
439 INT n2, INT is2, INT os2);
440 tensor *X(mktensor_4d)(INT n0, INT is0, INT os0,
441 INT n1, INT is1, INT os1,
442 INT n2, INT is2, INT os2,
443 INT n3, INT is3, INT os3);
444 tensor *X(mktensor_5d)(INT n0, INT is0, INT os0,
445 INT n1, INT is1, INT os1,
446 INT n2, INT is2, INT os2,
447 INT n3, INT is3, INT os3,
448 INT n4, INT is4, INT os4);
449 INT X(tensor_sz)(const tensor *sz);
450 void X(tensor_md5)(md5 *p, const tensor *t);
451 INT X(tensor_max_index)(const tensor *sz);
452 INT X(tensor_min_istride)(const tensor *sz);
453 INT X(tensor_min_ostride)(const tensor *sz);
454 INT X(tensor_min_stride)(const tensor *sz);
455 int X(tensor_inplace_strides)(const tensor *sz);
456 int X(tensor_inplace_strides2)(const tensor *a, const tensor *b);
457 int X(tensor_strides_decrease)(const tensor *sz, const tensor *vecsz,
458 inplace_kind k);
459 tensor *X(tensor_copy)(const tensor *sz);
460 int X(tensor_kosherp)(const tensor *x);
461
462 tensor *X(tensor_copy_inplace)(const tensor *sz, inplace_kind k);
463 tensor *X(tensor_copy_except)(const tensor *sz, int except_dim);
464 tensor *X(tensor_copy_sub)(const tensor *sz, int start_dim, int rnk);
465 tensor *X(tensor_compress)(const tensor *sz);
466 tensor *X(tensor_compress_contiguous)(const tensor *sz);
467 tensor *X(tensor_append)(const tensor *a, const tensor *b);
468 void X(tensor_split)(const tensor *sz, tensor **a, int a_rnk, tensor **b);
469 int X(tensor_tornk1)(const tensor *t, INT *n, INT *is, INT *os);
470 void X(tensor_destroy)(tensor *sz);
471 void X(tensor_destroy2)(tensor *a, tensor *b);
472 void X(tensor_destroy4)(tensor *a, tensor *b, tensor *c, tensor *d);
473 void X(tensor_print)(const tensor *sz, printer *p);
474 int X(dimcmp)(const iodim *a, const iodim *b);
475 int X(tensor_equal)(const tensor *a, const tensor *b);
476 int X(tensor_inplace_locations)(const tensor *sz, const tensor *vecsz);
477
478 /*-----------------------------------------------------------------------*/
479 /* problem.c: */
480 enum {
481 /* a problem that cannot be solved */
482 PROBLEM_UNSOLVABLE,
483
484 PROBLEM_DFT,
485 PROBLEM_RDFT,
486 PROBLEM_RDFT2,
487
488 /* for mpi/ subdirectory */
489 PROBLEM_MPI_DFT,
490 PROBLEM_MPI_RDFT,
491 PROBLEM_MPI_RDFT2,
492 PROBLEM_MPI_TRANSPOSE,
493
494 PROBLEM_LAST
495 };
496
497 typedef struct {
498 int problem_kind;
499 void (*hash) (const problem *ego, md5 *p);
500 void (*zero) (const problem *ego);
501 void (*print) (const problem *ego, printer *p);
502 void (*destroy) (problem *ego);
503 } problem_adt;
504
505 struct problem_s {
506 const problem_adt *adt;
507 };
508
509 problem *X(mkproblem)(size_t sz, const problem_adt *adt);
510 void X(problem_destroy)(problem *ego);
511 problem *X(mkproblem_unsolvable)(void);
512
513 /*-----------------------------------------------------------------------*/
514 /* print.c */
515 struct printer_s {
516 void (*print)(printer *p, const char *format, ...);
517 void (*vprint)(printer *p, const char *format, va_list ap);
518 void (*putchr)(printer *p, char c);
519 void (*cleanup)(printer *p);
520 int indent;
521 int indent_incr;
522 };
523
524 printer *X(mkprinter)(size_t size,
525 void (*putchr)(printer *p, char c),
526 void (*cleanup)(printer *p));
527 IFFTW_EXTERN void X(printer_destroy)(printer *p);
528
529 /*-----------------------------------------------------------------------*/
530 /* scan.c */
531 struct scanner_s {
532 int (*scan)(scanner *sc, const char *format, ...);
533 int (*vscan)(scanner *sc, const char *format, va_list ap);
534 int (*getchr)(scanner *sc);
535 int ungotc;
536 };
537
538 scanner *X(mkscanner)(size_t size, int (*getchr)(scanner *sc));
539 void X(scanner_destroy)(scanner *sc);
540
541 /*-----------------------------------------------------------------------*/
542 /* plan.c: */
543
544 enum wakefulness {
545 SLEEPY,
546 AWAKE_ZERO,
547 AWAKE_SQRTN_TABLE,
548 AWAKE_SINCOS
549 };
550
551 typedef struct {
552 void (*solve)(const plan *ego, const problem *p);
553 void (*awake)(plan *ego, enum wakefulness wakefulness);
554 void (*print)(const plan *ego, printer *p);
555 void (*destroy)(plan *ego);
556 } plan_adt;
557
558 struct plan_s {
559 const plan_adt *adt;
560 opcnt ops;
561 double pcost;
562 enum wakefulness wakefulness; /* used for debugging only */
563 int could_prune_now_p;
564 };
565
566 plan *X(mkplan)(size_t size, const plan_adt *adt);
567 void X(plan_destroy_internal)(plan *ego);
568 IFFTW_EXTERN void X(plan_awake)(plan *ego, enum wakefulness wakefulness);
569 void X(plan_null_destroy)(plan *ego);
570
571 /*-----------------------------------------------------------------------*/
572 /* solver.c: */
573 typedef struct {
574 int problem_kind;
575 plan *(*mkplan)(const solver *ego, const problem *p, planner *plnr);
576 void (*destroy)(solver *ego);
577 } solver_adt;
578
579 struct solver_s {
580 const solver_adt *adt;
581 int refcnt;
582 };
583
584 solver *X(mksolver)(size_t size, const solver_adt *adt);
585 void X(solver_use)(solver *ego);
586 void X(solver_destroy)(solver *ego);
587 void X(solver_register)(planner *plnr, solver *s);
588
589 /* shorthand */
590 #define MKSOLVER(type, adt) (type *)X(mksolver)(sizeof(type), adt)
591
592 /*-----------------------------------------------------------------------*/
593 /* planner.c */
594
595 typedef struct slvdesc_s {
596 solver *slv;
597 const char *reg_nam;
598 unsigned nam_hash;
599 int reg_id;
600 int next_for_same_problem_kind;
601 } slvdesc;
602
603 typedef struct solution_s solution; /* opaque */
604
605 /* interpretation of L and U:
606
607 - if it returns a plan, the planner guarantees that all applicable
608 plans at least as impatient as U have been tried, and that each
609 plan in the solution is at least as impatient as L.
610
611 - if it returns 0, the planner guarantees to have tried all solvers
612 at least as impatient as L, and that none of them was applicable.
613
614 The structure is packed to fit into 64 bits.
615 */
616
617 typedef struct {
618 unsigned l:20;
619 unsigned hash_info:3;
620 # define BITS_FOR_TIMELIMIT 9
621 unsigned timelimit_impatience:BITS_FOR_TIMELIMIT;
622 unsigned u:20;
623
624 /* abstraction break: we store the solver here to pad the
625 structure to 64 bits. Otherwise, the struct is padded to 64
626 bits anyway, and another word is allocated for slvndx. */
627 # define BITS_FOR_SLVNDX 12
628 unsigned slvndx:BITS_FOR_SLVNDX;
629 } flags_t;
630
631 /* impatience flags */
632 enum {
633 BELIEVE_PCOST = 0x0001,
634 ESTIMATE = 0x0002,
635 NO_DFT_R2HC = 0x0004,
636 NO_SLOW = 0x0008,
637 NO_VRECURSE = 0x0010,
638 NO_INDIRECT_OP = 0x0020,
639 NO_LARGE_GENERIC = 0x0040,
640 NO_RANK_SPLITS = 0x0080,
641 NO_VRANK_SPLITS = 0x0100,
642 NO_NONTHREADED = 0x0200,
643 NO_BUFFERING = 0x0400,
644 NO_FIXED_RADIX_LARGE_N = 0x0800,
645 NO_DESTROY_INPUT = 0x1000,
646 NO_SIMD = 0x2000,
647 CONSERVE_MEMORY = 0x4000,
648 NO_DHT_R2HC = 0x8000,
649 NO_UGLY = 0x10000,
650 ALLOW_PRUNING = 0x20000
651 };
652
653 /* hashtable information */
654 enum {
655 BLESSING = 0x1u, /* save this entry */
656 H_VALID = 0x2u, /* valid hastable entry */
657 H_LIVE = 0x4u /* entry is nonempty, implies H_VALID */
658 };
659
660 #define PLNR_L(plnr) ((plnr)->flags.l)
661 #define PLNR_U(plnr) ((plnr)->flags.u)
662 #define PLNR_TIMELIMIT_IMPATIENCE(plnr) ((plnr)->flags.timelimit_impatience)
663
664 #define ESTIMATEP(plnr) (PLNR_U(plnr) & ESTIMATE)
665 #define BELIEVE_PCOSTP(plnr) (PLNR_U(plnr) & BELIEVE_PCOST)
666 #define ALLOW_PRUNINGP(plnr) (PLNR_U(plnr) & ALLOW_PRUNING)
667
668 #define NO_INDIRECT_OP_P(plnr) (PLNR_L(plnr) & NO_INDIRECT_OP)
669 #define NO_LARGE_GENERICP(plnr) (PLNR_L(plnr) & NO_LARGE_GENERIC)
670 #define NO_RANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_RANK_SPLITS)
671 #define NO_VRANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_VRANK_SPLITS)
672 #define NO_VRECURSEP(plnr) (PLNR_L(plnr) & NO_VRECURSE)
673 #define NO_DFT_R2HCP(plnr) (PLNR_L(plnr) & NO_DFT_R2HC)
674 #define NO_SLOWP(plnr) (PLNR_L(plnr) & NO_SLOW)
675 #define NO_UGLYP(plnr) (PLNR_L(plnr) & NO_UGLY)
676 #define NO_FIXED_RADIX_LARGE_NP(plnr) \
677 (PLNR_L(plnr) & NO_FIXED_RADIX_LARGE_N)
678 #define NO_NONTHREADEDP(plnr) \
679 ((PLNR_L(plnr) & NO_NONTHREADED) && (plnr)->nthr > 1)
680
681 #define NO_DESTROY_INPUTP(plnr) (PLNR_L(plnr) & NO_DESTROY_INPUT)
682 #define NO_SIMDP(plnr) (PLNR_L(plnr) & NO_SIMD)
683 #define CONSERVE_MEMORYP(plnr) (PLNR_L(plnr) & CONSERVE_MEMORY)
684 #define NO_DHT_R2HCP(plnr) (PLNR_L(plnr) & NO_DHT_R2HC)
685 #define NO_BUFFERINGP(plnr) (PLNR_L(plnr) & NO_BUFFERING)
686
687 typedef enum { FORGET_ACCURSED, FORGET_EVERYTHING } amnesia;
688
689 typedef enum {
690 /* WISDOM_NORMAL: planner may or may not use wisdom */
691 WISDOM_NORMAL,
692
693 /* WISDOM_ONLY: planner must use wisdom and must avoid searching */
694 WISDOM_ONLY,
695
696 /* WISDOM_IS_BOGUS: planner must return 0 as quickly as possible */
697 WISDOM_IS_BOGUS,
698
699 /* WISDOM_IGNORE_INFEASIBLE: planner ignores infeasible wisdom */
700 WISDOM_IGNORE_INFEASIBLE,
701
702 /* WISDOM_IGNORE_ALL: planner ignores all */
703 WISDOM_IGNORE_ALL
704 } wisdom_state_t;
705
706 typedef struct {
707 void (*register_solver)(planner *ego, solver *s);
708 plan *(*mkplan)(planner *ego, const problem *p);
709 void (*forget)(planner *ego, amnesia a);
710 void (*exprt)(planner *ego, printer *p); /* ``export'' is a reserved
711 word in C++. */
712 int (*imprt)(planner *ego, scanner *sc);
713 } planner_adt;
714
715 /* hash table of solutions */
716 typedef struct {
717 solution *solutions;
718 unsigned hashsiz, nelem;
719
720 /* statistics */
721 int lookup, succ_lookup, lookup_iter;
722 int insert, insert_iter, insert_unknown;
723 int nrehash;
724 } hashtab;
725
726 typedef enum { COST_SUM, COST_MAX } cost_kind;
727
728 struct planner_s {
729 const planner_adt *adt;
730 void (*hook)(struct planner_s *plnr, plan *pln,
731 const problem *p, int optimalp);
732 double (*cost_hook)(const problem *p, double t, cost_kind k);
733 int (*wisdom_ok_hook)(const problem *p, flags_t flags);
734 void (*nowisdom_hook)(const problem *p);
735 wisdom_state_t (*bogosity_hook)(wisdom_state_t state, const problem *p);
736
737 /* solver descriptors */
738 slvdesc *slvdescs;
739 unsigned nslvdesc, slvdescsiz;
740 const char *cur_reg_nam;
741 int cur_reg_id;
742 int slvdescs_for_problem_kind[PROBLEM_LAST];
743
744 wisdom_state_t wisdom_state;
745
746 hashtab htab_blessed;
747 hashtab htab_unblessed;
748
749 int nthr;
750 flags_t flags;
751
752 crude_time start_time;
753 double timelimit; /* elapsed_since(start_time) at which to bail out */
754 int timed_out; /* whether most recent search timed out */
755 int need_timeout_check;
756
757 /* various statistics */
758 int nplan; /* number of plans evaluated */
759 double pcost, epcost; /* total pcost of measured/estimated plans */
760 int nprob; /* number of problems evaluated */
761 };
762
763 planner *X(mkplanner)(void);
764 void X(planner_destroy)(planner *ego);
765
766 /*
767 Iterate over all solvers. Read:
768
769 @article{ baker93iterators,
770 author = "Henry G. Baker, Jr.",
771 title = "Iterators: Signs of Weakness in Object-Oriented Languages",
772 journal = "{ACM} {OOPS} Messenger",
773 volume = "4",
774 number = "3",
775 pages = "18--25"
776 }
777 */
778 #define FORALL_SOLVERS(ego, s, p, what) \
779 { \
780 unsigned _cnt; \
781 for (_cnt = 0; _cnt < ego->nslvdesc; ++_cnt) { \
782 slvdesc *p = ego->slvdescs + _cnt; \
783 solver *s = p->slv; \
784 what; \
785 } \
786 }
787
788 #define FORALL_SOLVERS_OF_KIND(kind, ego, s, p, what) \
789 { \
790 int _cnt = ego->slvdescs_for_problem_kind[kind]; \
791 while (_cnt >= 0) { \
792 slvdesc *p = ego->slvdescs + _cnt; \
793 solver *s = p->slv; \
794 what; \
795 _cnt = p->next_for_same_problem_kind; \
796 } \
797 }
798
799
800 /* make plan, destroy problem */
801 plan *X(mkplan_d)(planner *ego, problem *p);
802 plan *X(mkplan_f_d)(planner *ego, problem *p,
803 unsigned l_set, unsigned u_set, unsigned u_reset);
804
805 /*-----------------------------------------------------------------------*/
806 /* stride.c: */
807
808 /* If PRECOMPUTE_ARRAY_INDICES is defined, precompute all strides. */
809 #if (defined(__i386__) || defined(__x86_64__) || _M_IX86 >= 500) && !defined(FFTW_LDOUBLE)
810 #define PRECOMPUTE_ARRAY_INDICES
811 #endif
812
813 extern const INT X(an_INT_guaranteed_to_be_zero);
814
815 #ifdef PRECOMPUTE_ARRAY_INDICES
816 typedef INT *stride;
817 #define WS(stride, i) (stride[i])
818 extern stride X(mkstride)(INT n, INT s);
819 void X(stride_destroy)(stride p);
820 /* hackery to prevent the compiler from copying the strides array
821 onto the stack */
822 #define MAKE_VOLATILE_STRIDE(nptr, x) (x) = (x) + X(an_INT_guaranteed_to_be_zero)
823 #else
824
825 typedef INT stride;
826 #define WS(stride, i) (stride * i)
827 #define fftwf_mkstride(n, stride) stride
828 #define fftw_mkstride(n, stride) stride
829 #define fftwl_mkstride(n, stride) stride
830 #define fftwf_stride_destroy(p) ((void) p)
831 #define fftw_stride_destroy(p) ((void) p)
832 #define fftwl_stride_destroy(p) ((void) p)
833
834 /* hackery to prevent the compiler from ``optimizing'' induction
835 variables in codelet loops. The problem is that for each K and for
836 each expression of the form P[I + STRIDE * K] in a loop, most
837 compilers will try to lift an induction variable PK := &P[I + STRIDE * K].
838 For large values of K this behavior overflows the
839 register set, which is likely worse than doing the index computation
840 in the first place.
841
842 If we guess that there are more than
843 ESTIMATED_AVAILABLE_INDEX_REGISTERS such pointers, we deliberately confuse
844 the compiler by setting STRIDE ^= ZERO, where ZERO is a value guaranteed to
845 be 0, but the compiler does not know this.
846
847 16 registers ought to be enough for anybody, or so the amd64 and ARM ISA's
848 seem to imply.
849 */
850 #define ESTIMATED_AVAILABLE_INDEX_REGISTERS 16
851 #define MAKE_VOLATILE_STRIDE(nptr, x) \
852 (nptr <= ESTIMATED_AVAILABLE_INDEX_REGISTERS ? \
853 0 : \
854 ((x) = (x) ^ X(an_INT_guaranteed_to_be_zero)))
855 #endif /* PRECOMPUTE_ARRAY_INDICES */
856
857 /*-----------------------------------------------------------------------*/
858 /* solvtab.c */
859
860 struct solvtab_s { void (*reg)(planner *); const char *reg_nam; };
861 typedef struct solvtab_s solvtab[];
862 void X(solvtab_exec)(const solvtab tbl, planner *p);
863 #define SOLVTAB(s) { s, STRINGIZE(s) }
864 #define SOLVTAB_END { 0, 0 }
865
866 /*-----------------------------------------------------------------------*/
867 /* pickdim.c */
868 int X(pickdim)(int which_dim, const int *buddies, size_t nbuddies,
869 const tensor *sz, int oop, int *dp);
870
871 /*-----------------------------------------------------------------------*/
872 /* twiddle.c */
873 /* little language to express twiddle factors computation */
874 enum { TW_COS = 0, TW_SIN = 1, TW_CEXP = 2, TW_NEXT = 3,
875 TW_FULL = 4, TW_HALF = 5 };
876
877 typedef struct {
878 unsigned char op;
879 signed char v;
880 short i;
881 } tw_instr;
882
883 typedef struct twid_s {
884 R *W; /* array of twiddle factors */
885 INT n, r, m; /* transform order, radix, # twiddle rows */
886 int refcnt;
887 const tw_instr *instr;
888 struct twid_s *cdr;
889 enum wakefulness wakefulness;
890 } twid;
891
892 INT X(twiddle_length)(INT r, const tw_instr *p);
893 void X(twiddle_awake)(enum wakefulness wakefulness,
894 twid **pp, const tw_instr *instr, INT n, INT r, INT m);
895
896 /*-----------------------------------------------------------------------*/
897 /* trig.c */
898 #if defined(TRIGREAL_IS_LONG_DOUBLE)
899 typedef long double trigreal;
900 #elif defined(TRIGREAL_IS_QUAD)
901 typedef __float128 trigreal;
902 #else
903 typedef double trigreal;
904 #endif
905
906 typedef struct triggen_s triggen;
907
908 struct triggen_s {
909 void (*cexp)(triggen *t, INT m, R *result);
910 void (*cexpl)(triggen *t, INT m, trigreal *result);
911 void (*rotate)(triggen *p, INT m, R xr, R xi, R *res);
912
913 INT twshft;
914 INT twradix;
915 INT twmsk;
916 trigreal *W0, *W1;
917 INT n;
918 };
919
920 triggen *X(mktriggen)(enum wakefulness wakefulness, INT n);
921 void X(triggen_destroy)(triggen *p);
922
923 /*-----------------------------------------------------------------------*/
924 /* primes.c: */
925
926 #define MULMOD(x, y, p) \
927 (((x) <= 92681 - (y)) ? ((x) * (y)) % (p) : X(safe_mulmod)(x, y, p))
928
929 INT X(safe_mulmod)(INT x, INT y, INT p);
930 INT X(power_mod)(INT n, INT m, INT p);
931 INT X(find_generator)(INT p);
932 INT X(first_divisor)(INT n);
933 int X(is_prime)(INT n);
934 INT X(next_prime)(INT n);
935 int X(factors_into)(INT n, const INT *primes);
936 int X(factors_into_small_primes)(INT n);
937 INT X(choose_radix)(INT r, INT n);
938 INT X(isqrt)(INT n);
939 INT X(modulo)(INT a, INT n);
940
941 #define GENERIC_MIN_BAD 173 /* min prime for which generic becomes bad */
942
943 /* thresholds below which certain solvers are considered SLOW. These are guesses
944 believed to be conservative */
945 #define GENERIC_MAX_SLOW 16
946 #define RADER_MAX_SLOW 32
947 #define BLUESTEIN_MAX_SLOW 24
948
949 /*-----------------------------------------------------------------------*/
950 /* rader.c: */
951 typedef struct rader_tls rader_tl;
952
953 void X(rader_tl_insert)(INT k1, INT k2, INT k3, R *W, rader_tl **tl);
954 R *X(rader_tl_find)(INT k1, INT k2, INT k3, rader_tl *t);
955 void X(rader_tl_delete)(R *W, rader_tl **tl);
956
957 /*-----------------------------------------------------------------------*/
958 /* copy/transposition routines */
959
960 /* lower bound to the cache size, for tiled routines */
961 #define CACHESIZE 8192
962
963 INT X(compute_tilesz)(INT vl, int how_many_tiles_in_cache);
964
965 void X(tile2d)(INT n0l, INT n0u, INT n1l, INT n1u, INT tilesz,
966 void (*f)(INT n0l, INT n0u, INT n1l, INT n1u, void *args),
967 void *args);
968 void X(cpy1d)(R *I, R *O, INT n0, INT is0, INT os0, INT vl);
969 void X(zero1d_pair)(R *O0, R *O1, INT n0, INT os0);
970 void X(cpy2d)(R *I, R *O,
971 INT n0, INT is0, INT os0,
972 INT n1, INT is1, INT os1,
973 INT vl);
974 void X(cpy2d_ci)(R *I, R *O,
975 INT n0, INT is0, INT os0,
976 INT n1, INT is1, INT os1,
977 INT vl);
978 void X(cpy2d_co)(R *I, R *O,
979 INT n0, INT is0, INT os0,
980 INT n1, INT is1, INT os1,
981 INT vl);
982 void X(cpy2d_tiled)(R *I, R *O,
983 INT n0, INT is0, INT os0,
984 INT n1, INT is1, INT os1,
985 INT vl);
986 void X(cpy2d_tiledbuf)(R *I, R *O,
987 INT n0, INT is0, INT os0,
988 INT n1, INT is1, INT os1,
989 INT vl);
990 void X(cpy2d_pair)(R *I0, R *I1, R *O0, R *O1,
991 INT n0, INT is0, INT os0,
992 INT n1, INT is1, INT os1);
993 void X(cpy2d_pair_ci)(R *I0, R *I1, R *O0, R *O1,
994 INT n0, INT is0, INT os0,
995 INT n1, INT is1, INT os1);
996 void X(cpy2d_pair_co)(R *I0, R *I1, R *O0, R *O1,
997 INT n0, INT is0, INT os0,
998 INT n1, INT is1, INT os1);
999
1000 void X(transpose)(R *I, INT n, INT s0, INT s1, INT vl);
1001 void X(transpose_tiled)(R *I, INT n, INT s0, INT s1, INT vl);
1002 void X(transpose_tiledbuf)(R *I, INT n, INT s0, INT s1, INT vl);
1003
1004 typedef void (*transpose_func)(R *I, INT n, INT s0, INT s1, INT vl);
1005 typedef void (*cpy2d_func)(R *I, R *O,
1006 INT n0, INT is0, INT os0,
1007 INT n1, INT is1, INT os1,
1008 INT vl);
1009
1010 /*-----------------------------------------------------------------------*/
1011 /* misc stuff */
1012 void X(null_awake)(plan *ego, enum wakefulness wakefulness);
1013 double X(iestimate_cost)(const planner *, const plan *, const problem *);
1014
1015 #ifdef FFTW_RANDOM_ESTIMATOR
1016 extern unsigned X(random_estimate_seed);
1017 #endif
1018
1019 double X(measure_execution_time)(const planner *plnr,
1020 plan *pln, const problem *p);
1021 IFFTW_EXTERN int X(ialignment_of)(R *p);
1022 unsigned X(hash)(const char *s);
1023 INT X(nbuf)(INT n, INT vl, INT maxnbuf);
1024 int X(nbuf_redundant)(INT n, INT vl, size_t which,
1025 const INT *maxnbuf, size_t nmaxnbuf);
1026 INT X(bufdist)(INT n, INT vl);
1027 int X(toobig)(INT n);
1028 int X(ct_uglyp)(INT min_n, INT v, INT n, INT r);
1029
1030 #if HAVE_SIMD
1031 R *X(taint)(R *p, INT s);
1032 R *X(join_taint)(R *p1, R *p2);
1033 #define TAINT(p, s) X(taint)(p, s)
1034 #define UNTAINT(p) ((R *) (((uintptr_t) (p)) & ~(uintptr_t)3))
1035 #define TAINTOF(p) (((uintptr_t)(p)) & 3)
1036 #define JOIN_TAINT(p1, p2) X(join_taint)(p1, p2)
1037 #else
1038 #define TAINT(p, s) (p)
1039 #define UNTAINT(p) (p)
1040 #define TAINTOF(p) 0
1041 #define JOIN_TAINT(p1, p2) p1
1042 #endif
1043
1044 #define ASSERT_ALIGNED_DOUBLE /*unused, legacy*/
1045
1046 /*-----------------------------------------------------------------------*/
1047 /* macros used in codelets to reduce source code size */
1048
1049 typedef R E; /* internal precision of codelets. */
1050
1051 #if defined(FFTW_LDOUBLE)
1052 # define K(x) ((E) x##L)
1053 #elif defined(FFTW_QUAD)
1054 # define K(x) ((E) x##Q)
1055 #else
1056 # define K(x) ((E) x)
1057 #endif
1058 #define DK(name, value) const E name = K(value)
1059
1060 /* FMA macros */
1061
1062 #if defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__) || defined(_POWER))
1063 /* The obvious expression a * b + c does not work. If both x = a * b
1064 + c and y = a * b - c appear in the source, gcc computes t = a * b,
1065 x = t + c, y = t - c, thus destroying the fma.
1066
1067 This peculiar coding seems to do the right thing on all of
1068 gcc-2.95, gcc-3.1, gcc-3.2, and gcc-3.3. It does the right thing
1069 on gcc-3.4 -fno-web (because the ``web'' pass splits the variable
1070 `x' for the single-assignment form).
1071
1072 However, gcc-4.0 is a formidable adversary which succeeds in
1073 pessimizing two fma's into one multiplication and two additions.
1074 It does it very early in the game---before the optimization passes
1075 even start. The only real workaround seems to use fake inline asm
1076 such as
1077
1078 asm ("# confuse gcc %0" : "=f"(a) : "0"(a));
1079 return a * b + c;
1080
1081 in each of the FMA, FMS, FNMA, and FNMS functions. However, this
1082 does not solve the problem either, because two equal asm statements
1083 count as a common subexpression! One must use *different* fake asm
1084 statements:
1085
1086 in FMA:
1087 asm ("# confuse gcc for fma %0" : "=f"(a) : "0"(a));
1088
1089 in FMS:
1090 asm ("# confuse gcc for fms %0" : "=f"(a) : "0"(a));
1091
1092 etc.
1093
1094 After these changes, gcc recalcitrantly generates the fma that was
1095 in the source to begin with. However, the extra asm() cruft
1096 confuses other passes of gcc, notably the instruction scheduler.
1097 (Of course, one could also generate the fma directly via inline
1098 asm, but this confuses the scheduler even more.)
1099
1100 Steven and I have submitted more than one bug report to the gcc
1101 mailing list over the past few years, to no effect. Thus, I give
1102 up. gcc-4.0 can go to hell. I'll wait at least until gcc-4.3 is
1103 out before touching this crap again.
1104 */
FMA(E a,E b,E c)1105 static __inline__ E FMA(E a, E b, E c)
1106 {
1107 E x = a * b;
1108 x = x + c;
1109 return x;
1110 }
1111
FMS(E a,E b,E c)1112 static __inline__ E FMS(E a, E b, E c)
1113 {
1114 E x = a * b;
1115 x = x - c;
1116 return x;
1117 }
1118
FNMA(E a,E b,E c)1119 static __inline__ E FNMA(E a, E b, E c)
1120 {
1121 E x = a * b;
1122 x = - (x + c);
1123 return x;
1124 }
1125
FNMS(E a,E b,E c)1126 static __inline__ E FNMS(E a, E b, E c)
1127 {
1128 E x = a * b;
1129 x = - (x - c);
1130 return x;
1131 }
1132 #else
1133 #define FMA(a, b, c) (((a) * (b)) + (c))
1134 #define FMS(a, b, c) (((a) * (b)) - (c))
1135 #define FNMA(a, b, c) (- (((a) * (b)) + (c)))
1136 #define FNMS(a, b, c) ((c) - ((a) * (b)))
1137 #endif
1138
1139 #ifdef __cplusplus
1140 } /* extern "C" */
1141 #endif /* __cplusplus */
1142
1143 #endif /* __IFFTW_H__ */
1144