1 /*
2 This file is part of pocketfft.
3 
4 Copyright (C) 2010-2020 Max-Planck-Society
5 Copyright (C) 2019-2020 Peter Bell
6 
7 For the odd-sized DCT-IV transforms:
8   Copyright (C) 2003, 2007-14 Matteo Frigo
9   Copyright (C) 2003, 2007-14 Massachusetts Institute of Technology
10 
11 Authors: Martin Reinecke, Peter Bell
12 
13 All rights reserved.
14 
15 Redistribution and use in source and binary forms, with or without modification,
16 are permitted provided that the following conditions are met:
17 
18 * Redistributions of source code must retain the above copyright notice, this
19   list of conditions and the following disclaimer.
20 * Redistributions in binary form must reproduce the above copyright notice, this
21   list of conditions and the following disclaimer in the documentation and/or
22   other materials provided with the distribution.
23 * Neither the name of the copyright holder nor the names of its contributors may
24   be used to endorse or promote products derived from this software without
25   specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
28 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
29 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
30 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
31 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
32 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
33 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
34 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
35 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 */
38 
39 #ifndef POCKETFFT_HDRONLY_H
40 #define POCKETFFT_HDRONLY_H
41 
42 #ifndef __cplusplus
43 #error This file is C++ and requires a C++ compiler.
44 #endif
45 
46 #if !(__cplusplus >= 201103L || _MSVC_LANG+0L >= 201103L)
47 #error This file requires at least C++11 support.
48 #endif
49 
50 #ifndef POCKETFFT_CACHE_SIZE
51 #define POCKETFFT_CACHE_SIZE 16
52 #endif
53 
54 #include <cmath>
55 #include <cstring>
56 #include <cstdlib>
57 #include <stdexcept>
58 #include <memory>
59 #include <vector>
60 #include <complex>
61 #if POCKETFFT_CACHE_SIZE!=0
62 #include <array>
63 #include <mutex>
64 #endif
65 
66 #ifndef POCKETFFT_NO_MULTITHREADING
67 #include <mutex>
68 #include <condition_variable>
69 #include <thread>
70 #include <queue>
71 #include <atomic>
72 #include <functional>
73 #include <new>
74 
75 #ifdef POCKETFFT_PTHREADS
76 #  include <pthread.h>
77 #endif
78 #endif
79 
80 #if defined(__GNUC__)
81 #define POCKETFFT_NOINLINE __attribute__((noinline))
82 #define POCKETFFT_RESTRICT __restrict__
83 #elif defined(_MSC_VER)
84 #define POCKETFFT_NOINLINE __declspec(noinline)
85 #define POCKETFFT_RESTRICT __restrict
86 #else
87 #define POCKETFFT_NOINLINE
88 #define POCKETFFT_RESTRICT
89 #endif
90 
91 namespace pocketfft {
92 
93 namespace detail {
94 using std::size_t;
95 using std::ptrdiff_t;
96 
97 // Always use std:: for <cmath> functions
98 template <typename T> T cos(T) = delete;
99 template <typename T> T sin(T) = delete;
100 template <typename T> T sqrt(T) = delete;
101 
102 using shape_t = std::vector<size_t>;
103 using stride_t = std::vector<ptrdiff_t>;
104 
105 constexpr bool FORWARD  = true,
106                BACKWARD = false;
107 
108 // only enable vector support for gcc>=5.0 and clang>=5.0
109 #ifndef POCKETFFT_NO_VECTORS
110 #define POCKETFFT_NO_VECTORS
111 #if defined(__INTEL_COMPILER)
112 // do nothing. This is necessary because this compiler also sets __GNUC__.
113 #elif defined(__clang__)
114 // AppleClang has their own version numbering
115 #ifdef __apple_build_version__
116 #  if (__clang_major__ > 9) || (__clang_major__ == 9 && __clang_minor__ >= 1)
117 #     undef POCKETFFT_NO_VECTORS
118 #  endif
119 #elif __clang_major__ >= 5
120 #  undef POCKETFFT_NO_VECTORS
121 #endif
122 #elif defined(__GNUC__)
123 #if __GNUC__>=5
124 #undef POCKETFFT_NO_VECTORS
125 #endif
126 #endif
127 #endif
128 
129 template<typename T> struct VLEN { static constexpr size_t val=1; };
130 
131 #ifndef POCKETFFT_NO_VECTORS
132 #if (defined(__AVX512F__))
133 template<> struct VLEN<float> { static constexpr size_t val=16; };
134 template<> struct VLEN<double> { static constexpr size_t val=8; };
135 #elif (defined(__AVX__))
136 template<> struct VLEN<float> { static constexpr size_t val=8; };
137 template<> struct VLEN<double> { static constexpr size_t val=4; };
138 #elif (defined(__SSE2__))
139 template<> struct VLEN<float> { static constexpr size_t val=4; };
140 template<> struct VLEN<double> { static constexpr size_t val=2; };
141 #elif (defined(__VSX__))
142 template<> struct VLEN<float> { static constexpr size_t val=4; };
143 template<> struct VLEN<double> { static constexpr size_t val=2; };
144 #elif (defined(__ARM_NEON__) || defined(__ARM_NEON))
145 template<> struct VLEN<float> { static constexpr size_t val=4; };
146 template<> struct VLEN<double> { static constexpr size_t val=2; };
147 #else
148 #define POCKETFFT_NO_VECTORS
149 #endif
150 #endif
151 
152 #if __cplusplus >= 201703L
153 inline void *aligned_alloc(size_t align, size_t size)
154   {
155   void *ptr = ::aligned_alloc(align,size);
156   if (!ptr) throw std::bad_alloc();
157   return ptr;
158   }
159 inline void aligned_dealloc(void *ptr)
160     { free(ptr); }
161 #else // portable emulation
162 inline void *aligned_alloc(size_t align, size_t size)
163   {
164   align = std::max(align, alignof(max_align_t));
165   void *ptr = malloc(size+align);
166   if (!ptr) throw std::bad_alloc();
167   void *res = reinterpret_cast<void *>
168     ((reinterpret_cast<uintptr_t>(ptr) & ~(uintptr_t(align-1))) + uintptr_t(align));
169   (reinterpret_cast<void**>(res))[-1] = ptr;
170   return res;
171   }
172 inline void aligned_dealloc(void *ptr)
173   { if (ptr) free((reinterpret_cast<void**>(ptr))[-1]); }
174 #endif
175 
176 template<typename T> class arr
177   {
178   private:
179     T *p;
180     size_t sz;
181 
182 #if defined(POCKETFFT_NO_VECTORS)
183     static T *ralloc(size_t num)
184       {
185       if (num==0) return nullptr;
186       void *res = malloc(num*sizeof(T));
187       if (!res) throw std::bad_alloc();
188       return reinterpret_cast<T *>(res);
189       }
190     static void dealloc(T *ptr)
191       { free(ptr); }
192 #else
193     static T *ralloc(size_t num)
194       {
195       if (num==0) return nullptr;
196       void *ptr = aligned_alloc(64, num*sizeof(T));
197       return static_cast<T*>(ptr);
198       }
199     static void dealloc(T *ptr)
200       { aligned_dealloc(ptr); }
201 #endif
202 
203   public:
204     arr() : p(0), sz(0) {}
205     arr(size_t n) : p(ralloc(n)), sz(n) {}
206     arr(arr &&other)
207       : p(other.p), sz(other.sz)
208       { other.p=nullptr; other.sz=0; }
209     ~arr() { dealloc(p); }
210 
211     void resize(size_t n)
212       {
213       if (n==sz) return;
214       dealloc(p);
215       p = ralloc(n);
216       sz = n;
217       }
218 
219     T &operator[](size_t idx) { return p[idx]; }
220     const T &operator[](size_t idx) const { return p[idx]; }
221 
222     T *data() { return p; }
223     const T *data() const { return p; }
224 
225     size_t size() const { return sz; }
226   };
227 
228 template<typename T> struct cmplx {
229   T r, i;
230   cmplx() {}
231   cmplx(T r_, T i_) : r(r_), i(i_) {}
232   void Set(T r_, T i_) { r=r_; i=i_; }
233   void Set(T r_) { r=r_; i=T(0); }
234   cmplx &operator+= (const cmplx &other)
235     { r+=other.r; i+=other.i; return *this; }
236   template<typename T2>cmplx &operator*= (T2 other)
237     { r*=other; i*=other; return *this; }
238   template<typename T2>cmplx &operator*= (const cmplx<T2> &other)
239     {
240     T tmp = r*other.r - i*other.i;
241     i = r*other.i + i*other.r;
242     r = tmp;
243     return *this;
244     }
245   template<typename T2>cmplx &operator+= (const cmplx<T2> &other)
246     { r+=other.r; i+=other.i; return *this; }
247   template<typename T2>cmplx &operator-= (const cmplx<T2> &other)
248     { r-=other.r; i-=other.i; return *this; }
249   template<typename T2> auto operator* (const T2 &other) const
250     -> cmplx<decltype(r*other)>
251     { return {r*other, i*other}; }
252   template<typename T2> auto operator+ (const cmplx<T2> &other) const
253     -> cmplx<decltype(r+other.r)>
254     { return {r+other.r, i+other.i}; }
255   template<typename T2> auto operator- (const cmplx<T2> &other) const
256     -> cmplx<decltype(r+other.r)>
257     { return {r-other.r, i-other.i}; }
258   template<typename T2> auto operator* (const cmplx<T2> &other) const
259     -> cmplx<decltype(r+other.r)>
260     { return {r*other.r-i*other.i, r*other.i + i*other.r}; }
261   template<bool fwd, typename T2> auto special_mul (const cmplx<T2> &other) const
262     -> cmplx<decltype(r+other.r)>
263     {
264     using Tres = cmplx<decltype(r+other.r)>;
265     return fwd ? Tres(r*other.r+i*other.i, i*other.r-r*other.i)
266                : Tres(r*other.r-i*other.i, r*other.i+i*other.r);
267     }
268 };
269 template<typename T> inline void PM(T &a, T &b, T c, T d)
270   { a=c+d; b=c-d; }
271 template<typename T> inline void PMINPLACE(T &a, T &b)
272   { T t = a; a+=b; b=t-b; }
273 template<typename T> inline void MPINPLACE(T &a, T &b)
274   { T t = a; a-=b; b=t+b; }
275 template<typename T> cmplx<T> conj(const cmplx<T> &a)
276   { return {a.r, -a.i}; }
277 template<bool fwd, typename T, typename T2> void special_mul (const cmplx<T> &v1, const cmplx<T2> &v2, cmplx<T> &res)
278   {
279   res = fwd ? cmplx<T>(v1.r*v2.r+v1.i*v2.i, v1.i*v2.r-v1.r*v2.i)
280             : cmplx<T>(v1.r*v2.r-v1.i*v2.i, v1.r*v2.i+v1.i*v2.r);
281   }
282 
283 template<typename T> void ROT90(cmplx<T> &a)
284   { auto tmp_=a.r; a.r=-a.i; a.i=tmp_; }
285 template<bool fwd, typename T> void ROTX90(cmplx<T> &a)
286   { auto tmp_= fwd ? -a.r : a.r; a.r = fwd ? a.i : -a.i; a.i=tmp_; }
287 
288 //
289 // twiddle factor section
290 //
291 template<typename T> class sincos_2pibyn
292   {
293   private:
294     using Thigh = typename std::conditional<(sizeof(T)>sizeof(double)), T, double>::type;
295     size_t N, mask, shift;
296     arr<cmplx<Thigh>> v1, v2;
297 
298     static cmplx<Thigh> calc(size_t x, size_t n, Thigh ang)
299       {
300       x<<=3;
301       if (x<4*n) // first half
302         {
303         if (x<2*n) // first quadrant
304           {
305           if (x<n) return cmplx<Thigh>(std::cos(Thigh(x)*ang), std::sin(Thigh(x)*ang));
306           return cmplx<Thigh>(std::sin(Thigh(2*n-x)*ang), std::cos(Thigh(2*n-x)*ang));
307           }
308         else // second quadrant
309           {
310           x-=2*n;
311           if (x<n) return cmplx<Thigh>(-std::sin(Thigh(x)*ang), std::cos(Thigh(x)*ang));
312           return cmplx<Thigh>(-std::cos(Thigh(2*n-x)*ang), std::sin(Thigh(2*n-x)*ang));
313           }
314         }
315       else
316         {
317         x=8*n-x;
318         if (x<2*n) // third quadrant
319           {
320           if (x<n) return cmplx<Thigh>(std::cos(Thigh(x)*ang), -std::sin(Thigh(x)*ang));
321           return cmplx<Thigh>(std::sin(Thigh(2*n-x)*ang), -std::cos(Thigh(2*n-x)*ang));
322           }
323         else // fourth quadrant
324           {
325           x-=2*n;
326           if (x<n) return cmplx<Thigh>(-std::sin(Thigh(x)*ang), -std::cos(Thigh(x)*ang));
327           return cmplx<Thigh>(-std::cos(Thigh(2*n-x)*ang), -std::sin(Thigh(2*n-x)*ang));
328           }
329         }
330       }
331 
332   public:
333     POCKETFFT_NOINLINE sincos_2pibyn(size_t n)
334       : N(n)
335       {
336       constexpr auto pi = 3.141592653589793238462643383279502884197L;
337       Thigh ang = Thigh(0.25L*pi/n);
338       size_t nval = (n+2)/2;
339       shift = 1;
340       while((size_t(1)<<shift)*(size_t(1)<<shift) < nval) ++shift;
341       mask = (size_t(1)<<shift)-1;
342       v1.resize(mask+1);
343       v1[0].Set(Thigh(1), Thigh(0));
344       for (size_t i=1; i<v1.size(); ++i)
345         v1[i]=calc(i,n,ang);
346       v2.resize((nval+mask)/(mask+1));
347       v2[0].Set(Thigh(1), Thigh(0));
348       for (size_t i=1; i<v2.size(); ++i)
349         v2[i]=calc(i*(mask+1),n,ang);
350       }
351 
352     cmplx<T> operator[](size_t idx) const
353       {
354       if (2*idx<=N)
355         {
356         auto x1=v1[idx&mask], x2=v2[idx>>shift];
357         return cmplx<T>(T(x1.r*x2.r-x1.i*x2.i), T(x1.r*x2.i+x1.i*x2.r));
358         }
359       idx = N-idx;
360       auto x1=v1[idx&mask], x2=v2[idx>>shift];
361       return cmplx<T>(T(x1.r*x2.r-x1.i*x2.i), -T(x1.r*x2.i+x1.i*x2.r));
362       }
363   };
364 
365 struct util // hack to avoid duplicate symbols
366   {
367   static POCKETFFT_NOINLINE size_t largest_prime_factor (size_t n)
368     {
369     size_t res=1;
370     while ((n&1)==0)
371       { res=2; n>>=1; }
372     for (size_t x=3; x*x<=n; x+=2)
373       while ((n%x)==0)
374         { res=x; n/=x; }
375     if (n>1) res=n;
376     return res;
377     }
378 
379   static POCKETFFT_NOINLINE double cost_guess (size_t n)
380     {
381     constexpr double lfp=1.1; // penalty for non-hardcoded larger factors
382     size_t ni=n;
383     double result=0.;
384     while ((n&1)==0)
385       { result+=2; n>>=1; }
386     for (size_t x=3; x*x<=n; x+=2)
387       while ((n%x)==0)
388         {
389         result+= (x<=5) ? double(x) : lfp*double(x); // penalize larger prime factors
390         n/=x;
391         }
392     if (n>1) result+=(n<=5) ? double(n) : lfp*double(n);
393     return result*double(ni);
394     }
395 
396   /* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */
397   static POCKETFFT_NOINLINE size_t good_size_cmplx(size_t n)
398     {
399     if (n<=12) return n;
400 
401     size_t bestfac=2*n;
402     for (size_t f11=1; f11<bestfac; f11*=11)
403       for (size_t f117=f11; f117<bestfac; f117*=7)
404         for (size_t f1175=f117; f1175<bestfac; f1175*=5)
405           {
406           size_t x=f1175;
407           while (x<n) x*=2;
408           for (;;)
409             {
410             if (x<n)
411               x*=3;
412             else if (x>n)
413               {
414               if (x<bestfac) bestfac=x;
415               if (x&1) break;
416               x>>=1;
417               }
418             else
419               return n;
420             }
421           }
422     return bestfac;
423     }
424 
425   /* returns the smallest composite of 2, 3, 5 which is >= n */
426   static POCKETFFT_NOINLINE size_t good_size_real(size_t n)
427     {
428     if (n<=6) return n;
429 
430     size_t bestfac=2*n;
431     for (size_t f5=1; f5<bestfac; f5*=5)
432       {
433       size_t x = f5;
434       while (x<n) x *= 2;
435       for (;;)
436         {
437         if (x<n)
438           x*=3;
439         else if (x>n)
440           {
441           if (x<bestfac) bestfac=x;
442           if (x&1) break;
443           x>>=1;
444           }
445         else
446           return n;
447         }
448       }
449     return bestfac;
450     }
451 
452   static size_t prod(const shape_t &shape)
453     {
454     size_t res=1;
455     for (auto sz: shape)
456       res*=sz;
457     return res;
458     }
459 
460   static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
461     const stride_t &stride_in, const stride_t &stride_out, bool inplace)
462     {
463     auto ndim = shape.size();
464     if (ndim<1) throw std::runtime_error("ndim must be >= 1");
465     if ((stride_in.size()!=ndim) || (stride_out.size()!=ndim))
466       throw std::runtime_error("stride dimension mismatch");
467     if (inplace && (stride_in!=stride_out))
468       throw std::runtime_error("stride mismatch");
469     }
470 
471   static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
472     const stride_t &stride_in, const stride_t &stride_out, bool inplace,
473     const shape_t &axes)
474     {
475     sanity_check(shape, stride_in, stride_out, inplace);
476     auto ndim = shape.size();
477     shape_t tmp(ndim,0);
478     for (auto ax : axes)
479       {
480       if (ax>=ndim) throw std::invalid_argument("bad axis number");
481       if (++tmp[ax]>1) throw std::invalid_argument("axis specified repeatedly");
482       }
483     }
484 
485   static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
486     const stride_t &stride_in, const stride_t &stride_out, bool inplace,
487     size_t axis)
488     {
489     sanity_check(shape, stride_in, stride_out, inplace);
490     if (axis>=shape.size()) throw std::invalid_argument("bad axis number");
491     }
492 
493 #ifdef POCKETFFT_NO_MULTITHREADING
494   static size_t thread_count (size_t /*nthreads*/, const shape_t &/*shape*/,
495     size_t /*axis*/, size_t /*vlen*/)
496     { return 1; }
497 #else
498   static size_t thread_count (size_t nthreads, const shape_t &shape,
499     size_t axis, size_t vlen)
500     {
501     if (nthreads==1) return 1;
502     size_t size = prod(shape);
503     size_t parallel = size / (shape[axis] * vlen);
504     if (shape[axis] < 1000)
505       parallel /= 4;
506     size_t max_threads = nthreads == 0 ?
507       std::thread::hardware_concurrency() : nthreads;
508     return std::max(size_t(1), std::min(parallel, max_threads));
509     }
510 #endif
511   };
512 
513 namespace threading {
514 
515 #ifdef POCKETFFT_NO_MULTITHREADING
516 
517 constexpr inline size_t thread_id() { return 0; }
518 constexpr inline size_t num_threads() { return 1; }
519 
520 template <typename Func>
521 void thread_map(size_t /* nthreads */, Func f)
522   { f(); }
523 
524 #else
525 
526 inline size_t &thread_id()
527   {
528   static thread_local size_t thread_id_=0;
529   return thread_id_;
530   }
531 inline size_t &num_threads()
532   {
533   static thread_local size_t num_threads_=1;
534   return num_threads_;
535   }
536 static const size_t max_threads = std::max(1u, std::thread::hardware_concurrency());
537 
538 class latch
539   {
540     std::atomic<size_t> num_left_;
541     std::mutex mut_;
542     std::condition_variable completed_;
543     using lock_t = std::unique_lock<std::mutex>;
544 
545   public:
546     latch(size_t n): num_left_(n) {}
547 
548     void count_down()
549       {
550       lock_t lock(mut_);
551       if (--num_left_)
552         return;
553       completed_.notify_all();
554       }
555 
556     void wait()
557       {
558       lock_t lock(mut_);
559       completed_.wait(lock, [this]{ return is_ready(); });
560       }
561     bool is_ready() { return num_left_ == 0; }
562   };
563 
564 template <typename T> class concurrent_queue
565   {
566     std::queue<T> q_;
567     std::mutex mut_;
568     std::atomic<size_t> size_;
569     using lock_t = std::lock_guard<std::mutex>;
570 
571   public:
572 
573     void push(T val)
574       {
575       lock_t lock(mut_);
576       ++size_;
577       q_.push(std::move(val));
578       }
579 
580     bool try_pop(T &val)
581       {
582       if (size_ == 0) return false;
583       lock_t lock(mut_);
584       // Queue might have been emptied while we acquired the lock
585       if (q_.empty()) return false;
586 
587       val = std::move(q_.front());
588       --size_;
589       q_.pop();
590       return true;
591       }
592 
593     bool empty() const { return size_==0; }
594   };
595 
596 // C++ allocator with support for over-aligned types
597 template <typename T> struct aligned_allocator
598   {
599   using value_type = T;
600   template <class U>
601   aligned_allocator(const aligned_allocator<U>&) {}
602   aligned_allocator() = default;
603 
604   T *allocate(size_t n)
605     {
606     void* mem = aligned_alloc(alignof(T), n*sizeof(T));
607     return static_cast<T*>(mem);
608     }
609 
610   void deallocate(T *p, size_t /*n*/)
611     { aligned_dealloc(p); }
612   };
613 
614 class thread_pool
615   {
616     // A reasonable guess, probably close enough for most hardware
617     static constexpr size_t cache_line_size = 64;
618     struct alignas(cache_line_size) worker
619       {
620       std::thread thread;
621       std::condition_variable work_ready;
622       std::mutex mut;
623       std::atomic_flag busy_flag = ATOMIC_FLAG_INIT;
624       std::function<void()> work;
625 
626       void worker_main(
627         std::atomic<bool> &shutdown_flag,
628         std::atomic<size_t> &unscheduled_tasks,
629         concurrent_queue<std::function<void()>> &overflow_work)
630         {
631         using lock_t = std::unique_lock<std::mutex>;
632         bool expect_work = true;
633         while (!shutdown_flag || expect_work)
634           {
635           std::function<void()> local_work;
636           if (expect_work || unscheduled_tasks == 0)
637             {
638             lock_t lock(mut);
639             // Wait until there is work to be executed
640             work_ready.wait(lock, [&]{ return (work || shutdown_flag); });
641             local_work.swap(work);
642             expect_work = false;
643             }
644 
645           bool marked_busy = false;
646           if (local_work)
647             {
648             marked_busy = true;
649             local_work();
650             }
651 
652           if (!overflow_work.empty())
653             {
654             if (!marked_busy && busy_flag.test_and_set())
655               {
656               expect_work = true;
657               continue;
658               }
659             marked_busy = true;
660 
661             while (overflow_work.try_pop(local_work))
662               {
663               --unscheduled_tasks;
664               local_work();
665               }
666             }
667 
668           if (marked_busy) busy_flag.clear();
669           }
670         }
671       };
672 
673     concurrent_queue<std::function<void()>> overflow_work_;
674     std::mutex mut_;
675     std::vector<worker, aligned_allocator<worker>> workers_;
676     std::atomic<bool> shutdown_;
677     std::atomic<size_t> unscheduled_tasks_;
678     using lock_t = std::lock_guard<std::mutex>;
679 
680     void create_threads()
681       {
682       lock_t lock(mut_);
683       size_t nthreads=workers_.size();
684       for (size_t i=0; i<nthreads; ++i)
685         {
686         try
687           {
688           auto *worker = &workers_[i];
689           worker->busy_flag.clear();
690           worker->work = nullptr;
691           worker->thread = std::thread([worker, this]
692             {
693             worker->worker_main(shutdown_, unscheduled_tasks_, overflow_work_);
694             });
695           }
696         catch (...)
697           {
698           shutdown_locked();
699           throw;
700           }
701         }
702       }
703 
704     void shutdown_locked()
705       {
706       shutdown_ = true;
707       for (auto &worker : workers_)
708         worker.work_ready.notify_all();
709 
710       for (auto &worker : workers_)
711         if (worker.thread.joinable())
712           worker.thread.join();
713       }
714 
715   public:
716     explicit thread_pool(size_t nthreads):
717       workers_(nthreads)
718       { create_threads(); }
719 
720     thread_pool(): thread_pool(max_threads) {}
721 
722     ~thread_pool() { shutdown(); }
723 
724     void submit(std::function<void()> work)
725       {
726       lock_t lock(mut_);
727       if (shutdown_)
728         throw std::runtime_error("Work item submitted after shutdown");
729 
730       ++unscheduled_tasks_;
731 
732       // First check for any idle workers and wake those
733       for (auto &worker : workers_)
734         if (!worker.busy_flag.test_and_set())
735           {
736           --unscheduled_tasks_;
737           {
738           lock_t lock(worker.mut);
739           worker.work = std::move(work);
740           }
741           worker.work_ready.notify_one();
742           return;
743           }
744 
745       // If no workers were idle, push onto the overflow queue for later
746       overflow_work_.push(std::move(work));
747       }
748 
749     void shutdown()
750       {
751       lock_t lock(mut_);
752       shutdown_locked();
753       }
754 
755     void restart()
756       {
757       shutdown_ = false;
758       create_threads();
759       }
760   };
761 
762 inline thread_pool & get_pool()
763   {
764   static thread_pool pool;
765 #ifdef POCKETFFT_PTHREADS
766   static std::once_flag f;
767   std::call_once(f,
768     []{
769     pthread_atfork(
770       +[]{ get_pool().shutdown(); },  // prepare
771       +[]{ get_pool().restart(); },   // parent
772       +[]{ get_pool().restart(); }    // child
773       );
774     });
775 #endif
776 
777   return pool;
778   }
779 
780 /** Map a function f over nthreads */
781 template <typename Func>
782 void thread_map(size_t nthreads, Func f)
783   {
784   if (nthreads == 0)
785     nthreads = max_threads;
786 
787   if (nthreads == 1)
788     { f(); return; }
789 
790   auto & pool = get_pool();
791   latch counter(nthreads);
792   std::exception_ptr ex;
793   std::mutex ex_mut;
794   for (size_t i=0; i<nthreads; ++i)
795     {
796     pool.submit(
797       [&f, &counter, &ex, &ex_mut, i, nthreads] {
798       thread_id() = i;
799       num_threads() = nthreads;
800       try { f(); }
801       catch (...)
802         {
803         std::lock_guard<std::mutex> lock(ex_mut);
804         ex = std::current_exception();
805         }
806       counter.count_down();
807       });
808     }
809   counter.wait();
810   if (ex)
811     std::rethrow_exception(ex);
812   }
813 
814 #endif
815 
816 }
817 
818 //
819 // complex FFTPACK transforms
820 //
821 
822 template<typename T0> class cfftp
823   {
824   private:
825     struct fctdata
826       {
827       size_t fct;
828       cmplx<T0> *tw, *tws;
829       };
830 
831     size_t length;
832     arr<cmplx<T0>> mem;
833     std::vector<fctdata> fact;
834 
835     void add_factor(size_t factor)
836       { fact.push_back({factor, nullptr, nullptr}); }
837 
838 template<bool fwd, typename T> void pass2 (size_t ido, size_t l1,
839   const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
840   const cmplx<T0> * POCKETFFT_RESTRICT wa) const
841   {
842   auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
843     { return ch[a+ido*(b+l1*c)]; };
844   auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
845     { return cc[a+ido*(b+2*c)]; };
846   auto WA = [wa, ido](size_t x, size_t i)
847     { return wa[i-1+x*(ido-1)]; };
848 
849   if (ido==1)
850     for (size_t k=0; k<l1; ++k)
851       {
852       CH(0,k,0) = CC(0,0,k)+CC(0,1,k);
853       CH(0,k,1) = CC(0,0,k)-CC(0,1,k);
854       }
855   else
856     for (size_t k=0; k<l1; ++k)
857       {
858       CH(0,k,0) = CC(0,0,k)+CC(0,1,k);
859       CH(0,k,1) = CC(0,0,k)-CC(0,1,k);
860       for (size_t i=1; i<ido; ++i)
861         {
862         CH(i,k,0) = CC(i,0,k)+CC(i,1,k);
863         special_mul<fwd>(CC(i,0,k)-CC(i,1,k),WA(0,i),CH(i,k,1));
864         }
865       }
866   }
867 
868 #define POCKETFFT_PREP3(idx) \
869         T t0 = CC(idx,0,k), t1, t2; \
870         PM (t1,t2,CC(idx,1,k),CC(idx,2,k)); \
871         CH(idx,k,0)=t0+t1;
872 #define POCKETFFT_PARTSTEP3a(u1,u2,twr,twi) \
873         { \
874         T ca=t0+t1*twr; \
875         T cb{-t2.i*twi, t2.r*twi}; \
876         PM(CH(0,k,u1),CH(0,k,u2),ca,cb) ;\
877         }
878 #define POCKETFFT_PARTSTEP3b(u1,u2,twr,twi) \
879         { \
880         T ca=t0+t1*twr; \
881         T cb{-t2.i*twi, t2.r*twi}; \
882         special_mul<fwd>(ca+cb,WA(u1-1,i),CH(i,k,u1)); \
883         special_mul<fwd>(ca-cb,WA(u2-1,i),CH(i,k,u2)); \
884         }
885 template<bool fwd, typename T> void pass3 (size_t ido, size_t l1,
886   const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
887   const cmplx<T0> * POCKETFFT_RESTRICT wa) const
888   {
889   constexpr T0 tw1r=-0.5,
890                tw1i= (fwd ? -1: 1) * T0(0.8660254037844386467637231707529362L);
891 
892   auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
893     { return ch[a+ido*(b+l1*c)]; };
894   auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
895     { return cc[a+ido*(b+3*c)]; };
896   auto WA = [wa, ido](size_t x, size_t i)
897     { return wa[i-1+x*(ido-1)]; };
898 
899   if (ido==1)
900     for (size_t k=0; k<l1; ++k)
901       {
902       POCKETFFT_PREP3(0)
903       POCKETFFT_PARTSTEP3a(1,2,tw1r,tw1i)
904       }
905   else
906     for (size_t k=0; k<l1; ++k)
907       {
908       {
909       POCKETFFT_PREP3(0)
910       POCKETFFT_PARTSTEP3a(1,2,tw1r,tw1i)
911       }
912       for (size_t i=1; i<ido; ++i)
913         {
914         POCKETFFT_PREP3(i)
915         POCKETFFT_PARTSTEP3b(1,2,tw1r,tw1i)
916         }
917       }
918   }
919 
920 #undef POCKETFFT_PARTSTEP3b
921 #undef POCKETFFT_PARTSTEP3a
922 #undef POCKETFFT_PREP3
923 
924 template<bool fwd, typename T> void pass4 (size_t ido, size_t l1,
925   const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
926   const cmplx<T0> * POCKETFFT_RESTRICT wa) const
927   {
928   auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
929     { return ch[a+ido*(b+l1*c)]; };
930   auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
931     { return cc[a+ido*(b+4*c)]; };
932   auto WA = [wa, ido](size_t x, size_t i)
933     { return wa[i-1+x*(ido-1)]; };
934 
935   if (ido==1)
936     for (size_t k=0; k<l1; ++k)
937       {
938       T t1, t2, t3, t4;
939       PM(t2,t1,CC(0,0,k),CC(0,2,k));
940       PM(t3,t4,CC(0,1,k),CC(0,3,k));
941       ROTX90<fwd>(t4);
942       PM(CH(0,k,0),CH(0,k,2),t2,t3);
943       PM(CH(0,k,1),CH(0,k,3),t1,t4);
944       }
945   else
946     for (size_t k=0; k<l1; ++k)
947       {
948       {
949       T t1, t2, t3, t4;
950       PM(t2,t1,CC(0,0,k),CC(0,2,k));
951       PM(t3,t4,CC(0,1,k),CC(0,3,k));
952       ROTX90<fwd>(t4);
953       PM(CH(0,k,0),CH(0,k,2),t2,t3);
954       PM(CH(0,k,1),CH(0,k,3),t1,t4);
955       }
956       for (size_t i=1; i<ido; ++i)
957         {
958         T t1, t2, t3, t4;
959         T cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
960         PM(t2,t1,cc0,cc2);
961         PM(t3,t4,cc1,cc3);
962         ROTX90<fwd>(t4);
963         CH(i,k,0) = t2+t3;
964         special_mul<fwd>(t1+t4,WA(0,i),CH(i,k,1));
965         special_mul<fwd>(t2-t3,WA(1,i),CH(i,k,2));
966         special_mul<fwd>(t1-t4,WA(2,i),CH(i,k,3));
967         }
968       }
969   }
970 
971 #define POCKETFFT_PREP5(idx) \
972         T t0 = CC(idx,0,k), t1, t2, t3, t4; \
973         PM (t1,t4,CC(idx,1,k),CC(idx,4,k)); \
974         PM (t2,t3,CC(idx,2,k),CC(idx,3,k)); \
975         CH(idx,k,0).r=t0.r+t1.r+t2.r; \
976         CH(idx,k,0).i=t0.i+t1.i+t2.i;
977 
978 #define POCKETFFT_PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \
979         { \
980         T ca,cb; \
981         ca.r=t0.r+twar*t1.r+twbr*t2.r; \
982         ca.i=t0.i+twar*t1.i+twbr*t2.i; \
983         cb.i=twai*t4.r twbi*t3.r; \
984         cb.r=-(twai*t4.i twbi*t3.i); \
985         PM(CH(0,k,u1),CH(0,k,u2),ca,cb); \
986         }
987 
988 #define POCKETFFT_PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \
989         { \
990         T ca,cb,da,db; \
991         ca.r=t0.r+twar*t1.r+twbr*t2.r; \
992         ca.i=t0.i+twar*t1.i+twbr*t2.i; \
993         cb.i=twai*t4.r twbi*t3.r; \
994         cb.r=-(twai*t4.i twbi*t3.i); \
995         special_mul<fwd>(ca+cb,WA(u1-1,i),CH(i,k,u1)); \
996         special_mul<fwd>(ca-cb,WA(u2-1,i),CH(i,k,u2)); \
997         }
998 template<bool fwd, typename T> void pass5 (size_t ido, size_t l1,
999   const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1000   const cmplx<T0> * POCKETFFT_RESTRICT wa) const
1001   {
1002   constexpr T0 tw1r= T0(0.3090169943749474241022934171828191L),
1003                tw1i= (fwd ? -1: 1) * T0(0.9510565162951535721164393333793821L),
1004                tw2r= T0(-0.8090169943749474241022934171828191L),
1005                tw2i= (fwd ? -1: 1) * T0(0.5877852522924731291687059546390728L);
1006 
1007   auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
1008     { return ch[a+ido*(b+l1*c)]; };
1009   auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
1010     { return cc[a+ido*(b+5*c)]; };
1011   auto WA = [wa, ido](size_t x, size_t i)
1012     { return wa[i-1+x*(ido-1)]; };
1013 
1014   if (ido==1)
1015     for (size_t k=0; k<l1; ++k)
1016       {
1017       POCKETFFT_PREP5(0)
1018       POCKETFFT_PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
1019       POCKETFFT_PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
1020       }
1021   else
1022     for (size_t k=0; k<l1; ++k)
1023       {
1024       {
1025       POCKETFFT_PREP5(0)
1026       POCKETFFT_PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
1027       POCKETFFT_PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
1028       }
1029       for (size_t i=1; i<ido; ++i)
1030         {
1031         POCKETFFT_PREP5(i)
1032         POCKETFFT_PARTSTEP5b(1,4,tw1r,tw2r,+tw1i,+tw2i)
1033         POCKETFFT_PARTSTEP5b(2,3,tw2r,tw1r,+tw2i,-tw1i)
1034         }
1035       }
1036   }
1037 
1038 #undef POCKETFFT_PARTSTEP5b
1039 #undef POCKETFFT_PARTSTEP5a
1040 #undef POCKETFFT_PREP5
1041 
1042 #define POCKETFFT_PREP7(idx) \
1043         T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \
1044         PM (t2,t7,CC(idx,1,k),CC(idx,6,k)); \
1045         PM (t3,t6,CC(idx,2,k),CC(idx,5,k)); \
1046         PM (t4,t5,CC(idx,3,k),CC(idx,4,k)); \
1047         CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \
1048         CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i;
1049 
1050 #define POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,out1,out2) \
1051         { \
1052         T ca,cb; \
1053         ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; \
1054         ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \
1055         cb.i=y1*t7.r y2*t6.r y3*t5.r; \
1056         cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \
1057         PM(out1,out2,ca,cb); \
1058         }
1059 #define POCKETFFT_PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \
1060         POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2))
1061 #define POCKETFFT_PARTSTEP7(u1,u2,x1,x2,x3,y1,y2,y3) \
1062         { \
1063         T da,db; \
1064         POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \
1065         special_mul<fwd>(da,WA(u1-1,i),CH(i,k,u1)); \
1066         special_mul<fwd>(db,WA(u2-1,i),CH(i,k,u2)); \
1067         }
1068 
1069 template<bool fwd, typename T> void pass7(size_t ido, size_t l1,
1070   const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1071   const cmplx<T0> * POCKETFFT_RESTRICT wa) const
1072   {
1073   constexpr T0 tw1r= T0(0.6234898018587335305250048840042398L),
1074                tw1i= (fwd ? -1 : 1) * T0(0.7818314824680298087084445266740578L),
1075                tw2r= T0(-0.2225209339563144042889025644967948L),
1076                tw2i= (fwd ? -1 : 1) * T0(0.9749279121818236070181316829939312L),
1077                tw3r= T0(-0.9009688679024191262361023195074451L),
1078                tw3i= (fwd ? -1 : 1) * T0(0.433883739117558120475768332848359L);
1079 
1080   auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
1081     { return ch[a+ido*(b+l1*c)]; };
1082   auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
1083     { return cc[a+ido*(b+7*c)]; };
1084   auto WA = [wa, ido](size_t x, size_t i)
1085     { return wa[i-1+x*(ido-1)]; };
1086 
1087   if (ido==1)
1088     for (size_t k=0; k<l1; ++k)
1089       {
1090       POCKETFFT_PREP7(0)
1091       POCKETFFT_PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
1092       POCKETFFT_PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
1093       POCKETFFT_PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
1094       }
1095   else
1096     for (size_t k=0; k<l1; ++k)
1097       {
1098       {
1099       POCKETFFT_PREP7(0)
1100       POCKETFFT_PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
1101       POCKETFFT_PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
1102       POCKETFFT_PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
1103       }
1104       for (size_t i=1; i<ido; ++i)
1105         {
1106         POCKETFFT_PREP7(i)
1107         POCKETFFT_PARTSTEP7(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
1108         POCKETFFT_PARTSTEP7(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
1109         POCKETFFT_PARTSTEP7(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
1110         }
1111       }
1112   }
1113 
1114 #undef POCKETFFT_PARTSTEP7
1115 #undef POCKETFFT_PARTSTEP7a0
1116 #undef POCKETFFT_PARTSTEP7a
1117 #undef POCKETFFT_PREP7
1118 
1119 template <bool fwd, typename T> void ROTX45(T &a) const
1120   {
1121   constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);
1122   if (fwd)
1123     { auto tmp_=a.r; a.r=hsqt2*(a.r+a.i); a.i=hsqt2*(a.i-tmp_); }
1124   else
1125     { auto tmp_=a.r; a.r=hsqt2*(a.r-a.i); a.i=hsqt2*(a.i+tmp_); }
1126   }
1127 template <bool fwd, typename T> void ROTX135(T &a) const
1128   {
1129   constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);
1130   if (fwd)
1131     { auto tmp_=a.r; a.r=hsqt2*(a.i-a.r); a.i=hsqt2*(-tmp_-a.i); }
1132   else
1133     { auto tmp_=a.r; a.r=hsqt2*(-a.r-a.i); a.i=hsqt2*(tmp_-a.i); }
1134   }
1135 
1136 template<bool fwd, typename T> void pass8 (size_t ido, size_t l1,
1137   const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1138   const cmplx<T0> * POCKETFFT_RESTRICT wa) const
1139   {
1140   auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
1141     { return ch[a+ido*(b+l1*c)]; };
1142   auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
1143     { return cc[a+ido*(b+8*c)]; };
1144   auto WA = [wa, ido](size_t x, size_t i)
1145     { return wa[i-1+x*(ido-1)]; };
1146 
1147   if (ido==1)
1148     for (size_t k=0; k<l1; ++k)
1149       {
1150       T a0, a1, a2, a3, a4, a5, a6, a7;
1151       PM(a1,a5,CC(0,1,k),CC(0,5,k));
1152       PM(a3,a7,CC(0,3,k),CC(0,7,k));
1153       PMINPLACE(a1,a3);
1154       ROTX90<fwd>(a3);
1155 
1156       ROTX90<fwd>(a7);
1157       PMINPLACE(a5,a7);
1158       ROTX45<fwd>(a5);
1159       ROTX135<fwd>(a7);
1160 
1161       PM(a0,a4,CC(0,0,k),CC(0,4,k));
1162       PM(a2,a6,CC(0,2,k),CC(0,6,k));
1163       PM(CH(0,k,0),CH(0,k,4),a0+a2,a1);
1164       PM(CH(0,k,2),CH(0,k,6),a0-a2,a3);
1165       ROTX90<fwd>(a6);
1166       PM(CH(0,k,1),CH(0,k,5),a4+a6,a5);
1167       PM(CH(0,k,3),CH(0,k,7),a4-a6,a7);
1168       }
1169   else
1170     for (size_t k=0; k<l1; ++k)
1171       {
1172       {
1173       T a0, a1, a2, a3, a4, a5, a6, a7;
1174       PM(a1,a5,CC(0,1,k),CC(0,5,k));
1175       PM(a3,a7,CC(0,3,k),CC(0,7,k));
1176       PMINPLACE(a1,a3);
1177       ROTX90<fwd>(a3);
1178 
1179       ROTX90<fwd>(a7);
1180       PMINPLACE(a5,a7);
1181       ROTX45<fwd>(a5);
1182       ROTX135<fwd>(a7);
1183 
1184       PM(a0,a4,CC(0,0,k),CC(0,4,k));
1185       PM(a2,a6,CC(0,2,k),CC(0,6,k));
1186       PM(CH(0,k,0),CH(0,k,4),a0+a2,a1);
1187       PM(CH(0,k,2),CH(0,k,6),a0-a2,a3);
1188       ROTX90<fwd>(a6);
1189       PM(CH(0,k,1),CH(0,k,5),a4+a6,a5);
1190       PM(CH(0,k,3),CH(0,k,7),a4-a6,a7);
1191       }
1192       for (size_t i=1; i<ido; ++i)
1193         {
1194         T a0, a1, a2, a3, a4, a5, a6, a7;
1195         PM(a1,a5,CC(i,1,k),CC(i,5,k));
1196         PM(a3,a7,CC(i,3,k),CC(i,7,k));
1197         ROTX90<fwd>(a7);
1198         PMINPLACE(a1,a3);
1199         ROTX90<fwd>(a3);
1200         PMINPLACE(a5,a7);
1201         ROTX45<fwd>(a5);
1202         ROTX135<fwd>(a7);
1203         PM(a0,a4,CC(i,0,k),CC(i,4,k));
1204         PM(a2,a6,CC(i,2,k),CC(i,6,k));
1205         PMINPLACE(a0,a2);
1206         CH(i,k,0) = a0+a1;
1207         special_mul<fwd>(a0-a1,WA(3,i),CH(i,k,4));
1208         special_mul<fwd>(a2+a3,WA(1,i),CH(i,k,2));
1209         special_mul<fwd>(a2-a3,WA(5,i),CH(i,k,6));
1210         ROTX90<fwd>(a6);
1211         PMINPLACE(a4,a6);
1212         special_mul<fwd>(a4+a5,WA(0,i),CH(i,k,1));
1213         special_mul<fwd>(a4-a5,WA(4,i),CH(i,k,5));
1214         special_mul<fwd>(a6+a7,WA(2,i),CH(i,k,3));
1215         special_mul<fwd>(a6-a7,WA(6,i),CH(i,k,7));
1216         }
1217       }
1218    }
1219 
1220 
1221 #define POCKETFFT_PREP11(idx) \
1222         T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \
1223         PM (t2,t11,CC(idx,1,k),CC(idx,10,k)); \
1224         PM (t3,t10,CC(idx,2,k),CC(idx, 9,k)); \
1225         PM (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)); \
1226         PM (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)); \
1227         PM (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)); \
1228         CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; \
1229         CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i;
1230 
1231 #define POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,out1,out2) \
1232         { \
1233         T ca = t1 + t2*x1 + t3*x2 + t4*x3 + t5*x4 +t6*x5, \
1234           cb; \
1235         cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; \
1236         cb.r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); \
1237         PM(out1,out2,ca,cb); \
1238         }
1239 #define POCKETFFT_PARTSTEP11a(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
1240         POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,CH(0,k,u1),CH(0,k,u2))
1241 #define POCKETFFT_PARTSTEP11(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
1242         { \
1243         T da,db; \
1244         POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,da,db) \
1245         special_mul<fwd>(da,WA(u1-1,i),CH(i,k,u1)); \
1246         special_mul<fwd>(db,WA(u2-1,i),CH(i,k,u2)); \
1247         }
1248 
1249 template<bool fwd, typename T> void pass11 (size_t ido, size_t l1,
1250   const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1251   const cmplx<T0> * POCKETFFT_RESTRICT wa) const
1252   {
1253   constexpr T0 tw1r= T0(0.8412535328311811688618116489193677L),
1254                tw1i= (fwd ? -1 : 1) * T0(0.5406408174555975821076359543186917L),
1255                tw2r= T0(0.4154150130018864255292741492296232L),
1256                tw2i= (fwd ? -1 : 1) * T0(0.9096319953545183714117153830790285L),
1257                tw3r= T0(-0.1423148382732851404437926686163697L),
1258                tw3i= (fwd ? -1 : 1) * T0(0.9898214418809327323760920377767188L),
1259                tw4r= T0(-0.6548607339452850640569250724662936L),
1260                tw4i= (fwd ? -1 : 1) * T0(0.7557495743542582837740358439723444L),
1261                tw5r= T0(-0.9594929736144973898903680570663277L),
1262                tw5i= (fwd ? -1 : 1) * T0(0.2817325568414296977114179153466169L);
1263 
1264   auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
1265     { return ch[a+ido*(b+l1*c)]; };
1266   auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
1267     { return cc[a+ido*(b+11*c)]; };
1268   auto WA = [wa, ido](size_t x, size_t i)
1269     { return wa[i-1+x*(ido-1)]; };
1270 
1271   if (ido==1)
1272     for (size_t k=0; k<l1; ++k)
1273       {
1274       POCKETFFT_PREP11(0)
1275       POCKETFFT_PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
1276       POCKETFFT_PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
1277       POCKETFFT_PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
1278       POCKETFFT_PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
1279       POCKETFFT_PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
1280       }
1281   else
1282     for (size_t k=0; k<l1; ++k)
1283       {
1284       {
1285       POCKETFFT_PREP11(0)
1286       POCKETFFT_PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
1287       POCKETFFT_PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
1288       POCKETFFT_PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
1289       POCKETFFT_PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
1290       POCKETFFT_PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
1291       }
1292       for (size_t i=1; i<ido; ++i)
1293         {
1294         POCKETFFT_PREP11(i)
1295         POCKETFFT_PARTSTEP11(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
1296         POCKETFFT_PARTSTEP11(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
1297         POCKETFFT_PARTSTEP11(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
1298         POCKETFFT_PARTSTEP11(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
1299         POCKETFFT_PARTSTEP11(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
1300         }
1301       }
1302   }
1303 
1304 #undef PARTSTEP11
1305 #undef PARTSTEP11a0
1306 #undef PARTSTEP11a
1307 #undef POCKETFFT_PREP11
1308 
1309 template<bool fwd, typename T> void passg (size_t ido, size_t ip,
1310   size_t l1, T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1311   const cmplx<T0> * POCKETFFT_RESTRICT wa,
1312   const cmplx<T0> * POCKETFFT_RESTRICT csarr) const
1313   {
1314   const size_t cdim=ip;
1315   size_t ipph = (ip+1)/2;
1316   size_t idl1 = ido*l1;
1317 
1318   auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
1319     { return ch[a+ido*(b+l1*c)]; };
1320   auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
1321     { return cc[a+ido*(b+cdim*c)]; };
1322   auto CX = [cc, ido, l1](size_t a, size_t b, size_t c) -> T&
1323     { return cc[a+ido*(b+l1*c)]; };
1324   auto CX2 = [cc, idl1](size_t a, size_t b) -> T&
1325     { return cc[a+idl1*b]; };
1326   auto CH2 = [ch, idl1](size_t a, size_t b) -> const T&
1327     { return ch[a+idl1*b]; };
1328 
1329   arr<cmplx<T0>> wal(ip);
1330   wal[0] = cmplx<T0>(1., 0.);
1331   for (size_t i=1; i<ip; ++i)
1332     wal[i]=cmplx<T0>(csarr[i].r,fwd ? -csarr[i].i : csarr[i].i);
1333 
1334   for (size_t k=0; k<l1; ++k)
1335     for (size_t i=0; i<ido; ++i)
1336       CH(i,k,0) = CC(i,0,k);
1337   for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
1338     for (size_t k=0; k<l1; ++k)
1339       for (size_t i=0; i<ido; ++i)
1340         PM(CH(i,k,j),CH(i,k,jc),CC(i,j,k),CC(i,jc,k));
1341   for (size_t k=0; k<l1; ++k)
1342     for (size_t i=0; i<ido; ++i)
1343       {
1344       T tmp = CH(i,k,0);
1345       for (size_t j=1; j<ipph; ++j)
1346         tmp+=CH(i,k,j);
1347       CX(i,k,0) = tmp;
1348       }
1349   for (size_t l=1, lc=ip-1; l<ipph; ++l, --lc)
1350     {
1351     // j=0
1352     for (size_t ik=0; ik<idl1; ++ik)
1353       {
1354       CX2(ik,l).r = CH2(ik,0).r+wal[l].r*CH2(ik,1).r+wal[2*l].r*CH2(ik,2).r;
1355       CX2(ik,l).i = CH2(ik,0).i+wal[l].r*CH2(ik,1).i+wal[2*l].r*CH2(ik,2).i;
1356       CX2(ik,lc).r=-wal[l].i*CH2(ik,ip-1).i-wal[2*l].i*CH2(ik,ip-2).i;
1357       CX2(ik,lc).i=wal[l].i*CH2(ik,ip-1).r+wal[2*l].i*CH2(ik,ip-2).r;
1358       }
1359 
1360     size_t iwal=2*l;
1361     size_t j=3, jc=ip-3;
1362     for (; j<ipph-1; j+=2, jc-=2)
1363       {
1364       iwal+=l; if (iwal>ip) iwal-=ip;
1365       cmplx<T0> xwal=wal[iwal];
1366       iwal+=l; if (iwal>ip) iwal-=ip;
1367       cmplx<T0> xwal2=wal[iwal];
1368       for (size_t ik=0; ik<idl1; ++ik)
1369         {
1370         CX2(ik,l).r += CH2(ik,j).r*xwal.r+CH2(ik,j+1).r*xwal2.r;
1371         CX2(ik,l).i += CH2(ik,j).i*xwal.r+CH2(ik,j+1).i*xwal2.r;
1372         CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i+CH2(ik,jc-1).i*xwal2.i;
1373         CX2(ik,lc).i += CH2(ik,jc).r*xwal.i+CH2(ik,jc-1).r*xwal2.i;
1374         }
1375       }
1376     for (; j<ipph; ++j, --jc)
1377       {
1378       iwal+=l; if (iwal>ip) iwal-=ip;
1379       cmplx<T0> xwal=wal[iwal];
1380       for (size_t ik=0; ik<idl1; ++ik)
1381         {
1382         CX2(ik,l).r += CH2(ik,j).r*xwal.r;
1383         CX2(ik,l).i += CH2(ik,j).i*xwal.r;
1384         CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i;
1385         CX2(ik,lc).i += CH2(ik,jc).r*xwal.i;
1386         }
1387       }
1388     }
1389 
1390   // shuffling and twiddling
1391   if (ido==1)
1392     for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
1393       for (size_t ik=0; ik<idl1; ++ik)
1394         {
1395         T t1=CX2(ik,j), t2=CX2(ik,jc);
1396         PM(CX2(ik,j),CX2(ik,jc),t1,t2);
1397         }
1398   else
1399     {
1400     for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)
1401       for (size_t k=0; k<l1; ++k)
1402         {
1403         T t1=CX(0,k,j), t2=CX(0,k,jc);
1404         PM(CX(0,k,j),CX(0,k,jc),t1,t2);
1405         for (size_t i=1; i<ido; ++i)
1406           {
1407           T x1, x2;
1408           PM(x1,x2,CX(i,k,j),CX(i,k,jc));
1409           size_t idij=(j-1)*(ido-1)+i-1;
1410           special_mul<fwd>(x1,wa[idij],CX(i,k,j));
1411           idij=(jc-1)*(ido-1)+i-1;
1412           special_mul<fwd>(x2,wa[idij],CX(i,k,jc));
1413           }
1414         }
1415     }
1416   }
1417 
1418 template<bool fwd, typename T> void pass_all(T c[], T0 fct) const
1419   {
1420   if (length==1) { c[0]*=fct; return; }
1421   size_t l1=1;
1422   arr<T> ch(length);
1423   T *p1=c, *p2=ch.data();
1424 
1425   for(size_t k1=0; k1<fact.size(); k1++)
1426     {
1427     size_t ip=fact[k1].fct;
1428     size_t l2=ip*l1;
1429     size_t ido = length/l2;
1430     if     (ip==4)
1431       pass4<fwd> (ido, l1, p1, p2, fact[k1].tw);
1432     else if(ip==8)
1433       pass8<fwd>(ido, l1, p1, p2, fact[k1].tw);
1434     else if(ip==2)
1435       pass2<fwd>(ido, l1, p1, p2, fact[k1].tw);
1436     else if(ip==3)
1437       pass3<fwd> (ido, l1, p1, p2, fact[k1].tw);
1438     else if(ip==5)
1439       pass5<fwd> (ido, l1, p1, p2, fact[k1].tw);
1440     else if(ip==7)
1441       pass7<fwd> (ido, l1, p1, p2, fact[k1].tw);
1442     else if(ip==11)
1443       pass11<fwd> (ido, l1, p1, p2, fact[k1].tw);
1444     else
1445       {
1446       passg<fwd>(ido, ip, l1, p1, p2, fact[k1].tw, fact[k1].tws);
1447       std::swap(p1,p2);
1448       }
1449     std::swap(p1,p2);
1450     l1=l2;
1451     }
1452   if (p1!=c)
1453     {
1454     if (fct!=1.)
1455       for (size_t i=0; i<length; ++i)
1456         c[i] = ch[i]*fct;
1457     else
1458       memcpy (c,p1,length*sizeof(T));
1459     }
1460   else
1461     if (fct!=1.)
1462       for (size_t i=0; i<length; ++i)
1463         c[i] *= fct;
1464   }
1465 
1466   public:
1467     template<typename T> void exec(T c[], T0 fct, bool fwd) const
1468       { fwd ? pass_all<true>(c, fct) : pass_all<false>(c, fct); }
1469 
1470   private:
1471     POCKETFFT_NOINLINE void factorize()
1472       {
1473       size_t len=length;
1474       while ((len&7)==0)
1475         { add_factor(8); len>>=3; }
1476       while ((len&3)==0)
1477         { add_factor(4); len>>=2; }
1478       if ((len&1)==0)
1479         {
1480         len>>=1;
1481         // factor 2 should be at the front of the factor list
1482         add_factor(2);
1483         std::swap(fact[0].fct, fact.back().fct);
1484         }
1485       for (size_t divisor=3; divisor*divisor<=len; divisor+=2)
1486         while ((len%divisor)==0)
1487           {
1488           add_factor(divisor);
1489           len/=divisor;
1490           }
1491       if (len>1) add_factor(len);
1492       }
1493 
1494     size_t twsize() const
1495       {
1496       size_t twsize=0, l1=1;
1497       for (size_t k=0; k<fact.size(); ++k)
1498         {
1499         size_t ip=fact[k].fct, ido= length/(l1*ip);
1500         twsize+=(ip-1)*(ido-1);
1501         if (ip>11)
1502           twsize+=ip;
1503         l1*=ip;
1504         }
1505       return twsize;
1506       }
1507 
1508     void comp_twiddle()
1509       {
1510       sincos_2pibyn<T0> twiddle(length);
1511       size_t l1=1;
1512       size_t memofs=0;
1513       for (size_t k=0; k<fact.size(); ++k)
1514         {
1515         size_t ip=fact[k].fct, ido=length/(l1*ip);
1516         fact[k].tw=mem.data()+memofs;
1517         memofs+=(ip-1)*(ido-1);
1518         for (size_t j=1; j<ip; ++j)
1519           for (size_t i=1; i<ido; ++i)
1520             fact[k].tw[(j-1)*(ido-1)+i-1] = twiddle[j*l1*i];
1521         if (ip>11)
1522           {
1523           fact[k].tws=mem.data()+memofs;
1524           memofs+=ip;
1525           for (size_t j=0; j<ip; ++j)
1526             fact[k].tws[j] = twiddle[j*l1*ido];
1527           }
1528         l1*=ip;
1529         }
1530       }
1531 
1532   public:
1533     POCKETFFT_NOINLINE cfftp(size_t length_)
1534       : length(length_)
1535       {
1536       if (length==0) throw std::runtime_error("zero-length FFT requested");
1537       if (length==1) return;
1538       factorize();
1539       mem.resize(twsize());
1540       comp_twiddle();
1541       }
1542   };
1543 
1544 //
1545 // real-valued FFTPACK transforms
1546 //
1547 
1548 template<typename T0> class rfftp
1549   {
1550   private:
1551     struct fctdata
1552       {
1553       size_t fct;
1554       T0 *tw, *tws;
1555       };
1556 
1557     size_t length;
1558     arr<T0> mem;
1559     std::vector<fctdata> fact;
1560 
1561     void add_factor(size_t factor)
1562       { fact.push_back({factor, nullptr, nullptr}); }
1563 
1564 /* (a+ib) = conj(c+id) * (e+if) */
1565 template<typename T1, typename T2, typename T3> inline void MULPM
1566   (T1 &a, T1 &b, T2 c, T2 d, T3 e, T3 f) const
1567   {  a=c*e+d*f; b=c*f-d*e; }
1568 
1569 template<typename T> void radf2 (size_t ido, size_t l1,
1570   const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1571   const T0 * POCKETFFT_RESTRICT wa) const
1572   {
1573   auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
1574   auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T&
1575     { return cc[a+ido*(b+l1*c)]; };
1576   auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T&
1577     { return ch[a+ido*(b+2*c)]; };
1578 
1579   for (size_t k=0; k<l1; k++)
1580     PM (CH(0,0,k),CH(ido-1,1,k),CC(0,k,0),CC(0,k,1));
1581   if ((ido&1)==0)
1582     for (size_t k=0; k<l1; k++)
1583       {
1584       CH(    0,1,k) = -CC(ido-1,k,1);
1585       CH(ido-1,0,k) =  CC(ido-1,k,0);
1586       }
1587   if (ido<=2) return;
1588   for (size_t k=0; k<l1; k++)
1589     for (size_t i=2; i<ido; i+=2)
1590       {
1591       size_t ic=ido-i;
1592       T tr2, ti2;
1593       MULPM (tr2,ti2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1));
1594       PM (CH(i-1,0,k),CH(ic-1,1,k),CC(i-1,k,0),tr2);
1595       PM (CH(i  ,0,k),CH(ic  ,1,k),ti2,CC(i  ,k,0));
1596       }
1597   }
1598 
1599 // a2=a+b; b2=i*(b-a);
1600 #define POCKETFFT_REARRANGE(rx, ix, ry, iy) \
1601   {\
1602   auto t1=rx+ry, t2=ry-rx, t3=ix+iy, t4=ix-iy; \
1603   rx=t1; ix=t3; ry=t4; iy=t2; \
1604   }
1605 
1606 template<typename T> void radf3(size_t ido, size_t l1,
1607   const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1608   const T0 * POCKETFFT_RESTRICT wa) const
1609   {
1610   constexpr T0 taur=-0.5, taui=T0(0.8660254037844386467637231707529362L);
1611 
1612   auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
1613   auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T&
1614     { return cc[a+ido*(b+l1*c)]; };
1615   auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T&
1616     { return ch[a+ido*(b+3*c)]; };
1617 
1618   for (size_t k=0; k<l1; k++)
1619     {
1620     T cr2=CC(0,k,1)+CC(0,k,2);
1621     CH(0,0,k) = CC(0,k,0)+cr2;
1622     CH(0,2,k) = taui*(CC(0,k,2)-CC(0,k,1));
1623     CH(ido-1,1,k) = CC(0,k,0)+taur*cr2;
1624     }
1625   if (ido==1) return;
1626   for (size_t k=0; k<l1; k++)
1627     for (size_t i=2; i<ido; i+=2)
1628       {
1629       size_t ic=ido-i;
1630       T di2, di3, dr2, dr3;
1631       MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1)); // d2=conj(WA0)*CC1
1632       MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2)); // d3=conj(WA1)*CC2
1633       POCKETFFT_REARRANGE(dr2, di2, dr3, di3);
1634       CH(i-1,0,k) = CC(i-1,k,0)+dr2; // c add
1635       CH(i  ,0,k) = CC(i  ,k,0)+di2;
1636       T tr2 = CC(i-1,k,0)+taur*dr2; // c add
1637       T ti2 = CC(i  ,k,0)+taur*di2;
1638       T tr3 = taui*dr3;  // t3 = taui*i*(d3-d2)?
1639       T ti3 = taui*di3;
1640       PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr3); // PM(i) = t2+t3
1641       PM(CH(i  ,2,k),CH(ic  ,1,k),ti3,ti2); // PM(ic) = conj(t2-t3)
1642       }
1643   }
1644 
1645 template<typename T> void radf4(size_t ido, size_t l1,
1646   const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1647   const T0 * POCKETFFT_RESTRICT wa) const
1648   {
1649   constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);
1650 
1651   auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
1652   auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T&
1653     { return cc[a+ido*(b+l1*c)]; };
1654   auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T&
1655     { return ch[a+ido*(b+4*c)]; };
1656 
1657   for (size_t k=0; k<l1; k++)
1658     {
1659     T tr1,tr2;
1660     PM (tr1,CH(0,2,k),CC(0,k,3),CC(0,k,1));
1661     PM (tr2,CH(ido-1,1,k),CC(0,k,0),CC(0,k,2));
1662     PM (CH(0,0,k),CH(ido-1,3,k),tr2,tr1);
1663     }
1664   if ((ido&1)==0)
1665     for (size_t k=0; k<l1; k++)
1666       {
1667       T ti1=-hsqt2*(CC(ido-1,k,1)+CC(ido-1,k,3));
1668       T tr1= hsqt2*(CC(ido-1,k,1)-CC(ido-1,k,3));
1669       PM (CH(ido-1,0,k),CH(ido-1,2,k),CC(ido-1,k,0),tr1);
1670       PM (CH(    0,3,k),CH(    0,1,k),ti1,CC(ido-1,k,2));
1671       }
1672   if (ido<=2) return;
1673   for (size_t k=0; k<l1; k++)
1674     for (size_t i=2; i<ido; i+=2)
1675       {
1676       size_t ic=ido-i;
1677       T ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
1678       MULPM(cr2,ci2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1));
1679       MULPM(cr3,ci3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2));
1680       MULPM(cr4,ci4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3));
1681       PM(tr1,tr4,cr4,cr2);
1682       PM(ti1,ti4,ci2,ci4);
1683       PM(tr2,tr3,CC(i-1,k,0),cr3);
1684       PM(ti2,ti3,CC(i  ,k,0),ci3);
1685       PM(CH(i-1,0,k),CH(ic-1,3,k),tr2,tr1);
1686       PM(CH(i  ,0,k),CH(ic  ,3,k),ti1,ti2);
1687       PM(CH(i-1,2,k),CH(ic-1,1,k),tr3,ti4);
1688       PM(CH(i  ,2,k),CH(ic  ,1,k),tr4,ti3);
1689       }
1690   }
1691 
1692 template<typename T> void radf5(size_t ido, size_t l1,
1693   const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1694   const T0 * POCKETFFT_RESTRICT wa) const
1695   {
1696   constexpr T0 tr11= T0(0.3090169943749474241022934171828191L),
1697                ti11= T0(0.9510565162951535721164393333793821L),
1698                tr12= T0(-0.8090169943749474241022934171828191L),
1699                ti12= T0(0.5877852522924731291687059546390728L);
1700 
1701   auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
1702   auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T&
1703     { return cc[a+ido*(b+l1*c)]; };
1704   auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T&
1705     { return ch[a+ido*(b+5*c)]; };
1706 
1707   for (size_t k=0; k<l1; k++)
1708     {
1709     T cr2, cr3, ci4, ci5;
1710     PM (cr2,ci5,CC(0,k,4),CC(0,k,1));
1711     PM (cr3,ci4,CC(0,k,3),CC(0,k,2));
1712     CH(0,0,k)=CC(0,k,0)+cr2+cr3;
1713     CH(ido-1,1,k)=CC(0,k,0)+tr11*cr2+tr12*cr3;
1714     CH(0,2,k)=ti11*ci5+ti12*ci4;
1715     CH(ido-1,3,k)=CC(0,k,0)+tr12*cr2+tr11*cr3;
1716     CH(0,4,k)=ti12*ci5-ti11*ci4;
1717     }
1718   if (ido==1) return;
1719   for (size_t k=0; k<l1;++k)
1720     for (size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2)
1721       {
1722       T di2, di3, di4, di5, dr2, dr3, dr4, dr5;
1723       MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1));
1724       MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2));
1725       MULPM (dr4,di4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3));
1726       MULPM (dr5,di5,WA(3,i-2),WA(3,i-1),CC(i-1,k,4),CC(i,k,4));
1727       POCKETFFT_REARRANGE(dr2, di2, dr5, di5);
1728       POCKETFFT_REARRANGE(dr3, di3, dr4, di4);
1729       CH(i-1,0,k)=CC(i-1,k,0)+dr2+dr3;
1730       CH(i  ,0,k)=CC(i  ,k,0)+di2+di3;
1731       T tr2=CC(i-1,k,0)+tr11*dr2+tr12*dr3;
1732       T ti2=CC(i  ,k,0)+tr11*di2+tr12*di3;
1733       T tr3=CC(i-1,k,0)+tr12*dr2+tr11*dr3;
1734       T ti3=CC(i  ,k,0)+tr12*di2+tr11*di3;
1735       T tr5 = ti11*dr5 + ti12*dr4;
1736       T ti5 = ti11*di5 + ti12*di4;
1737       T tr4 = ti12*dr5 - ti11*dr4;
1738       T ti4 = ti12*di5 - ti11*di4;
1739       PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr5);
1740       PM(CH(i  ,2,k),CH(ic  ,1,k),ti5,ti2);
1741       PM(CH(i-1,4,k),CH(ic-1,3,k),tr3,tr4);
1742       PM(CH(i  ,4,k),CH(ic  ,3,k),ti4,ti3);
1743       }
1744   }
1745 
1746 #undef POCKETFFT_REARRANGE
1747 
1748 template<typename T> void radfg(size_t ido, size_t ip, size_t l1,
1749   T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1750   const T0 * POCKETFFT_RESTRICT wa, const T0 * POCKETFFT_RESTRICT csarr) const
1751   {
1752   const size_t cdim=ip;
1753   size_t ipph=(ip+1)/2;
1754   size_t idl1 = ido*l1;
1755 
1756   auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> T&
1757     { return cc[a+ido*(b+cdim*c)]; };
1758   auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> const T&
1759     { return ch[a+ido*(b+l1*c)]; };
1760   auto C1 = [cc,ido,l1] (size_t a, size_t b, size_t c) -> T&
1761     { return cc[a+ido*(b+l1*c)]; };
1762   auto C2 = [cc,idl1] (size_t a, size_t b) -> T&
1763     { return cc[a+idl1*b]; };
1764   auto CH2 = [ch,idl1] (size_t a, size_t b) -> T&
1765     { return ch[a+idl1*b]; };
1766 
1767   if (ido>1)
1768     {
1769     for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)              // 114
1770       {
1771       size_t is=(j-1)*(ido-1),
1772              is2=(jc-1)*(ido-1);
1773       for (size_t k=0; k<l1; ++k)                            // 113
1774         {
1775         size_t idij=is;
1776         size_t idij2=is2;
1777         for (size_t i=1; i<=ido-2; i+=2)                      // 112
1778           {
1779           T t1=C1(i,k,j ), t2=C1(i+1,k,j ),
1780             t3=C1(i,k,jc), t4=C1(i+1,k,jc);
1781           T x1=wa[idij]*t1 + wa[idij+1]*t2,
1782             x2=wa[idij]*t2 - wa[idij+1]*t1,
1783             x3=wa[idij2]*t3 + wa[idij2+1]*t4,
1784             x4=wa[idij2]*t4 - wa[idij2+1]*t3;
1785           PM(C1(i,k,j),C1(i+1,k,jc),x3,x1);
1786           PM(C1(i+1,k,j),C1(i,k,jc),x2,x4);
1787           idij+=2;
1788           idij2+=2;
1789           }
1790         }
1791       }
1792     }
1793 
1794   for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                // 123
1795     for (size_t k=0; k<l1; ++k)                              // 122
1796       MPINPLACE(C1(0,k,jc), C1(0,k,j));
1797 
1798 //everything in C
1799 //memset(ch,0,ip*l1*ido*sizeof(double));
1800 
1801   for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc)                 // 127
1802     {
1803     for (size_t ik=0; ik<idl1; ++ik)                         // 124
1804       {
1805       CH2(ik,l ) = C2(ik,0)+csarr[2*l]*C2(ik,1)+csarr[4*l]*C2(ik,2);
1806       CH2(ik,lc) = csarr[2*l+1]*C2(ik,ip-1)+csarr[4*l+1]*C2(ik,ip-2);
1807       }
1808     size_t iang = 2*l;
1809     size_t j=3, jc=ip-3;
1810     for (; j<ipph-3; j+=4,jc-=4)              // 126
1811       {
1812       iang+=l; if (iang>=ip) iang-=ip;
1813       T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1];
1814       iang+=l; if (iang>=ip) iang-=ip;
1815       T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1];
1816       iang+=l; if (iang>=ip) iang-=ip;
1817       T0 ar3=csarr[2*iang], ai3=csarr[2*iang+1];
1818       iang+=l; if (iang>=ip) iang-=ip;
1819       T0 ar4=csarr[2*iang], ai4=csarr[2*iang+1];
1820       for (size_t ik=0; ik<idl1; ++ik)                       // 125
1821         {
1822         CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1)
1823                      +ar3*C2(ik,j +2)+ar4*C2(ik,j +3);
1824         CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1)
1825                      +ai3*C2(ik,jc-2)+ai4*C2(ik,jc-3);
1826         }
1827       }
1828     for (; j<ipph-1; j+=2,jc-=2)              // 126
1829       {
1830       iang+=l; if (iang>=ip) iang-=ip;
1831       T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1];
1832       iang+=l; if (iang>=ip) iang-=ip;
1833       T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1];
1834       for (size_t ik=0; ik<idl1; ++ik)                       // 125
1835         {
1836         CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1);
1837         CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1);
1838         }
1839       }
1840     for (; j<ipph; ++j,--jc)              // 126
1841       {
1842       iang+=l; if (iang>=ip) iang-=ip;
1843       T0 ar=csarr[2*iang], ai=csarr[2*iang+1];
1844       for (size_t ik=0; ik<idl1; ++ik)                       // 125
1845         {
1846         CH2(ik,l ) += ar*C2(ik,j );
1847         CH2(ik,lc) += ai*C2(ik,jc);
1848         }
1849       }
1850     }
1851   for (size_t ik=0; ik<idl1; ++ik)                         // 101
1852     CH2(ik,0) = C2(ik,0);
1853   for (size_t j=1; j<ipph; ++j)                              // 129
1854     for (size_t ik=0; ik<idl1; ++ik)                         // 128
1855       CH2(ik,0) += C2(ik,j);
1856 
1857 // everything in CH at this point!
1858 //memset(cc,0,ip*l1*ido*sizeof(double));
1859 
1860   for (size_t k=0; k<l1; ++k)                                // 131
1861     for (size_t i=0; i<ido; ++i)                             // 130
1862       CC(i,0,k) = CH(i,k,0);
1863 
1864   for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                // 137
1865     {
1866     size_t j2=2*j-1;
1867     for (size_t k=0; k<l1; ++k)                              // 136
1868       {
1869       CC(ido-1,j2,k) = CH(0,k,j);
1870       CC(0,j2+1,k) = CH(0,k,jc);
1871       }
1872     }
1873 
1874   if (ido==1) return;
1875 
1876   for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                // 140
1877     {
1878     size_t j2=2*j-1;
1879     for(size_t k=0; k<l1; ++k)                               // 139
1880       for(size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2)      // 138
1881         {
1882         CC(i   ,j2+1,k) = CH(i  ,k,j )+CH(i  ,k,jc);
1883         CC(ic  ,j2  ,k) = CH(i  ,k,j )-CH(i  ,k,jc);
1884         CC(i+1 ,j2+1,k) = CH(i+1,k,j )+CH(i+1,k,jc);
1885         CC(ic+1,j2  ,k) = CH(i+1,k,jc)-CH(i+1,k,j );
1886         }
1887     }
1888   }
1889 
1890 template<typename T> void radb2(size_t ido, size_t l1,
1891   const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1892   const T0 * POCKETFFT_RESTRICT wa) const
1893   {
1894   auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
1895   auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
1896     { return cc[a+ido*(b+2*c)]; };
1897   auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
1898     { return ch[a+ido*(b+l1*c)]; };
1899 
1900   for (size_t k=0; k<l1; k++)
1901     PM (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(ido-1,1,k));
1902   if ((ido&1)==0)
1903     for (size_t k=0; k<l1; k++)
1904       {
1905       CH(ido-1,k,0) = 2*CC(ido-1,0,k);
1906       CH(ido-1,k,1) =-2*CC(0    ,1,k);
1907       }
1908   if (ido<=2) return;
1909   for (size_t k=0; k<l1;++k)
1910     for (size_t i=2; i<ido; i+=2)
1911       {
1912       size_t ic=ido-i;
1913       T ti2, tr2;
1914       PM (CH(i-1,k,0),tr2,CC(i-1,0,k),CC(ic-1,1,k));
1915       PM (ti2,CH(i  ,k,0),CC(i  ,0,k),CC(ic  ,1,k));
1916       MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ti2,tr2);
1917       }
1918   }
1919 
1920 template<typename T> void radb3(size_t ido, size_t l1,
1921   const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1922   const T0 * POCKETFFT_RESTRICT wa) const
1923   {
1924   constexpr T0 taur=-0.5, taui=T0(0.8660254037844386467637231707529362L);
1925 
1926   auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
1927   auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
1928     { return cc[a+ido*(b+3*c)]; };
1929   auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
1930     { return ch[a+ido*(b+l1*c)]; };
1931 
1932   for (size_t k=0; k<l1; k++)
1933     {
1934     T tr2=2*CC(ido-1,1,k);
1935     T cr2=CC(0,0,k)+taur*tr2;
1936     CH(0,k,0)=CC(0,0,k)+tr2;
1937     T ci3=2*taui*CC(0,2,k);
1938     PM (CH(0,k,2),CH(0,k,1),cr2,ci3);
1939     }
1940   if (ido==1) return;
1941   for (size_t k=0; k<l1; k++)
1942     for (size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2)
1943       {
1944       T tr2=CC(i-1,2,k)+CC(ic-1,1,k); // t2=CC(I) + conj(CC(ic))
1945       T ti2=CC(i  ,2,k)-CC(ic  ,1,k);
1946       T cr2=CC(i-1,0,k)+taur*tr2;     // c2=CC +taur*t2
1947       T ci2=CC(i  ,0,k)+taur*ti2;
1948       CH(i-1,k,0)=CC(i-1,0,k)+tr2;         // CH=CC+t2
1949       CH(i  ,k,0)=CC(i  ,0,k)+ti2;
1950       T cr3=taui*(CC(i-1,2,k)-CC(ic-1,1,k));// c3=taui*(CC(i)-conj(CC(ic)))
1951       T ci3=taui*(CC(i  ,2,k)+CC(ic  ,1,k));
1952       T di2, di3, dr2, dr3;
1953       PM(dr3,dr2,cr2,ci3); // d2= (cr2-ci3, ci2+cr3) = c2+i*c3
1954       PM(di2,di3,ci2,cr3); // d3= (cr2+ci3, ci2-cr3) = c2-i*c3
1955       MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2); // ch = WA*d2
1956       MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3);
1957       }
1958   }
1959 
1960 template<typename T> void radb4(size_t ido, size_t l1,
1961   const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1962   const T0 * POCKETFFT_RESTRICT wa) const
1963   {
1964   constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
1965 
1966   auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
1967   auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
1968     { return cc[a+ido*(b+4*c)]; };
1969   auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
1970     { return ch[a+ido*(b+l1*c)]; };
1971 
1972   for (size_t k=0; k<l1; k++)
1973     {
1974     T tr1, tr2;
1975     PM (tr2,tr1,CC(0,0,k),CC(ido-1,3,k));
1976     T tr3=2*CC(ido-1,1,k);
1977     T tr4=2*CC(0,2,k);
1978     PM (CH(0,k,0),CH(0,k,2),tr2,tr3);
1979     PM (CH(0,k,3),CH(0,k,1),tr1,tr4);
1980     }
1981   if ((ido&1)==0)
1982     for (size_t k=0; k<l1; k++)
1983       {
1984       T tr1,tr2,ti1,ti2;
1985       PM (ti1,ti2,CC(0    ,3,k),CC(0    ,1,k));
1986       PM (tr2,tr1,CC(ido-1,0,k),CC(ido-1,2,k));
1987       CH(ido-1,k,0)=tr2+tr2;
1988       CH(ido-1,k,1)=sqrt2*(tr1-ti1);
1989       CH(ido-1,k,2)=ti2+ti2;
1990       CH(ido-1,k,3)=-sqrt2*(tr1+ti1);
1991       }
1992   if (ido<=2) return;
1993   for (size_t k=0; k<l1;++k)
1994     for (size_t i=2; i<ido; i+=2)
1995       {
1996       T ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
1997       size_t ic=ido-i;
1998       PM (tr2,tr1,CC(i-1,0,k),CC(ic-1,3,k));
1999       PM (ti1,ti2,CC(i  ,0,k),CC(ic  ,3,k));
2000       PM (tr4,ti3,CC(i  ,2,k),CC(ic  ,1,k));
2001       PM (tr3,ti4,CC(i-1,2,k),CC(ic-1,1,k));
2002       PM (CH(i-1,k,0),cr3,tr2,tr3);
2003       PM (CH(i  ,k,0),ci3,ti2,ti3);
2004       PM (cr4,cr2,tr1,tr4);
2005       PM (ci2,ci4,ti1,ti4);
2006       MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ci2,cr2);
2007       MULPM (CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),ci3,cr3);
2008       MULPM (CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),ci4,cr4);
2009       }
2010   }
2011 
2012 template<typename T> void radb5(size_t ido, size_t l1,
2013   const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
2014   const T0 * POCKETFFT_RESTRICT wa) const
2015   {
2016   constexpr T0 tr11= T0(0.3090169943749474241022934171828191L),
2017                ti11= T0(0.9510565162951535721164393333793821L),
2018                tr12= T0(-0.8090169943749474241022934171828191L),
2019                ti12= T0(0.5877852522924731291687059546390728L);
2020 
2021   auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
2022   auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
2023     { return cc[a+ido*(b+5*c)]; };
2024   auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
2025     { return ch[a+ido*(b+l1*c)]; };
2026 
2027   for (size_t k=0; k<l1; k++)
2028     {
2029     T ti5=CC(0,2,k)+CC(0,2,k);
2030     T ti4=CC(0,4,k)+CC(0,4,k);
2031     T tr2=CC(ido-1,1,k)+CC(ido-1,1,k);
2032     T tr3=CC(ido-1,3,k)+CC(ido-1,3,k);
2033     CH(0,k,0)=CC(0,0,k)+tr2+tr3;
2034     T cr2=CC(0,0,k)+tr11*tr2+tr12*tr3;
2035     T cr3=CC(0,0,k)+tr12*tr2+tr11*tr3;
2036     T ci4, ci5;
2037     MULPM(ci5,ci4,ti5,ti4,ti11,ti12);
2038     PM(CH(0,k,4),CH(0,k,1),cr2,ci5);
2039     PM(CH(0,k,3),CH(0,k,2),cr3,ci4);
2040     }
2041   if (ido==1) return;
2042   for (size_t k=0; k<l1;++k)
2043     for (size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2)
2044       {
2045       T tr2, tr3, tr4, tr5, ti2, ti3, ti4, ti5;
2046       PM(tr2,tr5,CC(i-1,2,k),CC(ic-1,1,k));
2047       PM(ti5,ti2,CC(i  ,2,k),CC(ic  ,1,k));
2048       PM(tr3,tr4,CC(i-1,4,k),CC(ic-1,3,k));
2049       PM(ti4,ti3,CC(i  ,4,k),CC(ic  ,3,k));
2050       CH(i-1,k,0)=CC(i-1,0,k)+tr2+tr3;
2051       CH(i  ,k,0)=CC(i  ,0,k)+ti2+ti3;
2052       T cr2=CC(i-1,0,k)+tr11*tr2+tr12*tr3;
2053       T ci2=CC(i  ,0,k)+tr11*ti2+tr12*ti3;
2054       T cr3=CC(i-1,0,k)+tr12*tr2+tr11*tr3;
2055       T ci3=CC(i  ,0,k)+tr12*ti2+tr11*ti3;
2056       T ci4, ci5, cr5, cr4;
2057       MULPM(cr5,cr4,tr5,tr4,ti11,ti12);
2058       MULPM(ci5,ci4,ti5,ti4,ti11,ti12);
2059       T dr2, dr3, dr4, dr5, di2, di3, di4, di5;
2060       PM(dr4,dr3,cr3,ci4);
2061       PM(di3,di4,ci3,cr4);
2062       PM(dr5,dr2,cr2,ci5);
2063       PM(di2,di5,ci2,cr5);
2064       MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2);
2065       MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3);
2066       MULPM(CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),di4,dr4);
2067       MULPM(CH(i,k,4),CH(i-1,k,4),WA(3,i-2),WA(3,i-1),di5,dr5);
2068       }
2069   }
2070 
2071 template<typename T> void radbg(size_t ido, size_t ip, size_t l1,
2072   T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
2073   const T0 * POCKETFFT_RESTRICT wa, const T0 * POCKETFFT_RESTRICT csarr) const
2074   {
2075   const size_t cdim=ip;
2076   size_t ipph=(ip+1)/ 2;
2077   size_t idl1 = ido*l1;
2078 
2079   auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
2080     { return cc[a+ido*(b+cdim*c)]; };
2081   auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
2082     { return ch[a+ido*(b+l1*c)]; };
2083   auto C1 = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T&
2084     { return cc[a+ido*(b+l1*c)]; };
2085   auto C2 = [cc,idl1](size_t a, size_t b) -> T&
2086     { return cc[a+idl1*b]; };
2087   auto CH2 = [ch,idl1](size_t a, size_t b) -> T&
2088     { return ch[a+idl1*b]; };
2089 
2090   for (size_t k=0; k<l1; ++k)        // 102
2091     for (size_t i=0; i<ido; ++i)     // 101
2092       CH(i,k,0) = CC(i,0,k);
2093   for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)   // 108
2094     {
2095     size_t j2=2*j-1;
2096     for (size_t k=0; k<l1; ++k)
2097       {
2098       CH(0,k,j ) = 2*CC(ido-1,j2,k);
2099       CH(0,k,jc) = 2*CC(0,j2+1,k);
2100       }
2101     }
2102 
2103   if (ido!=1)
2104     {
2105     for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)   // 111
2106       {
2107       size_t j2=2*j-1;
2108       for (size_t k=0; k<l1; ++k)
2109         for (size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2)      // 109
2110           {
2111           CH(i  ,k,j ) = CC(i  ,j2+1,k)+CC(ic  ,j2,k);
2112           CH(i  ,k,jc) = CC(i  ,j2+1,k)-CC(ic  ,j2,k);
2113           CH(i+1,k,j ) = CC(i+1,j2+1,k)-CC(ic+1,j2,k);
2114           CH(i+1,k,jc) = CC(i+1,j2+1,k)+CC(ic+1,j2,k);
2115           }
2116       }
2117     }
2118   for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc)
2119     {
2120     for (size_t ik=0; ik<idl1; ++ik)
2121       {
2122       C2(ik,l ) = CH2(ik,0)+csarr[2*l]*CH2(ik,1)+csarr[4*l]*CH2(ik,2);
2123       C2(ik,lc) = csarr[2*l+1]*CH2(ik,ip-1)+csarr[4*l+1]*CH2(ik,ip-2);
2124       }
2125     size_t iang=2*l;
2126     size_t j=3,jc=ip-3;
2127     for(; j<ipph-3; j+=4,jc-=4)
2128       {
2129       iang+=l; if(iang>ip) iang-=ip;
2130       T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1];
2131       iang+=l; if(iang>ip) iang-=ip;
2132       T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1];
2133       iang+=l; if(iang>ip) iang-=ip;
2134       T0 ar3=csarr[2*iang], ai3=csarr[2*iang+1];
2135       iang+=l; if(iang>ip) iang-=ip;
2136       T0 ar4=csarr[2*iang], ai4=csarr[2*iang+1];
2137       for (size_t ik=0; ik<idl1; ++ik)
2138         {
2139         C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1)
2140                     +ar3*CH2(ik,j +2)+ar4*CH2(ik,j +3);
2141         C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1)
2142                     +ai3*CH2(ik,jc-2)+ai4*CH2(ik,jc-3);
2143         }
2144       }
2145     for(; j<ipph-1; j+=2,jc-=2)
2146       {
2147       iang+=l; if(iang>ip) iang-=ip;
2148       T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1];
2149       iang+=l; if(iang>ip) iang-=ip;
2150       T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1];
2151       for (size_t ik=0; ik<idl1; ++ik)
2152         {
2153         C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1);
2154         C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1);
2155         }
2156       }
2157     for(; j<ipph; ++j,--jc)
2158       {
2159       iang+=l; if(iang>ip) iang-=ip;
2160       T0 war=csarr[2*iang], wai=csarr[2*iang+1];
2161       for (size_t ik=0; ik<idl1; ++ik)
2162         {
2163         C2(ik,l ) += war*CH2(ik,j );
2164         C2(ik,lc) += wai*CH2(ik,jc);
2165         }
2166       }
2167     }
2168   for (size_t j=1; j<ipph; ++j)
2169     for (size_t ik=0; ik<idl1; ++ik)
2170       CH2(ik,0) += CH2(ik,j);
2171   for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)   // 124
2172     for (size_t k=0; k<l1; ++k)
2173       PM(CH(0,k,jc),CH(0,k,j),C1(0,k,j),C1(0,k,jc));
2174 
2175   if (ido==1) return;
2176 
2177   for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)  // 127
2178     for (size_t k=0; k<l1; ++k)
2179       for (size_t i=1; i<=ido-2; i+=2)
2180         {
2181         CH(i  ,k,j ) = C1(i  ,k,j)-C1(i+1,k,jc);
2182         CH(i  ,k,jc) = C1(i  ,k,j)+C1(i+1,k,jc);
2183         CH(i+1,k,j ) = C1(i+1,k,j)+C1(i  ,k,jc);
2184         CH(i+1,k,jc) = C1(i+1,k,j)-C1(i  ,k,jc);
2185         }
2186 
2187 // All in CH
2188 
2189   for (size_t j=1; j<ip; ++j)
2190     {
2191     size_t is = (j-1)*(ido-1);
2192     for (size_t k=0; k<l1; ++k)
2193       {
2194       size_t idij = is;
2195       for (size_t i=1; i<=ido-2; i+=2)
2196         {
2197         T t1=CH(i,k,j), t2=CH(i+1,k,j);
2198         CH(i  ,k,j) = wa[idij]*t1-wa[idij+1]*t2;
2199         CH(i+1,k,j) = wa[idij]*t2+wa[idij+1]*t1;
2200         idij+=2;
2201         }
2202       }
2203     }
2204   }
2205 
2206     template<typename T> void copy_and_norm(T *c, T *p1, size_t n, T0 fct) const
2207       {
2208       if (p1!=c)
2209         {
2210         if (fct!=1.)
2211           for (size_t i=0; i<n; ++i)
2212             c[i] = fct*p1[i];
2213         else
2214           memcpy (c,p1,n*sizeof(T));
2215         }
2216       else
2217         if (fct!=1.)
2218           for (size_t i=0; i<n; ++i)
2219             c[i] *= fct;
2220       }
2221 
2222   public:
2223     template<typename T> void exec(T c[], T0 fct, bool r2hc) const
2224       {
2225       if (length==1) { c[0]*=fct; return; }
2226       size_t n=length, nf=fact.size();
2227       arr<T> ch(n);
2228       T *p1=c, *p2=ch.data();
2229 
2230       if (r2hc)
2231         for(size_t k1=0, l1=n; k1<nf;++k1)
2232           {
2233           size_t k=nf-k1-1;
2234           size_t ip=fact[k].fct;
2235           size_t ido=n / l1;
2236           l1 /= ip;
2237           if(ip==4)
2238             radf4(ido, l1, p1, p2, fact[k].tw);
2239           else if(ip==2)
2240             radf2(ido, l1, p1, p2, fact[k].tw);
2241           else if(ip==3)
2242             radf3(ido, l1, p1, p2, fact[k].tw);
2243           else if(ip==5)
2244             radf5(ido, l1, p1, p2, fact[k].tw);
2245           else
2246             { radfg(ido, ip, l1, p1, p2, fact[k].tw, fact[k].tws); std::swap (p1,p2); }
2247           std::swap (p1,p2);
2248           }
2249       else
2250         for(size_t k=0, l1=1; k<nf; k++)
2251           {
2252           size_t ip = fact[k].fct,
2253                  ido= n/(ip*l1);
2254           if(ip==4)
2255             radb4(ido, l1, p1, p2, fact[k].tw);
2256           else if(ip==2)
2257             radb2(ido, l1, p1, p2, fact[k].tw);
2258           else if(ip==3)
2259             radb3(ido, l1, p1, p2, fact[k].tw);
2260           else if(ip==5)
2261             radb5(ido, l1, p1, p2, fact[k].tw);
2262           else
2263             radbg(ido, ip, l1, p1, p2, fact[k].tw, fact[k].tws);
2264           std::swap (p1,p2);
2265           l1*=ip;
2266           }
2267 
2268       copy_and_norm(c,p1,n,fct);
2269       }
2270 
2271   private:
2272     void factorize()
2273       {
2274       size_t len=length;
2275       while ((len%4)==0)
2276         { add_factor(4); len>>=2; }
2277       if ((len%2)==0)
2278         {
2279         len>>=1;
2280         // factor 2 should be at the front of the factor list
2281         add_factor(2);
2282         std::swap(fact[0].fct, fact.back().fct);
2283         }
2284       for (size_t divisor=3; divisor*divisor<=len; divisor+=2)
2285         while ((len%divisor)==0)
2286           {
2287           add_factor(divisor);
2288           len/=divisor;
2289           }
2290       if (len>1) add_factor(len);
2291       }
2292 
2293     size_t twsize() const
2294       {
2295       size_t twsz=0, l1=1;
2296       for (size_t k=0; k<fact.size(); ++k)
2297         {
2298         size_t ip=fact[k].fct, ido=length/(l1*ip);
2299         twsz+=(ip-1)*(ido-1);
2300         if (ip>5) twsz+=2*ip;
2301         l1*=ip;
2302         }
2303       return twsz;
2304       }
2305 
2306     void comp_twiddle()
2307       {
2308       sincos_2pibyn<T0> twid(length);
2309       size_t l1=1;
2310       T0 *ptr=mem.data();
2311       for (size_t k=0; k<fact.size(); ++k)
2312         {
2313         size_t ip=fact[k].fct, ido=length/(l1*ip);
2314         if (k<fact.size()-1) // last factor doesn't need twiddles
2315           {
2316           fact[k].tw=ptr; ptr+=(ip-1)*(ido-1);
2317           for (size_t j=1; j<ip; ++j)
2318             for (size_t i=1; i<=(ido-1)/2; ++i)
2319               {
2320               fact[k].tw[(j-1)*(ido-1)+2*i-2] = twid[j*l1*i].r;
2321               fact[k].tw[(j-1)*(ido-1)+2*i-1] = twid[j*l1*i].i;
2322               }
2323           }
2324         if (ip>5) // special factors required by *g functions
2325           {
2326           fact[k].tws=ptr; ptr+=2*ip;
2327           fact[k].tws[0] = 1.;
2328           fact[k].tws[1] = 0.;
2329           for (size_t i=2, ic=2*ip-2; i<=ic; i+=2, ic-=2)
2330             {
2331             fact[k].tws[i  ] = twid[i/2*(length/ip)].r;
2332             fact[k].tws[i+1] = twid[i/2*(length/ip)].i;
2333             fact[k].tws[ic]   = twid[i/2*(length/ip)].r;
2334             fact[k].tws[ic+1] = -twid[i/2*(length/ip)].i;
2335             }
2336           }
2337         l1*=ip;
2338         }
2339       }
2340 
2341   public:
2342     POCKETFFT_NOINLINE rfftp(size_t length_)
2343       : length(length_)
2344       {
2345       if (length==0) throw std::runtime_error("zero-length FFT requested");
2346       if (length==1) return;
2347       factorize();
2348       mem.resize(twsize());
2349       comp_twiddle();
2350       }
2351 };
2352 
2353 //
2354 // complex Bluestein transforms
2355 //
2356 
2357 template<typename T0> class fftblue
2358   {
2359   private:
2360     size_t n, n2;
2361     cfftp<T0> plan;
2362     arr<cmplx<T0>> mem;
2363     cmplx<T0> *bk, *bkf;
2364 
2365     template<bool fwd, typename T> void fft(cmplx<T> c[], T0 fct) const
2366       {
2367       arr<cmplx<T>> akf(n2);
2368 
2369       /* initialize a_k and FFT it */
2370       for (size_t m=0; m<n; ++m)
2371         special_mul<fwd>(c[m],bk[m],akf[m]);
2372       auto zero = akf[0]*T0(0);
2373       for (size_t m=n; m<n2; ++m)
2374         akf[m]=zero;
2375 
2376       plan.exec (akf.data(),1.,true);
2377 
2378       /* do the convolution */
2379       akf[0] = akf[0].template special_mul<!fwd>(bkf[0]);
2380       for (size_t m=1; m<(n2+1)/2; ++m)
2381         {
2382         akf[m] = akf[m].template special_mul<!fwd>(bkf[m]);
2383         akf[n2-m] = akf[n2-m].template special_mul<!fwd>(bkf[m]);
2384         }
2385       if ((n2&1)==0)
2386         akf[n2/2] = akf[n2/2].template special_mul<!fwd>(bkf[n2/2]);
2387 
2388       /* inverse FFT */
2389       plan.exec (akf.data(),1.,false);
2390 
2391       /* multiply by b_k */
2392       for (size_t m=0; m<n; ++m)
2393         c[m] = akf[m].template special_mul<fwd>(bk[m])*fct;
2394       }
2395 
2396   public:
2397     POCKETFFT_NOINLINE fftblue(size_t length)
2398       : n(length), n2(util::good_size_cmplx(n*2-1)), plan(n2), mem(n+n2/2+1),
2399         bk(mem.data()), bkf(mem.data()+n)
2400       {
2401       /* initialize b_k */
2402       sincos_2pibyn<T0> tmp(2*n);
2403       bk[0].Set(1, 0);
2404 
2405       size_t coeff=0;
2406       for (size_t m=1; m<n; ++m)
2407         {
2408         coeff+=2*m-1;
2409         if (coeff>=2*n) coeff-=2*n;
2410         bk[m] = tmp[coeff];
2411         }
2412 
2413       /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */
2414       arr<cmplx<T0>> tbkf(n2);
2415       T0 xn2 = T0(1)/T0(n2);
2416       tbkf[0] = bk[0]*xn2;
2417       for (size_t m=1; m<n; ++m)
2418         tbkf[m] = tbkf[n2-m] = bk[m]*xn2;
2419       for (size_t m=n;m<=(n2-n);++m)
2420         tbkf[m].Set(0.,0.);
2421       plan.exec(tbkf.data(),1.,true);
2422       for (size_t i=0; i<n2/2+1; ++i)
2423         bkf[i] = tbkf[i];
2424       }
2425 
2426     template<typename T> void exec(cmplx<T> c[], T0 fct, bool fwd) const
2427       { fwd ? fft<true>(c,fct) : fft<false>(c,fct); }
2428 
2429     template<typename T> void exec_r(T c[], T0 fct, bool fwd)
2430       {
2431       arr<cmplx<T>> tmp(n);
2432       if (fwd)
2433         {
2434         auto zero = T0(0)*c[0];
2435         for (size_t m=0; m<n; ++m)
2436           tmp[m].Set(c[m], zero);
2437         fft<true>(tmp.data(),fct);
2438         c[0] = tmp[0].r;
2439         memcpy (c+1, tmp.data()+1, (n-1)*sizeof(T));
2440         }
2441       else
2442         {
2443         tmp[0].Set(c[0],c[0]*0);
2444         memcpy (reinterpret_cast<void *>(tmp.data()+1),
2445                 reinterpret_cast<void *>(c+1), (n-1)*sizeof(T));
2446         if ((n&1)==0) tmp[n/2].i=T0(0)*c[0];
2447         for (size_t m=1; 2*m<n; ++m)
2448           tmp[n-m].Set(tmp[m].r, -tmp[m].i);
2449         fft<false>(tmp.data(),fct);
2450         for (size_t m=0; m<n; ++m)
2451           c[m] = tmp[m].r;
2452         }
2453       }
2454   };
2455 
2456 //
2457 // flexible (FFTPACK/Bluestein) complex 1D transform
2458 //
2459 
2460 template<typename T0> class pocketfft_c
2461   {
2462   private:
2463     std::unique_ptr<cfftp<T0>> packplan;
2464     std::unique_ptr<fftblue<T0>> blueplan;
2465     size_t len;
2466 
2467   public:
2468     POCKETFFT_NOINLINE pocketfft_c(size_t length)
2469       : len(length)
2470       {
2471       if (length==0) throw std::runtime_error("zero-length FFT requested");
2472       size_t tmp = (length<50) ? 0 : util::largest_prime_factor(length);
2473       if (tmp*tmp <= length)
2474         {
2475         packplan=std::unique_ptr<cfftp<T0>>(new cfftp<T0>(length));
2476         return;
2477         }
2478       double comp1 = util::cost_guess(length);
2479       double comp2 = 2*util::cost_guess(util::good_size_cmplx(2*length-1));
2480       comp2*=1.5; /* fudge factor that appears to give good overall performance */
2481       if (comp2<comp1) // use Bluestein
2482         blueplan=std::unique_ptr<fftblue<T0>>(new fftblue<T0>(length));
2483       else
2484         packplan=std::unique_ptr<cfftp<T0>>(new cfftp<T0>(length));
2485       }
2486 
2487     template<typename T> POCKETFFT_NOINLINE void exec(cmplx<T> c[], T0 fct, bool fwd) const
2488       { packplan ? packplan->exec(c,fct,fwd) : blueplan->exec(c,fct,fwd); }
2489 
2490     size_t length() const { return len; }
2491   };
2492 
2493 //
2494 // flexible (FFTPACK/Bluestein) real-valued 1D transform
2495 //
2496 
2497 template<typename T0> class pocketfft_r
2498   {
2499   private:
2500     std::unique_ptr<rfftp<T0>> packplan;
2501     std::unique_ptr<fftblue<T0>> blueplan;
2502     size_t len;
2503 
2504   public:
2505     POCKETFFT_NOINLINE pocketfft_r(size_t length)
2506       : len(length)
2507       {
2508       if (length==0) throw std::runtime_error("zero-length FFT requested");
2509       size_t tmp = (length<50) ? 0 : util::largest_prime_factor(length);
2510       if (tmp*tmp <= length)
2511         {
2512         packplan=std::unique_ptr<rfftp<T0>>(new rfftp<T0>(length));
2513         return;
2514         }
2515       double comp1 = 0.5*util::cost_guess(length);
2516       double comp2 = 2*util::cost_guess(util::good_size_cmplx(2*length-1));
2517       comp2*=1.5; /* fudge factor that appears to give good overall performance */
2518       if (comp2<comp1) // use Bluestein
2519         blueplan=std::unique_ptr<fftblue<T0>>(new fftblue<T0>(length));
2520       else
2521         packplan=std::unique_ptr<rfftp<T0>>(new rfftp<T0>(length));
2522       }
2523 
2524     template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool fwd) const
2525       { packplan ? packplan->exec(c,fct,fwd) : blueplan->exec_r(c,fct,fwd); }
2526 
2527     size_t length() const { return len; }
2528   };
2529 
2530 
2531 //
2532 // sine/cosine transforms
2533 //
2534 
2535 template<typename T0> class T_dct1
2536   {
2537   private:
2538     pocketfft_r<T0> fftplan;
2539 
2540   public:
2541     POCKETFFT_NOINLINE T_dct1(size_t length)
2542       : fftplan(2*(length-1)) {}
2543 
2544     template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho,
2545       int /*type*/, bool /*cosine*/) const
2546       {
2547       constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
2548       size_t N=fftplan.length(), n=N/2+1;
2549       if (ortho)
2550         { c[0]*=sqrt2; c[n-1]*=sqrt2; }
2551       arr<T> tmp(N);
2552       tmp[0] = c[0];
2553       for (size_t i=1; i<n; ++i)
2554         tmp[i] = tmp[N-i] = c[i];
2555       fftplan.exec(tmp.data(), fct, true);
2556       c[0] = tmp[0];
2557       for (size_t i=1; i<n; ++i)
2558         c[i] = tmp[2*i-1];
2559       if (ortho)
2560         { c[0]*=sqrt2*T0(0.5); c[n-1]*=sqrt2*T0(0.5); }
2561       }
2562 
2563     size_t length() const { return fftplan.length()/2+1; }
2564   };
2565 
2566 template<typename T0> class T_dst1
2567   {
2568   private:
2569     pocketfft_r<T0> fftplan;
2570 
2571   public:
2572     POCKETFFT_NOINLINE T_dst1(size_t length)
2573       : fftplan(2*(length+1)) {}
2574 
2575     template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct,
2576       bool /*ortho*/, int /*type*/, bool /*cosine*/) const
2577       {
2578       size_t N=fftplan.length(), n=N/2-1;
2579       arr<T> tmp(N);
2580       tmp[0] = tmp[n+1] = c[0]*0;
2581       for (size_t i=0; i<n; ++i)
2582         { tmp[i+1]=c[i]; tmp[N-1-i]=-c[i]; }
2583       fftplan.exec(tmp.data(), fct, true);
2584       for (size_t i=0; i<n; ++i)
2585         c[i] = -tmp[2*i+2];
2586       }
2587 
2588     size_t length() const { return fftplan.length()/2-1; }
2589   };
2590 
2591 template<typename T0> class T_dcst23
2592   {
2593   private:
2594     pocketfft_r<T0> fftplan;
2595     std::vector<T0> twiddle;
2596 
2597   public:
2598     POCKETFFT_NOINLINE T_dcst23(size_t length)
2599       : fftplan(length), twiddle(length)
2600       {
2601       sincos_2pibyn<T0> tw(4*length);
2602       for (size_t i=0; i<length; ++i)
2603         twiddle[i] = tw[i+1].r;
2604       }
2605 
2606     template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho,
2607       int type, bool cosine) const
2608       {
2609       constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
2610       size_t N=length();
2611       size_t NS2 = (N+1)/2;
2612       if (type==2)
2613         {
2614         if (!cosine)
2615           for (size_t k=1; k<N; k+=2)
2616             c[k] = -c[k];
2617         c[0] *= 2;
2618         if ((N&1)==0) c[N-1]*=2;
2619         for (size_t k=1; k<N-1; k+=2)
2620           MPINPLACE(c[k+1], c[k]);
2621         fftplan.exec(c, fct, false);
2622         for (size_t k=1, kc=N-1; k<NS2; ++k, --kc)
2623           {
2624           T t1 = twiddle[k-1]*c[kc]+twiddle[kc-1]*c[k];
2625           T t2 = twiddle[k-1]*c[k]-twiddle[kc-1]*c[kc];
2626           c[k] = T0(0.5)*(t1+t2); c[kc]=T0(0.5)*(t1-t2);
2627           }
2628         if ((N&1)==0)
2629           c[NS2] *= twiddle[NS2-1];
2630         if (!cosine)
2631           for (size_t k=0, kc=N-1; k<kc; ++k, --kc)
2632             std::swap(c[k], c[kc]);
2633         if (ortho) c[0]*=sqrt2*T0(0.5);
2634         }
2635       else
2636         {
2637         if (ortho) c[0]*=sqrt2;
2638         if (!cosine)
2639           for (size_t k=0, kc=N-1; k<NS2; ++k, --kc)
2640             std::swap(c[k], c[kc]);
2641         for (size_t k=1, kc=N-1; k<NS2; ++k, --kc)
2642           {
2643           T t1=c[k]+c[kc], t2=c[k]-c[kc];
2644           c[k] = twiddle[k-1]*t2+twiddle[kc-1]*t1;
2645           c[kc]= twiddle[k-1]*t1-twiddle[kc-1]*t2;
2646           }
2647         if ((N&1)==0)
2648           c[NS2] *= 2*twiddle[NS2-1];
2649         fftplan.exec(c, fct, true);
2650         for (size_t k=1; k<N-1; k+=2)
2651           MPINPLACE(c[k], c[k+1]);
2652         if (!cosine)
2653           for (size_t k=1; k<N; k+=2)
2654             c[k] = -c[k];
2655         }
2656       }
2657 
2658     size_t length() const { return fftplan.length(); }
2659   };
2660 
2661 template<typename T0> class T_dcst4
2662   {
2663   private:
2664     size_t N;
2665     std::unique_ptr<pocketfft_c<T0>> fft;
2666     std::unique_ptr<pocketfft_r<T0>> rfft;
2667     arr<cmplx<T0>> C2;
2668 
2669   public:
2670     POCKETFFT_NOINLINE T_dcst4(size_t length)
2671       : N(length),
2672         fft((N&1) ? nullptr : new pocketfft_c<T0>(N/2)),
2673         rfft((N&1)? new pocketfft_r<T0>(N) : nullptr),
2674         C2((N&1) ? 0 : N/2)
2675       {
2676       if ((N&1)==0)
2677         {
2678         sincos_2pibyn<T0> tw(16*N);
2679         for (size_t i=0; i<N/2; ++i)
2680           C2[i] = conj(tw[8*i+1]);
2681         }
2682       }
2683 
2684     template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct,
2685       bool /*ortho*/, int /*type*/, bool cosine) const
2686       {
2687       size_t n2 = N/2;
2688       if (!cosine)
2689         for (size_t k=0, kc=N-1; k<n2; ++k, --kc)
2690           std::swap(c[k], c[kc]);
2691       if (N&1)
2692         {
2693         // The following code is derived from the FFTW3 function apply_re11()
2694         // and is released under the 3-clause BSD license with friendly
2695         // permission of Matteo Frigo and Steven G. Johnson.
2696 
2697         arr<T> y(N);
2698         {
2699         size_t i=0, m=n2;
2700         for (; m<N; ++i, m+=4)
2701           y[i] = c[m];
2702         for (; m<2*N; ++i, m+=4)
2703           y[i] = -c[2*N-m-1];
2704         for (; m<3*N; ++i, m+=4)
2705           y[i] = -c[m-2*N];
2706         for (; m<4*N; ++i, m+=4)
2707           y[i] = c[4*N-m-1];
2708         for (; i<N; ++i, m+=4)
2709           y[i] = c[m-4*N];
2710         }
2711         rfft->exec(y.data(), fct, true);
2712         {
2713         auto SGN = [](size_t i)
2714            {
2715            constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
2716            return (i&2) ? -sqrt2 : sqrt2;
2717            };
2718         c[n2] = y[0]*SGN(n2+1);
2719         size_t i=0, i1=1, k=1;
2720         for (; k<n2; ++i, ++i1, k+=2)
2721           {
2722           c[i    ] = y[2*k-1]*SGN(i1)     + y[2*k  ]*SGN(i);
2723           c[N -i1] = y[2*k-1]*SGN(N -i)   - y[2*k  ]*SGN(N -i1);
2724           c[n2-i1] = y[2*k+1]*SGN(n2-i)   - y[2*k+2]*SGN(n2-i1);
2725           c[n2+i1] = y[2*k+1]*SGN(n2+i+2) + y[2*k+2]*SGN(n2+i1);
2726           }
2727         if (k == n2)
2728           {
2729           c[i   ] = y[2*k-1]*SGN(i+1) + y[2*k]*SGN(i);
2730           c[N-i1] = y[2*k-1]*SGN(i+2) + y[2*k]*SGN(i1);
2731           }
2732         }
2733 
2734         // FFTW-derived code ends here
2735         }
2736       else
2737         {
2738         // even length algorithm from
2739         // https://www.appletonaudio.com/blog/2013/derivation-of-fast-dct-4-algorithm-based-on-dft/
2740         arr<cmplx<T>> y(n2);
2741         for(size_t i=0; i<n2; ++i)
2742           {
2743           y[i].Set(c[2*i],c[N-1-2*i]);
2744           y[i] *= C2[i];
2745           }
2746         fft->exec(y.data(), fct, true);
2747         for(size_t i=0, ic=n2-1; i<n2; ++i, --ic)
2748           {
2749           c[2*i  ] =  2*(y[i ].r*C2[i ].r-y[i ].i*C2[i ].i);
2750           c[2*i+1] = -2*(y[ic].i*C2[ic].r+y[ic].r*C2[ic].i);
2751           }
2752         }
2753       if (!cosine)
2754         for (size_t k=1; k<N; k+=2)
2755           c[k] = -c[k];
2756       }
2757 
2758     size_t length() const { return N; }
2759   };
2760 
2761 
2762 //
2763 // multi-D infrastructure
2764 //
2765 
2766 template<typename T> std::shared_ptr<T> get_plan(size_t length)
2767   {
2768 #if POCKETFFT_CACHE_SIZE==0
2769   return std::make_shared<T>(length);
2770 #else
2771   constexpr size_t nmax=POCKETFFT_CACHE_SIZE;
2772   static std::array<std::shared_ptr<T>, nmax> cache;
2773   static std::array<size_t, nmax> last_access{{0}};
2774   static size_t access_counter = 0;
2775   static std::mutex mut;
2776 
2777   auto find_in_cache = [&]() -> std::shared_ptr<T>
2778     {
2779     for (size_t i=0; i<nmax; ++i)
2780       if (cache[i] && (cache[i]->length()==length))
2781         {
2782         // no need to update if this is already the most recent entry
2783         if (last_access[i]!=access_counter)
2784           {
2785           last_access[i] = ++access_counter;
2786           // Guard against overflow
2787           if (access_counter == 0)
2788             last_access.fill(0);
2789           }
2790         return cache[i];
2791         }
2792 
2793     return nullptr;
2794     };
2795 
2796   {
2797   std::lock_guard<std::mutex> lock(mut);
2798   auto p = find_in_cache();
2799   if (p) return p;
2800   }
2801   auto plan = std::make_shared<T>(length);
2802   {
2803   std::lock_guard<std::mutex> lock(mut);
2804   auto p = find_in_cache();
2805   if (p) return p;
2806 
2807   size_t lru = 0;
2808   for (size_t i=1; i<nmax; ++i)
2809     if (last_access[i] < last_access[lru])
2810       lru = i;
2811 
2812   cache[lru] = plan;
2813   last_access[lru] = ++access_counter;
2814   }
2815   return plan;
2816 #endif
2817   }
2818 
2819 class arr_info
2820   {
2821   protected:
2822     shape_t shp;
2823     stride_t str;
2824 
2825   public:
2826     arr_info(const shape_t &shape_, const stride_t &stride_)
2827       : shp(shape_), str(stride_) {}
2828     size_t ndim() const { return shp.size(); }
2829     size_t size() const { return util::prod(shp); }
2830     const shape_t &shape() const { return shp; }
2831     size_t shape(size_t i) const { return shp[i]; }
2832     const stride_t &stride() const { return str; }
2833     const ptrdiff_t &stride(size_t i) const { return str[i]; }
2834   };
2835 
2836 template<typename T> class cndarr: public arr_info
2837   {
2838   protected:
2839     const char *d;
2840 
2841   public:
2842     cndarr(const void *data_, const shape_t &shape_, const stride_t &stride_)
2843       : arr_info(shape_, stride_),
2844         d(reinterpret_cast<const char *>(data_)) {}
2845     const T &operator[](ptrdiff_t ofs) const
2846       { return *reinterpret_cast<const T *>(d+ofs); }
2847   };
2848 
2849 template<typename T> class ndarr: public cndarr<T>
2850   {
2851   public:
2852     ndarr(void *data_, const shape_t &shape_, const stride_t &stride_)
2853       : cndarr<T>::cndarr(const_cast<const void *>(data_), shape_, stride_)
2854       {}
2855     T &operator[](ptrdiff_t ofs)
2856       { return *reinterpret_cast<T *>(const_cast<char *>(cndarr<T>::d+ofs)); }
2857   };
2858 
2859 template<size_t N> class multi_iter
2860   {
2861   private:
2862     shape_t pos;
2863     const arr_info &iarr, &oarr;
2864     ptrdiff_t p_ii, p_i[N], str_i, p_oi, p_o[N], str_o;
2865     size_t idim, rem;
2866 
2867     void advance_i()
2868       {
2869       for (int i_=int(pos.size())-1; i_>=0; --i_)
2870         {
2871         auto i = size_t(i_);
2872         if (i==idim) continue;
2873         p_ii += iarr.stride(i);
2874         p_oi += oarr.stride(i);
2875         if (++pos[i] < iarr.shape(i))
2876           return;
2877         pos[i] = 0;
2878         p_ii -= ptrdiff_t(iarr.shape(i))*iarr.stride(i);
2879         p_oi -= ptrdiff_t(oarr.shape(i))*oarr.stride(i);
2880         }
2881       }
2882 
2883   public:
2884     multi_iter(const arr_info &iarr_, const arr_info &oarr_, size_t idim_)
2885       : pos(iarr_.ndim(), 0), iarr(iarr_), oarr(oarr_), p_ii(0),
2886         str_i(iarr.stride(idim_)), p_oi(0), str_o(oarr.stride(idim_)),
2887         idim(idim_), rem(iarr.size()/iarr.shape(idim))
2888       {
2889       auto nshares = threading::num_threads();
2890       if (nshares==1) return;
2891       if (nshares==0) throw std::runtime_error("can't run with zero threads");
2892       auto myshare = threading::thread_id();
2893       if (myshare>=nshares) throw std::runtime_error("impossible share requested");
2894       size_t nbase = rem/nshares;
2895       size_t additional = rem%nshares;
2896       size_t lo = myshare*nbase + ((myshare<additional) ? myshare : additional);
2897       size_t hi = lo+nbase+(myshare<additional);
2898       size_t todo = hi-lo;
2899 
2900       size_t chunk = rem;
2901       for (size_t i=0; i<pos.size(); ++i)
2902         {
2903         if (i==idim) continue;
2904         chunk /= iarr.shape(i);
2905         size_t n_advance = lo/chunk;
2906         pos[i] += n_advance;
2907         p_ii += ptrdiff_t(n_advance)*iarr.stride(i);
2908         p_oi += ptrdiff_t(n_advance)*oarr.stride(i);
2909         lo -= n_advance*chunk;
2910         }
2911       rem = todo;
2912       }
2913     void advance(size_t n)
2914       {
2915       if (rem<n) throw std::runtime_error("underrun");
2916       for (size_t i=0; i<n; ++i)
2917         {
2918         p_i[i] = p_ii;
2919         p_o[i] = p_oi;
2920         advance_i();
2921         }
2922       rem -= n;
2923       }
2924     ptrdiff_t iofs(size_t i) const { return p_i[0] + ptrdiff_t(i)*str_i; }
2925     ptrdiff_t iofs(size_t j, size_t i) const { return p_i[j] + ptrdiff_t(i)*str_i; }
2926     ptrdiff_t oofs(size_t i) const { return p_o[0] + ptrdiff_t(i)*str_o; }
2927     ptrdiff_t oofs(size_t j, size_t i) const { return p_o[j] + ptrdiff_t(i)*str_o; }
2928     size_t length_in() const { return iarr.shape(idim); }
2929     size_t length_out() const { return oarr.shape(idim); }
2930     ptrdiff_t stride_in() const { return str_i; }
2931     ptrdiff_t stride_out() const { return str_o; }
2932     size_t remaining() const { return rem; }
2933   };
2934 
2935 class simple_iter
2936   {
2937   private:
2938     shape_t pos;
2939     const arr_info &arr;
2940     ptrdiff_t p;
2941     size_t rem;
2942 
2943   public:
2944     simple_iter(const arr_info &arr_)
2945       : pos(arr_.ndim(), 0), arr(arr_), p(0), rem(arr_.size()) {}
2946     void advance()
2947       {
2948       --rem;
2949       for (int i_=int(pos.size())-1; i_>=0; --i_)
2950         {
2951         auto i = size_t(i_);
2952         p += arr.stride(i);
2953         if (++pos[i] < arr.shape(i))
2954           return;
2955         pos[i] = 0;
2956         p -= ptrdiff_t(arr.shape(i))*arr.stride(i);
2957         }
2958       }
2959     ptrdiff_t ofs() const { return p; }
2960     size_t remaining() const { return rem; }
2961   };
2962 
2963 class rev_iter
2964   {
2965   private:
2966     shape_t pos;
2967     const arr_info &arr;
2968     std::vector<char> rev_axis;
2969     std::vector<char> rev_jump;
2970     size_t last_axis, last_size;
2971     shape_t shp;
2972     ptrdiff_t p, rp;
2973     size_t rem;
2974 
2975   public:
2976     rev_iter(const arr_info &arr_, const shape_t &axes)
2977       : pos(arr_.ndim(), 0), arr(arr_), rev_axis(arr_.ndim(), 0),
2978         rev_jump(arr_.ndim(), 1), p(0), rp(0)
2979       {
2980       for (auto ax: axes)
2981         rev_axis[ax]=1;
2982       last_axis = axes.back();
2983       last_size = arr.shape(last_axis)/2 + 1;
2984       shp = arr.shape();
2985       shp[last_axis] = last_size;
2986       rem=1;
2987       for (auto i: shp)
2988         rem *= i;
2989       }
2990     void advance()
2991       {
2992       --rem;
2993       for (int i_=int(pos.size())-1; i_>=0; --i_)
2994         {
2995         auto i = size_t(i_);
2996         p += arr.stride(i);
2997         if (!rev_axis[i])
2998           rp += arr.stride(i);
2999         else
3000           {
3001           rp -= arr.stride(i);
3002           if (rev_jump[i])
3003             {
3004             rp += ptrdiff_t(arr.shape(i))*arr.stride(i);
3005             rev_jump[i] = 0;
3006             }
3007           }
3008         if (++pos[i] < shp[i])
3009           return;
3010         pos[i] = 0;
3011         p -= ptrdiff_t(shp[i])*arr.stride(i);
3012         if (rev_axis[i])
3013           {
3014           rp -= ptrdiff_t(arr.shape(i)-shp[i])*arr.stride(i);
3015           rev_jump[i] = 1;
3016           }
3017         else
3018           rp -= ptrdiff_t(shp[i])*arr.stride(i);
3019         }
3020       }
3021     ptrdiff_t ofs() const { return p; }
3022     ptrdiff_t rev_ofs() const { return rp; }
3023     size_t remaining() const { return rem; }
3024   };
3025 
3026 template<typename T> struct VTYPE {};
3027 template <typename T> using vtype_t = typename VTYPE<T>::type;
3028 
3029 #ifndef POCKETFFT_NO_VECTORS
3030 template<> struct VTYPE<float>
3031   {
3032   using type = float __attribute__ ((vector_size (VLEN<float>::val*sizeof(float))));
3033   };
3034 template<> struct VTYPE<double>
3035   {
3036   using type = double __attribute__ ((vector_size (VLEN<double>::val*sizeof(double))));
3037   };
3038 template<> struct VTYPE<long double>
3039   {
3040   using type = long double __attribute__ ((vector_size (VLEN<long double>::val*sizeof(long double))));
3041   };
3042 #endif
3043 
3044 template<typename T> arr<char> alloc_tmp(const shape_t &shape,
3045   size_t axsize, size_t elemsize)
3046   {
3047   auto othersize = util::prod(shape)/axsize;
3048   auto tmpsize = axsize*((othersize>=VLEN<T>::val) ? VLEN<T>::val : 1);
3049   return arr<char>(tmpsize*elemsize);
3050   }
3051 template<typename T> arr<char> alloc_tmp(const shape_t &shape,
3052   const shape_t &axes, size_t elemsize)
3053   {
3054   size_t fullsize=util::prod(shape);
3055   size_t tmpsize=0;
3056   for (size_t i=0; i<axes.size(); ++i)
3057     {
3058     auto axsize = shape[axes[i]];
3059     auto othersize = fullsize/axsize;
3060     auto sz = axsize*((othersize>=VLEN<T>::val) ? VLEN<T>::val : 1);
3061     if (sz>tmpsize) tmpsize=sz;
3062     }
3063   return arr<char>(tmpsize*elemsize);
3064   }
3065 
3066 template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it,
3067   const cndarr<cmplx<T>> &src, cmplx<vtype_t<T>> *POCKETFFT_RESTRICT dst)
3068   {
3069   for (size_t i=0; i<it.length_in(); ++i)
3070     for (size_t j=0; j<vlen; ++j)
3071       {
3072       dst[i].r[j] = src[it.iofs(j,i)].r;
3073       dst[i].i[j] = src[it.iofs(j,i)].i;
3074       }
3075   }
3076 
3077 template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it,
3078   const cndarr<T> &src, vtype_t<T> *POCKETFFT_RESTRICT dst)
3079   {
3080   for (size_t i=0; i<it.length_in(); ++i)
3081     for (size_t j=0; j<vlen; ++j)
3082       dst[i][j] = src[it.iofs(j,i)];
3083   }
3084 
3085 template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it,
3086   const cndarr<T> &src, T *POCKETFFT_RESTRICT dst)
3087   {
3088   if (dst == &src[it.iofs(0)]) return;  // in-place
3089   for (size_t i=0; i<it.length_in(); ++i)
3090     dst[i] = src[it.iofs(i)];
3091   }
3092 
3093 template<typename T, size_t vlen> void copy_output(const multi_iter<vlen> &it,
3094   const cmplx<vtype_t<T>> *POCKETFFT_RESTRICT src, ndarr<cmplx<T>> &dst)
3095   {
3096   for (size_t i=0; i<it.length_out(); ++i)
3097     for (size_t j=0; j<vlen; ++j)
3098       dst[it.oofs(j,i)].Set(src[i].r[j],src[i].i[j]);
3099   }
3100 
3101 template<typename T, size_t vlen> void copy_output(const multi_iter<vlen> &it,
3102   const vtype_t<T> *POCKETFFT_RESTRICT src, ndarr<T> &dst)
3103   {
3104   for (size_t i=0; i<it.length_out(); ++i)
3105     for (size_t j=0; j<vlen; ++j)
3106       dst[it.oofs(j,i)] = src[i][j];
3107   }
3108 
3109 template<typename T, size_t vlen> void copy_output(const multi_iter<vlen> &it,
3110   const T *POCKETFFT_RESTRICT src, ndarr<T> &dst)
3111   {
3112   if (src == &dst[it.oofs(0)]) return;  // in-place
3113   for (size_t i=0; i<it.length_out(); ++i)
3114     dst[it.oofs(i)] = src[i];
3115   }
3116 
3117 template <typename T> struct add_vec { using type = vtype_t<T>; };
3118 template <typename T> struct add_vec<cmplx<T>>
3119   { using type = cmplx<vtype_t<T>>; };
3120 template <typename T> using add_vec_t = typename add_vec<T>::type;
3121 
3122 template<typename Tplan, typename T, typename T0, typename Exec>
3123 POCKETFFT_NOINLINE void general_nd(const cndarr<T> &in, ndarr<T> &out,
3124   const shape_t &axes, T0 fct, size_t nthreads, const Exec & exec,
3125   const bool allow_inplace=true)
3126   {
3127   std::shared_ptr<Tplan> plan;
3128 
3129   for (size_t iax=0; iax<axes.size(); ++iax)
3130     {
3131     size_t len=in.shape(axes[iax]);
3132     if ((!plan) || (len!=plan->length()))
3133       plan = get_plan<Tplan>(len);
3134 
3135     threading::thread_map(
3136       util::thread_count(nthreads, in.shape(), axes[iax], VLEN<T>::val),
3137       [&] {
3138         constexpr auto vlen = VLEN<T0>::val;
3139         auto storage = alloc_tmp<T0>(in.shape(), len, sizeof(T));
3140         const auto &tin(iax==0? in : out);
3141         multi_iter<vlen> it(tin, out, axes[iax]);
3142 #ifndef POCKETFFT_NO_VECTORS
3143         if (vlen>1)
3144           while (it.remaining()>=vlen)
3145             {
3146             it.advance(vlen);
3147             auto tdatav = reinterpret_cast<add_vec_t<T> *>(storage.data());
3148             exec(it, tin, out, tdatav, *plan, fct);
3149             }
3150 #endif
3151         while (it.remaining()>0)
3152           {
3153           it.advance(1);
3154           auto buf = allow_inplace && it.stride_out() == sizeof(T) ?
3155             &out[it.oofs(0)] : reinterpret_cast<T *>(storage.data());
3156           exec(it, tin, out, buf, *plan, fct);
3157           }
3158       });  // end of parallel region
3159     fct = T0(1); // factor has been applied, use 1 for remaining axes
3160     }
3161   }
3162 
3163 struct ExecC2C
3164   {
3165   bool forward;
3166 
3167   template <typename T0, typename T, size_t vlen> void operator () (
3168     const multi_iter<vlen> &it, const cndarr<cmplx<T0>> &in,
3169     ndarr<cmplx<T0>> &out, T * buf, const pocketfft_c<T0> &plan, T0 fct) const
3170     {
3171     copy_input(it, in, buf);
3172     plan.exec(buf, fct, forward);
3173     copy_output(it, buf, out);
3174     }
3175   };
3176 
3177 template <typename T, size_t vlen> void copy_hartley(const multi_iter<vlen> &it,
3178   const vtype_t<T> *POCKETFFT_RESTRICT src, ndarr<T> &dst)
3179   {
3180   for (size_t j=0; j<vlen; ++j)
3181     dst[it.oofs(j,0)] = src[0][j];
3182   size_t i=1, i1=1, i2=it.length_out()-1;
3183   for (i=1; i<it.length_out()-1; i+=2, ++i1, --i2)
3184     for (size_t j=0; j<vlen; ++j)
3185       {
3186         dst[it.oofs(j,i1)] = src[i][j]+src[i+1][j];
3187         dst[it.oofs(j,i2)] = src[i][j]-src[i+1][j];
3188       }
3189   if (i<it.length_out())
3190     for (size_t j=0; j<vlen; ++j)
3191       dst[it.oofs(j,i1)] = src[i][j];
3192   }
3193 
3194 template <typename T, size_t vlen> void copy_hartley(const multi_iter<vlen> &it,
3195   const T *POCKETFFT_RESTRICT src, ndarr<T> &dst)
3196   {
3197   dst[it.oofs(0)] = src[0];
3198   size_t i=1, i1=1, i2=it.length_out()-1;
3199   for (i=1; i<it.length_out()-1; i+=2, ++i1, --i2)
3200     {
3201     dst[it.oofs(i1)] = src[i]+src[i+1];
3202     dst[it.oofs(i2)] = src[i]-src[i+1];
3203     }
3204   if (i<it.length_out())
3205     dst[it.oofs(i1)] = src[i];
3206   }
3207 
3208 struct ExecHartley
3209   {
3210   template <typename T0, typename T, size_t vlen> void operator () (
3211     const multi_iter<vlen> &it, const cndarr<T0> &in, ndarr<T0> &out,
3212     T * buf, const pocketfft_r<T0> &plan, T0 fct) const
3213     {
3214     copy_input(it, in, buf);
3215     plan.exec(buf, fct, true);
3216     copy_hartley(it, buf, out);
3217     }
3218   };
3219 
3220 struct ExecDcst
3221   {
3222   bool ortho;
3223   int type;
3224   bool cosine;
3225 
3226   template <typename T0, typename T, typename Tplan, size_t vlen>
3227   void operator () (const multi_iter<vlen> &it, const cndarr<T0> &in,
3228     ndarr<T0> &out, T * buf, const Tplan &plan, T0 fct) const
3229     {
3230     copy_input(it, in, buf);
3231     plan.exec(buf, fct, ortho, type, cosine);
3232     copy_output(it, buf, out);
3233     }
3234   };
3235 
3236 template<typename T> POCKETFFT_NOINLINE void general_r2c(
3237   const cndarr<T> &in, ndarr<cmplx<T>> &out, size_t axis, bool forward, T fct,
3238   size_t nthreads)
3239   {
3240   auto plan = get_plan<pocketfft_r<T>>(in.shape(axis));
3241   size_t len=in.shape(axis);
3242   threading::thread_map(
3243     util::thread_count(nthreads, in.shape(), axis, VLEN<T>::val),
3244     [&] {
3245     constexpr auto vlen = VLEN<T>::val;
3246     auto storage = alloc_tmp<T>(in.shape(), len, sizeof(T));
3247     multi_iter<vlen> it(in, out, axis);
3248 #ifndef POCKETFFT_NO_VECTORS
3249     if (vlen>1)
3250       while (it.remaining()>=vlen)
3251         {
3252         it.advance(vlen);
3253         auto tdatav = reinterpret_cast<vtype_t<T> *>(storage.data());
3254         copy_input(it, in, tdatav);
3255         plan->exec(tdatav, fct, true);
3256         for (size_t j=0; j<vlen; ++j)
3257           out[it.oofs(j,0)].Set(tdatav[0][j]);
3258         size_t i=1, ii=1;
3259         if (forward)
3260           for (; i<len-1; i+=2, ++ii)
3261             for (size_t j=0; j<vlen; ++j)
3262               out[it.oofs(j,ii)].Set(tdatav[i][j], tdatav[i+1][j]);
3263         else
3264           for (; i<len-1; i+=2, ++ii)
3265             for (size_t j=0; j<vlen; ++j)
3266               out[it.oofs(j,ii)].Set(tdatav[i][j], -tdatav[i+1][j]);
3267         if (i<len)
3268           for (size_t j=0; j<vlen; ++j)
3269             out[it.oofs(j,ii)].Set(tdatav[i][j]);
3270         }
3271 #endif
3272     while (it.remaining()>0)
3273       {
3274       it.advance(1);
3275       auto tdata = reinterpret_cast<T *>(storage.data());
3276       copy_input(it, in, tdata);
3277       plan->exec(tdata, fct, true);
3278       out[it.oofs(0)].Set(tdata[0]);
3279       size_t i=1, ii=1;
3280       if (forward)
3281         for (; i<len-1; i+=2, ++ii)
3282           out[it.oofs(ii)].Set(tdata[i], tdata[i+1]);
3283       else
3284         for (; i<len-1; i+=2, ++ii)
3285           out[it.oofs(ii)].Set(tdata[i], -tdata[i+1]);
3286       if (i<len)
3287         out[it.oofs(ii)].Set(tdata[i]);
3288       }
3289     });  // end of parallel region
3290   }
3291 template<typename T> POCKETFFT_NOINLINE void general_c2r(
3292   const cndarr<cmplx<T>> &in, ndarr<T> &out, size_t axis, bool forward, T fct,
3293   size_t nthreads)
3294   {
3295   auto plan = get_plan<pocketfft_r<T>>(out.shape(axis));
3296   size_t len=out.shape(axis);
3297   threading::thread_map(
3298     util::thread_count(nthreads, in.shape(), axis, VLEN<T>::val),
3299     [&] {
3300       constexpr auto vlen = VLEN<T>::val;
3301       auto storage = alloc_tmp<T>(out.shape(), len, sizeof(T));
3302       multi_iter<vlen> it(in, out, axis);
3303 #ifndef POCKETFFT_NO_VECTORS
3304       if (vlen>1)
3305         while (it.remaining()>=vlen)
3306           {
3307           it.advance(vlen);
3308           auto tdatav = reinterpret_cast<vtype_t<T> *>(storage.data());
3309           for (size_t j=0; j<vlen; ++j)
3310             tdatav[0][j]=in[it.iofs(j,0)].r;
3311           {
3312           size_t i=1, ii=1;
3313           if (forward)
3314             for (; i<len-1; i+=2, ++ii)
3315               for (size_t j=0; j<vlen; ++j)
3316                 {
3317                 tdatav[i  ][j] =  in[it.iofs(j,ii)].r;
3318                 tdatav[i+1][j] = -in[it.iofs(j,ii)].i;
3319                 }
3320           else
3321             for (; i<len-1; i+=2, ++ii)
3322               for (size_t j=0; j<vlen; ++j)
3323                 {
3324                 tdatav[i  ][j] = in[it.iofs(j,ii)].r;
3325                 tdatav[i+1][j] = in[it.iofs(j,ii)].i;
3326                 }
3327           if (i<len)
3328             for (size_t j=0; j<vlen; ++j)
3329               tdatav[i][j] = in[it.iofs(j,ii)].r;
3330           }
3331           plan->exec(tdatav, fct, false);
3332           copy_output(it, tdatav, out);
3333           }
3334 #endif
3335       while (it.remaining()>0)
3336         {
3337         it.advance(1);
3338         auto tdata = reinterpret_cast<T *>(storage.data());
3339         tdata[0]=in[it.iofs(0)].r;
3340         {
3341         size_t i=1, ii=1;
3342         if (forward)
3343           for (; i<len-1; i+=2, ++ii)
3344             {
3345             tdata[i  ] =  in[it.iofs(ii)].r;
3346             tdata[i+1] = -in[it.iofs(ii)].i;
3347             }
3348         else
3349           for (; i<len-1; i+=2, ++ii)
3350             {
3351             tdata[i  ] = in[it.iofs(ii)].r;
3352             tdata[i+1] = in[it.iofs(ii)].i;
3353             }
3354         if (i<len)
3355           tdata[i] = in[it.iofs(ii)].r;
3356         }
3357         plan->exec(tdata, fct, false);
3358         copy_output(it, tdata, out);
3359         }
3360     });  // end of parallel region
3361   }
3362 
3363 struct ExecR2R
3364   {
3365   bool r2c, forward;
3366 
3367   template <typename T0, typename T, size_t vlen> void operator () (
3368     const multi_iter<vlen> &it, const cndarr<T0> &in, ndarr<T0> &out, T * buf,
3369     const pocketfft_r<T0> &plan, T0 fct) const
3370     {
3371     copy_input(it, in, buf);
3372     if ((!r2c) && forward)
3373       for (size_t i=2; i<it.length_out(); i+=2)
3374         buf[i] = -buf[i];
3375     plan.exec(buf, fct, forward);
3376     if (r2c && (!forward))
3377       for (size_t i=2; i<it.length_out(); i+=2)
3378         buf[i] = -buf[i];
3379     copy_output(it, buf, out);
3380     }
3381   };
3382 
3383 template<typename T> void c2c(const shape_t &shape, const stride_t &stride_in,
3384   const stride_t &stride_out, const shape_t &axes, bool forward,
3385   const std::complex<T> *data_in, std::complex<T> *data_out, T fct,
3386   size_t nthreads=1)
3387   {
3388   if (util::prod(shape)==0) return;
3389   util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes);
3390   cndarr<cmplx<T>> ain(data_in, shape, stride_in);
3391   ndarr<cmplx<T>> aout(data_out, shape, stride_out);
3392   general_nd<pocketfft_c<T>>(ain, aout, axes, fct, nthreads, ExecC2C{forward});
3393   }
3394 
3395 template<typename T> void dct(const shape_t &shape,
3396   const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
3397   int type, const T *data_in, T *data_out, T fct, bool ortho, size_t nthreads=1)
3398   {
3399   if ((type<1) || (type>4)) throw std::invalid_argument("invalid DCT type");
3400   if (util::prod(shape)==0) return;
3401   util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes);
3402   cndarr<T> ain(data_in, shape, stride_in);
3403   ndarr<T> aout(data_out, shape, stride_out);
3404   const ExecDcst exec{ortho, type, true};
3405   if (type==1)
3406     general_nd<T_dct1<T>>(ain, aout, axes, fct, nthreads, exec);
3407   else if (type==4)
3408     general_nd<T_dcst4<T>>(ain, aout, axes, fct, nthreads, exec);
3409   else
3410     general_nd<T_dcst23<T>>(ain, aout, axes, fct, nthreads, exec);
3411   }
3412 
3413 template<typename T> void dst(const shape_t &shape,
3414   const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
3415   int type, const T *data_in, T *data_out, T fct, bool ortho, size_t nthreads=1)
3416   {
3417   if ((type<1) || (type>4)) throw std::invalid_argument("invalid DST type");
3418   if (util::prod(shape)==0) return;
3419   util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes);
3420   cndarr<T> ain(data_in, shape, stride_in);
3421   ndarr<T> aout(data_out, shape, stride_out);
3422   const ExecDcst exec{ortho, type, false};
3423   if (type==1)
3424     general_nd<T_dst1<T>>(ain, aout, axes, fct, nthreads, exec);
3425   else if (type==4)
3426     general_nd<T_dcst4<T>>(ain, aout, axes, fct, nthreads, exec);
3427   else
3428     general_nd<T_dcst23<T>>(ain, aout, axes, fct, nthreads, exec);
3429   }
3430 
3431 template<typename T> void r2c(const shape_t &shape_in,
3432   const stride_t &stride_in, const stride_t &stride_out, size_t axis,
3433   bool forward, const T *data_in, std::complex<T> *data_out, T fct,
3434   size_t nthreads=1)
3435   {
3436   if (util::prod(shape_in)==0) return;
3437   util::sanity_check(shape_in, stride_in, stride_out, false, axis);
3438   cndarr<T> ain(data_in, shape_in, stride_in);
3439   shape_t shape_out(shape_in);
3440   shape_out[axis] = shape_in[axis]/2 + 1;
3441   ndarr<cmplx<T>> aout(data_out, shape_out, stride_out);
3442   general_r2c(ain, aout, axis, forward, fct, nthreads);
3443   }
3444 
3445 template<typename T> void r2c(const shape_t &shape_in,
3446   const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
3447   bool forward, const T *data_in, std::complex<T> *data_out, T fct,
3448   size_t nthreads=1)
3449   {
3450   if (util::prod(shape_in)==0) return;
3451   util::sanity_check(shape_in, stride_in, stride_out, false, axes);
3452   r2c(shape_in, stride_in, stride_out, axes.back(), forward, data_in, data_out,
3453     fct, nthreads);
3454   if (axes.size()==1) return;
3455 
3456   shape_t shape_out(shape_in);
3457   shape_out[axes.back()] = shape_in[axes.back()]/2 + 1;
3458   auto newaxes = shape_t{axes.begin(), --axes.end()};
3459   c2c(shape_out, stride_out, stride_out, newaxes, forward, data_out, data_out,
3460     T(1), nthreads);
3461   }
3462 
3463 template<typename T> void c2r(const shape_t &shape_out,
3464   const stride_t &stride_in, const stride_t &stride_out, size_t axis,
3465   bool forward, const std::complex<T> *data_in, T *data_out, T fct,
3466   size_t nthreads=1)
3467   {
3468   if (util::prod(shape_out)==0) return;
3469   util::sanity_check(shape_out, stride_in, stride_out, false, axis);
3470   shape_t shape_in(shape_out);
3471   shape_in[axis] = shape_out[axis]/2 + 1;
3472   cndarr<cmplx<T>> ain(data_in, shape_in, stride_in);
3473   ndarr<T> aout(data_out, shape_out, stride_out);
3474   general_c2r(ain, aout, axis, forward, fct, nthreads);
3475   }
3476 
3477 template<typename T> void c2r(const shape_t &shape_out,
3478   const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
3479   bool forward, const std::complex<T> *data_in, T *data_out, T fct,
3480   size_t nthreads=1)
3481   {
3482   if (util::prod(shape_out)==0) return;
3483   if (axes.size()==1)
3484     return c2r(shape_out, stride_in, stride_out, axes[0], forward,
3485       data_in, data_out, fct, nthreads);
3486   util::sanity_check(shape_out, stride_in, stride_out, false, axes);
3487   auto shape_in = shape_out;
3488   shape_in[axes.back()] = shape_out[axes.back()]/2 + 1;
3489   auto nval = util::prod(shape_in);
3490   stride_t stride_inter(shape_in.size());
3491   stride_inter.back() = sizeof(cmplx<T>);
3492   for (int i=int(shape_in.size())-2; i>=0; --i)
3493     stride_inter[size_t(i)] =
3494       stride_inter[size_t(i+1)]*ptrdiff_t(shape_in[size_t(i+1)]);
3495   arr<std::complex<T>> tmp(nval);
3496   auto newaxes = shape_t{axes.begin(), --axes.end()};
3497   c2c(shape_in, stride_in, stride_inter, newaxes, forward, data_in, tmp.data(),
3498     T(1), nthreads);
3499   c2r(shape_out, stride_inter, stride_out, axes.back(), forward,
3500     tmp.data(), data_out, fct, nthreads);
3501   }
3502 
3503 template<typename T> void r2r_fftpack(const shape_t &shape,
3504   const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
3505   bool real2hermitian, bool forward, const T *data_in, T *data_out, T fct,
3506   size_t nthreads=1)
3507   {
3508   if (util::prod(shape)==0) return;
3509   util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes);
3510   cndarr<T> ain(data_in, shape, stride_in);
3511   ndarr<T> aout(data_out, shape, stride_out);
3512   general_nd<pocketfft_r<T>>(ain, aout, axes, fct, nthreads,
3513     ExecR2R{real2hermitian, forward});
3514   }
3515 
3516 template<typename T> void r2r_separable_hartley(const shape_t &shape,
3517   const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
3518   const T *data_in, T *data_out, T fct, size_t nthreads=1)
3519   {
3520   if (util::prod(shape)==0) return;
3521   util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes);
3522   cndarr<T> ain(data_in, shape, stride_in);
3523   ndarr<T> aout(data_out, shape, stride_out);
3524   general_nd<pocketfft_r<T>>(ain, aout, axes, fct, nthreads, ExecHartley{},
3525     false);
3526   }
3527 
3528 template<typename T> void r2r_genuine_hartley(const shape_t &shape,
3529   const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
3530   const T *data_in, T *data_out, T fct, size_t nthreads=1)
3531   {
3532   if (util::prod(shape)==0) return;
3533   if (axes.size()==1)
3534     return r2r_separable_hartley(shape, stride_in, stride_out, axes, data_in,
3535       data_out, fct, nthreads);
3536   util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes);
3537   shape_t tshp(shape);
3538   tshp[axes.back()] = tshp[axes.back()]/2+1;
3539   arr<std::complex<T>> tdata(util::prod(tshp));
3540   stride_t tstride(shape.size());
3541   tstride.back()=sizeof(std::complex<T>);
3542   for (size_t i=tstride.size()-1; i>0; --i)
3543     tstride[i-1]=tstride[i]*ptrdiff_t(tshp[i]);
3544   r2c(shape, stride_in, tstride, axes, true, data_in, tdata.data(), fct, nthreads);
3545   cndarr<cmplx<T>> atmp(tdata.data(), tshp, tstride);
3546   ndarr<T> aout(data_out, shape, stride_out);
3547   simple_iter iin(atmp);
3548   rev_iter iout(aout, axes);
3549   while(iin.remaining()>0)
3550     {
3551     auto v = atmp[iin.ofs()];
3552     aout[iout.ofs()] = v.r+v.i;
3553     aout[iout.rev_ofs()] = v.r-v.i;
3554     iin.advance(); iout.advance();
3555     }
3556   }
3557 
3558 } // namespace detail
3559 
3560 using detail::FORWARD;
3561 using detail::BACKWARD;
3562 using detail::shape_t;
3563 using detail::stride_t;
3564 using detail::c2c;
3565 using detail::c2r;
3566 using detail::r2c;
3567 using detail::r2r_fftpack;
3568 using detail::r2r_separable_hartley;
3569 using detail::r2r_genuine_hartley;
3570 using detail::dct;
3571 using detail::dst;
3572 
3573 } // namespace pocketfft
3574 
3575 #undef POCKETFFT_NOINLINE
3576 #undef POCKETFFT_RESTRICT
3577 
3578 #endif // POCKETFFT_HDRONLY_H
3579