1 /*
2 This file is part of pocketfft.
3 
4 Copyright (C) 2010-2021 Max-Planck-Society
5 Copyright (C) 2019 Peter Bell
6 
7 Authors: Martin Reinecke, Peter Bell
8 
9 All rights reserved.
10 
11 Redistribution and use in source and binary forms, with or without modification,
12 are permitted provided that the following conditions are met:
13 
14 * Redistributions of source code must retain the above copyright notice, this
15   list of conditions and the following disclaimer.
16 * Redistributions in binary form must reproduce the above copyright notice, this
17   list of conditions and the following disclaimer in the documentation and/or
18   other materials provided with the distribution.
19 * Neither the name of the copyright holder nor the names of its contributors may
20   be used to endorse or promote products derived from this software without
21   specific prior written permission.
22 
23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
24 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
25 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
26 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
27 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
28 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
30 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 */
34 
35 #ifndef DUCC0_FFT1D_H
36 #define DUCC0_FFT1D_H
37 
38 #include <memory>
39 #include <cstddef>
40 #include <algorithm>
41 #include <functional>
42 #include <type_traits>
43 #include <utility>
44 #include <vector>
45 #include <typeinfo>
46 #include <typeindex>
47 #include "ducc0/infra/useful_macros.h"
48 #include "ducc0/math/cmplx.h"
49 #include "ducc0/infra/error_handling.h"
50 #include "ducc0/infra/aligned_array.h"
51 #include "ducc0/infra/simd.h"
52 #include "ducc0/infra/threading.h"
53 #include "ducc0/math/unity_roots.h"
54 
55 //#define DUCC0_USE_PROPER_HARTLEY_CONVENTION
56 
57 namespace ducc0 {
58 
59 namespace detail_fft {
60 
61 using namespace std;
62 
63 template<typename T> constexpr inline size_t fft1d_simdlen
64   = min<size_t>(8, native_simd<T>::size());
65 template<> constexpr inline size_t fft1d_simdlen<double>
66   = min<size_t>(4, native_simd<double>::size());
67 template<> constexpr inline size_t fft1d_simdlen<float>
68   = min<size_t>(8, native_simd<float>::size());
69 template<typename T> using fft1d_simd = typename simd_select<T,fft1d_simdlen<T>>::type;
70 template<typename T> constexpr inline bool fft1d_simd_exists = (fft1d_simdlen<T> > 1);
71 
72 // Always use std:: for <cmath> functions
73 template <typename T> T cos(T) = delete;
74 template <typename T> T sin(T) = delete;
75 template <typename T> T sqrt(T) = delete;
76 
77 template<typename T> inline void PM(T &a, T &b, T c, T d)
78   { a=c+d; b=c-d; }
79 template<typename T> inline void PMINPLACE(T &a, T &b)
80   { T t = a; a+=b; b=t-b; }
81 template<typename T> inline void MPINPLACE(T &a, T &b)
82   { T t = a; a-=b; b=t+b; }
83 template<bool fwd, typename T, typename T2> void special_mul (const Cmplx<T> &v1, const Cmplx<T2> &v2, Cmplx<T> &res)
84   {
85   res = fwd ? Cmplx<T>(v1.r*v2.r+v1.i*v2.i, v1.i*v2.r-v1.r*v2.i)
86             : Cmplx<T>(v1.r*v2.r-v1.i*v2.i, v1.r*v2.i+v1.i*v2.r);
87   }
88 
89 template<bool fwd, typename T> void ROTX90(Cmplx<T> &a)
90   { auto tmp_= fwd ? -a.r : a.r; a.r = fwd ? a.i : -a.i; a.i=tmp_; }
91 
92 template<typename T> inline auto tidx() { return type_index(typeid(T)); }
93 
94 struct util1d // hack to avoid duplicate symbols
95   {
96   /* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */
97   DUCC0_NOINLINE static size_t good_size_cmplx(size_t n)
98     {
99     if (n<=12) return n;
100 
101     size_t bestfac=2*n;
102     for (size_t f11=1; f11<bestfac; f11*=11)
103       for (size_t f117=f11; f117<bestfac; f117*=7)
104         for (size_t f1175=f117; f1175<bestfac; f1175*=5)
105           {
106           size_t x=f1175;
107           while (x<n) x*=2;
108           for (;;)
109             {
110             if (x<n)
111               x*=3;
112             else if (x>n)
113               {
114               if (x<bestfac) bestfac=x;
115               if (x&1) break;
116               x>>=1;
117               }
118             else
119               return n;
120             }
121           }
122     return bestfac;
123     }
124 
125   /* returns the smallest composite of 2, 3, 5 which is >= n */
126   DUCC0_NOINLINE static size_t good_size_real(size_t n)
127     {
128     if (n<=6) return n;
129 
130     size_t bestfac=2*n;
131     for (size_t f5=1; f5<bestfac; f5*=5)
132       {
133       size_t x = f5;
134       while (x<n) x *= 2;
135       for (;;)
136         {
137         if (x<n)
138           x*=3;
139         else if (x>n)
140           {
141           if (x<bestfac) bestfac=x;
142           if (x&1) break;
143           x>>=1;
144           }
145         else
146           return n;
147         }
148       }
149     return bestfac;
150     }
151 
152   DUCC0_NOINLINE static vector<size_t> prime_factors(size_t N)
153     {
154     MR_assert(N>0, "need a positive number");
155     vector<size_t> factors;
156     while ((N&1)==0)
157       { N>>=1; factors.push_back(2); }
158     for (size_t divisor=3; divisor*divisor<=N; divisor+=2)
159     while ((N%divisor)==0)
160       {
161       factors.push_back(divisor);
162       N/=divisor;
163       }
164     if (N>1) factors.push_back(N);
165     return factors;
166     }
167   };
168 
169 template<typename T> using Troots = shared_ptr<const UnityRoots<T,Cmplx<T>>>;
170 
171 // T: "type", f/c: "float/complex", s/v: "scalar/vector"
172 template <typename Tfs> class cfftpass
173   {
174   public:
175     virtual ~cfftpass(){}
176     using Tcs = Cmplx<Tfs>;
177 
178     // number of Tcd values required as scratch space during "exec"
179     // will be provided in "buf"
180     virtual size_t bufsize() const = 0;
181     virtual bool needs_copy() const = 0;
182     virtual void *exec(const type_index &ti, void *in, void *copy, void *buf,
183       bool fwd, size_t nthreads=1) const = 0;
184 
185     static vector<size_t> factorize(size_t N)
186       {
187       MR_assert(N>0, "need a positive number");
188       vector<size_t> factors;
189       factors.reserve(15);
190       while ((N&3)==0)
191         { factors.push_back(4); N>>=2; }
192       if ((N&1)==0)
193         {
194         N>>=1;
195         // factor 2 should be at the front of the factor list
196         factors.push_back(2);
197         swap(factors[0], factors.back());
198         }
199       for (size_t divisor=3; divisor*divisor<=N; divisor+=2)
200       while ((N%divisor)==0)
201         {
202         factors.push_back(divisor);
203         N/=divisor;
204         }
205       if (N>1) factors.push_back(N);
206       return factors;
207       }
208 
209     static shared_ptr<cfftpass> make_pass(size_t l1, size_t ido, size_t ip,
210       const Troots<Tfs> &roots, bool vectorize=false);
211     static shared_ptr<cfftpass> make_pass(size_t ip, bool vectorize=false)
212       {
213       return make_pass(1,1,ip,make_shared<UnityRoots<Tfs,Cmplx<Tfs>>>(ip),
214         vectorize);
215       }
216   };
217 
218 #define POCKETFFT_EXEC_DISPATCH \
219     virtual void *exec(const type_index &ti, void *in, void *copy, void *buf, \
220       bool fwd, size_t nthreads=1) const \
221       { \
222       static const auto tics = tidx<Tcs *>(); \
223       if (ti==tics) \
224         { \
225         auto in1 = static_cast<Tcs *>(in); \
226         auto copy1 = static_cast<Tcs *>(copy); \
227         auto buf1 = static_cast<Tcs *>(buf); \
228         return fwd ? exec_<true>(in1, copy1, buf1, nthreads) \
229                    : exec_<false>(in1, copy1, buf1, nthreads); \
230         } \
231       if constexpr (fft1d_simdlen<Tfs> > 1) \
232         if constexpr (simd_exists<Tfs, fft1d_simdlen<Tfs>>) \
233           { \
234           using Tfv = typename simd_select<Tfs, fft1d_simdlen<Tfs>>::type; \
235           using Tcv = Cmplx<Tfv>; \
236           static const auto ticv = tidx<Tcv *>(); \
237           if (ti==ticv) \
238             {  \
239             auto in1 = static_cast<Tcv *>(in); \
240             auto copy1 = static_cast<Tcv *>(copy); \
241             auto buf1 = static_cast<Tcv *>(buf); \
242             return fwd ? exec_<true>(in1, copy1, buf1, nthreads) \
243                        : exec_<false>(in1, copy1, buf1, nthreads); \
244             } \
245           } \
246       if constexpr (fft1d_simdlen<Tfs> > 2) \
247         if constexpr (simd_exists<Tfs, fft1d_simdlen<Tfs>/2>) \
248           { \
249           using Tfv = typename simd_select<Tfs, fft1d_simdlen<Tfs>/2>::type; \
250           using Tcv = Cmplx<Tfv>; \
251           static const auto ticv = tidx<Tcv *>(); \
252           if (ti==ticv) \
253             {  \
254             auto in1 = static_cast<Tcv *>(in); \
255             auto copy1 = static_cast<Tcv *>(copy); \
256             auto buf1 = static_cast<Tcv *>(buf); \
257             return fwd ? exec_<true>(in1, copy1, buf1, nthreads) \
258                        : exec_<false>(in1, copy1, buf1, nthreads); \
259             } \
260           } \
261       if constexpr (fft1d_simdlen<Tfs> > 4) \
262         if constexpr (simd_exists<Tfs, fft1d_simdlen<Tfs>/4>) \
263           { \
264           using Tfv = typename simd_select<Tfs, fft1d_simdlen<Tfs>/4>::type; \
265           using Tcv = Cmplx<Tfv>; \
266           static const auto ticv = tidx<Tcv *>(); \
267           if (ti==ticv) \
268             {  \
269             auto in1 = static_cast<Tcv *>(in); \
270             auto copy1 = static_cast<Tcv *>(copy); \
271             auto buf1 = static_cast<Tcv *>(buf); \
272             return fwd ? exec_<true>(in1, copy1, buf1, nthreads) \
273                        : exec_<false>(in1, copy1, buf1, nthreads); \
274             } \
275           } \
276       if constexpr (fft1d_simdlen<Tfs> > 8) \
277         if constexpr (simd_exists<Tfs, fft1d_simdlen<Tfs>/8>) \
278           { \
279           using Tfv = typename simd_select<Tfs, fft1d_simdlen<Tfs>/8>::type; \
280           using Tcv = Cmplx<Tfv>; \
281           static const auto ticv = tidx<Tcv *>(); \
282           if (ti==ticv) \
283             {  \
284             auto in1 = static_cast<Tcv *>(in); \
285             auto copy1 = static_cast<Tcv *>(copy); \
286             auto buf1 = static_cast<Tcv *>(buf); \
287             return fwd ? exec_<true>(in1, copy1, buf1, nthreads) \
288                        : exec_<false>(in1, copy1, buf1, nthreads); \
289             } \
290           } \
291       MR_fail("impossible vector length requested"); \
292       }
293 
294 template<typename T> using Tcpass = shared_ptr<cfftpass<T>>;
295 
296 template <typename Tfs> class cfftp1: public cfftpass<Tfs>
297   {
298   public:
299     cfftp1() {}
300     virtual size_t bufsize() const { return 0; }
301     virtual bool needs_copy() const { return false; }
302 
303     virtual void *exec(const type_index & /*ti*/, void * in, void * /*copy*/,
304       void * /*buf*/, bool /*fwd*/, size_t /*nthreads*/) const
305       { return in; }
306   };
307 
308 template <typename Tfs> class cfftp2: public cfftpass<Tfs>
309   {
310   private:
311     using typename cfftpass<Tfs>::Tcs;
312 
313     size_t l1, ido;
314     static constexpr size_t ip=2;
315     quick_array<Tcs> wa;
316 
317     auto WA(size_t i) const
318       { return wa[i-1]; }
319 
320     template<bool fwd, typename Tcd> Tcd *exec_ (const Tcd * DUCC0_RESTRICT cc,
321       Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, size_t /*nthreads*/) const
322       {
323       if (ido==1)
324         {
325         auto CH = [ch,this](size_t b, size_t c) -> Tcd&
326           { return ch[b+l1*c]; };
327         auto CC = [cc](size_t b, size_t c) -> const Tcd&
328           { return cc[b+ip*c]; };
329         for (size_t k=0; k<l1; ++k)
330           {
331           CH(k,0) = CC(0,k)+CC(1,k);
332           CH(k,1) = CC(0,k)-CC(1,k);
333           }
334         return ch;
335         }
336       else
337         {
338         auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd&
339           { return ch[a+ido*(b+l1*c)]; };
340         auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd&
341           { return cc[a+ido*(b+ip*c)]; };
342         for (size_t k=0; k<l1; ++k)
343           {
344           CH(0,k,0) = CC(0,0,k)+CC(0,1,k);
345           CH(0,k,1) = CC(0,0,k)-CC(0,1,k);
346           for (size_t i=1; i<ido; ++i)
347             {
348             CH(i,k,0) = CC(i,0,k)+CC(i,1,k);
349             special_mul<fwd>(CC(i,0,k)-CC(i,1,k),WA(i),CH(i,k,1));
350             }
351           }
352         return ch;
353         }
354       }
355 
356   public:
357     cfftp2(size_t l1_, size_t ido_, const Troots<Tfs> &roots)
358       : l1(l1_), ido(ido_), wa((ip-1)*(ido-1))
359       {
360       size_t N=ip*l1*ido;
361       size_t rfct = roots->size()/N;
362       MR_assert(roots->size()==N*rfct, "mismatch");
363       for (size_t i=1; i<ido; ++i)
364         wa[i-1] = (*roots)[rfct*l1*i];
365       }
366 
367     virtual size_t bufsize() const { return 0; }
368     virtual bool needs_copy() const { return true; }
369 
370     POCKETFFT_EXEC_DISPATCH
371   };
372 
373 template <typename Tfs> class cfftp3: public cfftpass<Tfs>
374   {
375   private:
376     using typename cfftpass<Tfs>::Tcs;
377 
378     size_t l1, ido;
379     static constexpr size_t ip=3;
380     quick_array<Tcs> wa;
381 
382     auto WA(size_t x, size_t i) const
383       { return wa[x+(i-1)*(ip-1)]; }
384 
385     template<bool fwd, typename Tcd> Tcd *exec_
386       (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/,
387       size_t /*nthreads*/) const
388       {
389       constexpr Tfs tw1r=-0.5,
390                     tw1i= (fwd ? -1: 1) * Tfs(0.8660254037844386467637231707529362L);
391 
392       auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd&
393         { return ch[a+ido*(b+l1*c)]; };
394       auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd&
395         { return cc[a+ido*(b+ip*c)]; };
396 
397 #define POCKETFFT_PREP3(idx) \
398         Tcd t0 = CC(idx,0,k), t1, t2; \
399         PM (t1,t2,CC(idx,1,k),CC(idx,2,k)); \
400         CH(idx,k,0)=t0+t1;
401 #define POCKETFFT_PARTSTEP3a(u1,u2,twr,twi) \
402         { \
403         Tcd ca=t0+t1*twr; \
404         Tcd cb{-t2.i*twi, t2.r*twi}; \
405         PM(CH(0,k,u1),CH(0,k,u2),ca,cb) ;\
406         }
407 #define POCKETFFT_PARTSTEP3b(u1,u2,twr,twi) \
408         { \
409         Tcd ca=t0+t1*twr; \
410         Tcd cb{-t2.i*twi, t2.r*twi}; \
411         special_mul<fwd>(ca+cb,WA(u1-1,i),CH(i,k,u1)); \
412         special_mul<fwd>(ca-cb,WA(u2-1,i),CH(i,k,u2)); \
413         }
414 
415       if (ido==1)
416         for (size_t k=0; k<l1; ++k)
417           {
418           POCKETFFT_PREP3(0)
419           POCKETFFT_PARTSTEP3a(1,2,tw1r,tw1i)
420           }
421       else
422         for (size_t k=0; k<l1; ++k)
423           {
424           {
425           POCKETFFT_PREP3(0)
426           POCKETFFT_PARTSTEP3a(1,2,tw1r,tw1i)
427           }
428           for (size_t i=1; i<ido; ++i)
429             {
430             POCKETFFT_PREP3(i)
431             POCKETFFT_PARTSTEP3b(1,2,tw1r,tw1i)
432             }
433           }
434 
435 #undef POCKETFFT_PARTSTEP3b
436 #undef POCKETFFT_PARTSTEP3a
437 #undef POCKETFFT_PREP3
438 
439       return ch;
440       }
441 
442   public:
443     cfftp3(size_t l1_, size_t ido_, const Troots<Tfs> &roots)
444       : l1(l1_), ido(ido_), wa((ip-1)*(ido-1))
445       {
446       size_t N=ip*l1*ido;
447       size_t rfct = roots->size()/N;
448       MR_assert(roots->size()==N*rfct, "mismatch");
449       for (size_t i=1; i<ido; ++i)
450         for (size_t j=1; j<ip; ++j)
451           wa[(j-1)+(i-1)*(ip-1)] = (*roots)[rfct*j*l1*i];
452       }
453 
454     virtual size_t bufsize() const { return 0; }
455     virtual bool needs_copy() const { return true; }
456 
457     POCKETFFT_EXEC_DISPATCH
458   };
459 
460 template <typename Tfs> class cfftp4: public cfftpass<Tfs>
461   {
462   private:
463     using typename cfftpass<Tfs>::Tcs;
464 
465     size_t l1, ido;
466     static constexpr size_t ip=4;
467     quick_array<Tcs> wa;
468 
469     auto WA(size_t x, size_t i) const
470       { return wa[x+(i-1)*(ip-1)]; }
471 
472     template<bool fwd, typename Tcd> Tcd *exec_
473       (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/,
474       size_t /*nthreads*/) const
475       {
476       if (ido==1)
477         {
478         auto CH = [ch,this](size_t b, size_t c) -> Tcd&
479           { return ch[b+l1*c]; };
480         auto CC = [cc](size_t b, size_t c) -> const Tcd&
481           { return cc[b+ip*c]; };
482         for (size_t k=0; k<l1; ++k)
483           {
484           Tcd t1, t2, t3, t4;
485           PM(t2,t1,CC(0,k),CC(2,k));
486           PM(t3,t4,CC(1,k),CC(3,k));
487           ROTX90<fwd>(t4);
488           PM(CH(k,0),CH(k,2),t2,t3);
489           PM(CH(k,1),CH(k,3),t1,t4);
490           }
491         }
492       else
493         {
494         auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd&
495           { return ch[a+ido*(b+l1*c)]; };
496         auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd&
497           { return cc[a+ido*(b+ip*c)]; };
498         for (size_t k=0; k<l1; ++k)
499           {
500           {
501           Tcd t1, t2, t3, t4;
502           PM(t2,t1,CC(0,0,k),CC(0,2,k));
503           PM(t3,t4,CC(0,1,k),CC(0,3,k));
504           ROTX90<fwd>(t4);
505           PM(CH(0,k,0),CH(0,k,2),t2,t3);
506           PM(CH(0,k,1),CH(0,k,3),t1,t4);
507           }
508           for (size_t i=1; i<ido; ++i)
509             {
510             Tcd t1, t2, t3, t4;
511             Tcd cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
512             PM(t2,t1,cc0,cc2);
513             PM(t3,t4,cc1,cc3);
514             ROTX90<fwd>(t4);
515             CH(i,k,0) = t2+t3;
516             special_mul<fwd>(t1+t4,WA(0,i),CH(i,k,1));
517             special_mul<fwd>(t2-t3,WA(1,i),CH(i,k,2));
518             special_mul<fwd>(t1-t4,WA(2,i),CH(i,k,3));
519             }
520           }
521         }
522       return ch;
523       }
524 
525   public:
526     cfftp4(size_t l1_, size_t ido_, const Troots<Tfs> &roots)
527       : l1(l1_), ido(ido_), wa((ip-1)*(ido-1))
528       {
529       size_t N=ip*l1*ido;
530       size_t rfct = roots->size()/N;
531       MR_assert(roots->size()==N*rfct, "mismatch");
532       for (size_t i=1; i<ido; ++i)
533         for (size_t j=1; j<ip; ++j)
534           wa[(j-1)+(i-1)*(ip-1)] = (*roots)[rfct*j*l1*i];
535       }
536 
537     virtual size_t bufsize() const { return 0; }
538     virtual bool needs_copy() const { return true; }
539 
540     POCKETFFT_EXEC_DISPATCH
541   };
542 
543 template <typename Tfs> class cfftp5: public cfftpass<Tfs>
544   {
545   private:
546     using typename cfftpass<Tfs>::Tcs;
547 
548     size_t l1, ido;
549     static constexpr size_t ip=5;
550     quick_array<Tcs> wa;
551 
552     auto WA(size_t x, size_t i) const
553       { return wa[x+(i-1)*(ip-1)]; }
554 
555     template<bool fwd, typename Tcd> Tcd *exec_
556       (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/,
557       size_t /*nthreads*/) const
558       {
559       constexpr Tfs tw1r= Tfs(0.3090169943749474241022934171828191L),
560                     tw1i= (fwd ? -1: 1) * Tfs(0.9510565162951535721164393333793821L),
561                     tw2r= Tfs(-0.8090169943749474241022934171828191L),
562                     tw2i= (fwd ? -1: 1) * Tfs(0.5877852522924731291687059546390728L);
563 
564       auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd&
565         { return ch[a+ido*(b+l1*c)]; };
566       auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd&
567         { return cc[a+ido*(b+ip*c)]; };
568 
569 #define POCKETFFT_PREP5(idx) \
570         Tcd t0 = CC(idx,0,k), t1, t2, t3, t4; \
571         PM (t1,t4,CC(idx,1,k),CC(idx,4,k)); \
572         PM (t2,t3,CC(idx,2,k),CC(idx,3,k)); \
573         CH(idx,k,0).r=t0.r+t1.r+t2.r; \
574         CH(idx,k,0).i=t0.i+t1.i+t2.i;
575 
576 #define POCKETFFT_PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \
577         { \
578         Tcd ca,cb; \
579         ca.r=t0.r+twar*t1.r+twbr*t2.r; \
580         ca.i=t0.i+twar*t1.i+twbr*t2.i; \
581         cb.i=twai*t4.r twbi*t3.r; \
582         cb.r=-(twai*t4.i twbi*t3.i); \
583         PM(CH(0,k,u1),CH(0,k,u2),ca,cb); \
584         }
585 
586 #define POCKETFFT_PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \
587         { \
588         Tcd ca,cb,da,db; \
589         ca.r=t0.r+twar*t1.r+twbr*t2.r; \
590         ca.i=t0.i+twar*t1.i+twbr*t2.i; \
591         cb.i=twai*t4.r twbi*t3.r; \
592         cb.r=-(twai*t4.i twbi*t3.i); \
593         special_mul<fwd>(ca+cb,WA(u1-1,i),CH(i,k,u1)); \
594         special_mul<fwd>(ca-cb,WA(u2-1,i),CH(i,k,u2)); \
595         }
596 
597       if (ido==1)
598         for (size_t k=0; k<l1; ++k)
599           {
600           POCKETFFT_PREP5(0)
601           POCKETFFT_PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
602           POCKETFFT_PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
603           }
604       else
605         for (size_t k=0; k<l1; ++k)
606           {
607           {
608           POCKETFFT_PREP5(0)
609           POCKETFFT_PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
610           POCKETFFT_PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
611           }
612           for (size_t i=1; i<ido; ++i)
613             {
614             POCKETFFT_PREP5(i)
615             POCKETFFT_PARTSTEP5b(1,4,tw1r,tw2r,+tw1i,+tw2i)
616             POCKETFFT_PARTSTEP5b(2,3,tw2r,tw1r,+tw2i,-tw1i)
617             }
618           }
619 
620 #undef POCKETFFT_PARTSTEP5b
621 #undef POCKETFFT_PARTSTEP5a
622 #undef POCKETFFT_PREP5
623 
624       return ch;
625       }
626 
627   public:
628     cfftp5(size_t l1_, size_t ido_, const Troots<Tfs> &roots)
629       : l1(l1_), ido(ido_), wa((ip-1)*(ido-1))
630       {
631       size_t N=ip*l1*ido;
632       auto rfct = roots->size()/N;
633       MR_assert(roots->size()==N*rfct, "mismatch");
634       for (size_t i=1; i<ido; ++i)
635         for (size_t j=1; j<ip; ++j)
636           wa[(j-1)+(i-1)*(ip-1)] = (*roots)[rfct*j*l1*i];
637       }
638 
639     virtual size_t bufsize() const { return 0; }
640     virtual bool needs_copy() const { return true; }
641 
642     POCKETFFT_EXEC_DISPATCH
643   };
644 
645 template <typename Tfs> class cfftp7: public cfftpass<Tfs>
646   {
647   private:
648     using typename cfftpass<Tfs>::Tcs;
649 
650     size_t l1, ido;
651     static constexpr size_t ip=7;
652     quick_array<Tcs> wa;
653 
654     auto WA(size_t x, size_t i) const
655       { return wa[x+(i-1)*(ip-1)]; }
656 
657     template<bool fwd, typename Tcd> Tcd *exec_
658       (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/,
659       size_t /*nthreads*/) const
660       {
661       constexpr Tfs tw1r= Tfs(0.6234898018587335305250048840042398L),
662                     tw1i= (fwd ? -1 : 1) * Tfs(0.7818314824680298087084445266740578L),
663                     tw2r= Tfs(-0.2225209339563144042889025644967948L),
664                     tw2i= (fwd ? -1 : 1) * Tfs(0.9749279121818236070181316829939312L),
665                     tw3r= Tfs(-0.9009688679024191262361023195074451L),
666                     tw3i= (fwd ? -1 : 1) * Tfs(0.433883739117558120475768332848359L);
667 
668       auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd&
669         { return ch[a+ido*(b+l1*c)]; };
670       auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd&
671         { return cc[a+ido*(b+ip*c)]; };
672 
673 #define POCKETFFT_PREP7(idx) \
674         Tcd t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \
675         PM (t2,t7,CC(idx,1,k),CC(idx,6,k)); \
676         PM (t3,t6,CC(idx,2,k),CC(idx,5,k)); \
677         PM (t4,t5,CC(idx,3,k),CC(idx,4,k)); \
678         CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \
679         CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i;
680 
681 #define POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,out1,out2) \
682         { \
683         Tcd ca,cb; \
684         ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; \
685         ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \
686         cb.i=y1*t7.r y2*t6.r y3*t5.r; \
687         cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \
688         PM(out1,out2,ca,cb); \
689         }
690 #define POCKETFFT_PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \
691         POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2))
692 #define POCKETFFT_PARTSTEP7(u1,u2,x1,x2,x3,y1,y2,y3) \
693         { \
694         Tcd da,db; \
695         POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \
696         special_mul<fwd>(da,WA(u1-1,i),CH(i,k,u1)); \
697         special_mul<fwd>(db,WA(u2-1,i),CH(i,k,u2)); \
698         }
699 
700       if (ido==1)
701         for (size_t k=0; k<l1; ++k)
702           {
703           POCKETFFT_PREP7(0)
704           POCKETFFT_PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
705           POCKETFFT_PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
706           POCKETFFT_PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
707           }
708       else
709         for (size_t k=0; k<l1; ++k)
710           {
711           {
712           POCKETFFT_PREP7(0)
713           POCKETFFT_PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
714           POCKETFFT_PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
715           POCKETFFT_PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
716           }
717           for (size_t i=1; i<ido; ++i)
718             {
719             POCKETFFT_PREP7(i)
720             POCKETFFT_PARTSTEP7(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
721             POCKETFFT_PARTSTEP7(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
722             POCKETFFT_PARTSTEP7(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
723             }
724           }
725 
726 #undef POCKETFFT_PARTSTEP7
727 #undef POCKETFFT_PARTSTEP7a0
728 #undef POCKETFFT_PARTSTEP7a
729 #undef POCKETFFT_PREP7
730 
731       return ch;
732       }
733 
734   public:
735     cfftp7(size_t l1_, size_t ido_, const Troots<Tfs> &roots)
736       : l1(l1_), ido(ido_), wa((ip-1)*(ido-1))
737       {
738       size_t N=ip*l1*ido;
739       auto rfct = roots->size()/N;
740       MR_assert(roots->size()==N*rfct, "mismatch");
741       for (size_t i=1; i<ido; ++i)
742         for (size_t j=1; j<ip; ++j)
743           wa[(j-1)+(i-1)*(ip-1)] = (*roots)[rfct*j*l1*i];
744       }
745 
746     virtual size_t bufsize() const { return 0; }
747     virtual bool needs_copy() const { return true; }
748 
749     POCKETFFT_EXEC_DISPATCH
750   };
751 
752 template <typename Tfs> class cfftp11: public cfftpass<Tfs>
753   {
754   private:
755     using typename cfftpass<Tfs>::Tcs;
756 
757     size_t l1, ido;
758     static constexpr size_t ip=11;
759     quick_array<Tcs> wa;
760 
761     auto WA(size_t x, size_t i) const
762       { return wa[x+(i-1)*(ip-1)]; }
763 
764     template<bool fwd, typename Tcd> [[gnu::hot]] Tcd *exec_
765       (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/,
766       size_t /*nthreads*/) const
767       {
768       constexpr Tfs tw1r= Tfs(0.8412535328311811688618116489193677L),
769                     tw1i= (fwd ? -1 : 1) * Tfs(0.5406408174555975821076359543186917L),
770                     tw2r= Tfs(0.4154150130018864255292741492296232L),
771                     tw2i= (fwd ? -1 : 1) * Tfs(0.9096319953545183714117153830790285L),
772                     tw3r= Tfs(-0.1423148382732851404437926686163697L),
773                     tw3i= (fwd ? -1 : 1) * Tfs(0.9898214418809327323760920377767188L),
774                     tw4r= Tfs(-0.6548607339452850640569250724662936L),
775                     tw4i= (fwd ? -1 : 1) * Tfs(0.7557495743542582837740358439723444L),
776                     tw5r= Tfs(-0.9594929736144973898903680570663277L),
777                     tw5i= (fwd ? -1 : 1) * Tfs(0.2817325568414296977114179153466169L);
778 
779       auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd&
780         { return ch[a+ido*(b+l1*c)]; };
781       auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd&
782         { return cc[a+ido*(b+ip*c)]; };
783 
784 #define POCKETFFT_PREP11(idx) \
785         Tcd t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \
786         PM (t2,t11,CC(idx,1,k),CC(idx,10,k)); \
787         PM (t3,t10,CC(idx,2,k),CC(idx, 9,k)); \
788         PM (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)); \
789         PM (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)); \
790         PM (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)); \
791         CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; \
792         CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i;
793 
794 #define POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,out1,out2) \
795         { \
796         Tcd ca = t1 + t2*x1 + t3*x2 + t4*x3 + t5*x4 +t6*x5, \
797             cb; \
798         cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; \
799         cb.r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); \
800         PM(out1,out2,ca,cb); \
801         }
802 #define POCKETFFT_PARTSTEP11a(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
803         POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,CH(0,k,u1),CH(0,k,u2))
804 #define POCKETFFT_PARTSTEP11(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
805         { \
806         Tcd da,db; \
807         POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,da,db) \
808         special_mul<fwd>(da,WA(u1-1,i),CH(i,k,u1)); \
809         special_mul<fwd>(db,WA(u2-1,i),CH(i,k,u2)); \
810         }
811 
812       if (ido==1)
813         for (size_t k=0; k<l1; ++k)
814           {
815           POCKETFFT_PREP11(0)
816           POCKETFFT_PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
817           POCKETFFT_PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
818           POCKETFFT_PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
819           POCKETFFT_PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
820           POCKETFFT_PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
821           }
822       else
823         for (size_t k=0; k<l1; ++k)
824           {
825           {
826           POCKETFFT_PREP11(0)
827           POCKETFFT_PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
828           POCKETFFT_PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
829           POCKETFFT_PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
830           POCKETFFT_PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
831           POCKETFFT_PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
832           }
833           for (size_t i=1; i<ido; ++i)
834             {
835             POCKETFFT_PREP11(i)
836             POCKETFFT_PARTSTEP11(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
837             POCKETFFT_PARTSTEP11(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
838             POCKETFFT_PARTSTEP11(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
839             POCKETFFT_PARTSTEP11(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
840             POCKETFFT_PARTSTEP11(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
841             }
842           }
843 
844 #undef POCKETFFT_PARTSTEP11
845 #undef POCKETFFT_PARTSTEP11a0
846 #undef POCKETFFT_PARTSTEP11a
847 #undef POCKETFFT_PREP11
848       return ch;
849       }
850 
851   public:
852     cfftp11(size_t l1_, size_t ido_, const Troots<Tfs> &roots)
853       : l1(l1_), ido(ido_), wa((ip-1)*(ido-1))
854       {
855       size_t N=ip*l1*ido;
856       auto rfct = roots->size()/N;
857       MR_assert(roots->size()==N*rfct, "mismatch");
858       for (size_t i=1; i<ido; ++i)
859         for (size_t j=1; j<ip; ++j)
860           wa[(j-1)+(i-1)*(ip-1)] = (*roots)[rfct*j*l1*i];
861       }
862 
863     virtual size_t bufsize() const { return 0; }
864     virtual bool needs_copy() const { return true; }
865 
866     POCKETFFT_EXEC_DISPATCH
867   };
868 
869 template <typename Tfs> class cfftpg: public cfftpass<Tfs>
870   {
871   private:
872     using typename cfftpass<Tfs>::Tcs;
873 
874     size_t l1, ido;
875     size_t ip;
876     quick_array<Tcs> wa;
877     quick_array<Tcs> csarr;
878 
879     auto WA(size_t x, size_t i) const
880       { return wa[i-1+x*(ido-1)]; }
881 
882     template<bool fwd, typename Tcd> Tcd *exec_
883       (Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, size_t /*nthreads*/) const
884       {
885       size_t ipph = (ip+1)/2;
886       size_t idl1 = ido*l1;
887 
888       auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd&
889         { return ch[a+ido*(b+l1*c)]; };
890       auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd&
891         { return cc[a+ido*(b+ip*c)]; };
892       auto CX = [cc,this](size_t a, size_t b, size_t c) -> Tcd&
893         { return cc[a+ido*(b+l1*c)]; };
894       auto CX2 = [cc, idl1](size_t a, size_t b) -> Tcd&
895         { return cc[a+idl1*b]; };
896       auto CH2 = [ch, idl1](size_t a, size_t b) -> const Tcd&
897         { return ch[a+idl1*b]; };
898 
899       for (size_t k=0; k<l1; ++k)
900         for (size_t i=0; i<ido; ++i)
901           CH(i,k,0) = CC(i,0,k);
902       for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
903         for (size_t k=0; k<l1; ++k)
904           for (size_t i=0; i<ido; ++i)
905             PM(CH(i,k,j),CH(i,k,jc),CC(i,j,k),CC(i,jc,k));
906       for (size_t k=0; k<l1; ++k)
907         for (size_t i=0; i<ido; ++i)
908           {
909           Tcd tmp = CH(i,k,0);
910           for (size_t j=1; j<ipph; ++j)
911             tmp+=CH(i,k,j);
912           CX(i,k,0) = tmp;
913           }
914       for (size_t l=1, lc=ip-1; l<ipph; ++l, --lc)
915         {
916         // j=0,1,2
917         {
918         auto wal  = fwd ? csarr[  l].conj() : csarr[  l];
919         auto wal2 = fwd ? csarr[2*l].conj() : csarr[2*l];
920         for (size_t ik=0; ik<idl1; ++ik)
921           {
922           CX2(ik,l ).r = CH2(ik,0).r+wal.r*CH2(ik,1).r+wal2.r*CH2(ik,2).r;
923           CX2(ik,l ).i = CH2(ik,0).i+wal.r*CH2(ik,1).i+wal2.r*CH2(ik,2).i;
924           CX2(ik,lc).r =-wal.i*CH2(ik,ip-1).i-wal2.i*CH2(ik,ip-2).i;
925           CX2(ik,lc).i = wal.i*CH2(ik,ip-1).r+wal2.i*CH2(ik,ip-2).r;
926           }
927         }
928 
929         size_t iwal=2*l;
930         size_t j=3, jc=ip-3;
931         for (; j<ipph-1; j+=2, jc-=2)
932           {
933           iwal+=l; if (iwal>ip) iwal-=ip;
934           Tcs xwal=fwd ? csarr[iwal].conj() : csarr[iwal];
935           iwal+=l; if (iwal>ip) iwal-=ip;
936           Tcs xwal2=fwd ? csarr[iwal].conj() : csarr[iwal];
937           for (size_t ik=0; ik<idl1; ++ik)
938             {
939             CX2(ik,l).r += CH2(ik,j).r*xwal.r+CH2(ik,j+1).r*xwal2.r;
940             CX2(ik,l).i += CH2(ik,j).i*xwal.r+CH2(ik,j+1).i*xwal2.r;
941             CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i+CH2(ik,jc-1).i*xwal2.i;
942             CX2(ik,lc).i += CH2(ik,jc).r*xwal.i+CH2(ik,jc-1).r*xwal2.i;
943             }
944           }
945         for (; j<ipph; ++j, --jc)
946           {
947           iwal+=l; if (iwal>ip) iwal-=ip;
948           Tcs xwal=fwd ? csarr[iwal].conj() : csarr[iwal];
949           for (size_t ik=0; ik<idl1; ++ik)
950             {
951             CX2(ik,l).r += CH2(ik,j).r*xwal.r;
952             CX2(ik,l).i += CH2(ik,j).i*xwal.r;
953             CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i;
954             CX2(ik,lc).i += CH2(ik,jc).r*xwal.i;
955             }
956           }
957         }
958 
959       // shuffling and twiddling
960       if (ido==1)
961         for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
962           for (size_t ik=0; ik<idl1; ++ik)
963             {
964             Tcd t1=CX2(ik,j), t2=CX2(ik,jc);
965             PM(CX2(ik,j),CX2(ik,jc),t1,t2);
966             }
967       else
968         {
969         for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)
970           for (size_t k=0; k<l1; ++k)
971             {
972             Tcd t1=CX(0,k,j), t2=CX(0,k,jc);
973             PM(CX(0,k,j),CX(0,k,jc),t1,t2);
974             for (size_t i=1; i<ido; ++i)
975               {
976               Tcd x1, x2;
977               PM(x1,x2,CX(i,k,j),CX(i,k,jc));
978               size_t idij=(j-1)*(ido-1)+i-1;
979               special_mul<fwd>(x1,wa[idij],CX(i,k,j));
980               idij=(jc-1)*(ido-1)+i-1;
981               special_mul<fwd>(x2,wa[idij],CX(i,k,jc));
982               }
983             }
984         }
985       return cc;
986       }
987 
988   public:
989     cfftpg(size_t l1_, size_t ido_, size_t ip_, const Troots<Tfs> &roots)
990       : l1(l1_), ido(ido_), ip(ip_), wa((ip-1)*(ido-1)), csarr(ip)
991       {
992       MR_assert((ip&1)&&(ip>=5), "need an odd number >=5");
993       size_t N=ip*l1*ido;
994       auto rfct = roots->size()/N;
995       MR_assert(roots->size()==N*rfct, "mismatch");
996       for (size_t j=1; j<ip; ++j)
997         for (size_t i=1; i<ido; ++i)
998           wa[(j-1)*(ido-1)+i-1] = (*roots)[rfct*j*l1*i];
999       for (size_t i=0; i<ip; ++i)
1000         csarr[i] = (*roots)[rfct*ido*l1*i];
1001       }
1002 
1003     virtual size_t bufsize() const { return 0; }
1004     virtual bool needs_copy() const { return true; }
1005 
1006     POCKETFFT_EXEC_DISPATCH
1007   };
1008 
1009 template <typename Tfs> class cfftpblue: public cfftpass<Tfs>
1010   {
1011   private:
1012     using typename cfftpass<Tfs>::Tcs;
1013 
1014     const size_t l1, ido, ip;
1015     const size_t ip2;
1016     const Tcpass<Tfs> subplan;
1017     quick_array<Tcs> wa, bk, bkf;
1018     size_t bufsz;
1019     bool need_cpy;
1020 
1021     auto WA(size_t x, size_t i) const
1022       { return wa[i-1+x*(ido-1)]; }
1023 
1024     template<bool fwd, typename Tcd> Tcd *exec_
1025       (Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch,
1026        Tcd * DUCC0_RESTRICT buf, size_t nthreads) const
1027       {
1028       static const auto ti=tidx<Tcd *>();
1029       Tcd *akf = &buf[0];
1030       Tcd *akf2 = &buf[ip2];
1031       Tcd *subbuf = &buf[2*ip2];
1032 
1033       auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd&
1034         { return ch[a+ido*(b+l1*c)]; };
1035       auto CC = [cc,this](size_t a, size_t b, size_t c) -> Tcd&
1036         { return cc[a+ido*(b+ip*c)]; };
1037 
1038 //FIXME: parallelize here?
1039       for (size_t k=0; k<l1; ++k)
1040         for (size_t i=0; i<ido; ++i)
1041           {
1042           /* initialize a_k and FFT it */
1043           for (size_t m=0; m<ip; ++m)
1044             special_mul<fwd>(CC(i,m,k),bk[m],akf[m]);
1045           auto zero = akf[0]*Tfs(0);
1046           for (size_t m=ip; m<ip2; ++m)
1047             akf[m]=zero;
1048 
1049           auto res = static_cast<Tcd *>(subplan->exec(ti,akf,akf2,
1050             subbuf, true, nthreads));
1051 
1052           /* do the convolution */
1053           res[0] = res[0].template special_mul<!fwd>(bkf[0]);
1054           for (size_t m=1; m<(ip2+1)/2; ++m)
1055             {
1056             res[m] = res[m].template special_mul<!fwd>(bkf[m]);
1057             res[ip2-m] = res[ip2-m].template special_mul<!fwd>(bkf[m]);
1058             }
1059           if ((ip2&1)==0)
1060             res[ip2/2] = res[ip2/2].template special_mul<!fwd>(bkf[ip2/2]);
1061 
1062           /* inverse FFT */
1063           res = static_cast<Tcd *>(subplan->exec(ti, res,
1064             (res==akf) ? akf2 : akf, subbuf, false, nthreads));
1065 
1066           /* multiply by b_k and write to output buffer */
1067           if (l1>1)
1068             {
1069             if (i==0)
1070               for (size_t m=0; m<ip; ++m)
1071                 CH(0,k,m) = res[m].template special_mul<fwd>(bk[m]);
1072             else
1073               {
1074               CH(i,k,0) = res[0].template special_mul<fwd>(bk[0]);
1075               for (size_t m=1; m<ip; ++m)
1076                 CH(i,k,m) = res[m].template special_mul<fwd>(bk[m]*WA(m-1,i));
1077               }
1078             }
1079           else
1080             {
1081             if (i==0)
1082               for (size_t m=0; m<ip; ++m)
1083                 CC(0,m,0) = res[m].template special_mul<fwd>(bk[m]);
1084             else
1085               {
1086               CC(i,0,0) = res[0].template special_mul<fwd>(bk[0]);
1087               for (size_t m=1; m<ip; ++m)
1088                 CC(i,m,0) = res[m].template special_mul<fwd>(bk[m]*WA(m-1,i));
1089               }
1090             }
1091           }
1092 
1093       return (l1>1) ? ch : cc;
1094       }
1095 
1096   public:
1097     cfftpblue(size_t l1_, size_t ido_, size_t ip_, const Troots<Tfs> &roots,
1098       bool vectorize=false)
1099       : l1(l1_), ido(ido_), ip(ip_), ip2(util1d::good_size_cmplx(ip*2-1)),
1100         subplan(cfftpass<Tfs>::make_pass(ip2, vectorize)), wa((ip-1)*(ido-1)),
1101         bk(ip), bkf(ip2/2+1)
1102       {
1103       size_t N=ip*l1*ido;
1104       auto rfct = roots->size()/N;
1105       MR_assert(roots->size()==N*rfct, "mismatch");
1106       for (size_t j=1; j<ip; ++j)
1107         for (size_t i=1; i<ido; ++i)
1108           wa[(j-1)*(ido-1)+i-1] = (*roots)[rfct*j*l1*i];
1109 
1110       /* initialize b_k */
1111       bk[0].Set(1, 0);
1112       size_t coeff=0;
1113       auto roots2 = ((roots->size()/(2*ip))*2*ip==roots->size()) ?
1114                     roots : make_shared<const UnityRoots<Tfs,Tcs>>(2*ip);
1115       size_t rfct2 = roots2->size()/(2*ip);
1116       for (size_t m=1; m<ip; ++m)
1117         {
1118         coeff+=2*m-1;
1119         if (coeff>=2*ip) coeff-=2*ip;
1120         bk[m] = (*roots2)[coeff*rfct2];
1121         }
1122 
1123       /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */
1124       quick_array<Tcs> tbkf(ip2), tbkf2(ip2);
1125       Tfs xn2 = Tfs(1)/Tfs(ip2);
1126       tbkf[0] = bk[0]*xn2;
1127       for (size_t m=1; m<ip; ++m)
1128         tbkf[m] = tbkf[ip2-m] = bk[m]*xn2;
1129       for (size_t m=ip;m<=(ip2-ip);++m)
1130         tbkf[m].Set(0.,0.);
1131       quick_array<Tcs> buf(subplan->bufsize());
1132       static const auto tics=tidx<Tcs *>();
1133       auto res = static_cast<Tcs *>(subplan->exec(tics, tbkf.data(),
1134         tbkf2.data(), buf.data(), true));
1135       for (size_t i=0; i<ip2/2+1; ++i)
1136         bkf[i] = res[i];
1137 
1138       need_cpy = l1>1;
1139       bufsz = ip2*(1+subplan->needs_copy()) + subplan->bufsize();
1140       }
1141 
1142     virtual size_t bufsize() const { return bufsz; }
1143     virtual bool needs_copy() const { return need_cpy; }
1144 
1145     POCKETFFT_EXEC_DISPATCH
1146   };
1147 
1148 template <typename Tfs> class cfft_multipass: public cfftpass<Tfs>
1149   {
1150   private:
1151     using typename cfftpass<Tfs>::Tcs;
1152     static constexpr size_t bunchsize=8;
1153 
1154     const size_t l1, ido;
1155     size_t ip;
1156     vector<Tcpass<Tfs>> passes;
1157     size_t bufsz;
1158     bool need_cpy;
1159     size_t rfct;
1160     Troots<Tfs> myroots;
1161 
1162 // FIXME split into sub-functions. This is too long!
1163     template<bool fwd, typename T> Cmplx<T> *exec_(Cmplx<T> *cc, Cmplx<T> *ch,
1164       Cmplx<T> *buf, size_t nthreads) const
1165       {
1166       using Tc = Cmplx<T>;
1167       if ((l1==1) && (ido==1)) // no chance at vectorizing
1168         {
1169         static const auto tic=tidx<Tc *>();
1170         Tc *p1=cc, *p2=ch;
1171         for(const auto &pass: passes)
1172           {
1173           auto res = static_cast<Tc *>(pass->exec(tic, p1, p2, buf,
1174             fwd, nthreads));
1175           if (res==p2) swap (p1,p2);
1176           }
1177         return p1;
1178         }
1179       else
1180         {
1181         if constexpr(is_same<T,Tfs>::value && fft1d_simd_exists<Tfs>) // we can vectorize!
1182           {
1183           using Tfv = fft1d_simd<Tfs>;
1184           using Tcv = Cmplx<Tfv>;
1185           constexpr size_t vlen = Tfv::size();
1186           size_t nvtrans = (l1*ido + vlen-1)/vlen;
1187           static const auto ticv = tidx<Tcv *>();
1188 
1189           if (ido==1)
1190             {
1191             auto CH = [ch,this](size_t b, size_t c) -> Tc&
1192               { return ch[b+l1*c]; };
1193             auto CC = [cc,this](size_t b, size_t c) -> Tc&
1194               { return cc[b+ip*c]; };
1195 
1196             execStatic(nvtrans, nthreads, 0, [&](auto &sched)
1197               {
1198               quick_array<Tcv> tbuf(2*ip+bufsize());
1199               auto cc2 = &tbuf[0];
1200               auto ch2 = &tbuf[ip];
1201               auto buf2 = &tbuf[2*ip];
1202 
1203               while (auto rng=sched.getNext())
1204                 for(auto itrans=rng.lo; itrans<rng.hi; ++itrans)
1205                   {
1206                   for (size_t n=0; n<vlen; ++n)
1207                     for (size_t m=0; m<ip; ++m)
1208                       {
1209                       size_t k = min(l1-1, itrans*vlen+n);
1210                       cc2[m].r[n] = CC(m,k).r;
1211                       cc2[m].i[n] = CC(m,k).i;
1212                       }
1213 
1214                   Tcv *p1=cc2, *p2=ch2;
1215                   for(const auto &pass: passes)
1216                     {
1217                     auto res = static_cast<Tcv *>(pass->exec(ticv,
1218                       p1, p2, buf2, fwd));
1219                     if (res==p2) swap (p1,p2);
1220                     }
1221 
1222                   for (size_t m=0; m<ip; ++m)
1223                     for (size_t n=0; n<vlen; ++n)
1224                       {
1225                       auto k = min(l1-1, itrans*vlen+n);
1226                       CH(k,m) = { p1[m].r[n], p1[m].i[n] };
1227                       }
1228                   }
1229               });
1230             return ch;
1231             }
1232 
1233           if (l1==1)
1234             {
1235             auto CC = [cc,this](size_t a, size_t b) -> Tc&
1236               { return cc[a+ido*b]; };
1237 
1238             execStatic(nvtrans, nthreads, 0, [&](auto &sched)
1239               {
1240               quick_array<Tcv> tbuf(2*ip+bufsize());
1241               auto cc2 = &tbuf[0];
1242               auto ch2 = &tbuf[ip];
1243               auto buf2 = &tbuf[2*ip];
1244 
1245               while (auto rng=sched.getNext())
1246                 for(auto itrans=rng.lo; itrans<rng.hi; ++itrans)
1247                   {
1248                   for (size_t m=0; m<ip; ++m)
1249                     for (size_t n=0; n<vlen; ++n)
1250                       {
1251                       size_t i = min(ido-1, itrans*vlen+n);
1252                       cc2[m].r[n] = CC(i,m).r;
1253                       cc2[m].i[n] = CC(i,m).i;
1254                       }
1255 
1256                   Tcv *p1=cc2, *p2=ch2;
1257                   for(const auto &pass: passes)
1258                     {
1259                     auto res = static_cast<Tcv *>(pass->exec(ticv,
1260                       p1, p2, buf2, fwd));
1261                     if (res==p2) swap (p1,p2);
1262                     }
1263 
1264                   for (size_t m=0; m<ip; ++m)
1265                     for (size_t n=0; n<vlen; ++n)
1266                       {
1267                       auto i = itrans*vlen+n;
1268                       if (i >= ido) break;
1269                       if (i==0)
1270                         CC(0,m) = { p1[m].r[n], p1[m].i[n] };
1271                       else
1272                         {
1273                         if (m==0)
1274                           CC(i,0) = { p1[0].r[n], p1[0].i[n] } ;
1275                         else
1276                           CC(i,m) = Tcs(p1[m].r[n],p1[m].i[n]).template special_mul<fwd>((*myroots)[rfct*m*i]);
1277                         }
1278                       }
1279                   }
1280               });
1281             return cc;
1282             }
1283 
1284 MR_fail("must not get here");
1285 #if 0
1286 //FIXME this code path is currently unused
1287           quick_array<Tcv> tbuf(2*ip+bufsize());
1288           auto cc2 = &tbuf[0];
1289           auto ch2 = &tbuf[ip];
1290           auto buf2 = &tbuf[2*ip];
1291 
1292           auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tc&
1293             { return ch[a+ido*(b+l1*c)]; };
1294           auto CC = [cc,this](size_t a, size_t b, size_t c) -> Tc&
1295             { return cc[a+ido*(b+ip*c)]; };
1296 
1297 //FIXME parallelize?
1298           for (size_t itrans=0; itrans<nvtrans; ++itrans)
1299             {
1300             array<size_t, vlen> ix, kx;
1301             size_t ixcur = (itrans*vlen)%ido;
1302             size_t kxcur = (itrans*vlen)/ido;
1303             for (size_t n=0; n<vlen; ++n)
1304               {
1305               ix[n] = ixcur;
1306               kx[n] = min(l1-1,kxcur);
1307               if (++ixcur==ido)
1308                 {
1309                 ixcur=0;
1310                 ++kxcur;
1311                 }
1312               }
1313 
1314             for (size_t m=0; m<ip; ++m)
1315               for (size_t n=0; n<vlen; ++n)
1316                 {
1317                 cc2[m].r[n] = CC(ix[n],m,kx[n]).r;
1318                 cc2[m].i[n] = CC(ix[n],m,kx[n]).i;
1319                 }
1320 
1321             Tcv *p1=cc2, *p2=ch2;
1322             for(const auto &pass: passes)
1323               {
1324               auto res = static_cast<Tcv *>(pass->exec(ticv,
1325                 p1, p2, buf2, fwd));
1326               if (res==p2) swap (p1,p2);
1327               }
1328 
1329             for (size_t m=0; m<ip; ++m)
1330               for (size_t n=0; n<vlen; ++n)
1331                 {
1332                 auto i = ix[n];
1333                 auto k = kx[n];
1334                 if (itrans*vlen+n >= l1*ido) break;
1335                 if (i==0)
1336                   CH(0,k,m) = { p1[m].r[n], p1[m].i[n] };
1337                 else
1338                   {
1339                   if (m==0)
1340                     CH(i,k,0) = { p1[0].r[n], p1[0].i[n] } ;
1341                   else
1342                     CH(i,k,m) = Tcs(p1[m].r[n],p1[m].i[n]).template special_mul<fwd>((*myroots)[rfct*l1*m*i]);
1343                   }
1344                 }
1345             }
1346           return ch;
1347 #endif
1348           }
1349         else
1350           {
1351           static const auto tic = tidx<Cmplx<T> *>();
1352           if (ido==1)
1353             {
1354 // parallelize here!
1355             for (size_t n=0; n<l1; ++n)
1356               {
1357               Cmplx<T> *p1=&cc[n*ip], *p2=ch;
1358               Cmplx<T> *res = nullptr;
1359               for(const auto &pass: passes)
1360                 {
1361                 res = static_cast<Cmplx<T> *>(pass->exec(tic,
1362                   p1, p2, buf, fwd));
1363                 if (res==p2) swap (p1,p2);
1364                 }
1365               if (res != &cc[n*ip])
1366                 copy(res, res+ip, cc+n*ip);
1367               }
1368             // transpose
1369             size_t nbunch = (l1*ido + bunchsize-1)/bunchsize;
1370 // parallelize here!
1371             for (size_t ibunch=0; ibunch<nbunch; ++ibunch)
1372               {
1373               size_t ntrans = min(bunchsize, l1-ibunch*bunchsize);
1374               for (size_t m=0; m<ip; ++m)
1375                 for (size_t n=0; n<ntrans; ++n)
1376                   {
1377                   size_t itrans = ibunch*bunchsize + n;
1378                   ch[itrans+m*l1] = cc[m+itrans*ip];
1379                   }
1380               }
1381             return ch;
1382             }
1383           if (l1==1)
1384             {
1385             auto cc2 = &buf[0];
1386             auto ch2 = &buf[bunchsize*ip];
1387             auto buf2 = &buf[(bunchsize+1)*ip];
1388             size_t nbunch = (ido + bunchsize-1)/bunchsize;
1389 
1390             auto CC = [cc,this](size_t a, size_t b) -> Tc&
1391               { return cc[a+ido*b]; };
1392 
1393 // parallelize here!
1394              for (size_t ibunch=0; ibunch<nbunch; ++ibunch)
1395               {
1396               size_t ntrans = min(bunchsize, ido-ibunch*bunchsize);
1397 
1398               for (size_t m=0; m<ip; ++m)
1399                 for (size_t n=0; n<ntrans; ++n)
1400                   cc2[m+n*ip] = CC(n+ibunch*bunchsize,m);
1401 
1402               for (size_t n=0; n<ntrans; ++n)
1403                 {
1404                 auto i = n+ibunch*bunchsize;
1405                 Cmplx<T> *p1=&cc2[n*ip], *p2=ch2;
1406                 Cmplx<T> *res = nullptr;
1407                 for(const auto &pass: passes)
1408                   {
1409                   res = static_cast<Cmplx<T> *>(pass->exec(tic,
1410                     p1, p2, buf2, fwd));
1411                   if (res==p2) swap (p1,p2);
1412                   }
1413                 if (res==&cc2[n*ip]) // no copying necessary
1414                   {
1415                   if (i!=0)
1416                     {
1417                     for (size_t m=1; m<ip; ++m)
1418                       cc2[n*ip+m] = cc2[n*ip+m].template special_mul<fwd>((*myroots)[rfct*m*i]);
1419                     }
1420                   }
1421                 else
1422                   {
1423                   if (i==0)
1424                     for (size_t m=0; m<ip; ++m)
1425                       cc2[n*ip+m] = res[m];
1426                   else
1427                     {
1428                     cc2[n*ip] = res[0];
1429                     for (size_t m=1; m<ip; ++m)
1430                       cc2[n*ip+m] = res[m].template special_mul<fwd>((*myroots)[rfct*m*i]);
1431                     }
1432                   }
1433                 }
1434               for (size_t m=0; m<ip; ++m)
1435                 for (size_t n=0; n<ntrans; ++n)
1436                   CC(n+ibunch*bunchsize, m) = cc2[m+n*ip];
1437               }
1438             return cc;
1439             }
1440 
1441 MR_fail("must not get here");
1442 #if 0
1443 //FIXME this code path is currently unused
1444           auto cc2 = &buf[0];
1445           auto ch2 = &buf[bunchsize*ip];
1446           auto buf2 = &buf[(bunchsize+1)*ip];
1447           size_t nbunch = (l1*ido + bunchsize-1)/bunchsize;
1448 
1449           auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tc&
1450             { return ch[a+ido*(b+l1*c)]; };
1451           auto CC = [cc,this](size_t a, size_t b, size_t c) -> Tc&
1452             { return cc[a+ido*(b+ip*c)]; };
1453 
1454 // parallelize here!
1455           for (size_t ibunch=0; ibunch<nbunch; ++ibunch)
1456             {
1457             size_t ntrans = min(bunchsize, l1*ido-ibunch*bunchsize);
1458             array<size_t, bunchsize> ix, kx;
1459             size_t ixcur = (ibunch*bunchsize)%ido;
1460             size_t kxcur = (ibunch*bunchsize)/ido;
1461             for (size_t n=0; n<bunchsize; ++n)
1462               {
1463               ix[n] = ixcur;
1464               kx[n] = min(l1-1,kxcur);
1465               if (++ixcur==ido)
1466                 {
1467                 ixcur=0;
1468                 ++kxcur;
1469                 }
1470               }
1471             for (size_t m=0; m<ip; ++m)
1472               for (size_t n=0; n<ntrans; ++n)
1473                 cc2[m+n*ip] = CC(ix[n],m,kx[n]);
1474 
1475             for (size_t n=0; n<ntrans; ++n)
1476               {
1477               auto i = ix[n];
1478               Cmplx<T> *p1=&cc2[n*ip], *p2=ch2;
1479               Cmplx<T> *res = nullptr;
1480               for(const auto &pass: passes)
1481                 {
1482                 res = static_cast<Cmplx<T> *>(pass->exec(tic,
1483                   p1, p2, buf2, fwd));
1484                 if (res==p2) swap (p1,p2);
1485                 }
1486               if (res==&cc2[n*ip]) // no copying necessary
1487                 {
1488                 if (i!=0)
1489                   {
1490                   for (size_t m=1; m<ip; ++m)
1491                     cc2[n*ip+m] = cc2[n*ip+m].template special_mul<fwd>((*myroots)[rfct*l1*m*i]);
1492                   }
1493                 }
1494               else
1495                 {
1496                 if (i==0)
1497                   for (size_t m=0; m<ip; ++m)
1498                     cc2[n*ip+m] = res[m];
1499                 else
1500                   {
1501                   cc2[n*ip] = res[0];
1502                   for (size_t m=1; m<ip; ++m)
1503                     cc2[n*ip+m] = res[m].template special_mul<fwd>((*myroots)[rfct*l1*m*i]);
1504                   }
1505                 }
1506               }
1507             for (size_t m=0; m<ip; ++m)
1508               for (size_t n=0; n<ntrans; ++n)
1509                 CH(ix[n], kx[n], m) = cc2[m+n*ip];
1510             }
1511           return ch;
1512 #endif
1513           }
1514         }
1515       }
1516 
1517   public:
1518     cfft_multipass(size_t l1_, size_t ido_, size_t ip_,
1519       const Troots<Tfs> &roots, bool vectorize=false)
1520       : l1(l1_), ido(ido_), ip(ip_), bufsz(0), need_cpy(false),
1521         myroots(roots)
1522       {
1523       size_t N=ip*l1*ido;
1524       rfct = roots->size()/N;
1525       MR_assert(roots->size()==N*rfct, "mismatch");
1526 
1527       // FIXME TBD
1528 // do we need the vectorize flag at all?
1529       size_t lim = vectorize ? 10000 : 10000;
1530       if (ip<=lim)
1531         {
1532         auto factors = cfftpass<Tfs>::factorize(ip);
1533         size_t l1l=1;
1534         for (auto fct: factors)
1535           {
1536           passes.push_back(cfftpass<Tfs>::make_pass(l1l, ip/(fct*l1l), fct, roots, false));
1537           l1l*=fct;
1538           }
1539         }
1540       else
1541         {
1542         vector<size_t> packets(2,1);
1543         auto factors = util1d::prime_factors(ip);
1544         sort(factors.begin(), factors.end(), std::greater<size_t>());
1545         for (auto fct: factors)
1546           (packets[0]>packets[1]) ? packets[1]*=fct : packets[0]*=fct;
1547         size_t l1l=1;
1548         for (auto pkt: packets)
1549           {
1550           passes.push_back(cfftpass<Tfs>::make_pass(l1l, ip/(pkt*l1l), pkt, roots, false));
1551           l1l*=pkt;
1552           }
1553         }
1554       for (const auto &pass: passes)
1555         {
1556         bufsz = max(bufsz, pass->bufsize());
1557         need_cpy |= pass->needs_copy();
1558         }
1559       if ((l1!=1)||(ido!=1))
1560         {
1561         need_cpy=true;
1562         bufsz += (bunchsize+1)*ip;
1563         }
1564       }
1565 
1566     virtual size_t bufsize() const { return bufsz; }
1567     virtual bool needs_copy() const { return need_cpy; }
1568 
1569     POCKETFFT_EXEC_DISPATCH
1570   };
1571 
1572 #undef POCKETFFT_EXEC_DISPATCH
1573 
1574 #if 0  // leaving in for potential future use; but doesn't seem beneficial
1575 template <size_t vlen, typename Tfs> class cfftp_vecpass: public cfftpass<Tfs>
1576   {
1577   private:
1578     static_assert(simd_exists<Tfs, vlen>, "bad vlen");
1579     using typename cfftpass<Tfs>::Tcs;
1580     using Tfv=typename simd_select<Tfs, vlen>::type;
1581     using Tcv=Cmplx<Tfv>;
1582 
1583     size_t ip;
1584     Tcpass<Tfs> spass;
1585     Tcpass<Tfs> vpass;
1586     size_t bufsz;
1587 
1588     template<bool fwd> Tcs *exec_ (Tcs *cc,
1589       Tcs * /*ch*/, Tcs * /*buf*/, size_t nthreads) const
1590       {
1591       quick_array<Tcv> buf(2*ip+bufsz);
1592       auto * cc2 = buf.data();
1593       auto * ch2 = buf.data()+ip;
1594       auto * buf2 = buf.data()+2*ip;
1595       static const auto tics = tidx<Tcs *>();
1596 // run scalar pass
1597       auto res = static_cast<Tcs *>(spass->exec(tics, cc,
1598         reinterpret_cast<Tcs *>(ch2), reinterpret_cast<Tcs *>(buf2),
1599         fwd, nthreads));
1600 // arrange input in SIMD-friendly way
1601 // FIXME: swap loops?
1602       for (size_t i=0; i<ip/vlen; ++i)
1603         for (size_t j=0; j<vlen; ++j)
1604           {
1605           size_t idx = j*(ip/vlen) + i;
1606           cc2[i].r[j] = res[idx].r;
1607           cc2[i].i[j] = res[idx].i;
1608           }
1609 // run vector pass
1610       static const auto ticv = tidx<Tcv *>();
1611       auto res2 = static_cast<Tcv *>(vpass->exec(ticv,
1612         cc2, ch2, buf2, fwd, nthreads));
1613 // de-SIMDify
1614       for (size_t i=0; i<ip/vlen; ++i)
1615         for (size_t j=0; j<vlen; ++j)
1616           cc[i*vlen+j] = Tcs(res2[i].r[j], res2[i].i[j]);
1617 
1618       return cc;
1619       }
1620 
1621   public:
1622     cfftp_vecpass(size_t ip_, const Troots<Tfs> &roots)
1623       : ip(ip_), spass(cfftpass<Tfs>::make_pass(1, ip/vlen, vlen, roots)),
1624         vpass(cfftpass<Tfs>::make_pass(1, 1, ip/vlen, roots)), bufsz(0)
1625       {
1626       MR_assert((ip/vlen)*vlen==ip, "cannot vectorize this size");
1627       bufsz=2*ip+max(vpass->bufsize(),(spass->bufsize()+vlen-1)/vlen);
1628       }
1629     virtual size_t bufsize() const { return 0; }
1630     virtual bool needs_copy() const { return false; }
1631     virtual void *exec(const type_index &ti, void *in, void *copy, void *buf,
1632       bool fwd, size_t nthreads=1) const
1633       {
1634       static const auto tics = tidx<Tcs *>();
1635       MR_assert(ti==tics, "bad input type");
1636       auto in1 = static_cast<Tcs *>(in);
1637       auto copy1 = static_cast<Tcs *>(copy);
1638       auto buf1 = static_cast<Tcs *>(buf);
1639       return fwd ? exec_<true>(in1, copy1, buf1, nthreads)
1640                  : exec_<false>(in1, copy1, buf1, nthreads);
1641       }
1642   };
1643 #endif
1644 
1645 template<typename Tfs> Tcpass<Tfs> cfftpass<Tfs>::make_pass(size_t l1,
1646   size_t ido, size_t ip, const Troots<Tfs> &roots, bool vectorize)
1647   {
1648   MR_assert(ip>=1, "no zero-sized FFTs");
1649 #if 0
1650   if (vectorize && (ip>300) && (ip<32768) && (l1==1) && (ido==1))
1651     {
1652     constexpr auto vlen = native_simd<Tfs>::size();
1653     if constexpr(vlen>1)
1654       if ((ip&(vlen-1))==0)
1655         return make_shared<cfftp_vecpass<vlen,Tfs>>(ip, roots);
1656     }
1657 #endif
1658 
1659   if (ip==1) return make_shared<cfftp1<Tfs>>();
1660   auto factors=cfftpass<Tfs>::factorize(ip);
1661   if (factors.size()==1)
1662     {
1663     switch(ip)
1664       {
1665       case 2:
1666         return make_shared<cfftp2<Tfs>>(l1, ido, roots);
1667       case 3:
1668         return make_shared<cfftp3<Tfs>>(l1, ido, roots);
1669       case 4:
1670         return make_shared<cfftp4<Tfs>>(l1, ido, roots);
1671       case 5:
1672         return make_shared<cfftp5<Tfs>>(l1, ido, roots);
1673       case 7:
1674         return make_shared<cfftp7<Tfs>>(l1, ido, roots);
1675       case 11:
1676         return make_shared<cfftp11<Tfs>>(l1, ido, roots);
1677       default:
1678         if (ip<110)
1679           return make_shared<cfftpg<Tfs>>(l1, ido, ip, roots);
1680         else
1681           return make_shared<cfftpblue<Tfs>>(l1, ido, ip, roots, vectorize);
1682       }
1683     }
1684   else // more than one factor, need a multipass
1685     return make_shared<cfft_multipass<Tfs>>(l1, ido, ip, roots, vectorize);
1686   }
1687 
1688 template<typename Tfs> class pocketfft_c
1689   {
1690   private:
1691     size_t N;
1692     size_t critbuf;
1693     Tcpass<Tfs> plan;
1694 
1695   public:
1696     pocketfft_c(size_t n, bool vectorize=false)
1697       : N(n), critbuf(((N&1023)==0) ? 16 : 0),
1698         plan(cfftpass<Tfs>::make_pass(n,vectorize)) {}
1699     size_t length() const { return N; }
1700     size_t bufsize() const { return N*plan->needs_copy()+2*critbuf+plan->bufsize(); }
1701     template<typename Tfd> DUCC0_NOINLINE Cmplx<Tfd> *exec(Cmplx<Tfd> *in, Cmplx<Tfd> *buf,
1702       Tfs fct, bool fwd, size_t nthreads=1) const
1703       {
1704       static const auto tic = tidx<Cmplx<Tfd> *>();
1705       auto res = static_cast<Cmplx<Tfd> *>(plan->exec(tic,
1706         in, buf+critbuf+plan->bufsize(), buf+critbuf, fwd, nthreads));
1707       if (fct!=Tfs(1))
1708         for (size_t i=0; i<N; ++i) res[i]*=fct;
1709       return res;
1710       }
1711     template<typename Tfd> DUCC0_NOINLINE void exec_copyback(Cmplx<Tfd> *in, Cmplx<Tfd> *buf,
1712       Tfs fct, bool fwd, size_t nthreads=1) const
1713       {
1714       static const auto tic = tidx<Cmplx<Tfd> *>();
1715       auto res = static_cast<Cmplx<Tfd> *>(plan->exec(tic,
1716         in, buf, buf+N*plan->needs_copy(), fwd, nthreads));
1717       if (res==in)
1718         {
1719         if (fct!=Tfs(1))
1720           for (size_t i=0; i<N; ++i) in[i]*=fct;
1721         }
1722       else
1723         {
1724         if (fct!=Tfs(1))
1725           for (size_t i=0; i<N; ++i) in[i]=res[i]*fct;
1726         else
1727           copy_n(res, N, in);
1728         }
1729       }
1730     template<typename Tfd> DUCC0_NOINLINE void exec(Cmplx<Tfd> *in, Tfs fct, bool fwd, size_t nthreads=1) const
1731       {
1732       quick_array<Cmplx<Tfd>> buf(N*plan->needs_copy()+plan->bufsize());
1733       exec_copyback(in, buf.data(), fct, fwd, nthreads);
1734       }
1735   };
1736 
1737 template <typename Tfs> class rfftpass
1738   {
1739   public:
1740     virtual ~rfftpass(){}
1741 
1742     // number of Tfd values required as scratch space during "exec"
1743     // will be provided in "buf"
1744     virtual size_t bufsize() const = 0;
1745     virtual bool needs_copy() const = 0;
1746     virtual void *exec(const type_index &ti, void *in, void *copy, void *buf,
1747       bool fwd, size_t nthreads=1) const = 0;
1748 
1749     static vector<size_t> factorize(size_t N)
1750       {
1751       MR_assert(N>0, "need a positive number");
1752       vector<size_t> factors;
1753       while ((N&3)==0)
1754         { factors.push_back(4); N>>=2; }
1755       if ((N&1)==0)
1756         {
1757         N>>=1;
1758         // factor 2 should be at the front of the factor list
1759         factors.push_back(2);
1760         swap(factors[0], factors.back());
1761         }
1762       for (size_t divisor=3; divisor*divisor<=N; divisor+=2)
1763       while ((N%divisor)==0)
1764         {
1765         factors.push_back(divisor);
1766         N/=divisor;
1767         }
1768       if (N>1) factors.push_back(N);
1769       return factors;
1770       }
1771 
1772     static shared_ptr<rfftpass> make_pass(size_t l1, size_t ido, size_t ip,
1773        const Troots<Tfs> &roots, bool vectorize=false);
1774     static shared_ptr<rfftpass> make_pass(size_t ip, bool vectorize=false)
1775       {
1776       return make_pass(1,1,ip,make_shared<UnityRoots<Tfs,Cmplx<Tfs>>>(ip),
1777         vectorize);
1778       }
1779   };
1780 
1781 #define POCKETFFT_EXEC_DISPATCH \
1782     virtual void *exec(const type_index &ti, void *in, void *copy, void *buf, \
1783       bool fwd, size_t nthreads) const \
1784       { \
1785       static const auto tifs=tidx<Tfs *>(); \
1786       if (ti==tifs) \
1787         { \
1788         auto in1 = static_cast<Tfs *>(in); \
1789         auto copy1 = static_cast<Tfs *>(copy); \
1790         auto buf1 = static_cast<Tfs *>(buf); \
1791         return fwd ? exec_<true>(in1, copy1, buf1, nthreads) \
1792                    : exec_<false>(in1, copy1, buf1, nthreads); \
1793         } \
1794       if constexpr (fft1d_simdlen<Tfs> > 1) \
1795         if constexpr (simd_exists<Tfs, fft1d_simdlen<Tfs>>) \
1796           { \
1797           using Tfv = typename simd_select<Tfs, fft1d_simdlen<Tfs>>::type; \
1798           static const auto tifv=tidx<Tfv *>(); \
1799           if (ti==tifv) \
1800             {  \
1801             auto in1 = static_cast<Tfv *>(in); \
1802             auto copy1 = static_cast<Tfv *>(copy); \
1803             auto buf1 = static_cast<Tfv *>(buf); \
1804             return fwd ? exec_<true>(in1, copy1, buf1, nthreads) \
1805                        : exec_<false>(in1, copy1, buf1, nthreads); \
1806             } \
1807           } \
1808       if constexpr (fft1d_simdlen<Tfs> > 2) \
1809         if constexpr (simd_exists<Tfs, fft1d_simdlen<Tfs>/2>) \
1810           { \
1811           using Tfv = typename simd_select<Tfs, fft1d_simdlen<Tfs>/2>::type; \
1812           static const auto tifv=tidx<Tfv *>(); \
1813           if (ti==tifv) \
1814             {  \
1815             auto in1 = static_cast<Tfv *>(in); \
1816             auto copy1 = static_cast<Tfv *>(copy); \
1817             auto buf1 = static_cast<Tfv *>(buf); \
1818             return fwd ? exec_<true>(in1, copy1, buf1, nthreads) \
1819                        : exec_<false>(in1, copy1, buf1, nthreads); \
1820             } \
1821           } \
1822       if constexpr (fft1d_simdlen<Tfs> > 4) \
1823         if constexpr (simd_exists<Tfs, fft1d_simdlen<Tfs>/4>) \
1824           { \
1825           using Tfv = typename simd_select<Tfs, fft1d_simdlen<Tfs>/4>::type; \
1826           static const auto tifv=tidx<Tfv *>(); \
1827           if (ti==tifv) \
1828             {  \
1829             auto in1 = static_cast<Tfv *>(in); \
1830             auto copy1 = static_cast<Tfv *>(copy); \
1831             auto buf1 = static_cast<Tfv *>(buf); \
1832             return fwd ? exec_<true>(in1, copy1, buf1, nthreads) \
1833                        : exec_<false>(in1, copy1, buf1, nthreads); \
1834             } \
1835           } \
1836       if constexpr (fft1d_simdlen<Tfs> > 8) \
1837         if constexpr (simd_exists<Tfs, fft1d_simdlen<Tfs>/8>) \
1838           { \
1839           using Tfv = typename simd_select<Tfs, fft1d_simdlen<Tfs>/8>::type; \
1840           static const auto tifv=tidx<Tfv *>(); \
1841           if (ti==tifv) \
1842             {  \
1843             auto in1 = static_cast<Tfv *>(in); \
1844             auto copy1 = static_cast<Tfv *>(copy); \
1845             auto buf1 = static_cast<Tfv *>(buf); \
1846             return fwd ? exec_<true>(in1, copy1, buf1, nthreads) \
1847                        : exec_<false>(in1, copy1, buf1, nthreads); \
1848             } \
1849           } \
1850       MR_fail("impossible vector length requested"); \
1851       }
1852 
1853 template<typename T> using Trpass = shared_ptr<rfftpass<T>>;
1854 
1855 /* (a+ib) = conj(c+id) * (e+if) */
1856 template<typename T1, typename T2, typename T3> inline void MULPM
1857   (T1 &a, T1 &b, T2 c, T2 d, T3 e, T3 f)
1858   {  a=c*e+d*f; b=c*f-d*e; }
1859 
1860 template <typename Tfs> class rfftp1: public rfftpass<Tfs>
1861   {
1862   public:
1863     rfftp1() {}
1864     virtual size_t bufsize() const { return 0; }
1865     virtual bool needs_copy() const { return false; }
1866 
1867     virtual void *exec(const type_index & /*ti*/, void * in, void * /*copy*/,
1868       void * /*buf*/, bool /*fwd*/, size_t /*nthreads*/) const
1869       { return in; }
1870   };
1871 
1872 template <typename Tfs> class rfftp2: public rfftpass<Tfs>
1873   {
1874   private:
1875     size_t l1, ido;
1876     static constexpr size_t ip=2;
1877     quick_array<Tfs> wa;
1878 
1879     auto WA(size_t x, size_t i) const
1880       { return wa[i+x*(ido-1)]; }
1881 
1882     template<bool fwd, typename Tfd> Tfd *exec_ (Tfd * DUCC0_RESTRICT cc,
1883       Tfd * DUCC0_RESTRICT ch, Tfd * /*buf*/, size_t /*nthreads*/) const
1884       {
1885       if constexpr(fwd)
1886         {
1887         auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd&
1888           { return cc[a+ido*(b+l1*c)]; };
1889         auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd&
1890           { return ch[a+ido*(b+ip*c)]; };
1891         for (size_t k=0; k<l1; k++)
1892           PM (CH(0,0,k),CH(ido-1,1,k),CC(0,k,0),CC(0,k,1));
1893         if ((ido&1)==0)
1894           for (size_t k=0; k<l1; k++)
1895             {
1896             CH(    0,1,k) = -CC(ido-1,k,1);
1897             CH(ido-1,0,k) =  CC(ido-1,k,0);
1898             }
1899         if (ido<=2) return ch;
1900         for (size_t k=0; k<l1; k++)
1901           for (size_t i=2; i<ido; i+=2)
1902             {
1903             size_t ic=ido-i;
1904             Tfd tr2, ti2;
1905             MULPM (tr2,ti2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1));
1906             PM (CH(i-1,0,k),CH(ic-1,1,k),CC(i-1,k,0),tr2);
1907             PM (CH(i  ,0,k),CH(ic  ,1,k),ti2,CC(i  ,k,0));
1908             }
1909         }
1910       else
1911         {
1912         auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd&
1913           { return cc[a+ido*(b+ip*c)]; };
1914         auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd&
1915           { return ch[a+ido*(b+l1*c)]; };
1916 
1917         for (size_t k=0; k<l1; k++)
1918           PM (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(ido-1,1,k));
1919         if ((ido&1)==0)
1920           for (size_t k=0; k<l1; k++)
1921             {
1922             CH(ido-1,k,0) = Tfs( 2)*CC(ido-1,0,k);
1923             CH(ido-1,k,1) = Tfs(-2)*CC(0    ,1,k);
1924             }
1925         if (ido<=2) return ch;
1926         for (size_t k=0; k<l1;++k)
1927           for (size_t i=2; i<ido; i+=2)
1928             {
1929             size_t ic=ido-i;
1930             Tfd ti2, tr2;
1931             PM (CH(i-1,k,0),tr2,CC(i-1,0,k),CC(ic-1,1,k));
1932             PM (ti2,CH(i  ,k,0),CC(i  ,0,k),CC(ic  ,1,k));
1933             MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ti2,tr2);
1934             }
1935         }
1936       return ch;
1937       }
1938 
1939   public:
1940     rfftp2(size_t l1_, size_t ido_, const Troots<Tfs> &roots)
1941       : l1(l1_), ido(ido_), wa((ip-1)*(ido-1))
1942       {
1943       size_t N=ip*l1*ido;
1944       size_t rfct = roots->size()/N;
1945       MR_assert(roots->size()==N*rfct, "mismatch");
1946       for (size_t j=1; j<ip; ++j)
1947         for (size_t i=1; i<=(ido-1)/2; ++i)
1948           {
1949           auto val = (*roots)[rfct*j*l1*i];
1950           wa[(j-1)*(ido-1)+2*i-2] = val.r;
1951           wa[(j-1)*(ido-1)+2*i-1] = val.i;
1952           }
1953       }
1954 
1955     virtual size_t bufsize() const { return 0; }
1956     virtual bool needs_copy() const { return true; }
1957 
1958     POCKETFFT_EXEC_DISPATCH
1959   };
1960 // a2=a+b; b2=i*(b-a);
1961 #define POCKETFFT_REARRANGE(rx, ix, ry, iy) \
1962   {\
1963   auto t1=rx+ry, t2=ry-rx, t3=ix+iy, t4=ix-iy; \
1964   rx=t1; ix=t3; ry=t4; iy=t2; \
1965   }
1966 
1967 template <typename Tfs> class rfftp3: public rfftpass<Tfs>
1968   {
1969   private:
1970     size_t l1, ido;
1971     static constexpr size_t ip=3;
1972     quick_array<Tfs> wa;
1973 
1974     auto WA(size_t x, size_t i) const
1975       { return wa[i+x*(ido-1)]; }
1976 
1977     template<bool fwd, typename Tfd> Tfd *exec_ (Tfd * DUCC0_RESTRICT cc,
1978       Tfd * DUCC0_RESTRICT ch, Tfd * /*buf*/, size_t /*nthreads*/) const
1979       {
1980       constexpr Tfs taur=Tfs(-0.5),
1981                     taui=Tfs(0.8660254037844386467637231707529362L);
1982       if constexpr(fwd)
1983         {
1984         auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd&
1985           { return cc[a+ido*(b+l1*c)]; };
1986         auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd&
1987           { return ch[a+ido*(b+ip*c)]; };
1988         for (size_t k=0; k<l1; k++)
1989           {
1990           Tfd cr2=CC(0,k,1)+CC(0,k,2);
1991           CH(0,0,k) = CC(0,k,0)+cr2;
1992           CH(0,2,k) = taui*(CC(0,k,2)-CC(0,k,1));
1993           CH(ido-1,1,k) = CC(0,k,0)+taur*cr2;
1994           }
1995         if (ido==1) return ch;
1996         for (size_t k=0; k<l1; k++)
1997           for (size_t i=2; i<ido; i+=2)
1998             {
1999             size_t ic=ido-i;
2000             Tfd di2, di3, dr2, dr3;
2001             MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1)); // d2=conj(WA0)*CC1
2002             MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2)); // d3=conj(WA1)*CC2
2003             POCKETFFT_REARRANGE(dr2, di2, dr3, di3);
2004             CH(i-1,0,k) = CC(i-1,k,0)+dr2; // c add
2005             CH(i  ,0,k) = CC(i  ,k,0)+di2;
2006             Tfd tr2 = CC(i-1,k,0)+taur*dr2; // c add
2007             Tfd ti2 = CC(i  ,k,0)+taur*di2;
2008             Tfd tr3 = taui*dr3;  // t3 = taui*i*(d3-d2)?
2009             Tfd ti3 = taui*di3;
2010             PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr3); // PM(i) = t2+t3
2011             PM(CH(i  ,2,k),CH(ic  ,1,k),ti3,ti2); // PM(ic) = conj(t2-t3)
2012             }
2013         }
2014       else
2015         {
2016         auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd&
2017           { return cc[a+ido*(b+ip*c)]; };
2018         auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd&
2019           { return ch[a+ido*(b+l1*c)]; };
2020 
2021         for (size_t k=0; k<l1; k++)
2022           {
2023           Tfd tr2=Tfs(2)*CC(ido-1,1,k);
2024           Tfd cr2=CC(0,0,k)+taur*tr2;
2025           CH(0,k,0)=CC(0,0,k)+tr2;
2026           Tfd ci3=Tfs(2)*taui*CC(0,2,k);
2027           PM (CH(0,k,2),CH(0,k,1),cr2,ci3);
2028           }
2029         if (ido==1) return ch;
2030         for (size_t k=0; k<l1; k++)
2031           for (size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2)
2032             {
2033             Tfd tr2=CC(i-1,2,k)+CC(ic-1,1,k); // t2=CC(I) + conj(CC(ic))
2034             Tfd ti2=CC(i  ,2,k)-CC(ic  ,1,k);
2035             Tfd cr2=CC(i-1,0,k)+taur*tr2;     // c2=CC +taur*t2
2036             Tfd ci2=CC(i  ,0,k)+taur*ti2;
2037             CH(i-1,k,0)=CC(i-1,0,k)+tr2;         // CH=CC+t2
2038             CH(i  ,k,0)=CC(i  ,0,k)+ti2;
2039             Tfd cr3=taui*(CC(i-1,2,k)-CC(ic-1,1,k));// c3=taui*(CC(i)-conj(CC(ic)))
2040             Tfd ci3=taui*(CC(i  ,2,k)+CC(ic  ,1,k));
2041             Tfd di2, di3, dr2, dr3;
2042             PM(dr3,dr2,cr2,ci3); // d2= (cr2-ci3, ci2+cr3) = c2+i*c3
2043             PM(di2,di3,ci2,cr3); // d3= (cr2+ci3, ci2-cr3) = c2-i*c3
2044             MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2); // ch = WA*d2
2045             MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3);
2046             }
2047         }
2048       return ch;
2049       }
2050 
2051   public:
2052     rfftp3(size_t l1_, size_t ido_, const Troots<Tfs> &roots)
2053       : l1(l1_), ido(ido_), wa((ip-1)*(ido-1))
2054       {
2055       MR_assert(ido&1, "ido must be odd");
2056       size_t N=ip*l1*ido;
2057       size_t rfct = roots->size()/N;
2058       MR_assert(roots->size()==N*rfct, "mismatch");
2059       for (size_t j=1; j<ip; ++j)
2060         for (size_t i=1; i<=(ido-1)/2; ++i)
2061           {
2062           auto val = (*roots)[rfct*j*l1*i];
2063           wa[(j-1)*(ido-1)+2*i-2] = val.r;
2064           wa[(j-1)*(ido-1)+2*i-1] = val.i;
2065           }
2066       }
2067 
2068     virtual size_t bufsize() const { return 0; }
2069     virtual bool needs_copy() const { return true; }
2070 
2071     POCKETFFT_EXEC_DISPATCH
2072   };
2073 
2074 template <typename Tfs> class rfftp4: public rfftpass<Tfs>
2075   {
2076   private:
2077     size_t l1, ido;
2078     static constexpr size_t ip=4;
2079     quick_array<Tfs> wa;
2080 
2081     auto WA(size_t x, size_t i) const
2082       { return wa[i+x*(ido-1)]; }
2083 
2084     template<bool fwd, typename Tfd> Tfd *exec_ (Tfd * DUCC0_RESTRICT cc,
2085       Tfd * DUCC0_RESTRICT ch, Tfd * /*buf*/, size_t /*nthreads*/) const
2086       {
2087       if constexpr(fwd)
2088         {
2089         constexpr Tfs hsqt2=Tfs(0.707106781186547524400844362104849L);
2090         auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd&
2091           { return cc[a+ido*(b+l1*c)]; };
2092         auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd&
2093           { return ch[a+ido*(b+ip*c)]; };
2094 
2095         for (size_t k=0; k<l1; k++)
2096           {
2097           Tfd tr1,tr2;
2098           PM (tr1,CH(0,2,k),CC(0,k,3),CC(0,k,1));
2099           PM (tr2,CH(ido-1,1,k),CC(0,k,0),CC(0,k,2));
2100           PM (CH(0,0,k),CH(ido-1,3,k),tr2,tr1);
2101           }
2102         if ((ido&1)==0)
2103           for (size_t k=0; k<l1; k++)
2104             {
2105             Tfd ti1=-hsqt2*(CC(ido-1,k,1)+CC(ido-1,k,3));
2106             Tfd tr1= hsqt2*(CC(ido-1,k,1)-CC(ido-1,k,3));
2107             PM (CH(ido-1,0,k),CH(ido-1,2,k),CC(ido-1,k,0),tr1);
2108             PM (CH(    0,3,k),CH(    0,1,k),ti1,CC(ido-1,k,2));
2109             }
2110         if (ido<=2) return ch;
2111         for (size_t k=0; k<l1; k++)
2112           for (size_t i=2; i<ido; i+=2)
2113             {
2114             size_t ic=ido-i;
2115             Tfd ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
2116             MULPM(cr2,ci2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1));
2117             MULPM(cr3,ci3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2));
2118             MULPM(cr4,ci4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3));
2119             PM(tr1,tr4,cr4,cr2);
2120             PM(ti1,ti4,ci2,ci4);
2121             PM(tr2,tr3,CC(i-1,k,0),cr3);
2122             PM(ti2,ti3,CC(i  ,k,0),ci3);
2123             PM(CH(i-1,0,k),CH(ic-1,3,k),tr2,tr1);
2124             PM(CH(i  ,0,k),CH(ic  ,3,k),ti1,ti2);
2125             PM(CH(i-1,2,k),CH(ic-1,1,k),tr3,ti4);
2126             PM(CH(i  ,2,k),CH(ic  ,1,k),tr4,ti3);
2127             }
2128         }
2129       else
2130         {
2131         constexpr Tfs sqrt2=Tfs(1.414213562373095048801688724209698L);
2132         auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd&
2133           { return cc[a+ido*(b+ip*c)]; };
2134         auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd&
2135           { return ch[a+ido*(b+l1*c)]; };
2136 
2137         for (size_t k=0; k<l1; k++)
2138           {
2139           Tfd tr1, tr2;
2140           PM (tr2,tr1,CC(0,0,k),CC(ido-1,3,k));
2141           Tfd tr3=Tfs(2)*CC(ido-1,1,k);
2142           Tfd tr4=Tfs(2)*CC(0,2,k);
2143           PM (CH(0,k,0),CH(0,k,2),tr2,tr3);
2144           PM (CH(0,k,3),CH(0,k,1),tr1,tr4);
2145           }
2146         if ((ido&1)==0)
2147           for (size_t k=0; k<l1; k++)
2148             {
2149             Tfd tr1,tr2,ti1,ti2;
2150             PM (ti1,ti2,CC(0    ,3,k),CC(0    ,1,k));
2151             PM (tr2,tr1,CC(ido-1,0,k),CC(ido-1,2,k));
2152             CH(ido-1,k,0)=tr2+tr2;
2153             CH(ido-1,k,1)=sqrt2*(tr1-ti1);
2154             CH(ido-1,k,2)=ti2+ti2;
2155             CH(ido-1,k,3)=-sqrt2*(tr1+ti1);
2156             }
2157         if (ido<=2) return ch;
2158         for (size_t k=0; k<l1;++k)
2159           for (size_t i=2; i<ido; i+=2)
2160             {
2161             Tfd ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
2162             size_t ic=ido-i;
2163             PM (tr2,tr1,CC(i-1,0,k),CC(ic-1,3,k));
2164             PM (ti1,ti2,CC(i  ,0,k),CC(ic  ,3,k));
2165             PM (tr4,ti3,CC(i  ,2,k),CC(ic  ,1,k));
2166             PM (tr3,ti4,CC(i-1,2,k),CC(ic-1,1,k));
2167             PM (CH(i-1,k,0),cr3,tr2,tr3);
2168             PM (CH(i  ,k,0),ci3,ti2,ti3);
2169             PM (cr4,cr2,tr1,tr4);
2170             PM (ci2,ci4,ti1,ti4);
2171             MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ci2,cr2);
2172             MULPM (CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),ci3,cr3);
2173             MULPM (CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),ci4,cr4);
2174             }
2175         }
2176       return ch;
2177       }
2178 
2179   public:
2180     rfftp4(size_t l1_, size_t ido_, const Troots<Tfs> &roots)
2181       : l1(l1_), ido(ido_), wa((ip-1)*(ido-1))
2182       {
2183       size_t N=ip*l1*ido;
2184       size_t rfct = roots->size()/N;
2185       MR_assert(roots->size()==N*rfct, "mismatch");
2186       for (size_t j=1; j<ip; ++j)
2187         for (size_t i=1; i<=(ido-1)/2; ++i)
2188           {
2189           auto val = (*roots)[rfct*j*l1*i];
2190           wa[(j-1)*(ido-1)+2*i-2] = val.r;
2191           wa[(j-1)*(ido-1)+2*i-1] = val.i;
2192           }
2193       }
2194 
2195     virtual size_t bufsize() const { return 0; }
2196     virtual bool needs_copy() const { return true; }
2197 
2198     POCKETFFT_EXEC_DISPATCH
2199   };
2200 
2201 template <typename Tfs> class rfftp5: public rfftpass<Tfs>
2202   {
2203   private:
2204     size_t l1, ido;
2205     static constexpr size_t ip=5;
2206     quick_array<Tfs> wa;
2207 
2208     auto WA(size_t x, size_t i) const
2209       { return wa[i+x*(ido-1)]; }
2210 
2211     template<bool fwd, typename Tfd> Tfd *exec_ (Tfd * DUCC0_RESTRICT cc,
2212       Tfd * DUCC0_RESTRICT ch, Tfd * /*buf*/, size_t /*nthreads*/) const
2213       {
2214       constexpr Tfs tr11= Tfs(0.3090169943749474241022934171828191L),
2215                     ti11= Tfs(0.9510565162951535721164393333793821L),
2216                     tr12= Tfs(-0.8090169943749474241022934171828191L),
2217                     ti12= Tfs(0.5877852522924731291687059546390728L);
2218 
2219       if constexpr(fwd)
2220         {
2221         auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd&
2222           { return cc[a+ido*(b+l1*c)]; };
2223         auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd&
2224           { return ch[a+ido*(b+ip*c)]; };
2225 
2226         for (size_t k=0; k<l1; k++)
2227           {
2228           Tfd cr2, cr3, ci4, ci5;
2229           PM (cr2,ci5,CC(0,k,4),CC(0,k,1));
2230           PM (cr3,ci4,CC(0,k,3),CC(0,k,2));
2231           CH(0,0,k)=CC(0,k,0)+cr2+cr3;
2232           CH(ido-1,1,k)=CC(0,k,0)+tr11*cr2+tr12*cr3;
2233           CH(0,2,k)=ti11*ci5+ti12*ci4;
2234           CH(ido-1,3,k)=CC(0,k,0)+tr12*cr2+tr11*cr3;
2235           CH(0,4,k)=ti12*ci5-ti11*ci4;
2236           }
2237         if (ido==1) return ch;
2238         for (size_t k=0; k<l1;++k)
2239           for (size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2)
2240             {
2241             Tfd di2, di3, di4, di5, dr2, dr3, dr4, dr5;
2242             MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1));
2243             MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2));
2244             MULPM (dr4,di4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3));
2245             MULPM (dr5,di5,WA(3,i-2),WA(3,i-1),CC(i-1,k,4),CC(i,k,4));
2246             POCKETFFT_REARRANGE(dr2, di2, dr5, di5);
2247             POCKETFFT_REARRANGE(dr3, di3, dr4, di4);
2248             CH(i-1,0,k)=CC(i-1,k,0)+dr2+dr3;
2249             CH(i  ,0,k)=CC(i  ,k,0)+di2+di3;
2250             Tfd tr2=CC(i-1,k,0)+tr11*dr2+tr12*dr3;
2251             Tfd ti2=CC(i  ,k,0)+tr11*di2+tr12*di3;
2252             Tfd tr3=CC(i-1,k,0)+tr12*dr2+tr11*dr3;
2253             Tfd ti3=CC(i  ,k,0)+tr12*di2+tr11*di3;
2254             Tfd tr5 = ti11*dr5 + ti12*dr4;
2255             Tfd ti5 = ti11*di5 + ti12*di4;
2256             Tfd tr4 = ti12*dr5 - ti11*dr4;
2257             Tfd ti4 = ti12*di5 - ti11*di4;
2258             PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr5);
2259             PM(CH(i  ,2,k),CH(ic  ,1,k),ti5,ti2);
2260             PM(CH(i-1,4,k),CH(ic-1,3,k),tr3,tr4);
2261             PM(CH(i  ,4,k),CH(ic  ,3,k),ti4,ti3);
2262             }
2263         }
2264       else
2265         {
2266         auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd&
2267           { return cc[a+ido*(b+ip*c)]; };
2268         auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd&
2269           { return ch[a+ido*(b+l1*c)]; };
2270 
2271         for (size_t k=0; k<l1; k++)
2272           {
2273           Tfd ti5=CC(0,2,k)+CC(0,2,k);
2274           Tfd ti4=CC(0,4,k)+CC(0,4,k);
2275           Tfd tr2=CC(ido-1,1,k)+CC(ido-1,1,k);
2276           Tfd tr3=CC(ido-1,3,k)+CC(ido-1,3,k);
2277           CH(0,k,0)=CC(0,0,k)+tr2+tr3;
2278           Tfd cr2=CC(0,0,k)+tr11*tr2+tr12*tr3;
2279           Tfd cr3=CC(0,0,k)+tr12*tr2+tr11*tr3;
2280           Tfd ci4, ci5;
2281           MULPM(ci5,ci4,ti5,ti4,ti11,ti12);
2282           PM(CH(0,k,4),CH(0,k,1),cr2,ci5);
2283           PM(CH(0,k,3),CH(0,k,2),cr3,ci4);
2284           }
2285         if (ido==1) return ch;
2286         for (size_t k=0; k<l1;++k)
2287           for (size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2)
2288             {
2289             Tfd tr2, tr3, tr4, tr5, ti2, ti3, ti4, ti5;
2290             PM(tr2,tr5,CC(i-1,2,k),CC(ic-1,1,k));
2291             PM(ti5,ti2,CC(i  ,2,k),CC(ic  ,1,k));
2292             PM(tr3,tr4,CC(i-1,4,k),CC(ic-1,3,k));
2293             PM(ti4,ti3,CC(i  ,4,k),CC(ic  ,3,k));
2294             CH(i-1,k,0)=CC(i-1,0,k)+tr2+tr3;
2295             CH(i  ,k,0)=CC(i  ,0,k)+ti2+ti3;
2296             Tfd cr2=CC(i-1,0,k)+tr11*tr2+tr12*tr3;
2297             Tfd ci2=CC(i  ,0,k)+tr11*ti2+tr12*ti3;
2298             Tfd cr3=CC(i-1,0,k)+tr12*tr2+tr11*tr3;
2299             Tfd ci3=CC(i  ,0,k)+tr12*ti2+tr11*ti3;
2300             Tfd ci4, ci5, cr5, cr4;
2301             MULPM(cr5,cr4,tr5,tr4,ti11,ti12);
2302             MULPM(ci5,ci4,ti5,ti4,ti11,ti12);
2303             Tfd dr2, dr3, dr4, dr5, di2, di3, di4, di5;
2304             PM(dr4,dr3,cr3,ci4);
2305             PM(di3,di4,ci3,cr4);
2306             PM(dr5,dr2,cr2,ci5);
2307             PM(di2,di5,ci2,cr5);
2308             MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2);
2309             MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3);
2310             MULPM(CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),di4,dr4);
2311             MULPM(CH(i,k,4),CH(i-1,k,4),WA(3,i-2),WA(3,i-1),di5,dr5);
2312             }
2313         }
2314       return ch;
2315       }
2316 
2317   public:
2318     rfftp5(size_t l1_, size_t ido_, const Troots<Tfs> &roots)
2319       : l1(l1_), ido(ido_), wa((ip-1)*(ido-1))
2320       {
2321       MR_assert(ido&1, "ido must be odd");
2322       size_t N=ip*l1*ido;
2323       size_t rfct = roots->size()/N;
2324       MR_assert(roots->size()==N*rfct, "mismatch");
2325       for (size_t j=1; j<ip; ++j)
2326         for (size_t i=1; i<=(ido-1)/2; ++i)
2327           {
2328           auto val = (*roots)[rfct*j*l1*i];
2329           wa[(j-1)*(ido-1)+2*i-2] = val.r;
2330           wa[(j-1)*(ido-1)+2*i-1] = val.i;
2331           }
2332       }
2333 
2334     virtual size_t bufsize() const { return 0; }
2335     virtual bool needs_copy() const { return true; }
2336 
2337     POCKETFFT_EXEC_DISPATCH
2338   };
2339 
2340 template <typename Tfs> class rfftpg: public rfftpass<Tfs>
2341   {
2342   private:
2343     size_t l1, ido;
2344     size_t ip;
2345     quick_array<Tfs> wa, csarr;
2346 
2347     template<bool fwd, typename Tfd> Tfd *exec_ (Tfd * DUCC0_RESTRICT cc,
2348       Tfd * DUCC0_RESTRICT ch, Tfd * /*buf*/, size_t /*nthreads*/) const
2349       {
2350       if constexpr(fwd)
2351         {
2352         size_t ipph=(ip+1)/2;
2353         size_t idl1 = ido*l1;
2354 
2355         auto CC = [cc,this](size_t a, size_t b, size_t c) -> Tfd&
2356           { return cc[a+ido*(b+ip*c)]; };
2357         auto CH = [ch,this](size_t a, size_t b, size_t c) -> const Tfd&
2358           { return ch[a+ido*(b+l1*c)]; };
2359         auto C1 = [cc,this] (size_t a, size_t b, size_t c) -> Tfd&
2360           { return cc[a+ido*(b+l1*c)]; };
2361         auto C2 = [cc,idl1] (size_t a, size_t b) -> Tfd&
2362           { return cc[a+idl1*b]; };
2363         auto CH2 = [ch,idl1] (size_t a, size_t b) -> Tfd&
2364           { return ch[a+idl1*b]; };
2365 
2366         if (ido>1)
2367           {
2368           for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)              // 114
2369             {
2370             size_t is=(j-1)*(ido-1),
2371                    is2=(jc-1)*(ido-1);
2372             for (size_t k=0; k<l1; ++k)                            // 113
2373               {
2374               size_t idij=is;
2375               size_t idij2=is2;
2376               for (size_t i=1; i<=ido-2; i+=2)                      // 112
2377                 {
2378                 Tfd t1=C1(i,k,j ), t2=C1(i+1,k,j ),
2379                     t3=C1(i,k,jc), t4=C1(i+1,k,jc);
2380                 Tfd x1=wa[idij]*t1 + wa[idij+1]*t2,
2381                     x2=wa[idij]*t2 - wa[idij+1]*t1,
2382                     x3=wa[idij2]*t3 + wa[idij2+1]*t4,
2383                     x4=wa[idij2]*t4 - wa[idij2+1]*t3;
2384                 PM(C1(i,k,j),C1(i+1,k,jc),x3,x1);
2385                 PM(C1(i+1,k,j),C1(i,k,jc),x2,x4);
2386                 idij+=2;
2387                 idij2+=2;
2388                 }
2389               }
2390             }
2391           }
2392 
2393         for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                // 123
2394           for (size_t k=0; k<l1; ++k)                              // 122
2395             MPINPLACE(C1(0,k,jc), C1(0,k,j));
2396 
2397       //everything in C
2398       //memset(ch,0,ip*l1*ido*sizeof(double));
2399 
2400         for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc)                 // 127
2401           {
2402           for (size_t ik=0; ik<idl1; ++ik)                         // 124
2403             {
2404             CH2(ik,l ) = C2(ik,0)+csarr[2*l]*C2(ik,1)+csarr[4*l]*C2(ik,2);
2405             CH2(ik,lc) = csarr[2*l+1]*C2(ik,ip-1)+csarr[4*l+1]*C2(ik,ip-2);
2406             }
2407           size_t iang = 2*l;
2408           size_t j=3, jc=ip-3;
2409           for (; j<ipph-3; j+=4,jc-=4)              // 126
2410             {
2411             iang+=l; if (iang>=ip) iang-=ip;
2412             Tfs ar1=csarr[2*iang], ai1=csarr[2*iang+1];
2413             iang+=l; if (iang>=ip) iang-=ip;
2414             Tfs ar2=csarr[2*iang], ai2=csarr[2*iang+1];
2415             iang+=l; if (iang>=ip) iang-=ip;
2416             Tfs ar3=csarr[2*iang], ai3=csarr[2*iang+1];
2417             iang+=l; if (iang>=ip) iang-=ip;
2418             Tfs ar4=csarr[2*iang], ai4=csarr[2*iang+1];
2419             for (size_t ik=0; ik<idl1; ++ik)                       // 125
2420               {
2421               CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1)
2422                            +ar3*C2(ik,j +2)+ar4*C2(ik,j +3);
2423               CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1)
2424                            +ai3*C2(ik,jc-2)+ai4*C2(ik,jc-3);
2425               }
2426             }
2427           for (; j<ipph-1; j+=2,jc-=2)              // 126
2428             {
2429             iang+=l; if (iang>=ip) iang-=ip;
2430             Tfs ar1=csarr[2*iang], ai1=csarr[2*iang+1];
2431             iang+=l; if (iang>=ip) iang-=ip;
2432             Tfs ar2=csarr[2*iang], ai2=csarr[2*iang+1];
2433             for (size_t ik=0; ik<idl1; ++ik)                       // 125
2434               {
2435               CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1);
2436               CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1);
2437               }
2438             }
2439           for (; j<ipph; ++j,--jc)              // 126
2440             {
2441             iang+=l; if (iang>=ip) iang-=ip;
2442             Tfs ar=csarr[2*iang], ai=csarr[2*iang+1];
2443             for (size_t ik=0; ik<idl1; ++ik)                       // 125
2444               {
2445               CH2(ik,l ) += ar*C2(ik,j );
2446               CH2(ik,lc) += ai*C2(ik,jc);
2447               }
2448             }
2449           }
2450         for (size_t ik=0; ik<idl1; ++ik)                         // 101
2451           CH2(ik,0) = C2(ik,0);
2452         for (size_t j=1; j<ipph; ++j)                              // 129
2453           for (size_t ik=0; ik<idl1; ++ik)                         // 128
2454             CH2(ik,0) += C2(ik,j);
2455 
2456       // everything in CH at this point!
2457       //memset(cc,0,ip*l1*ido*sizeof(double));
2458 
2459         for (size_t k=0; k<l1; ++k)                                // 131
2460           for (size_t i=0; i<ido; ++i)                             // 130
2461             CC(i,0,k) = CH(i,k,0);
2462 
2463         for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                // 137
2464           {
2465           size_t j2=2*j-1;
2466           for (size_t k=0; k<l1; ++k)                              // 136
2467             {
2468             CC(ido-1,j2,k) = CH(0,k,j);
2469             CC(0,j2+1,k) = CH(0,k,jc);
2470             }
2471           }
2472 
2473         if (ido==1) return cc;
2474 
2475         for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                // 140
2476           {
2477           size_t j2=2*j-1;
2478           for(size_t k=0; k<l1; ++k)                               // 139
2479             for(size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2)      // 138
2480               {
2481               CC(i   ,j2+1,k) = CH(i  ,k,j )+CH(i  ,k,jc);
2482               CC(ic  ,j2  ,k) = CH(i  ,k,j )-CH(i  ,k,jc);
2483               CC(i+1 ,j2+1,k) = CH(i+1,k,j )+CH(i+1,k,jc);
2484               CC(ic+1,j2  ,k) = CH(i+1,k,jc)-CH(i+1,k,j );
2485               }
2486           }
2487         return cc;
2488         }
2489       else
2490         {
2491         size_t ipph=(ip+1)/ 2;
2492         size_t idl1 = ido*l1;
2493 
2494         auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd&
2495           { return cc[a+ido*(b+ip*c)]; };
2496         auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd&
2497           { return ch[a+ido*(b+l1*c)]; };
2498         auto C1 = [cc,this](size_t a, size_t b, size_t c) -> const Tfd&
2499           { return cc[a+ido*(b+l1*c)]; };
2500         auto C2 = [cc,idl1](size_t a, size_t b) -> Tfd&
2501           { return cc[a+idl1*b]; };
2502         auto CH2 = [ch,idl1](size_t a, size_t b) -> Tfd&
2503           { return ch[a+idl1*b]; };
2504 
2505         for (size_t k=0; k<l1; ++k)        // 102
2506           for (size_t i=0; i<ido; ++i)     // 101
2507             CH(i,k,0) = CC(i,0,k);
2508         for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)   // 108
2509           {
2510           size_t j2=2*j-1;
2511           for (size_t k=0; k<l1; ++k)
2512             {
2513             CH(0,k,j ) = Tfs(2)*CC(ido-1,j2,k);
2514             CH(0,k,jc) = Tfs(2)*CC(0,j2+1,k);
2515             }
2516           }
2517 
2518         if (ido!=1)
2519           {
2520           for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)   // 111
2521             {
2522             size_t j2=2*j-1;
2523             for (size_t k=0; k<l1; ++k)
2524               for (size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2)      // 109
2525                 {
2526                 CH(i  ,k,j ) = CC(i  ,j2+1,k)+CC(ic  ,j2,k);
2527                 CH(i  ,k,jc) = CC(i  ,j2+1,k)-CC(ic  ,j2,k);
2528                 CH(i+1,k,j ) = CC(i+1,j2+1,k)-CC(ic+1,j2,k);
2529                 CH(i+1,k,jc) = CC(i+1,j2+1,k)+CC(ic+1,j2,k);
2530                 }
2531             }
2532           }
2533         for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc)
2534           {
2535           for (size_t ik=0; ik<idl1; ++ik)
2536             {
2537             C2(ik,l ) = CH2(ik,0)+csarr[2*l]*CH2(ik,1)+csarr[4*l]*CH2(ik,2);
2538             C2(ik,lc) = csarr[2*l+1]*CH2(ik,ip-1)+csarr[4*l+1]*CH2(ik,ip-2);
2539             }
2540           size_t iang=2*l;
2541           size_t j=3,jc=ip-3;
2542           for(; j<ipph-3; j+=4,jc-=4)
2543             {
2544             iang+=l; if(iang>ip) iang-=ip;
2545             Tfs ar1=csarr[2*iang], ai1=csarr[2*iang+1];
2546             iang+=l; if(iang>ip) iang-=ip;
2547             Tfs ar2=csarr[2*iang], ai2=csarr[2*iang+1];
2548             iang+=l; if(iang>ip) iang-=ip;
2549             Tfs ar3=csarr[2*iang], ai3=csarr[2*iang+1];
2550             iang+=l; if(iang>ip) iang-=ip;
2551             Tfs ar4=csarr[2*iang], ai4=csarr[2*iang+1];
2552             for (size_t ik=0; ik<idl1; ++ik)
2553               {
2554               C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1)
2555                           +ar3*CH2(ik,j +2)+ar4*CH2(ik,j +3);
2556               C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1)
2557                           +ai3*CH2(ik,jc-2)+ai4*CH2(ik,jc-3);
2558               }
2559             }
2560           for(; j<ipph-1; j+=2,jc-=2)
2561             {
2562             iang+=l; if(iang>ip) iang-=ip;
2563             Tfs ar1=csarr[2*iang], ai1=csarr[2*iang+1];
2564             iang+=l; if(iang>ip) iang-=ip;
2565             Tfs ar2=csarr[2*iang], ai2=csarr[2*iang+1];
2566             for (size_t ik=0; ik<idl1; ++ik)
2567               {
2568               C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1);
2569               C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1);
2570               }
2571             }
2572           for(; j<ipph; ++j,--jc)
2573             {
2574             iang+=l; if(iang>ip) iang-=ip;
2575             Tfs war=csarr[2*iang], wai=csarr[2*iang+1];
2576             for (size_t ik=0; ik<idl1; ++ik)
2577               {
2578               C2(ik,l ) += war*CH2(ik,j );
2579               C2(ik,lc) += wai*CH2(ik,jc);
2580               }
2581             }
2582           }
2583         for (size_t j=1; j<ipph; ++j)
2584           for (size_t ik=0; ik<idl1; ++ik)
2585             CH2(ik,0) += CH2(ik,j);
2586         for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)   // 124
2587           for (size_t k=0; k<l1; ++k)
2588             PM(CH(0,k,jc),CH(0,k,j),C1(0,k,j),C1(0,k,jc));
2589 
2590         if (ido==1) return ch;
2591 
2592         for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)  // 127
2593           for (size_t k=0; k<l1; ++k)
2594             for (size_t i=1; i<=ido-2; i+=2)
2595               {
2596               CH(i  ,k,j ) = C1(i  ,k,j)-C1(i+1,k,jc);
2597               CH(i  ,k,jc) = C1(i  ,k,j)+C1(i+1,k,jc);
2598               CH(i+1,k,j ) = C1(i+1,k,j)+C1(i  ,k,jc);
2599               CH(i+1,k,jc) = C1(i+1,k,j)-C1(i  ,k,jc);
2600               }
2601 
2602       // All in CH
2603 
2604         for (size_t j=1; j<ip; ++j)
2605           {
2606           size_t is = (j-1)*(ido-1);
2607           for (size_t k=0; k<l1; ++k)
2608             {
2609             size_t idij = is;
2610             for (size_t i=1; i<=ido-2; i+=2)
2611               {
2612               Tfd t1=CH(i,k,j), t2=CH(i+1,k,j);
2613               CH(i  ,k,j) = wa[idij]*t1-wa[idij+1]*t2;
2614               CH(i+1,k,j) = wa[idij]*t2+wa[idij+1]*t1;
2615               idij+=2;
2616               }
2617             }
2618           }
2619         return ch;
2620         }
2621       }
2622 
2623   public:
2624     rfftpg(size_t l1_, size_t ido_, size_t ip_, const Troots<Tfs> &roots)
2625       : l1(l1_), ido(ido_), ip(ip_), wa((ip-1)*(ido-1)), csarr(2*ip)
2626       {
2627       MR_assert(ido&1, "ido must be odd");
2628       size_t N=ip*l1*ido;
2629       size_t rfct = roots->size()/N;
2630       MR_assert(roots->size()==N*rfct, "mismatch");
2631       for (size_t j=1; j<ip; ++j)
2632         for (size_t i=1; i<=(ido-1)/2; ++i)
2633           {
2634           auto val = (*roots)[rfct*j*l1*i];
2635           wa[(j-1)*(ido-1)+2*i-2] = val.r;
2636           wa[(j-1)*(ido-1)+2*i-1] = val.i;
2637           }
2638       csarr[0] = Tfs(1);
2639       csarr[1] = Tfs(0);
2640       for (size_t i=2, ic=2*ip-2; i<=ic; i+=2, ic-=2)
2641         {
2642         auto val = (*roots)[i/2*rfct*(N/ip)];
2643         csarr[i   ] = val.r;
2644         csarr[i +1] = val.i;
2645         csarr[ic  ] = val.r;
2646         csarr[ic+1] = -val.i;
2647         }
2648       }
2649 
2650     virtual size_t bufsize() const { return 0; }
2651     virtual bool needs_copy() const { return true; }
2652 
2653     POCKETFFT_EXEC_DISPATCH
2654   };
2655 
2656 template <typename Tfs> class rfftpblue: public rfftpass<Tfs>
2657   {
2658   private:
2659     const size_t l1, ido, ip;
2660     quick_array<Tfs> wa;
2661     const Tcpass<Tfs> cplan;
2662     size_t bufsz;
2663     bool need_cpy;
2664 
2665     auto WA(size_t x, size_t i) const
2666       { return wa[i+x*(ido-1)]; }
2667 
2668     template<bool fwd, typename Tfd> Tfd *exec_
2669       (Tfd * DUCC0_RESTRICT cc, Tfd * DUCC0_RESTRICT ch,
2670        Tfd * DUCC0_RESTRICT buf_, size_t nthreads) const
2671       {
2672       using Tcd = Cmplx<Tfd>;
2673       auto buf = reinterpret_cast<Tcd *>(buf_);
2674       Tcd *cc2 = &buf[0];
2675       Tcd *ch2 = &buf[ip];
2676       Tcd *subbuf = &buf[2*ip];
2677       static const auto ticd = tidx<Tcd *>();
2678 
2679       if constexpr(fwd)
2680         {
2681         auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd&
2682           { return cc[a+ido*(b+l1*c)]; };
2683         auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd&
2684           { return ch[a+ido*(b+ip*c)]; };
2685 
2686         for (size_t k=0; k<l1; ++k)
2687           {
2688           // copy in
2689           for (size_t m=0; m<ip; ++m)
2690             cc2[m] = {CC(0,k,m),Tfd(0)};
2691           auto res = static_cast<Tcd *>(cplan->exec(ticd, cc2, ch2,
2692             subbuf, fwd, nthreads));
2693           // copy out
2694           CH(0,0,k) = res[0].r;
2695           for (size_t m=1; m<=ip/2; ++m)
2696             {
2697             CH(ido-1,2*m-1,k)=res[m].r;
2698             CH(0,2*m,k)=res[m].i;
2699             }
2700           }
2701         if (ido==1) return ch;
2702         size_t ipph = (ip+1)/2;
2703         for (size_t k=0; k<l1; ++k)
2704           for (size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2)
2705             {
2706             // copy in
2707             cc2[0] = {CC(i-1,k,0),CC(i,k,0)};
2708             for (size_t m=1; m<ipph; ++m)
2709               {
2710               MULPM (cc2[m].r,cc2[m].i,WA(m-1,i-2),WA(m-1,i-1),CC(i-1,k,m),CC(i,k,m));
2711               MULPM (cc2[ip-m].r,cc2[ip-m].i,WA(ip-m-1,i-2),WA(ip-m-1,i-1),CC(i-1,k,ip-m),CC(i,k,ip-m));
2712               }
2713             auto res = static_cast<Tcd *>(cplan->exec(ticd, cc2, ch2,
2714               subbuf, fwd, nthreads));
2715             CH(i-1,0,k) = res[0].r;
2716             CH(i,0,k) = res[0].i;
2717             for (size_t m=1; m<ipph; ++m)
2718               {
2719               CH(i-1,2*m,k) = res[m].r;
2720               CH(ic-1,2*m-1,k) = res[ip-m].r;
2721               CH(i  ,2*m,k) = res[m].i;
2722               CH(ic  ,2*m-1,k) = -res[ip-m].i;
2723               }
2724             }
2725         }
2726       else
2727         {
2728         auto CC = [cc,this](size_t a, size_t b, size_t c) -> Tfd&
2729           { return cc[a+ido*(b+ip*c)]; };
2730         auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd&
2731           { return ch[a+ido*(b+l1*c)]; };
2732 
2733         for (size_t k=0; k<l1; k++)
2734           {
2735           cc2[0] = {CC(0,0,k), Tfd(0)};
2736           for (size_t m=1; m<=ip/2; ++m)
2737             {
2738             cc2[m] = {CC(ido-1,2*m-1,k),CC(0,2*m,k)};
2739             cc2[ip-m] = {CC(ido-1,2*m-1,k),-CC(0,2*m,k)};
2740             }
2741           auto res = static_cast<Tcd *>(cplan->exec(ticd, cc2, ch2,
2742             subbuf, fwd, nthreads));
2743           for (size_t m=0; m<ip; ++m)
2744             CH(0,k,m) = res[m].r;
2745           }
2746         if (ido==1) return ch;
2747         for (size_t k=0; k<l1; ++k)
2748           for (size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2)
2749             {
2750             // copy in
2751             cc2[0] = {CC(i-1,0,k),CC(i,0,k)};
2752             for (size_t m=1; m<=ip/2; ++m)
2753               {
2754               cc2[m] = {CC(i-1,2*m,k),CC(i,2*m,k)};
2755               cc2[ip-m] = {CC(ic-1,2*m-1,k),-CC(ic,2*m-1,k)};
2756               }
2757             auto res = static_cast<Tcd *>(cplan->exec(ticd, cc2, ch2,
2758               subbuf, fwd, nthreads));
2759             CH(i-1,k,0) = res[0].r;
2760             CH(i,k,0) = res[0].i;
2761             for (size_t m=1; m<ip; ++m)
2762               {
2763               MULPM(CH(i-1,k,m),CH(i,k,m),WA(m-1,i-2),-WA(m-1,i-1),res[m].r,res[m].i);
2764               MULPM(CH(i-1,k,ip-m),CH(i,k,ip-m),WA(ip-m-1,i-2),-WA(ip-m-1,i-1),res[ip-m].r,res[ip-m].i);
2765               }
2766             }
2767         }
2768       return ch;
2769       }
2770 
2771   public:
2772     rfftpblue(size_t l1_, size_t ido_, size_t ip_, const Troots<Tfs> &roots, bool vectorize=false)
2773       : l1(l1_), ido(ido_), ip(ip_), wa((ip-1)*(ido-1)),
2774         cplan(cfftpass<Tfs>::make_pass(1,1,ip,roots,vectorize))
2775       {
2776       MR_assert(ip&1, "Bluestein length must be odd");
2777       MR_assert(ido&1, "ido must be odd");
2778       size_t N=ip*l1*ido;
2779       auto rfct = roots->size()/N;
2780       MR_assert(roots->size()==N*rfct, "mismatch");
2781       for (size_t j=1; j<ip; ++j)
2782         for (size_t i=1; i<=(ido-1)/2; ++i)
2783           {
2784           auto val = (*roots)[rfct*j*l1*i];
2785           wa[(j-1)*(ido-1)+2*i-2] = val.r;
2786           wa[(j-1)*(ido-1)+2*i-1] = val.i;
2787           }
2788       }
2789 
2790     virtual size_t bufsize() const { return 4*ip + 2*cplan->bufsize(); }
2791     virtual bool needs_copy() const { return true; }
2792 
2793     POCKETFFT_EXEC_DISPATCH
2794   };
2795 
2796 template <typename Tfs> class rfft_multipass: public rfftpass<Tfs>
2797   {
2798   private:
2799     const size_t l1, ido;
2800     size_t ip;
2801     vector<Trpass<Tfs>> passes;
2802     size_t bufsz;
2803     bool need_cpy;
2804     quick_array<Tfs> wa;
2805 
2806     auto WA(size_t x, size_t i) const
2807       { return wa[(i-1)*(ip-1)+x]; }
2808 
2809     template<bool fwd, typename Tfd> Tfd *exec_(Tfd *cc, Tfd *ch, Tfd *buf,
2810       size_t nthreads) const
2811       {
2812       static const auto tifd = tidx<Tfd *>();
2813       if ((l1==1) && (ido==1))
2814         {
2815         Tfd *p1=cc, *p2=ch;
2816         if constexpr (fwd)
2817           for (auto it=passes.rbegin(); it!=passes.rend(); ++it)
2818             {
2819             auto res = static_cast<Tfd *>((*it)->exec(tifd,
2820               p1, p2, buf, fwd, nthreads));
2821             if (res==p2) swap(p1,p2);
2822             }
2823         else
2824           for (const auto &pass: passes)
2825             {
2826             auto res = static_cast<Tfd *>(pass->exec(tifd,
2827               p1, p2, buf, fwd, nthreads));
2828             if (res==p2) swap(p1,p2);
2829             }
2830         return p1;
2831         }
2832       else
2833         MR_fail("not yet supported");
2834       }
2835 
2836   public:
2837     rfft_multipass(size_t l1_, size_t ido_, size_t ip_,
2838       const Troots<Tfs> &roots, bool /*vectorize*/=false)
2839       : l1(l1_), ido(ido_), ip(ip_), bufsz(0), need_cpy(false),
2840         wa((ip-1)*(ido-1))
2841       {
2842       size_t N=ip*l1*ido;
2843       auto rfct = roots->size()/N;
2844       MR_assert(roots->size()==N*rfct, "mismatch");
2845       for (size_t j=1; j<ip; ++j)
2846         for (size_t i=1; i<=(ido-1)/2; ++i)
2847           {
2848           auto val = (*roots)[rfct*j*l1*i];
2849           wa[(j-1)*(ido-1)+2*i-2] = val.r;
2850           wa[(j-1)*(ido-1)+2*i-1] = val.i;
2851           }
2852 
2853       auto factors = rfftpass<Tfs>::factorize(ip);
2854 
2855       size_t l1l=1;
2856       for (auto fct: factors)
2857         {
2858         passes.push_back(rfftpass<Tfs>::make_pass(l1l, ip/(fct*l1l), fct, roots));
2859         l1l*=fct;
2860         }
2861       for (const auto &pass: passes)
2862         {
2863         bufsz = max(bufsz, pass->bufsize());
2864         need_cpy |= pass->needs_copy();
2865         }
2866       if ((l1!=1)||(ido!=1))
2867         {
2868         need_cpy=true;
2869         bufsz += 2*ip;
2870         }
2871       }
2872 
2873     virtual size_t bufsize() const { return bufsz; }
2874     virtual bool needs_copy() const { return need_cpy; }
2875 
2876     POCKETFFT_EXEC_DISPATCH
2877   };
2878 
2879 template <typename Tfs> class rfftp_complexify: public rfftpass<Tfs>
2880   {
2881   private:
2882     size_t N;
2883     Troots<Tfs> roots;
2884     size_t rfct;
2885     Tcpass<Tfs> pass;
2886     size_t l1, ido;
2887     static constexpr size_t ip=2;
2888 
2889     template<bool fwd, typename Tfd> Tfd *exec_ (Tfd * DUCC0_RESTRICT cc,
2890       Tfd * DUCC0_RESTRICT ch, Tfd * buf, size_t nthreads) const
2891       {
2892       using Tcd = Cmplx<Tfd>;
2893       auto ccc = reinterpret_cast<Tcd *>(cc);
2894       auto cch = reinterpret_cast<Tcd *>(ch);
2895       auto cbuf = reinterpret_cast<Tcd *>(buf);
2896       static const auto ticd = tidx<Tcd *>();
2897       if constexpr(fwd)
2898         {
2899         auto res = static_cast<Tcd *>(pass->exec(ticd,
2900           ccc, cch, cbuf, true, nthreads));
2901         auto rres = (res==ccc) ? ch : cc;
2902         rres[0] = res[0].r+res[0].i;
2903 //FIXME: parallelize?
2904         for (size_t i=1, xi=N/2-1; i<=xi; ++i, --xi)
2905           {
2906           auto xe = res[i]+res[xi].conj();
2907           auto xo = Tcd(res[i].i+res[xi].i, res[xi].r-res[i].r)
2908                   * (*roots)[rfct*i].conj();
2909           rres[2*i-1] = Tfs(0.5)*(xe.r+xo.r);
2910           rres[2*i] = Tfs(0.5)*(xe.i+xo.i);
2911           rres[2*xi-1] = Tfs(0.5)*(xe.r-xo.r);
2912           rres[2*xi] = Tfs(0.5)*(xo.i-xe.i);
2913           }
2914         rres[N-1] = res[0].r-res[0].i;
2915         return rres;
2916         }
2917       else
2918         {
2919         cch[0] = Tcd(cc[0]+cc[N-1], cc[0]-cc[N-1]);
2920 //FIXME: parallelize?
2921         for (size_t i=1, xi=N/2-1; i<=xi; ++i, --xi)
2922           {
2923           Tcd t1 (cc[2*i-1], cc[2*i]);
2924           Tcd t2 (cc[2*xi-1], -cc[2*xi]);
2925           auto xe = t1+t2;
2926           auto xo = (t1-t2)*(*roots)[rfct*i];
2927           cch[i] = (xe + Tcd(-xo.i, xo.r));
2928           cch[xi] = (xe.conj() + Tcd(xo.i, xo.r));
2929           }
2930         auto res = static_cast<Tcd *>(pass->exec(ticd,
2931           cch, ccc, cbuf, false, nthreads));
2932         return (res==ccc) ? cc : ch;
2933         }
2934       }
2935 
2936   public:
2937     rfftp_complexify(size_t N_, const Troots<Tfs> &roots_, bool vectorize=false)
2938       : N(N_), roots(roots_), pass(cfftpass<Tfs>::make_pass(N/2, vectorize))
2939       {
2940       rfct = roots->size()/N;
2941       MR_assert(roots->size()==N*rfct, "mismatch");
2942       MR_assert((N&1)==0, "N must be even");
2943       }
2944 
2945     virtual size_t bufsize() const { return 2*pass->bufsize(); }
2946     virtual bool needs_copy() const { return true; }
2947 
2948     POCKETFFT_EXEC_DISPATCH
2949   };
2950 #undef POCKETFFT_EXEC_DISPATCH
2951 
2952 template<typename Tfs> Trpass<Tfs> rfftpass<Tfs>::make_pass(size_t l1,
2953   size_t ido, size_t ip, const Troots<Tfs> &roots, bool vectorize)
2954   {
2955   MR_assert(ip>=1, "no zero-sized FFTs");
2956   if (ip==1) return make_shared<rfftp1<Tfs>>();
2957   if ((ip>1000) && ((ip&1)==0))  // use complex transform
2958     return make_shared<rfftp_complexify<Tfs>>(ip, roots, vectorize);
2959   auto factors=rfftpass<Tfs>::factorize(ip);
2960   if (factors.size()==1)
2961     {
2962     switch(ip)
2963       {
2964       case 2:
2965         return make_shared<rfftp2<Tfs>>(l1, ido, roots);
2966       case 3:
2967         return make_shared<rfftp3<Tfs>>(l1, ido, roots);
2968       case 4:
2969         return make_shared<rfftp4<Tfs>>(l1, ido, roots);
2970       case 5:
2971         return make_shared<rfftp5<Tfs>>(l1, ido, roots);
2972       default:
2973         if (ip<135)
2974           return make_shared<rfftpg<Tfs>>(l1, ido, ip, roots);
2975         else
2976           return make_shared<rfftpblue<Tfs>>(l1, ido, ip, roots, vectorize);
2977       }
2978     }
2979   else // more than one factor, need a multipass
2980     return make_shared<rfft_multipass<Tfs>>(l1, ido, ip, roots, vectorize);
2981   }
2982 
2983 template<typename Tfs> class pocketfft_r
2984   {
2985   private:
2986     size_t N;
2987     Trpass<Tfs> plan;
2988 
2989   public:
2990     pocketfft_r(size_t n, bool vectorize=false)
2991       : N(n), plan(rfftpass<Tfs>::make_pass(n,vectorize)) {}
2992     size_t length() const { return N; }
2993     size_t bufsize() const { return N*plan->needs_copy()+plan->bufsize(); }
2994     template<typename Tfd> DUCC0_NOINLINE Tfd *exec(Tfd *in, Tfd *buf, Tfs fct,
2995       bool fwd, size_t nthreads=1) const
2996       {
2997       static const auto tifd = tidx<Tfd *>();
2998       auto res = static_cast<Tfd *>(plan->exec(tifd, in, buf,
2999         buf+N*plan->needs_copy(), fwd, nthreads));
3000       if (fct!=Tfs(1))
3001         for (size_t i=0; i<N; ++i) res[i]*=fct;
3002       return res;
3003       }
3004     template<typename Tfd> DUCC0_NOINLINE void exec_copyback(Tfd *in, Tfd *buf,
3005       Tfs fct, bool fwd, size_t nthreads=1) const
3006       {
3007       static const auto tifd = tidx<Tfd *>();
3008       auto res = static_cast<Tfd *>(plan->exec(tifd, in, buf,
3009         buf+N*plan->needs_copy(), fwd, nthreads));
3010       if (res==in)
3011         {
3012         if (fct!=Tfs(1))
3013           for (size_t i=0; i<N; ++i) in[i]*=fct;
3014         }
3015       else
3016         {
3017         if (fct!=Tfs(1))
3018           for (size_t i=0; i<N; ++i) in[i]=res[i]*fct;
3019         else
3020           copy_n(res, N, in);
3021         }
3022       }
3023     template<typename Tfd> DUCC0_NOINLINE void exec(Tfd *in, Tfs fct, bool fwd,
3024       size_t nthreads=1) const
3025       {
3026       quick_array<Tfd> buf(N*plan->needs_copy()+plan->bufsize());
3027       exec_copyback(in, buf.data(), fct, fwd, nthreads);
3028       }
3029   };
3030 
3031 template<typename Tfs> class pocketfft_hartley
3032   {
3033   private:
3034     size_t N;
3035     Trpass<Tfs> plan;
3036 
3037   public:
3038     pocketfft_hartley(size_t n, bool vectorize=false)
3039       : N(n), plan(rfftpass<Tfs>::make_pass(n,vectorize)) {}
3040     size_t length() const { return N; }
3041     size_t bufsize() const { return N+plan->bufsize(); }
3042     template<typename Tfd> DUCC0_NOINLINE Tfd *exec(Tfd *in, Tfd *buf, Tfs fct,
3043       size_t nthreads=1) const
3044       {
3045       static const auto tifd = tidx<Tfd *>();
3046       auto res = static_cast<Tfd *>(plan->exec(tifd,
3047         in, buf, buf+N, true, nthreads));
3048       auto res2 = (res==buf) ? in : buf;
3049       res2[0] = fct*res[0];
3050       size_t i=1, i1=1, i2=N-1;
3051       for (i=1; i<N-1; i+=2, ++i1, --i2)
3052         {
3053 #ifdef DUCC0_USE_PROPER_HARTLEY_CONVENTION
3054         res2[i1] = fct*(res[i]-res[i+1]);
3055         res2[i2] = fct*(res[i]+res[i+1]);
3056 #else
3057         res2[i1] = fct*(res[i]+res[i+1]);
3058         res2[i2] = fct*(res[i]-res[i+1]);
3059 #endif
3060         }
3061       if (i<N)
3062         res2[i1] = fct*res[i];
3063 
3064       return res2;
3065       }
3066     template<typename Tfd> DUCC0_NOINLINE void exec_copyback(Tfd *in, Tfd *buf,
3067       Tfs fct, size_t nthreads=1) const
3068       {
3069       auto res = exec(in, buf, fct, nthreads);
3070       if (res!=in)
3071         copy_n(res, N, in);
3072       }
3073     template<typename Tfd> DUCC0_NOINLINE void exec(Tfd *in, Tfs fct,
3074       size_t nthreads=1) const
3075       {
3076       quick_array<Tfd> buf(N+plan->bufsize());
3077       exec_copyback(in, buf.data(), fct, nthreads);
3078       }
3079   };
3080 
3081 // R2R transforms using FFTW's halfcomplex format
3082 template<typename Tfs> class pocketfft_fftw
3083   {
3084   private:
3085     size_t N;
3086     Trpass<Tfs> plan;
3087 
3088   public:
3089     pocketfft_fftw(size_t n, bool vectorize=false)
3090       : N(n), plan(rfftpass<Tfs>::make_pass(n,vectorize)) {}
3091     size_t length() const { return N; }
3092     size_t bufsize() const { return N+plan->bufsize(); }
3093     template<typename Tfd> DUCC0_NOINLINE Tfd *exec(Tfd *in, Tfd *buf, Tfs fct,
3094       bool fwd, size_t nthreads=1) const
3095       {
3096       static const auto tifd = tidx<Tfd *>();
3097       auto res = in;
3098       auto res2 = buf;
3099       if (!fwd) // go to FFTPACK halfcomplex order
3100         {
3101         res2[0] = fct*res[0];
3102         size_t i=1, i1=1, i2=N-1;
3103         for (i=1; i<N-1; i+=2, ++i1, --i2)
3104           {
3105           res2[i] = fct*res[i1];
3106           res2[i+1] = fct*res[i2];
3107           }
3108         if (i<N)
3109           res2[i] = fct*res[i1];
3110         swap(res, res2);
3111         }
3112       res = static_cast<Tfd *>(plan->exec(tifd,
3113         res, res2, buf+N, fwd, nthreads));
3114       if (!fwd) return res;
3115 
3116       // go to FFTW halfcomplex order
3117       res2 = (res==buf) ? in : buf;
3118       res2[0] = fct*res[0];
3119       size_t i=1, i1=1, i2=N-1;
3120       for (i=1; i<N-1; i+=2, ++i1, --i2)
3121         {
3122         res2[i1] = fct*res[i];
3123         res2[i2] = fct*res[i+1];
3124         }
3125       if (i<N)
3126         res2[i1] = fct*res[i];
3127 
3128       return res2;
3129       }
3130     template<typename Tfd> DUCC0_NOINLINE void exec_copyback(Tfd *in, Tfd *buf,
3131       Tfs fct, bool fwd, size_t nthreads=1) const
3132       {
3133       auto res = exec(in, buf, fct, fwd, nthreads);
3134       if (res!=in)
3135         copy_n(res, N, in);
3136       }
3137     template<typename Tfd> DUCC0_NOINLINE void exec(Tfd *in, Tfs fct, bool fwd,
3138       size_t nthreads=1) const
3139       {
3140       quick_array<Tfd> buf(N+plan->bufsize());
3141       exec_copyback(in, buf.data(), fct, fwd, nthreads);
3142       }
3143   };
3144 
3145 }
3146 
3147 using detail_fft::pocketfft_c;
3148 using detail_fft::pocketfft_r;
3149 using detail_fft::pocketfft_hartley;
3150 using detail_fft::pocketfft_fftw;
3151 inline size_t good_size_complex(size_t n)
3152   { return detail_fft::util1d::good_size_cmplx(n); }
3153 inline size_t good_size_real(size_t n)
3154   { return detail_fft::util1d::good_size_real(n); }
3155 
3156 }
3157 
3158 #endif
3159