1 /* 2 This file is part of pocketfft. 3 4 Copyright (C) 2010-2021 Max-Planck-Society 5 Copyright (C) 2019 Peter Bell 6 7 Authors: Martin Reinecke, Peter Bell 8 9 All rights reserved. 10 11 Redistribution and use in source and binary forms, with or without modification, 12 are permitted provided that the following conditions are met: 13 14 * Redistributions of source code must retain the above copyright notice, this 15 list of conditions and the following disclaimer. 16 * Redistributions in binary form must reproduce the above copyright notice, this 17 list of conditions and the following disclaimer in the documentation and/or 18 other materials provided with the distribution. 19 * Neither the name of the copyright holder nor the names of its contributors may 20 be used to endorse or promote products derived from this software without 21 specific prior written permission. 22 23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 24 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 25 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 26 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 27 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 28 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 30 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 32 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 #ifndef DUCC0_FFT1D_H 36 #define DUCC0_FFT1D_H 37 38 #include <memory> 39 #include <cstddef> 40 #include <algorithm> 41 #include <functional> 42 #include <type_traits> 43 #include <utility> 44 #include <vector> 45 #include <typeinfo> 46 #include <typeindex> 47 #include "ducc0/infra/useful_macros.h" 48 #include "ducc0/math/cmplx.h" 49 #include "ducc0/infra/error_handling.h" 50 #include "ducc0/infra/aligned_array.h" 51 #include "ducc0/infra/simd.h" 52 #include "ducc0/infra/threading.h" 53 #include "ducc0/math/unity_roots.h" 54 55 //#define DUCC0_USE_PROPER_HARTLEY_CONVENTION 56 57 namespace ducc0 { 58 59 namespace detail_fft { 60 61 using namespace std; 62 63 template<typename T> constexpr inline size_t fft1d_simdlen 64 = min<size_t>(8, native_simd<T>::size()); 65 template<> constexpr inline size_t fft1d_simdlen<double> 66 = min<size_t>(4, native_simd<double>::size()); 67 template<> constexpr inline size_t fft1d_simdlen<float> 68 = min<size_t>(8, native_simd<float>::size()); 69 template<typename T> using fft1d_simd = typename simd_select<T,fft1d_simdlen<T>>::type; 70 template<typename T> constexpr inline bool fft1d_simd_exists = (fft1d_simdlen<T> > 1); 71 72 // Always use std:: for <cmath> functions 73 template <typename T> T cos(T) = delete; 74 template <typename T> T sin(T) = delete; 75 template <typename T> T sqrt(T) = delete; 76 77 template<typename T> inline void PM(T &a, T &b, T c, T d) 78 { a=c+d; b=c-d; } 79 template<typename T> inline void PMINPLACE(T &a, T &b) 80 { T t = a; a+=b; b=t-b; } 81 template<typename T> inline void MPINPLACE(T &a, T &b) 82 { T t = a; a-=b; b=t+b; } 83 template<bool fwd, typename T, typename T2> void special_mul (const Cmplx<T> &v1, const Cmplx<T2> &v2, Cmplx<T> &res) 84 { 85 res = fwd ? Cmplx<T>(v1.r*v2.r+v1.i*v2.i, v1.i*v2.r-v1.r*v2.i) 86 : Cmplx<T>(v1.r*v2.r-v1.i*v2.i, v1.r*v2.i+v1.i*v2.r); 87 } 88 89 template<bool fwd, typename T> void ROTX90(Cmplx<T> &a) 90 { auto tmp_= fwd ? -a.r : a.r; a.r = fwd ? a.i : -a.i; a.i=tmp_; } 91 92 template<typename T> inline auto tidx() { return type_index(typeid(T)); } 93 94 struct util1d // hack to avoid duplicate symbols 95 { 96 /* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */ 97 DUCC0_NOINLINE static size_t good_size_cmplx(size_t n) 98 { 99 if (n<=12) return n; 100 101 size_t bestfac=2*n; 102 for (size_t f11=1; f11<bestfac; f11*=11) 103 for (size_t f117=f11; f117<bestfac; f117*=7) 104 for (size_t f1175=f117; f1175<bestfac; f1175*=5) 105 { 106 size_t x=f1175; 107 while (x<n) x*=2; 108 for (;;) 109 { 110 if (x<n) 111 x*=3; 112 else if (x>n) 113 { 114 if (x<bestfac) bestfac=x; 115 if (x&1) break; 116 x>>=1; 117 } 118 else 119 return n; 120 } 121 } 122 return bestfac; 123 } 124 125 /* returns the smallest composite of 2, 3, 5 which is >= n */ 126 DUCC0_NOINLINE static size_t good_size_real(size_t n) 127 { 128 if (n<=6) return n; 129 130 size_t bestfac=2*n; 131 for (size_t f5=1; f5<bestfac; f5*=5) 132 { 133 size_t x = f5; 134 while (x<n) x *= 2; 135 for (;;) 136 { 137 if (x<n) 138 x*=3; 139 else if (x>n) 140 { 141 if (x<bestfac) bestfac=x; 142 if (x&1) break; 143 x>>=1; 144 } 145 else 146 return n; 147 } 148 } 149 return bestfac; 150 } 151 152 DUCC0_NOINLINE static vector<size_t> prime_factors(size_t N) 153 { 154 MR_assert(N>0, "need a positive number"); 155 vector<size_t> factors; 156 while ((N&1)==0) 157 { N>>=1; factors.push_back(2); } 158 for (size_t divisor=3; divisor*divisor<=N; divisor+=2) 159 while ((N%divisor)==0) 160 { 161 factors.push_back(divisor); 162 N/=divisor; 163 } 164 if (N>1) factors.push_back(N); 165 return factors; 166 } 167 }; 168 169 template<typename T> using Troots = shared_ptr<const UnityRoots<T,Cmplx<T>>>; 170 171 // T: "type", f/c: "float/complex", s/v: "scalar/vector" 172 template <typename Tfs> class cfftpass 173 { 174 public: 175 virtual ~cfftpass(){} 176 using Tcs = Cmplx<Tfs>; 177 178 // number of Tcd values required as scratch space during "exec" 179 // will be provided in "buf" 180 virtual size_t bufsize() const = 0; 181 virtual bool needs_copy() const = 0; 182 virtual void *exec(const type_index &ti, void *in, void *copy, void *buf, 183 bool fwd, size_t nthreads=1) const = 0; 184 185 static vector<size_t> factorize(size_t N) 186 { 187 MR_assert(N>0, "need a positive number"); 188 vector<size_t> factors; 189 factors.reserve(15); 190 while ((N&3)==0) 191 { factors.push_back(4); N>>=2; } 192 if ((N&1)==0) 193 { 194 N>>=1; 195 // factor 2 should be at the front of the factor list 196 factors.push_back(2); 197 swap(factors[0], factors.back()); 198 } 199 for (size_t divisor=3; divisor*divisor<=N; divisor+=2) 200 while ((N%divisor)==0) 201 { 202 factors.push_back(divisor); 203 N/=divisor; 204 } 205 if (N>1) factors.push_back(N); 206 return factors; 207 } 208 209 static shared_ptr<cfftpass> make_pass(size_t l1, size_t ido, size_t ip, 210 const Troots<Tfs> &roots, bool vectorize=false); 211 static shared_ptr<cfftpass> make_pass(size_t ip, bool vectorize=false) 212 { 213 return make_pass(1,1,ip,make_shared<UnityRoots<Tfs,Cmplx<Tfs>>>(ip), 214 vectorize); 215 } 216 }; 217 218 #define POCKETFFT_EXEC_DISPATCH \ 219 virtual void *exec(const type_index &ti, void *in, void *copy, void *buf, \ 220 bool fwd, size_t nthreads=1) const \ 221 { \ 222 static const auto tics = tidx<Tcs *>(); \ 223 if (ti==tics) \ 224 { \ 225 auto in1 = static_cast<Tcs *>(in); \ 226 auto copy1 = static_cast<Tcs *>(copy); \ 227 auto buf1 = static_cast<Tcs *>(buf); \ 228 return fwd ? exec_<true>(in1, copy1, buf1, nthreads) \ 229 : exec_<false>(in1, copy1, buf1, nthreads); \ 230 } \ 231 if constexpr (fft1d_simdlen<Tfs> > 1) \ 232 if constexpr (simd_exists<Tfs, fft1d_simdlen<Tfs>>) \ 233 { \ 234 using Tfv = typename simd_select<Tfs, fft1d_simdlen<Tfs>>::type; \ 235 using Tcv = Cmplx<Tfv>; \ 236 static const auto ticv = tidx<Tcv *>(); \ 237 if (ti==ticv) \ 238 { \ 239 auto in1 = static_cast<Tcv *>(in); \ 240 auto copy1 = static_cast<Tcv *>(copy); \ 241 auto buf1 = static_cast<Tcv *>(buf); \ 242 return fwd ? exec_<true>(in1, copy1, buf1, nthreads) \ 243 : exec_<false>(in1, copy1, buf1, nthreads); \ 244 } \ 245 } \ 246 if constexpr (fft1d_simdlen<Tfs> > 2) \ 247 if constexpr (simd_exists<Tfs, fft1d_simdlen<Tfs>/2>) \ 248 { \ 249 using Tfv = typename simd_select<Tfs, fft1d_simdlen<Tfs>/2>::type; \ 250 using Tcv = Cmplx<Tfv>; \ 251 static const auto ticv = tidx<Tcv *>(); \ 252 if (ti==ticv) \ 253 { \ 254 auto in1 = static_cast<Tcv *>(in); \ 255 auto copy1 = static_cast<Tcv *>(copy); \ 256 auto buf1 = static_cast<Tcv *>(buf); \ 257 return fwd ? exec_<true>(in1, copy1, buf1, nthreads) \ 258 : exec_<false>(in1, copy1, buf1, nthreads); \ 259 } \ 260 } \ 261 if constexpr (fft1d_simdlen<Tfs> > 4) \ 262 if constexpr (simd_exists<Tfs, fft1d_simdlen<Tfs>/4>) \ 263 { \ 264 using Tfv = typename simd_select<Tfs, fft1d_simdlen<Tfs>/4>::type; \ 265 using Tcv = Cmplx<Tfv>; \ 266 static const auto ticv = tidx<Tcv *>(); \ 267 if (ti==ticv) \ 268 { \ 269 auto in1 = static_cast<Tcv *>(in); \ 270 auto copy1 = static_cast<Tcv *>(copy); \ 271 auto buf1 = static_cast<Tcv *>(buf); \ 272 return fwd ? exec_<true>(in1, copy1, buf1, nthreads) \ 273 : exec_<false>(in1, copy1, buf1, nthreads); \ 274 } \ 275 } \ 276 if constexpr (fft1d_simdlen<Tfs> > 8) \ 277 if constexpr (simd_exists<Tfs, fft1d_simdlen<Tfs>/8>) \ 278 { \ 279 using Tfv = typename simd_select<Tfs, fft1d_simdlen<Tfs>/8>::type; \ 280 using Tcv = Cmplx<Tfv>; \ 281 static const auto ticv = tidx<Tcv *>(); \ 282 if (ti==ticv) \ 283 { \ 284 auto in1 = static_cast<Tcv *>(in); \ 285 auto copy1 = static_cast<Tcv *>(copy); \ 286 auto buf1 = static_cast<Tcv *>(buf); \ 287 return fwd ? exec_<true>(in1, copy1, buf1, nthreads) \ 288 : exec_<false>(in1, copy1, buf1, nthreads); \ 289 } \ 290 } \ 291 MR_fail("impossible vector length requested"); \ 292 } 293 294 template<typename T> using Tcpass = shared_ptr<cfftpass<T>>; 295 296 template <typename Tfs> class cfftp1: public cfftpass<Tfs> 297 { 298 public: 299 cfftp1() {} 300 virtual size_t bufsize() const { return 0; } 301 virtual bool needs_copy() const { return false; } 302 303 virtual void *exec(const type_index & /*ti*/, void * in, void * /*copy*/, 304 void * /*buf*/, bool /*fwd*/, size_t /*nthreads*/) const 305 { return in; } 306 }; 307 308 template <typename Tfs> class cfftp2: public cfftpass<Tfs> 309 { 310 private: 311 using typename cfftpass<Tfs>::Tcs; 312 313 size_t l1, ido; 314 static constexpr size_t ip=2; 315 quick_array<Tcs> wa; 316 317 auto WA(size_t i) const 318 { return wa[i-1]; } 319 320 template<bool fwd, typename Tcd> Tcd *exec_ (const Tcd * DUCC0_RESTRICT cc, 321 Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, size_t /*nthreads*/) const 322 { 323 if (ido==1) 324 { 325 auto CH = [ch,this](size_t b, size_t c) -> Tcd& 326 { return ch[b+l1*c]; }; 327 auto CC = [cc](size_t b, size_t c) -> const Tcd& 328 { return cc[b+ip*c]; }; 329 for (size_t k=0; k<l1; ++k) 330 { 331 CH(k,0) = CC(0,k)+CC(1,k); 332 CH(k,1) = CC(0,k)-CC(1,k); 333 } 334 return ch; 335 } 336 else 337 { 338 auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& 339 { return ch[a+ido*(b+l1*c)]; }; 340 auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& 341 { return cc[a+ido*(b+ip*c)]; }; 342 for (size_t k=0; k<l1; ++k) 343 { 344 CH(0,k,0) = CC(0,0,k)+CC(0,1,k); 345 CH(0,k,1) = CC(0,0,k)-CC(0,1,k); 346 for (size_t i=1; i<ido; ++i) 347 { 348 CH(i,k,0) = CC(i,0,k)+CC(i,1,k); 349 special_mul<fwd>(CC(i,0,k)-CC(i,1,k),WA(i),CH(i,k,1)); 350 } 351 } 352 return ch; 353 } 354 } 355 356 public: 357 cfftp2(size_t l1_, size_t ido_, const Troots<Tfs> &roots) 358 : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) 359 { 360 size_t N=ip*l1*ido; 361 size_t rfct = roots->size()/N; 362 MR_assert(roots->size()==N*rfct, "mismatch"); 363 for (size_t i=1; i<ido; ++i) 364 wa[i-1] = (*roots)[rfct*l1*i]; 365 } 366 367 virtual size_t bufsize() const { return 0; } 368 virtual bool needs_copy() const { return true; } 369 370 POCKETFFT_EXEC_DISPATCH 371 }; 372 373 template <typename Tfs> class cfftp3: public cfftpass<Tfs> 374 { 375 private: 376 using typename cfftpass<Tfs>::Tcs; 377 378 size_t l1, ido; 379 static constexpr size_t ip=3; 380 quick_array<Tcs> wa; 381 382 auto WA(size_t x, size_t i) const 383 { return wa[x+(i-1)*(ip-1)]; } 384 385 template<bool fwd, typename Tcd> Tcd *exec_ 386 (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, 387 size_t /*nthreads*/) const 388 { 389 constexpr Tfs tw1r=-0.5, 390 tw1i= (fwd ? -1: 1) * Tfs(0.8660254037844386467637231707529362L); 391 392 auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& 393 { return ch[a+ido*(b+l1*c)]; }; 394 auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& 395 { return cc[a+ido*(b+ip*c)]; }; 396 397 #define POCKETFFT_PREP3(idx) \ 398 Tcd t0 = CC(idx,0,k), t1, t2; \ 399 PM (t1,t2,CC(idx,1,k),CC(idx,2,k)); \ 400 CH(idx,k,0)=t0+t1; 401 #define POCKETFFT_PARTSTEP3a(u1,u2,twr,twi) \ 402 { \ 403 Tcd ca=t0+t1*twr; \ 404 Tcd cb{-t2.i*twi, t2.r*twi}; \ 405 PM(CH(0,k,u1),CH(0,k,u2),ca,cb) ;\ 406 } 407 #define POCKETFFT_PARTSTEP3b(u1,u2,twr,twi) \ 408 { \ 409 Tcd ca=t0+t1*twr; \ 410 Tcd cb{-t2.i*twi, t2.r*twi}; \ 411 special_mul<fwd>(ca+cb,WA(u1-1,i),CH(i,k,u1)); \ 412 special_mul<fwd>(ca-cb,WA(u2-1,i),CH(i,k,u2)); \ 413 } 414 415 if (ido==1) 416 for (size_t k=0; k<l1; ++k) 417 { 418 POCKETFFT_PREP3(0) 419 POCKETFFT_PARTSTEP3a(1,2,tw1r,tw1i) 420 } 421 else 422 for (size_t k=0; k<l1; ++k) 423 { 424 { 425 POCKETFFT_PREP3(0) 426 POCKETFFT_PARTSTEP3a(1,2,tw1r,tw1i) 427 } 428 for (size_t i=1; i<ido; ++i) 429 { 430 POCKETFFT_PREP3(i) 431 POCKETFFT_PARTSTEP3b(1,2,tw1r,tw1i) 432 } 433 } 434 435 #undef POCKETFFT_PARTSTEP3b 436 #undef POCKETFFT_PARTSTEP3a 437 #undef POCKETFFT_PREP3 438 439 return ch; 440 } 441 442 public: 443 cfftp3(size_t l1_, size_t ido_, const Troots<Tfs> &roots) 444 : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) 445 { 446 size_t N=ip*l1*ido; 447 size_t rfct = roots->size()/N; 448 MR_assert(roots->size()==N*rfct, "mismatch"); 449 for (size_t i=1; i<ido; ++i) 450 for (size_t j=1; j<ip; ++j) 451 wa[(j-1)+(i-1)*(ip-1)] = (*roots)[rfct*j*l1*i]; 452 } 453 454 virtual size_t bufsize() const { return 0; } 455 virtual bool needs_copy() const { return true; } 456 457 POCKETFFT_EXEC_DISPATCH 458 }; 459 460 template <typename Tfs> class cfftp4: public cfftpass<Tfs> 461 { 462 private: 463 using typename cfftpass<Tfs>::Tcs; 464 465 size_t l1, ido; 466 static constexpr size_t ip=4; 467 quick_array<Tcs> wa; 468 469 auto WA(size_t x, size_t i) const 470 { return wa[x+(i-1)*(ip-1)]; } 471 472 template<bool fwd, typename Tcd> Tcd *exec_ 473 (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, 474 size_t /*nthreads*/) const 475 { 476 if (ido==1) 477 { 478 auto CH = [ch,this](size_t b, size_t c) -> Tcd& 479 { return ch[b+l1*c]; }; 480 auto CC = [cc](size_t b, size_t c) -> const Tcd& 481 { return cc[b+ip*c]; }; 482 for (size_t k=0; k<l1; ++k) 483 { 484 Tcd t1, t2, t3, t4; 485 PM(t2,t1,CC(0,k),CC(2,k)); 486 PM(t3,t4,CC(1,k),CC(3,k)); 487 ROTX90<fwd>(t4); 488 PM(CH(k,0),CH(k,2),t2,t3); 489 PM(CH(k,1),CH(k,3),t1,t4); 490 } 491 } 492 else 493 { 494 auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& 495 { return ch[a+ido*(b+l1*c)]; }; 496 auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& 497 { return cc[a+ido*(b+ip*c)]; }; 498 for (size_t k=0; k<l1; ++k) 499 { 500 { 501 Tcd t1, t2, t3, t4; 502 PM(t2,t1,CC(0,0,k),CC(0,2,k)); 503 PM(t3,t4,CC(0,1,k),CC(0,3,k)); 504 ROTX90<fwd>(t4); 505 PM(CH(0,k,0),CH(0,k,2),t2,t3); 506 PM(CH(0,k,1),CH(0,k,3),t1,t4); 507 } 508 for (size_t i=1; i<ido; ++i) 509 { 510 Tcd t1, t2, t3, t4; 511 Tcd cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k); 512 PM(t2,t1,cc0,cc2); 513 PM(t3,t4,cc1,cc3); 514 ROTX90<fwd>(t4); 515 CH(i,k,0) = t2+t3; 516 special_mul<fwd>(t1+t4,WA(0,i),CH(i,k,1)); 517 special_mul<fwd>(t2-t3,WA(1,i),CH(i,k,2)); 518 special_mul<fwd>(t1-t4,WA(2,i),CH(i,k,3)); 519 } 520 } 521 } 522 return ch; 523 } 524 525 public: 526 cfftp4(size_t l1_, size_t ido_, const Troots<Tfs> &roots) 527 : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) 528 { 529 size_t N=ip*l1*ido; 530 size_t rfct = roots->size()/N; 531 MR_assert(roots->size()==N*rfct, "mismatch"); 532 for (size_t i=1; i<ido; ++i) 533 for (size_t j=1; j<ip; ++j) 534 wa[(j-1)+(i-1)*(ip-1)] = (*roots)[rfct*j*l1*i]; 535 } 536 537 virtual size_t bufsize() const { return 0; } 538 virtual bool needs_copy() const { return true; } 539 540 POCKETFFT_EXEC_DISPATCH 541 }; 542 543 template <typename Tfs> class cfftp5: public cfftpass<Tfs> 544 { 545 private: 546 using typename cfftpass<Tfs>::Tcs; 547 548 size_t l1, ido; 549 static constexpr size_t ip=5; 550 quick_array<Tcs> wa; 551 552 auto WA(size_t x, size_t i) const 553 { return wa[x+(i-1)*(ip-1)]; } 554 555 template<bool fwd, typename Tcd> Tcd *exec_ 556 (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, 557 size_t /*nthreads*/) const 558 { 559 constexpr Tfs tw1r= Tfs(0.3090169943749474241022934171828191L), 560 tw1i= (fwd ? -1: 1) * Tfs(0.9510565162951535721164393333793821L), 561 tw2r= Tfs(-0.8090169943749474241022934171828191L), 562 tw2i= (fwd ? -1: 1) * Tfs(0.5877852522924731291687059546390728L); 563 564 auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& 565 { return ch[a+ido*(b+l1*c)]; }; 566 auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& 567 { return cc[a+ido*(b+ip*c)]; }; 568 569 #define POCKETFFT_PREP5(idx) \ 570 Tcd t0 = CC(idx,0,k), t1, t2, t3, t4; \ 571 PM (t1,t4,CC(idx,1,k),CC(idx,4,k)); \ 572 PM (t2,t3,CC(idx,2,k),CC(idx,3,k)); \ 573 CH(idx,k,0).r=t0.r+t1.r+t2.r; \ 574 CH(idx,k,0).i=t0.i+t1.i+t2.i; 575 576 #define POCKETFFT_PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \ 577 { \ 578 Tcd ca,cb; \ 579 ca.r=t0.r+twar*t1.r+twbr*t2.r; \ 580 ca.i=t0.i+twar*t1.i+twbr*t2.i; \ 581 cb.i=twai*t4.r twbi*t3.r; \ 582 cb.r=-(twai*t4.i twbi*t3.i); \ 583 PM(CH(0,k,u1),CH(0,k,u2),ca,cb); \ 584 } 585 586 #define POCKETFFT_PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \ 587 { \ 588 Tcd ca,cb,da,db; \ 589 ca.r=t0.r+twar*t1.r+twbr*t2.r; \ 590 ca.i=t0.i+twar*t1.i+twbr*t2.i; \ 591 cb.i=twai*t4.r twbi*t3.r; \ 592 cb.r=-(twai*t4.i twbi*t3.i); \ 593 special_mul<fwd>(ca+cb,WA(u1-1,i),CH(i,k,u1)); \ 594 special_mul<fwd>(ca-cb,WA(u2-1,i),CH(i,k,u2)); \ 595 } 596 597 if (ido==1) 598 for (size_t k=0; k<l1; ++k) 599 { 600 POCKETFFT_PREP5(0) 601 POCKETFFT_PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i) 602 POCKETFFT_PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i) 603 } 604 else 605 for (size_t k=0; k<l1; ++k) 606 { 607 { 608 POCKETFFT_PREP5(0) 609 POCKETFFT_PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i) 610 POCKETFFT_PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i) 611 } 612 for (size_t i=1; i<ido; ++i) 613 { 614 POCKETFFT_PREP5(i) 615 POCKETFFT_PARTSTEP5b(1,4,tw1r,tw2r,+tw1i,+tw2i) 616 POCKETFFT_PARTSTEP5b(2,3,tw2r,tw1r,+tw2i,-tw1i) 617 } 618 } 619 620 #undef POCKETFFT_PARTSTEP5b 621 #undef POCKETFFT_PARTSTEP5a 622 #undef POCKETFFT_PREP5 623 624 return ch; 625 } 626 627 public: 628 cfftp5(size_t l1_, size_t ido_, const Troots<Tfs> &roots) 629 : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) 630 { 631 size_t N=ip*l1*ido; 632 auto rfct = roots->size()/N; 633 MR_assert(roots->size()==N*rfct, "mismatch"); 634 for (size_t i=1; i<ido; ++i) 635 for (size_t j=1; j<ip; ++j) 636 wa[(j-1)+(i-1)*(ip-1)] = (*roots)[rfct*j*l1*i]; 637 } 638 639 virtual size_t bufsize() const { return 0; } 640 virtual bool needs_copy() const { return true; } 641 642 POCKETFFT_EXEC_DISPATCH 643 }; 644 645 template <typename Tfs> class cfftp7: public cfftpass<Tfs> 646 { 647 private: 648 using typename cfftpass<Tfs>::Tcs; 649 650 size_t l1, ido; 651 static constexpr size_t ip=7; 652 quick_array<Tcs> wa; 653 654 auto WA(size_t x, size_t i) const 655 { return wa[x+(i-1)*(ip-1)]; } 656 657 template<bool fwd, typename Tcd> Tcd *exec_ 658 (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, 659 size_t /*nthreads*/) const 660 { 661 constexpr Tfs tw1r= Tfs(0.6234898018587335305250048840042398L), 662 tw1i= (fwd ? -1 : 1) * Tfs(0.7818314824680298087084445266740578L), 663 tw2r= Tfs(-0.2225209339563144042889025644967948L), 664 tw2i= (fwd ? -1 : 1) * Tfs(0.9749279121818236070181316829939312L), 665 tw3r= Tfs(-0.9009688679024191262361023195074451L), 666 tw3i= (fwd ? -1 : 1) * Tfs(0.433883739117558120475768332848359L); 667 668 auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& 669 { return ch[a+ido*(b+l1*c)]; }; 670 auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& 671 { return cc[a+ido*(b+ip*c)]; }; 672 673 #define POCKETFFT_PREP7(idx) \ 674 Tcd t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \ 675 PM (t2,t7,CC(idx,1,k),CC(idx,6,k)); \ 676 PM (t3,t6,CC(idx,2,k),CC(idx,5,k)); \ 677 PM (t4,t5,CC(idx,3,k),CC(idx,4,k)); \ 678 CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \ 679 CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i; 680 681 #define POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,out1,out2) \ 682 { \ 683 Tcd ca,cb; \ 684 ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; \ 685 ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \ 686 cb.i=y1*t7.r y2*t6.r y3*t5.r; \ 687 cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \ 688 PM(out1,out2,ca,cb); \ 689 } 690 #define POCKETFFT_PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \ 691 POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2)) 692 #define POCKETFFT_PARTSTEP7(u1,u2,x1,x2,x3,y1,y2,y3) \ 693 { \ 694 Tcd da,db; \ 695 POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \ 696 special_mul<fwd>(da,WA(u1-1,i),CH(i,k,u1)); \ 697 special_mul<fwd>(db,WA(u2-1,i),CH(i,k,u2)); \ 698 } 699 700 if (ido==1) 701 for (size_t k=0; k<l1; ++k) 702 { 703 POCKETFFT_PREP7(0) 704 POCKETFFT_PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i) 705 POCKETFFT_PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i) 706 POCKETFFT_PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i) 707 } 708 else 709 for (size_t k=0; k<l1; ++k) 710 { 711 { 712 POCKETFFT_PREP7(0) 713 POCKETFFT_PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i) 714 POCKETFFT_PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i) 715 POCKETFFT_PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i) 716 } 717 for (size_t i=1; i<ido; ++i) 718 { 719 POCKETFFT_PREP7(i) 720 POCKETFFT_PARTSTEP7(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i) 721 POCKETFFT_PARTSTEP7(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i) 722 POCKETFFT_PARTSTEP7(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i) 723 } 724 } 725 726 #undef POCKETFFT_PARTSTEP7 727 #undef POCKETFFT_PARTSTEP7a0 728 #undef POCKETFFT_PARTSTEP7a 729 #undef POCKETFFT_PREP7 730 731 return ch; 732 } 733 734 public: 735 cfftp7(size_t l1_, size_t ido_, const Troots<Tfs> &roots) 736 : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) 737 { 738 size_t N=ip*l1*ido; 739 auto rfct = roots->size()/N; 740 MR_assert(roots->size()==N*rfct, "mismatch"); 741 for (size_t i=1; i<ido; ++i) 742 for (size_t j=1; j<ip; ++j) 743 wa[(j-1)+(i-1)*(ip-1)] = (*roots)[rfct*j*l1*i]; 744 } 745 746 virtual size_t bufsize() const { return 0; } 747 virtual bool needs_copy() const { return true; } 748 749 POCKETFFT_EXEC_DISPATCH 750 }; 751 752 template <typename Tfs> class cfftp11: public cfftpass<Tfs> 753 { 754 private: 755 using typename cfftpass<Tfs>::Tcs; 756 757 size_t l1, ido; 758 static constexpr size_t ip=11; 759 quick_array<Tcs> wa; 760 761 auto WA(size_t x, size_t i) const 762 { return wa[x+(i-1)*(ip-1)]; } 763 764 template<bool fwd, typename Tcd> [[gnu::hot]] Tcd *exec_ 765 (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, 766 size_t /*nthreads*/) const 767 { 768 constexpr Tfs tw1r= Tfs(0.8412535328311811688618116489193677L), 769 tw1i= (fwd ? -1 : 1) * Tfs(0.5406408174555975821076359543186917L), 770 tw2r= Tfs(0.4154150130018864255292741492296232L), 771 tw2i= (fwd ? -1 : 1) * Tfs(0.9096319953545183714117153830790285L), 772 tw3r= Tfs(-0.1423148382732851404437926686163697L), 773 tw3i= (fwd ? -1 : 1) * Tfs(0.9898214418809327323760920377767188L), 774 tw4r= Tfs(-0.6548607339452850640569250724662936L), 775 tw4i= (fwd ? -1 : 1) * Tfs(0.7557495743542582837740358439723444L), 776 tw5r= Tfs(-0.9594929736144973898903680570663277L), 777 tw5i= (fwd ? -1 : 1) * Tfs(0.2817325568414296977114179153466169L); 778 779 auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& 780 { return ch[a+ido*(b+l1*c)]; }; 781 auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& 782 { return cc[a+ido*(b+ip*c)]; }; 783 784 #define POCKETFFT_PREP11(idx) \ 785 Tcd t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \ 786 PM (t2,t11,CC(idx,1,k),CC(idx,10,k)); \ 787 PM (t3,t10,CC(idx,2,k),CC(idx, 9,k)); \ 788 PM (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)); \ 789 PM (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)); \ 790 PM (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)); \ 791 CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; \ 792 CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i; 793 794 #define POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,out1,out2) \ 795 { \ 796 Tcd ca = t1 + t2*x1 + t3*x2 + t4*x3 + t5*x4 +t6*x5, \ 797 cb; \ 798 cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; \ 799 cb.r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); \ 800 PM(out1,out2,ca,cb); \ 801 } 802 #define POCKETFFT_PARTSTEP11a(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \ 803 POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,CH(0,k,u1),CH(0,k,u2)) 804 #define POCKETFFT_PARTSTEP11(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \ 805 { \ 806 Tcd da,db; \ 807 POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,da,db) \ 808 special_mul<fwd>(da,WA(u1-1,i),CH(i,k,u1)); \ 809 special_mul<fwd>(db,WA(u2-1,i),CH(i,k,u2)); \ 810 } 811 812 if (ido==1) 813 for (size_t k=0; k<l1; ++k) 814 { 815 POCKETFFT_PREP11(0) 816 POCKETFFT_PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i) 817 POCKETFFT_PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i) 818 POCKETFFT_PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i) 819 POCKETFFT_PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i) 820 POCKETFFT_PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i) 821 } 822 else 823 for (size_t k=0; k<l1; ++k) 824 { 825 { 826 POCKETFFT_PREP11(0) 827 POCKETFFT_PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i) 828 POCKETFFT_PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i) 829 POCKETFFT_PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i) 830 POCKETFFT_PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i) 831 POCKETFFT_PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i) 832 } 833 for (size_t i=1; i<ido; ++i) 834 { 835 POCKETFFT_PREP11(i) 836 POCKETFFT_PARTSTEP11(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i) 837 POCKETFFT_PARTSTEP11(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i) 838 POCKETFFT_PARTSTEP11(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i) 839 POCKETFFT_PARTSTEP11(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i) 840 POCKETFFT_PARTSTEP11(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i) 841 } 842 } 843 844 #undef POCKETFFT_PARTSTEP11 845 #undef POCKETFFT_PARTSTEP11a0 846 #undef POCKETFFT_PARTSTEP11a 847 #undef POCKETFFT_PREP11 848 return ch; 849 } 850 851 public: 852 cfftp11(size_t l1_, size_t ido_, const Troots<Tfs> &roots) 853 : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) 854 { 855 size_t N=ip*l1*ido; 856 auto rfct = roots->size()/N; 857 MR_assert(roots->size()==N*rfct, "mismatch"); 858 for (size_t i=1; i<ido; ++i) 859 for (size_t j=1; j<ip; ++j) 860 wa[(j-1)+(i-1)*(ip-1)] = (*roots)[rfct*j*l1*i]; 861 } 862 863 virtual size_t bufsize() const { return 0; } 864 virtual bool needs_copy() const { return true; } 865 866 POCKETFFT_EXEC_DISPATCH 867 }; 868 869 template <typename Tfs> class cfftpg: public cfftpass<Tfs> 870 { 871 private: 872 using typename cfftpass<Tfs>::Tcs; 873 874 size_t l1, ido; 875 size_t ip; 876 quick_array<Tcs> wa; 877 quick_array<Tcs> csarr; 878 879 auto WA(size_t x, size_t i) const 880 { return wa[i-1+x*(ido-1)]; } 881 882 template<bool fwd, typename Tcd> Tcd *exec_ 883 (Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, size_t /*nthreads*/) const 884 { 885 size_t ipph = (ip+1)/2; 886 size_t idl1 = ido*l1; 887 888 auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& 889 { return ch[a+ido*(b+l1*c)]; }; 890 auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& 891 { return cc[a+ido*(b+ip*c)]; }; 892 auto CX = [cc,this](size_t a, size_t b, size_t c) -> Tcd& 893 { return cc[a+ido*(b+l1*c)]; }; 894 auto CX2 = [cc, idl1](size_t a, size_t b) -> Tcd& 895 { return cc[a+idl1*b]; }; 896 auto CH2 = [ch, idl1](size_t a, size_t b) -> const Tcd& 897 { return ch[a+idl1*b]; }; 898 899 for (size_t k=0; k<l1; ++k) 900 for (size_t i=0; i<ido; ++i) 901 CH(i,k,0) = CC(i,0,k); 902 for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc) 903 for (size_t k=0; k<l1; ++k) 904 for (size_t i=0; i<ido; ++i) 905 PM(CH(i,k,j),CH(i,k,jc),CC(i,j,k),CC(i,jc,k)); 906 for (size_t k=0; k<l1; ++k) 907 for (size_t i=0; i<ido; ++i) 908 { 909 Tcd tmp = CH(i,k,0); 910 for (size_t j=1; j<ipph; ++j) 911 tmp+=CH(i,k,j); 912 CX(i,k,0) = tmp; 913 } 914 for (size_t l=1, lc=ip-1; l<ipph; ++l, --lc) 915 { 916 // j=0,1,2 917 { 918 auto wal = fwd ? csarr[ l].conj() : csarr[ l]; 919 auto wal2 = fwd ? csarr[2*l].conj() : csarr[2*l]; 920 for (size_t ik=0; ik<idl1; ++ik) 921 { 922 CX2(ik,l ).r = CH2(ik,0).r+wal.r*CH2(ik,1).r+wal2.r*CH2(ik,2).r; 923 CX2(ik,l ).i = CH2(ik,0).i+wal.r*CH2(ik,1).i+wal2.r*CH2(ik,2).i; 924 CX2(ik,lc).r =-wal.i*CH2(ik,ip-1).i-wal2.i*CH2(ik,ip-2).i; 925 CX2(ik,lc).i = wal.i*CH2(ik,ip-1).r+wal2.i*CH2(ik,ip-2).r; 926 } 927 } 928 929 size_t iwal=2*l; 930 size_t j=3, jc=ip-3; 931 for (; j<ipph-1; j+=2, jc-=2) 932 { 933 iwal+=l; if (iwal>ip) iwal-=ip; 934 Tcs xwal=fwd ? csarr[iwal].conj() : csarr[iwal]; 935 iwal+=l; if (iwal>ip) iwal-=ip; 936 Tcs xwal2=fwd ? csarr[iwal].conj() : csarr[iwal]; 937 for (size_t ik=0; ik<idl1; ++ik) 938 { 939 CX2(ik,l).r += CH2(ik,j).r*xwal.r+CH2(ik,j+1).r*xwal2.r; 940 CX2(ik,l).i += CH2(ik,j).i*xwal.r+CH2(ik,j+1).i*xwal2.r; 941 CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i+CH2(ik,jc-1).i*xwal2.i; 942 CX2(ik,lc).i += CH2(ik,jc).r*xwal.i+CH2(ik,jc-1).r*xwal2.i; 943 } 944 } 945 for (; j<ipph; ++j, --jc) 946 { 947 iwal+=l; if (iwal>ip) iwal-=ip; 948 Tcs xwal=fwd ? csarr[iwal].conj() : csarr[iwal]; 949 for (size_t ik=0; ik<idl1; ++ik) 950 { 951 CX2(ik,l).r += CH2(ik,j).r*xwal.r; 952 CX2(ik,l).i += CH2(ik,j).i*xwal.r; 953 CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i; 954 CX2(ik,lc).i += CH2(ik,jc).r*xwal.i; 955 } 956 } 957 } 958 959 // shuffling and twiddling 960 if (ido==1) 961 for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc) 962 for (size_t ik=0; ik<idl1; ++ik) 963 { 964 Tcd t1=CX2(ik,j), t2=CX2(ik,jc); 965 PM(CX2(ik,j),CX2(ik,jc),t1,t2); 966 } 967 else 968 { 969 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) 970 for (size_t k=0; k<l1; ++k) 971 { 972 Tcd t1=CX(0,k,j), t2=CX(0,k,jc); 973 PM(CX(0,k,j),CX(0,k,jc),t1,t2); 974 for (size_t i=1; i<ido; ++i) 975 { 976 Tcd x1, x2; 977 PM(x1,x2,CX(i,k,j),CX(i,k,jc)); 978 size_t idij=(j-1)*(ido-1)+i-1; 979 special_mul<fwd>(x1,wa[idij],CX(i,k,j)); 980 idij=(jc-1)*(ido-1)+i-1; 981 special_mul<fwd>(x2,wa[idij],CX(i,k,jc)); 982 } 983 } 984 } 985 return cc; 986 } 987 988 public: 989 cfftpg(size_t l1_, size_t ido_, size_t ip_, const Troots<Tfs> &roots) 990 : l1(l1_), ido(ido_), ip(ip_), wa((ip-1)*(ido-1)), csarr(ip) 991 { 992 MR_assert((ip&1)&&(ip>=5), "need an odd number >=5"); 993 size_t N=ip*l1*ido; 994 auto rfct = roots->size()/N; 995 MR_assert(roots->size()==N*rfct, "mismatch"); 996 for (size_t j=1; j<ip; ++j) 997 for (size_t i=1; i<ido; ++i) 998 wa[(j-1)*(ido-1)+i-1] = (*roots)[rfct*j*l1*i]; 999 for (size_t i=0; i<ip; ++i) 1000 csarr[i] = (*roots)[rfct*ido*l1*i]; 1001 } 1002 1003 virtual size_t bufsize() const { return 0; } 1004 virtual bool needs_copy() const { return true; } 1005 1006 POCKETFFT_EXEC_DISPATCH 1007 }; 1008 1009 template <typename Tfs> class cfftpblue: public cfftpass<Tfs> 1010 { 1011 private: 1012 using typename cfftpass<Tfs>::Tcs; 1013 1014 const size_t l1, ido, ip; 1015 const size_t ip2; 1016 const Tcpass<Tfs> subplan; 1017 quick_array<Tcs> wa, bk, bkf; 1018 size_t bufsz; 1019 bool need_cpy; 1020 1021 auto WA(size_t x, size_t i) const 1022 { return wa[i-1+x*(ido-1)]; } 1023 1024 template<bool fwd, typename Tcd> Tcd *exec_ 1025 (Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, 1026 Tcd * DUCC0_RESTRICT buf, size_t nthreads) const 1027 { 1028 static const auto ti=tidx<Tcd *>(); 1029 Tcd *akf = &buf[0]; 1030 Tcd *akf2 = &buf[ip2]; 1031 Tcd *subbuf = &buf[2*ip2]; 1032 1033 auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& 1034 { return ch[a+ido*(b+l1*c)]; }; 1035 auto CC = [cc,this](size_t a, size_t b, size_t c) -> Tcd& 1036 { return cc[a+ido*(b+ip*c)]; }; 1037 1038 //FIXME: parallelize here? 1039 for (size_t k=0; k<l1; ++k) 1040 for (size_t i=0; i<ido; ++i) 1041 { 1042 /* initialize a_k and FFT it */ 1043 for (size_t m=0; m<ip; ++m) 1044 special_mul<fwd>(CC(i,m,k),bk[m],akf[m]); 1045 auto zero = akf[0]*Tfs(0); 1046 for (size_t m=ip; m<ip2; ++m) 1047 akf[m]=zero; 1048 1049 auto res = static_cast<Tcd *>(subplan->exec(ti,akf,akf2, 1050 subbuf, true, nthreads)); 1051 1052 /* do the convolution */ 1053 res[0] = res[0].template special_mul<!fwd>(bkf[0]); 1054 for (size_t m=1; m<(ip2+1)/2; ++m) 1055 { 1056 res[m] = res[m].template special_mul<!fwd>(bkf[m]); 1057 res[ip2-m] = res[ip2-m].template special_mul<!fwd>(bkf[m]); 1058 } 1059 if ((ip2&1)==0) 1060 res[ip2/2] = res[ip2/2].template special_mul<!fwd>(bkf[ip2/2]); 1061 1062 /* inverse FFT */ 1063 res = static_cast<Tcd *>(subplan->exec(ti, res, 1064 (res==akf) ? akf2 : akf, subbuf, false, nthreads)); 1065 1066 /* multiply by b_k and write to output buffer */ 1067 if (l1>1) 1068 { 1069 if (i==0) 1070 for (size_t m=0; m<ip; ++m) 1071 CH(0,k,m) = res[m].template special_mul<fwd>(bk[m]); 1072 else 1073 { 1074 CH(i,k,0) = res[0].template special_mul<fwd>(bk[0]); 1075 for (size_t m=1; m<ip; ++m) 1076 CH(i,k,m) = res[m].template special_mul<fwd>(bk[m]*WA(m-1,i)); 1077 } 1078 } 1079 else 1080 { 1081 if (i==0) 1082 for (size_t m=0; m<ip; ++m) 1083 CC(0,m,0) = res[m].template special_mul<fwd>(bk[m]); 1084 else 1085 { 1086 CC(i,0,0) = res[0].template special_mul<fwd>(bk[0]); 1087 for (size_t m=1; m<ip; ++m) 1088 CC(i,m,0) = res[m].template special_mul<fwd>(bk[m]*WA(m-1,i)); 1089 } 1090 } 1091 } 1092 1093 return (l1>1) ? ch : cc; 1094 } 1095 1096 public: 1097 cfftpblue(size_t l1_, size_t ido_, size_t ip_, const Troots<Tfs> &roots, 1098 bool vectorize=false) 1099 : l1(l1_), ido(ido_), ip(ip_), ip2(util1d::good_size_cmplx(ip*2-1)), 1100 subplan(cfftpass<Tfs>::make_pass(ip2, vectorize)), wa((ip-1)*(ido-1)), 1101 bk(ip), bkf(ip2/2+1) 1102 { 1103 size_t N=ip*l1*ido; 1104 auto rfct = roots->size()/N; 1105 MR_assert(roots->size()==N*rfct, "mismatch"); 1106 for (size_t j=1; j<ip; ++j) 1107 for (size_t i=1; i<ido; ++i) 1108 wa[(j-1)*(ido-1)+i-1] = (*roots)[rfct*j*l1*i]; 1109 1110 /* initialize b_k */ 1111 bk[0].Set(1, 0); 1112 size_t coeff=0; 1113 auto roots2 = ((roots->size()/(2*ip))*2*ip==roots->size()) ? 1114 roots : make_shared<const UnityRoots<Tfs,Tcs>>(2*ip); 1115 size_t rfct2 = roots2->size()/(2*ip); 1116 for (size_t m=1; m<ip; ++m) 1117 { 1118 coeff+=2*m-1; 1119 if (coeff>=2*ip) coeff-=2*ip; 1120 bk[m] = (*roots2)[coeff*rfct2]; 1121 } 1122 1123 /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */ 1124 quick_array<Tcs> tbkf(ip2), tbkf2(ip2); 1125 Tfs xn2 = Tfs(1)/Tfs(ip2); 1126 tbkf[0] = bk[0]*xn2; 1127 for (size_t m=1; m<ip; ++m) 1128 tbkf[m] = tbkf[ip2-m] = bk[m]*xn2; 1129 for (size_t m=ip;m<=(ip2-ip);++m) 1130 tbkf[m].Set(0.,0.); 1131 quick_array<Tcs> buf(subplan->bufsize()); 1132 static const auto tics=tidx<Tcs *>(); 1133 auto res = static_cast<Tcs *>(subplan->exec(tics, tbkf.data(), 1134 tbkf2.data(), buf.data(), true)); 1135 for (size_t i=0; i<ip2/2+1; ++i) 1136 bkf[i] = res[i]; 1137 1138 need_cpy = l1>1; 1139 bufsz = ip2*(1+subplan->needs_copy()) + subplan->bufsize(); 1140 } 1141 1142 virtual size_t bufsize() const { return bufsz; } 1143 virtual bool needs_copy() const { return need_cpy; } 1144 1145 POCKETFFT_EXEC_DISPATCH 1146 }; 1147 1148 template <typename Tfs> class cfft_multipass: public cfftpass<Tfs> 1149 { 1150 private: 1151 using typename cfftpass<Tfs>::Tcs; 1152 static constexpr size_t bunchsize=8; 1153 1154 const size_t l1, ido; 1155 size_t ip; 1156 vector<Tcpass<Tfs>> passes; 1157 size_t bufsz; 1158 bool need_cpy; 1159 size_t rfct; 1160 Troots<Tfs> myroots; 1161 1162 // FIXME split into sub-functions. This is too long! 1163 template<bool fwd, typename T> Cmplx<T> *exec_(Cmplx<T> *cc, Cmplx<T> *ch, 1164 Cmplx<T> *buf, size_t nthreads) const 1165 { 1166 using Tc = Cmplx<T>; 1167 if ((l1==1) && (ido==1)) // no chance at vectorizing 1168 { 1169 static const auto tic=tidx<Tc *>(); 1170 Tc *p1=cc, *p2=ch; 1171 for(const auto &pass: passes) 1172 { 1173 auto res = static_cast<Tc *>(pass->exec(tic, p1, p2, buf, 1174 fwd, nthreads)); 1175 if (res==p2) swap (p1,p2); 1176 } 1177 return p1; 1178 } 1179 else 1180 { 1181 if constexpr(is_same<T,Tfs>::value && fft1d_simd_exists<Tfs>) // we can vectorize! 1182 { 1183 using Tfv = fft1d_simd<Tfs>; 1184 using Tcv = Cmplx<Tfv>; 1185 constexpr size_t vlen = Tfv::size(); 1186 size_t nvtrans = (l1*ido + vlen-1)/vlen; 1187 static const auto ticv = tidx<Tcv *>(); 1188 1189 if (ido==1) 1190 { 1191 auto CH = [ch,this](size_t b, size_t c) -> Tc& 1192 { return ch[b+l1*c]; }; 1193 auto CC = [cc,this](size_t b, size_t c) -> Tc& 1194 { return cc[b+ip*c]; }; 1195 1196 execStatic(nvtrans, nthreads, 0, [&](auto &sched) 1197 { 1198 quick_array<Tcv> tbuf(2*ip+bufsize()); 1199 auto cc2 = &tbuf[0]; 1200 auto ch2 = &tbuf[ip]; 1201 auto buf2 = &tbuf[2*ip]; 1202 1203 while (auto rng=sched.getNext()) 1204 for(auto itrans=rng.lo; itrans<rng.hi; ++itrans) 1205 { 1206 for (size_t n=0; n<vlen; ++n) 1207 for (size_t m=0; m<ip; ++m) 1208 { 1209 size_t k = min(l1-1, itrans*vlen+n); 1210 cc2[m].r[n] = CC(m,k).r; 1211 cc2[m].i[n] = CC(m,k).i; 1212 } 1213 1214 Tcv *p1=cc2, *p2=ch2; 1215 for(const auto &pass: passes) 1216 { 1217 auto res = static_cast<Tcv *>(pass->exec(ticv, 1218 p1, p2, buf2, fwd)); 1219 if (res==p2) swap (p1,p2); 1220 } 1221 1222 for (size_t m=0; m<ip; ++m) 1223 for (size_t n=0; n<vlen; ++n) 1224 { 1225 auto k = min(l1-1, itrans*vlen+n); 1226 CH(k,m) = { p1[m].r[n], p1[m].i[n] }; 1227 } 1228 } 1229 }); 1230 return ch; 1231 } 1232 1233 if (l1==1) 1234 { 1235 auto CC = [cc,this](size_t a, size_t b) -> Tc& 1236 { return cc[a+ido*b]; }; 1237 1238 execStatic(nvtrans, nthreads, 0, [&](auto &sched) 1239 { 1240 quick_array<Tcv> tbuf(2*ip+bufsize()); 1241 auto cc2 = &tbuf[0]; 1242 auto ch2 = &tbuf[ip]; 1243 auto buf2 = &tbuf[2*ip]; 1244 1245 while (auto rng=sched.getNext()) 1246 for(auto itrans=rng.lo; itrans<rng.hi; ++itrans) 1247 { 1248 for (size_t m=0; m<ip; ++m) 1249 for (size_t n=0; n<vlen; ++n) 1250 { 1251 size_t i = min(ido-1, itrans*vlen+n); 1252 cc2[m].r[n] = CC(i,m).r; 1253 cc2[m].i[n] = CC(i,m).i; 1254 } 1255 1256 Tcv *p1=cc2, *p2=ch2; 1257 for(const auto &pass: passes) 1258 { 1259 auto res = static_cast<Tcv *>(pass->exec(ticv, 1260 p1, p2, buf2, fwd)); 1261 if (res==p2) swap (p1,p2); 1262 } 1263 1264 for (size_t m=0; m<ip; ++m) 1265 for (size_t n=0; n<vlen; ++n) 1266 { 1267 auto i = itrans*vlen+n; 1268 if (i >= ido) break; 1269 if (i==0) 1270 CC(0,m) = { p1[m].r[n], p1[m].i[n] }; 1271 else 1272 { 1273 if (m==0) 1274 CC(i,0) = { p1[0].r[n], p1[0].i[n] } ; 1275 else 1276 CC(i,m) = Tcs(p1[m].r[n],p1[m].i[n]).template special_mul<fwd>((*myroots)[rfct*m*i]); 1277 } 1278 } 1279 } 1280 }); 1281 return cc; 1282 } 1283 1284 MR_fail("must not get here"); 1285 #if 0 1286 //FIXME this code path is currently unused 1287 quick_array<Tcv> tbuf(2*ip+bufsize()); 1288 auto cc2 = &tbuf[0]; 1289 auto ch2 = &tbuf[ip]; 1290 auto buf2 = &tbuf[2*ip]; 1291 1292 auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tc& 1293 { return ch[a+ido*(b+l1*c)]; }; 1294 auto CC = [cc,this](size_t a, size_t b, size_t c) -> Tc& 1295 { return cc[a+ido*(b+ip*c)]; }; 1296 1297 //FIXME parallelize? 1298 for (size_t itrans=0; itrans<nvtrans; ++itrans) 1299 { 1300 array<size_t, vlen> ix, kx; 1301 size_t ixcur = (itrans*vlen)%ido; 1302 size_t kxcur = (itrans*vlen)/ido; 1303 for (size_t n=0; n<vlen; ++n) 1304 { 1305 ix[n] = ixcur; 1306 kx[n] = min(l1-1,kxcur); 1307 if (++ixcur==ido) 1308 { 1309 ixcur=0; 1310 ++kxcur; 1311 } 1312 } 1313 1314 for (size_t m=0; m<ip; ++m) 1315 for (size_t n=0; n<vlen; ++n) 1316 { 1317 cc2[m].r[n] = CC(ix[n],m,kx[n]).r; 1318 cc2[m].i[n] = CC(ix[n],m,kx[n]).i; 1319 } 1320 1321 Tcv *p1=cc2, *p2=ch2; 1322 for(const auto &pass: passes) 1323 { 1324 auto res = static_cast<Tcv *>(pass->exec(ticv, 1325 p1, p2, buf2, fwd)); 1326 if (res==p2) swap (p1,p2); 1327 } 1328 1329 for (size_t m=0; m<ip; ++m) 1330 for (size_t n=0; n<vlen; ++n) 1331 { 1332 auto i = ix[n]; 1333 auto k = kx[n]; 1334 if (itrans*vlen+n >= l1*ido) break; 1335 if (i==0) 1336 CH(0,k,m) = { p1[m].r[n], p1[m].i[n] }; 1337 else 1338 { 1339 if (m==0) 1340 CH(i,k,0) = { p1[0].r[n], p1[0].i[n] } ; 1341 else 1342 CH(i,k,m) = Tcs(p1[m].r[n],p1[m].i[n]).template special_mul<fwd>((*myroots)[rfct*l1*m*i]); 1343 } 1344 } 1345 } 1346 return ch; 1347 #endif 1348 } 1349 else 1350 { 1351 static const auto tic = tidx<Cmplx<T> *>(); 1352 if (ido==1) 1353 { 1354 // parallelize here! 1355 for (size_t n=0; n<l1; ++n) 1356 { 1357 Cmplx<T> *p1=&cc[n*ip], *p2=ch; 1358 Cmplx<T> *res = nullptr; 1359 for(const auto &pass: passes) 1360 { 1361 res = static_cast<Cmplx<T> *>(pass->exec(tic, 1362 p1, p2, buf, fwd)); 1363 if (res==p2) swap (p1,p2); 1364 } 1365 if (res != &cc[n*ip]) 1366 copy(res, res+ip, cc+n*ip); 1367 } 1368 // transpose 1369 size_t nbunch = (l1*ido + bunchsize-1)/bunchsize; 1370 // parallelize here! 1371 for (size_t ibunch=0; ibunch<nbunch; ++ibunch) 1372 { 1373 size_t ntrans = min(bunchsize, l1-ibunch*bunchsize); 1374 for (size_t m=0; m<ip; ++m) 1375 for (size_t n=0; n<ntrans; ++n) 1376 { 1377 size_t itrans = ibunch*bunchsize + n; 1378 ch[itrans+m*l1] = cc[m+itrans*ip]; 1379 } 1380 } 1381 return ch; 1382 } 1383 if (l1==1) 1384 { 1385 auto cc2 = &buf[0]; 1386 auto ch2 = &buf[bunchsize*ip]; 1387 auto buf2 = &buf[(bunchsize+1)*ip]; 1388 size_t nbunch = (ido + bunchsize-1)/bunchsize; 1389 1390 auto CC = [cc,this](size_t a, size_t b) -> Tc& 1391 { return cc[a+ido*b]; }; 1392 1393 // parallelize here! 1394 for (size_t ibunch=0; ibunch<nbunch; ++ibunch) 1395 { 1396 size_t ntrans = min(bunchsize, ido-ibunch*bunchsize); 1397 1398 for (size_t m=0; m<ip; ++m) 1399 for (size_t n=0; n<ntrans; ++n) 1400 cc2[m+n*ip] = CC(n+ibunch*bunchsize,m); 1401 1402 for (size_t n=0; n<ntrans; ++n) 1403 { 1404 auto i = n+ibunch*bunchsize; 1405 Cmplx<T> *p1=&cc2[n*ip], *p2=ch2; 1406 Cmplx<T> *res = nullptr; 1407 for(const auto &pass: passes) 1408 { 1409 res = static_cast<Cmplx<T> *>(pass->exec(tic, 1410 p1, p2, buf2, fwd)); 1411 if (res==p2) swap (p1,p2); 1412 } 1413 if (res==&cc2[n*ip]) // no copying necessary 1414 { 1415 if (i!=0) 1416 { 1417 for (size_t m=1; m<ip; ++m) 1418 cc2[n*ip+m] = cc2[n*ip+m].template special_mul<fwd>((*myroots)[rfct*m*i]); 1419 } 1420 } 1421 else 1422 { 1423 if (i==0) 1424 for (size_t m=0; m<ip; ++m) 1425 cc2[n*ip+m] = res[m]; 1426 else 1427 { 1428 cc2[n*ip] = res[0]; 1429 for (size_t m=1; m<ip; ++m) 1430 cc2[n*ip+m] = res[m].template special_mul<fwd>((*myroots)[rfct*m*i]); 1431 } 1432 } 1433 } 1434 for (size_t m=0; m<ip; ++m) 1435 for (size_t n=0; n<ntrans; ++n) 1436 CC(n+ibunch*bunchsize, m) = cc2[m+n*ip]; 1437 } 1438 return cc; 1439 } 1440 1441 MR_fail("must not get here"); 1442 #if 0 1443 //FIXME this code path is currently unused 1444 auto cc2 = &buf[0]; 1445 auto ch2 = &buf[bunchsize*ip]; 1446 auto buf2 = &buf[(bunchsize+1)*ip]; 1447 size_t nbunch = (l1*ido + bunchsize-1)/bunchsize; 1448 1449 auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tc& 1450 { return ch[a+ido*(b+l1*c)]; }; 1451 auto CC = [cc,this](size_t a, size_t b, size_t c) -> Tc& 1452 { return cc[a+ido*(b+ip*c)]; }; 1453 1454 // parallelize here! 1455 for (size_t ibunch=0; ibunch<nbunch; ++ibunch) 1456 { 1457 size_t ntrans = min(bunchsize, l1*ido-ibunch*bunchsize); 1458 array<size_t, bunchsize> ix, kx; 1459 size_t ixcur = (ibunch*bunchsize)%ido; 1460 size_t kxcur = (ibunch*bunchsize)/ido; 1461 for (size_t n=0; n<bunchsize; ++n) 1462 { 1463 ix[n] = ixcur; 1464 kx[n] = min(l1-1,kxcur); 1465 if (++ixcur==ido) 1466 { 1467 ixcur=0; 1468 ++kxcur; 1469 } 1470 } 1471 for (size_t m=0; m<ip; ++m) 1472 for (size_t n=0; n<ntrans; ++n) 1473 cc2[m+n*ip] = CC(ix[n],m,kx[n]); 1474 1475 for (size_t n=0; n<ntrans; ++n) 1476 { 1477 auto i = ix[n]; 1478 Cmplx<T> *p1=&cc2[n*ip], *p2=ch2; 1479 Cmplx<T> *res = nullptr; 1480 for(const auto &pass: passes) 1481 { 1482 res = static_cast<Cmplx<T> *>(pass->exec(tic, 1483 p1, p2, buf2, fwd)); 1484 if (res==p2) swap (p1,p2); 1485 } 1486 if (res==&cc2[n*ip]) // no copying necessary 1487 { 1488 if (i!=0) 1489 { 1490 for (size_t m=1; m<ip; ++m) 1491 cc2[n*ip+m] = cc2[n*ip+m].template special_mul<fwd>((*myroots)[rfct*l1*m*i]); 1492 } 1493 } 1494 else 1495 { 1496 if (i==0) 1497 for (size_t m=0; m<ip; ++m) 1498 cc2[n*ip+m] = res[m]; 1499 else 1500 { 1501 cc2[n*ip] = res[0]; 1502 for (size_t m=1; m<ip; ++m) 1503 cc2[n*ip+m] = res[m].template special_mul<fwd>((*myroots)[rfct*l1*m*i]); 1504 } 1505 } 1506 } 1507 for (size_t m=0; m<ip; ++m) 1508 for (size_t n=0; n<ntrans; ++n) 1509 CH(ix[n], kx[n], m) = cc2[m+n*ip]; 1510 } 1511 return ch; 1512 #endif 1513 } 1514 } 1515 } 1516 1517 public: 1518 cfft_multipass(size_t l1_, size_t ido_, size_t ip_, 1519 const Troots<Tfs> &roots, bool vectorize=false) 1520 : l1(l1_), ido(ido_), ip(ip_), bufsz(0), need_cpy(false), 1521 myroots(roots) 1522 { 1523 size_t N=ip*l1*ido; 1524 rfct = roots->size()/N; 1525 MR_assert(roots->size()==N*rfct, "mismatch"); 1526 1527 // FIXME TBD 1528 // do we need the vectorize flag at all? 1529 size_t lim = vectorize ? 10000 : 10000; 1530 if (ip<=lim) 1531 { 1532 auto factors = cfftpass<Tfs>::factorize(ip); 1533 size_t l1l=1; 1534 for (auto fct: factors) 1535 { 1536 passes.push_back(cfftpass<Tfs>::make_pass(l1l, ip/(fct*l1l), fct, roots, false)); 1537 l1l*=fct; 1538 } 1539 } 1540 else 1541 { 1542 vector<size_t> packets(2,1); 1543 auto factors = util1d::prime_factors(ip); 1544 sort(factors.begin(), factors.end(), std::greater<size_t>()); 1545 for (auto fct: factors) 1546 (packets[0]>packets[1]) ? packets[1]*=fct : packets[0]*=fct; 1547 size_t l1l=1; 1548 for (auto pkt: packets) 1549 { 1550 passes.push_back(cfftpass<Tfs>::make_pass(l1l, ip/(pkt*l1l), pkt, roots, false)); 1551 l1l*=pkt; 1552 } 1553 } 1554 for (const auto &pass: passes) 1555 { 1556 bufsz = max(bufsz, pass->bufsize()); 1557 need_cpy |= pass->needs_copy(); 1558 } 1559 if ((l1!=1)||(ido!=1)) 1560 { 1561 need_cpy=true; 1562 bufsz += (bunchsize+1)*ip; 1563 } 1564 } 1565 1566 virtual size_t bufsize() const { return bufsz; } 1567 virtual bool needs_copy() const { return need_cpy; } 1568 1569 POCKETFFT_EXEC_DISPATCH 1570 }; 1571 1572 #undef POCKETFFT_EXEC_DISPATCH 1573 1574 #if 0 // leaving in for potential future use; but doesn't seem beneficial 1575 template <size_t vlen, typename Tfs> class cfftp_vecpass: public cfftpass<Tfs> 1576 { 1577 private: 1578 static_assert(simd_exists<Tfs, vlen>, "bad vlen"); 1579 using typename cfftpass<Tfs>::Tcs; 1580 using Tfv=typename simd_select<Tfs, vlen>::type; 1581 using Tcv=Cmplx<Tfv>; 1582 1583 size_t ip; 1584 Tcpass<Tfs> spass; 1585 Tcpass<Tfs> vpass; 1586 size_t bufsz; 1587 1588 template<bool fwd> Tcs *exec_ (Tcs *cc, 1589 Tcs * /*ch*/, Tcs * /*buf*/, size_t nthreads) const 1590 { 1591 quick_array<Tcv> buf(2*ip+bufsz); 1592 auto * cc2 = buf.data(); 1593 auto * ch2 = buf.data()+ip; 1594 auto * buf2 = buf.data()+2*ip; 1595 static const auto tics = tidx<Tcs *>(); 1596 // run scalar pass 1597 auto res = static_cast<Tcs *>(spass->exec(tics, cc, 1598 reinterpret_cast<Tcs *>(ch2), reinterpret_cast<Tcs *>(buf2), 1599 fwd, nthreads)); 1600 // arrange input in SIMD-friendly way 1601 // FIXME: swap loops? 1602 for (size_t i=0; i<ip/vlen; ++i) 1603 for (size_t j=0; j<vlen; ++j) 1604 { 1605 size_t idx = j*(ip/vlen) + i; 1606 cc2[i].r[j] = res[idx].r; 1607 cc2[i].i[j] = res[idx].i; 1608 } 1609 // run vector pass 1610 static const auto ticv = tidx<Tcv *>(); 1611 auto res2 = static_cast<Tcv *>(vpass->exec(ticv, 1612 cc2, ch2, buf2, fwd, nthreads)); 1613 // de-SIMDify 1614 for (size_t i=0; i<ip/vlen; ++i) 1615 for (size_t j=0; j<vlen; ++j) 1616 cc[i*vlen+j] = Tcs(res2[i].r[j], res2[i].i[j]); 1617 1618 return cc; 1619 } 1620 1621 public: 1622 cfftp_vecpass(size_t ip_, const Troots<Tfs> &roots) 1623 : ip(ip_), spass(cfftpass<Tfs>::make_pass(1, ip/vlen, vlen, roots)), 1624 vpass(cfftpass<Tfs>::make_pass(1, 1, ip/vlen, roots)), bufsz(0) 1625 { 1626 MR_assert((ip/vlen)*vlen==ip, "cannot vectorize this size"); 1627 bufsz=2*ip+max(vpass->bufsize(),(spass->bufsize()+vlen-1)/vlen); 1628 } 1629 virtual size_t bufsize() const { return 0; } 1630 virtual bool needs_copy() const { return false; } 1631 virtual void *exec(const type_index &ti, void *in, void *copy, void *buf, 1632 bool fwd, size_t nthreads=1) const 1633 { 1634 static const auto tics = tidx<Tcs *>(); 1635 MR_assert(ti==tics, "bad input type"); 1636 auto in1 = static_cast<Tcs *>(in); 1637 auto copy1 = static_cast<Tcs *>(copy); 1638 auto buf1 = static_cast<Tcs *>(buf); 1639 return fwd ? exec_<true>(in1, copy1, buf1, nthreads) 1640 : exec_<false>(in1, copy1, buf1, nthreads); 1641 } 1642 }; 1643 #endif 1644 1645 template<typename Tfs> Tcpass<Tfs> cfftpass<Tfs>::make_pass(size_t l1, 1646 size_t ido, size_t ip, const Troots<Tfs> &roots, bool vectorize) 1647 { 1648 MR_assert(ip>=1, "no zero-sized FFTs"); 1649 #if 0 1650 if (vectorize && (ip>300) && (ip<32768) && (l1==1) && (ido==1)) 1651 { 1652 constexpr auto vlen = native_simd<Tfs>::size(); 1653 if constexpr(vlen>1) 1654 if ((ip&(vlen-1))==0) 1655 return make_shared<cfftp_vecpass<vlen,Tfs>>(ip, roots); 1656 } 1657 #endif 1658 1659 if (ip==1) return make_shared<cfftp1<Tfs>>(); 1660 auto factors=cfftpass<Tfs>::factorize(ip); 1661 if (factors.size()==1) 1662 { 1663 switch(ip) 1664 { 1665 case 2: 1666 return make_shared<cfftp2<Tfs>>(l1, ido, roots); 1667 case 3: 1668 return make_shared<cfftp3<Tfs>>(l1, ido, roots); 1669 case 4: 1670 return make_shared<cfftp4<Tfs>>(l1, ido, roots); 1671 case 5: 1672 return make_shared<cfftp5<Tfs>>(l1, ido, roots); 1673 case 7: 1674 return make_shared<cfftp7<Tfs>>(l1, ido, roots); 1675 case 11: 1676 return make_shared<cfftp11<Tfs>>(l1, ido, roots); 1677 default: 1678 if (ip<110) 1679 return make_shared<cfftpg<Tfs>>(l1, ido, ip, roots); 1680 else 1681 return make_shared<cfftpblue<Tfs>>(l1, ido, ip, roots, vectorize); 1682 } 1683 } 1684 else // more than one factor, need a multipass 1685 return make_shared<cfft_multipass<Tfs>>(l1, ido, ip, roots, vectorize); 1686 } 1687 1688 template<typename Tfs> class pocketfft_c 1689 { 1690 private: 1691 size_t N; 1692 size_t critbuf; 1693 Tcpass<Tfs> plan; 1694 1695 public: 1696 pocketfft_c(size_t n, bool vectorize=false) 1697 : N(n), critbuf(((N&1023)==0) ? 16 : 0), 1698 plan(cfftpass<Tfs>::make_pass(n,vectorize)) {} 1699 size_t length() const { return N; } 1700 size_t bufsize() const { return N*plan->needs_copy()+2*critbuf+plan->bufsize(); } 1701 template<typename Tfd> DUCC0_NOINLINE Cmplx<Tfd> *exec(Cmplx<Tfd> *in, Cmplx<Tfd> *buf, 1702 Tfs fct, bool fwd, size_t nthreads=1) const 1703 { 1704 static const auto tic = tidx<Cmplx<Tfd> *>(); 1705 auto res = static_cast<Cmplx<Tfd> *>(plan->exec(tic, 1706 in, buf+critbuf+plan->bufsize(), buf+critbuf, fwd, nthreads)); 1707 if (fct!=Tfs(1)) 1708 for (size_t i=0; i<N; ++i) res[i]*=fct; 1709 return res; 1710 } 1711 template<typename Tfd> DUCC0_NOINLINE void exec_copyback(Cmplx<Tfd> *in, Cmplx<Tfd> *buf, 1712 Tfs fct, bool fwd, size_t nthreads=1) const 1713 { 1714 static const auto tic = tidx<Cmplx<Tfd> *>(); 1715 auto res = static_cast<Cmplx<Tfd> *>(plan->exec(tic, 1716 in, buf, buf+N*plan->needs_copy(), fwd, nthreads)); 1717 if (res==in) 1718 { 1719 if (fct!=Tfs(1)) 1720 for (size_t i=0; i<N; ++i) in[i]*=fct; 1721 } 1722 else 1723 { 1724 if (fct!=Tfs(1)) 1725 for (size_t i=0; i<N; ++i) in[i]=res[i]*fct; 1726 else 1727 copy_n(res, N, in); 1728 } 1729 } 1730 template<typename Tfd> DUCC0_NOINLINE void exec(Cmplx<Tfd> *in, Tfs fct, bool fwd, size_t nthreads=1) const 1731 { 1732 quick_array<Cmplx<Tfd>> buf(N*plan->needs_copy()+plan->bufsize()); 1733 exec_copyback(in, buf.data(), fct, fwd, nthreads); 1734 } 1735 }; 1736 1737 template <typename Tfs> class rfftpass 1738 { 1739 public: 1740 virtual ~rfftpass(){} 1741 1742 // number of Tfd values required as scratch space during "exec" 1743 // will be provided in "buf" 1744 virtual size_t bufsize() const = 0; 1745 virtual bool needs_copy() const = 0; 1746 virtual void *exec(const type_index &ti, void *in, void *copy, void *buf, 1747 bool fwd, size_t nthreads=1) const = 0; 1748 1749 static vector<size_t> factorize(size_t N) 1750 { 1751 MR_assert(N>0, "need a positive number"); 1752 vector<size_t> factors; 1753 while ((N&3)==0) 1754 { factors.push_back(4); N>>=2; } 1755 if ((N&1)==0) 1756 { 1757 N>>=1; 1758 // factor 2 should be at the front of the factor list 1759 factors.push_back(2); 1760 swap(factors[0], factors.back()); 1761 } 1762 for (size_t divisor=3; divisor*divisor<=N; divisor+=2) 1763 while ((N%divisor)==0) 1764 { 1765 factors.push_back(divisor); 1766 N/=divisor; 1767 } 1768 if (N>1) factors.push_back(N); 1769 return factors; 1770 } 1771 1772 static shared_ptr<rfftpass> make_pass(size_t l1, size_t ido, size_t ip, 1773 const Troots<Tfs> &roots, bool vectorize=false); 1774 static shared_ptr<rfftpass> make_pass(size_t ip, bool vectorize=false) 1775 { 1776 return make_pass(1,1,ip,make_shared<UnityRoots<Tfs,Cmplx<Tfs>>>(ip), 1777 vectorize); 1778 } 1779 }; 1780 1781 #define POCKETFFT_EXEC_DISPATCH \ 1782 virtual void *exec(const type_index &ti, void *in, void *copy, void *buf, \ 1783 bool fwd, size_t nthreads) const \ 1784 { \ 1785 static const auto tifs=tidx<Tfs *>(); \ 1786 if (ti==tifs) \ 1787 { \ 1788 auto in1 = static_cast<Tfs *>(in); \ 1789 auto copy1 = static_cast<Tfs *>(copy); \ 1790 auto buf1 = static_cast<Tfs *>(buf); \ 1791 return fwd ? exec_<true>(in1, copy1, buf1, nthreads) \ 1792 : exec_<false>(in1, copy1, buf1, nthreads); \ 1793 } \ 1794 if constexpr (fft1d_simdlen<Tfs> > 1) \ 1795 if constexpr (simd_exists<Tfs, fft1d_simdlen<Tfs>>) \ 1796 { \ 1797 using Tfv = typename simd_select<Tfs, fft1d_simdlen<Tfs>>::type; \ 1798 static const auto tifv=tidx<Tfv *>(); \ 1799 if (ti==tifv) \ 1800 { \ 1801 auto in1 = static_cast<Tfv *>(in); \ 1802 auto copy1 = static_cast<Tfv *>(copy); \ 1803 auto buf1 = static_cast<Tfv *>(buf); \ 1804 return fwd ? exec_<true>(in1, copy1, buf1, nthreads) \ 1805 : exec_<false>(in1, copy1, buf1, nthreads); \ 1806 } \ 1807 } \ 1808 if constexpr (fft1d_simdlen<Tfs> > 2) \ 1809 if constexpr (simd_exists<Tfs, fft1d_simdlen<Tfs>/2>) \ 1810 { \ 1811 using Tfv = typename simd_select<Tfs, fft1d_simdlen<Tfs>/2>::type; \ 1812 static const auto tifv=tidx<Tfv *>(); \ 1813 if (ti==tifv) \ 1814 { \ 1815 auto in1 = static_cast<Tfv *>(in); \ 1816 auto copy1 = static_cast<Tfv *>(copy); \ 1817 auto buf1 = static_cast<Tfv *>(buf); \ 1818 return fwd ? exec_<true>(in1, copy1, buf1, nthreads) \ 1819 : exec_<false>(in1, copy1, buf1, nthreads); \ 1820 } \ 1821 } \ 1822 if constexpr (fft1d_simdlen<Tfs> > 4) \ 1823 if constexpr (simd_exists<Tfs, fft1d_simdlen<Tfs>/4>) \ 1824 { \ 1825 using Tfv = typename simd_select<Tfs, fft1d_simdlen<Tfs>/4>::type; \ 1826 static const auto tifv=tidx<Tfv *>(); \ 1827 if (ti==tifv) \ 1828 { \ 1829 auto in1 = static_cast<Tfv *>(in); \ 1830 auto copy1 = static_cast<Tfv *>(copy); \ 1831 auto buf1 = static_cast<Tfv *>(buf); \ 1832 return fwd ? exec_<true>(in1, copy1, buf1, nthreads) \ 1833 : exec_<false>(in1, copy1, buf1, nthreads); \ 1834 } \ 1835 } \ 1836 if constexpr (fft1d_simdlen<Tfs> > 8) \ 1837 if constexpr (simd_exists<Tfs, fft1d_simdlen<Tfs>/8>) \ 1838 { \ 1839 using Tfv = typename simd_select<Tfs, fft1d_simdlen<Tfs>/8>::type; \ 1840 static const auto tifv=tidx<Tfv *>(); \ 1841 if (ti==tifv) \ 1842 { \ 1843 auto in1 = static_cast<Tfv *>(in); \ 1844 auto copy1 = static_cast<Tfv *>(copy); \ 1845 auto buf1 = static_cast<Tfv *>(buf); \ 1846 return fwd ? exec_<true>(in1, copy1, buf1, nthreads) \ 1847 : exec_<false>(in1, copy1, buf1, nthreads); \ 1848 } \ 1849 } \ 1850 MR_fail("impossible vector length requested"); \ 1851 } 1852 1853 template<typename T> using Trpass = shared_ptr<rfftpass<T>>; 1854 1855 /* (a+ib) = conj(c+id) * (e+if) */ 1856 template<typename T1, typename T2, typename T3> inline void MULPM 1857 (T1 &a, T1 &b, T2 c, T2 d, T3 e, T3 f) 1858 { a=c*e+d*f; b=c*f-d*e; } 1859 1860 template <typename Tfs> class rfftp1: public rfftpass<Tfs> 1861 { 1862 public: 1863 rfftp1() {} 1864 virtual size_t bufsize() const { return 0; } 1865 virtual bool needs_copy() const { return false; } 1866 1867 virtual void *exec(const type_index & /*ti*/, void * in, void * /*copy*/, 1868 void * /*buf*/, bool /*fwd*/, size_t /*nthreads*/) const 1869 { return in; } 1870 }; 1871 1872 template <typename Tfs> class rfftp2: public rfftpass<Tfs> 1873 { 1874 private: 1875 size_t l1, ido; 1876 static constexpr size_t ip=2; 1877 quick_array<Tfs> wa; 1878 1879 auto WA(size_t x, size_t i) const 1880 { return wa[i+x*(ido-1)]; } 1881 1882 template<bool fwd, typename Tfd> Tfd *exec_ (Tfd * DUCC0_RESTRICT cc, 1883 Tfd * DUCC0_RESTRICT ch, Tfd * /*buf*/, size_t /*nthreads*/) const 1884 { 1885 if constexpr(fwd) 1886 { 1887 auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd& 1888 { return cc[a+ido*(b+l1*c)]; }; 1889 auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& 1890 { return ch[a+ido*(b+ip*c)]; }; 1891 for (size_t k=0; k<l1; k++) 1892 PM (CH(0,0,k),CH(ido-1,1,k),CC(0,k,0),CC(0,k,1)); 1893 if ((ido&1)==0) 1894 for (size_t k=0; k<l1; k++) 1895 { 1896 CH( 0,1,k) = -CC(ido-1,k,1); 1897 CH(ido-1,0,k) = CC(ido-1,k,0); 1898 } 1899 if (ido<=2) return ch; 1900 for (size_t k=0; k<l1; k++) 1901 for (size_t i=2; i<ido; i+=2) 1902 { 1903 size_t ic=ido-i; 1904 Tfd tr2, ti2; 1905 MULPM (tr2,ti2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1)); 1906 PM (CH(i-1,0,k),CH(ic-1,1,k),CC(i-1,k,0),tr2); 1907 PM (CH(i ,0,k),CH(ic ,1,k),ti2,CC(i ,k,0)); 1908 } 1909 } 1910 else 1911 { 1912 auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd& 1913 { return cc[a+ido*(b+ip*c)]; }; 1914 auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& 1915 { return ch[a+ido*(b+l1*c)]; }; 1916 1917 for (size_t k=0; k<l1; k++) 1918 PM (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(ido-1,1,k)); 1919 if ((ido&1)==0) 1920 for (size_t k=0; k<l1; k++) 1921 { 1922 CH(ido-1,k,0) = Tfs( 2)*CC(ido-1,0,k); 1923 CH(ido-1,k,1) = Tfs(-2)*CC(0 ,1,k); 1924 } 1925 if (ido<=2) return ch; 1926 for (size_t k=0; k<l1;++k) 1927 for (size_t i=2; i<ido; i+=2) 1928 { 1929 size_t ic=ido-i; 1930 Tfd ti2, tr2; 1931 PM (CH(i-1,k,0),tr2,CC(i-1,0,k),CC(ic-1,1,k)); 1932 PM (ti2,CH(i ,k,0),CC(i ,0,k),CC(ic ,1,k)); 1933 MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ti2,tr2); 1934 } 1935 } 1936 return ch; 1937 } 1938 1939 public: 1940 rfftp2(size_t l1_, size_t ido_, const Troots<Tfs> &roots) 1941 : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) 1942 { 1943 size_t N=ip*l1*ido; 1944 size_t rfct = roots->size()/N; 1945 MR_assert(roots->size()==N*rfct, "mismatch"); 1946 for (size_t j=1; j<ip; ++j) 1947 for (size_t i=1; i<=(ido-1)/2; ++i) 1948 { 1949 auto val = (*roots)[rfct*j*l1*i]; 1950 wa[(j-1)*(ido-1)+2*i-2] = val.r; 1951 wa[(j-1)*(ido-1)+2*i-1] = val.i; 1952 } 1953 } 1954 1955 virtual size_t bufsize() const { return 0; } 1956 virtual bool needs_copy() const { return true; } 1957 1958 POCKETFFT_EXEC_DISPATCH 1959 }; 1960 // a2=a+b; b2=i*(b-a); 1961 #define POCKETFFT_REARRANGE(rx, ix, ry, iy) \ 1962 {\ 1963 auto t1=rx+ry, t2=ry-rx, t3=ix+iy, t4=ix-iy; \ 1964 rx=t1; ix=t3; ry=t4; iy=t2; \ 1965 } 1966 1967 template <typename Tfs> class rfftp3: public rfftpass<Tfs> 1968 { 1969 private: 1970 size_t l1, ido; 1971 static constexpr size_t ip=3; 1972 quick_array<Tfs> wa; 1973 1974 auto WA(size_t x, size_t i) const 1975 { return wa[i+x*(ido-1)]; } 1976 1977 template<bool fwd, typename Tfd> Tfd *exec_ (Tfd * DUCC0_RESTRICT cc, 1978 Tfd * DUCC0_RESTRICT ch, Tfd * /*buf*/, size_t /*nthreads*/) const 1979 { 1980 constexpr Tfs taur=Tfs(-0.5), 1981 taui=Tfs(0.8660254037844386467637231707529362L); 1982 if constexpr(fwd) 1983 { 1984 auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd& 1985 { return cc[a+ido*(b+l1*c)]; }; 1986 auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& 1987 { return ch[a+ido*(b+ip*c)]; }; 1988 for (size_t k=0; k<l1; k++) 1989 { 1990 Tfd cr2=CC(0,k,1)+CC(0,k,2); 1991 CH(0,0,k) = CC(0,k,0)+cr2; 1992 CH(0,2,k) = taui*(CC(0,k,2)-CC(0,k,1)); 1993 CH(ido-1,1,k) = CC(0,k,0)+taur*cr2; 1994 } 1995 if (ido==1) return ch; 1996 for (size_t k=0; k<l1; k++) 1997 for (size_t i=2; i<ido; i+=2) 1998 { 1999 size_t ic=ido-i; 2000 Tfd di2, di3, dr2, dr3; 2001 MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1)); // d2=conj(WA0)*CC1 2002 MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2)); // d3=conj(WA1)*CC2 2003 POCKETFFT_REARRANGE(dr2, di2, dr3, di3); 2004 CH(i-1,0,k) = CC(i-1,k,0)+dr2; // c add 2005 CH(i ,0,k) = CC(i ,k,0)+di2; 2006 Tfd tr2 = CC(i-1,k,0)+taur*dr2; // c add 2007 Tfd ti2 = CC(i ,k,0)+taur*di2; 2008 Tfd tr3 = taui*dr3; // t3 = taui*i*(d3-d2)? 2009 Tfd ti3 = taui*di3; 2010 PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr3); // PM(i) = t2+t3 2011 PM(CH(i ,2,k),CH(ic ,1,k),ti3,ti2); // PM(ic) = conj(t2-t3) 2012 } 2013 } 2014 else 2015 { 2016 auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd& 2017 { return cc[a+ido*(b+ip*c)]; }; 2018 auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& 2019 { return ch[a+ido*(b+l1*c)]; }; 2020 2021 for (size_t k=0; k<l1; k++) 2022 { 2023 Tfd tr2=Tfs(2)*CC(ido-1,1,k); 2024 Tfd cr2=CC(0,0,k)+taur*tr2; 2025 CH(0,k,0)=CC(0,0,k)+tr2; 2026 Tfd ci3=Tfs(2)*taui*CC(0,2,k); 2027 PM (CH(0,k,2),CH(0,k,1),cr2,ci3); 2028 } 2029 if (ido==1) return ch; 2030 for (size_t k=0; k<l1; k++) 2031 for (size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2) 2032 { 2033 Tfd tr2=CC(i-1,2,k)+CC(ic-1,1,k); // t2=CC(I) + conj(CC(ic)) 2034 Tfd ti2=CC(i ,2,k)-CC(ic ,1,k); 2035 Tfd cr2=CC(i-1,0,k)+taur*tr2; // c2=CC +taur*t2 2036 Tfd ci2=CC(i ,0,k)+taur*ti2; 2037 CH(i-1,k,0)=CC(i-1,0,k)+tr2; // CH=CC+t2 2038 CH(i ,k,0)=CC(i ,0,k)+ti2; 2039 Tfd cr3=taui*(CC(i-1,2,k)-CC(ic-1,1,k));// c3=taui*(CC(i)-conj(CC(ic))) 2040 Tfd ci3=taui*(CC(i ,2,k)+CC(ic ,1,k)); 2041 Tfd di2, di3, dr2, dr3; 2042 PM(dr3,dr2,cr2,ci3); // d2= (cr2-ci3, ci2+cr3) = c2+i*c3 2043 PM(di2,di3,ci2,cr3); // d3= (cr2+ci3, ci2-cr3) = c2-i*c3 2044 MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2); // ch = WA*d2 2045 MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3); 2046 } 2047 } 2048 return ch; 2049 } 2050 2051 public: 2052 rfftp3(size_t l1_, size_t ido_, const Troots<Tfs> &roots) 2053 : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) 2054 { 2055 MR_assert(ido&1, "ido must be odd"); 2056 size_t N=ip*l1*ido; 2057 size_t rfct = roots->size()/N; 2058 MR_assert(roots->size()==N*rfct, "mismatch"); 2059 for (size_t j=1; j<ip; ++j) 2060 for (size_t i=1; i<=(ido-1)/2; ++i) 2061 { 2062 auto val = (*roots)[rfct*j*l1*i]; 2063 wa[(j-1)*(ido-1)+2*i-2] = val.r; 2064 wa[(j-1)*(ido-1)+2*i-1] = val.i; 2065 } 2066 } 2067 2068 virtual size_t bufsize() const { return 0; } 2069 virtual bool needs_copy() const { return true; } 2070 2071 POCKETFFT_EXEC_DISPATCH 2072 }; 2073 2074 template <typename Tfs> class rfftp4: public rfftpass<Tfs> 2075 { 2076 private: 2077 size_t l1, ido; 2078 static constexpr size_t ip=4; 2079 quick_array<Tfs> wa; 2080 2081 auto WA(size_t x, size_t i) const 2082 { return wa[i+x*(ido-1)]; } 2083 2084 template<bool fwd, typename Tfd> Tfd *exec_ (Tfd * DUCC0_RESTRICT cc, 2085 Tfd * DUCC0_RESTRICT ch, Tfd * /*buf*/, size_t /*nthreads*/) const 2086 { 2087 if constexpr(fwd) 2088 { 2089 constexpr Tfs hsqt2=Tfs(0.707106781186547524400844362104849L); 2090 auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd& 2091 { return cc[a+ido*(b+l1*c)]; }; 2092 auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& 2093 { return ch[a+ido*(b+ip*c)]; }; 2094 2095 for (size_t k=0; k<l1; k++) 2096 { 2097 Tfd tr1,tr2; 2098 PM (tr1,CH(0,2,k),CC(0,k,3),CC(0,k,1)); 2099 PM (tr2,CH(ido-1,1,k),CC(0,k,0),CC(0,k,2)); 2100 PM (CH(0,0,k),CH(ido-1,3,k),tr2,tr1); 2101 } 2102 if ((ido&1)==0) 2103 for (size_t k=0; k<l1; k++) 2104 { 2105 Tfd ti1=-hsqt2*(CC(ido-1,k,1)+CC(ido-1,k,3)); 2106 Tfd tr1= hsqt2*(CC(ido-1,k,1)-CC(ido-1,k,3)); 2107 PM (CH(ido-1,0,k),CH(ido-1,2,k),CC(ido-1,k,0),tr1); 2108 PM (CH( 0,3,k),CH( 0,1,k),ti1,CC(ido-1,k,2)); 2109 } 2110 if (ido<=2) return ch; 2111 for (size_t k=0; k<l1; k++) 2112 for (size_t i=2; i<ido; i+=2) 2113 { 2114 size_t ic=ido-i; 2115 Tfd ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4; 2116 MULPM(cr2,ci2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1)); 2117 MULPM(cr3,ci3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2)); 2118 MULPM(cr4,ci4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3)); 2119 PM(tr1,tr4,cr4,cr2); 2120 PM(ti1,ti4,ci2,ci4); 2121 PM(tr2,tr3,CC(i-1,k,0),cr3); 2122 PM(ti2,ti3,CC(i ,k,0),ci3); 2123 PM(CH(i-1,0,k),CH(ic-1,3,k),tr2,tr1); 2124 PM(CH(i ,0,k),CH(ic ,3,k),ti1,ti2); 2125 PM(CH(i-1,2,k),CH(ic-1,1,k),tr3,ti4); 2126 PM(CH(i ,2,k),CH(ic ,1,k),tr4,ti3); 2127 } 2128 } 2129 else 2130 { 2131 constexpr Tfs sqrt2=Tfs(1.414213562373095048801688724209698L); 2132 auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd& 2133 { return cc[a+ido*(b+ip*c)]; }; 2134 auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& 2135 { return ch[a+ido*(b+l1*c)]; }; 2136 2137 for (size_t k=0; k<l1; k++) 2138 { 2139 Tfd tr1, tr2; 2140 PM (tr2,tr1,CC(0,0,k),CC(ido-1,3,k)); 2141 Tfd tr3=Tfs(2)*CC(ido-1,1,k); 2142 Tfd tr4=Tfs(2)*CC(0,2,k); 2143 PM (CH(0,k,0),CH(0,k,2),tr2,tr3); 2144 PM (CH(0,k,3),CH(0,k,1),tr1,tr4); 2145 } 2146 if ((ido&1)==0) 2147 for (size_t k=0; k<l1; k++) 2148 { 2149 Tfd tr1,tr2,ti1,ti2; 2150 PM (ti1,ti2,CC(0 ,3,k),CC(0 ,1,k)); 2151 PM (tr2,tr1,CC(ido-1,0,k),CC(ido-1,2,k)); 2152 CH(ido-1,k,0)=tr2+tr2; 2153 CH(ido-1,k,1)=sqrt2*(tr1-ti1); 2154 CH(ido-1,k,2)=ti2+ti2; 2155 CH(ido-1,k,3)=-sqrt2*(tr1+ti1); 2156 } 2157 if (ido<=2) return ch; 2158 for (size_t k=0; k<l1;++k) 2159 for (size_t i=2; i<ido; i+=2) 2160 { 2161 Tfd ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4; 2162 size_t ic=ido-i; 2163 PM (tr2,tr1,CC(i-1,0,k),CC(ic-1,3,k)); 2164 PM (ti1,ti2,CC(i ,0,k),CC(ic ,3,k)); 2165 PM (tr4,ti3,CC(i ,2,k),CC(ic ,1,k)); 2166 PM (tr3,ti4,CC(i-1,2,k),CC(ic-1,1,k)); 2167 PM (CH(i-1,k,0),cr3,tr2,tr3); 2168 PM (CH(i ,k,0),ci3,ti2,ti3); 2169 PM (cr4,cr2,tr1,tr4); 2170 PM (ci2,ci4,ti1,ti4); 2171 MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ci2,cr2); 2172 MULPM (CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),ci3,cr3); 2173 MULPM (CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),ci4,cr4); 2174 } 2175 } 2176 return ch; 2177 } 2178 2179 public: 2180 rfftp4(size_t l1_, size_t ido_, const Troots<Tfs> &roots) 2181 : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) 2182 { 2183 size_t N=ip*l1*ido; 2184 size_t rfct = roots->size()/N; 2185 MR_assert(roots->size()==N*rfct, "mismatch"); 2186 for (size_t j=1; j<ip; ++j) 2187 for (size_t i=1; i<=(ido-1)/2; ++i) 2188 { 2189 auto val = (*roots)[rfct*j*l1*i]; 2190 wa[(j-1)*(ido-1)+2*i-2] = val.r; 2191 wa[(j-1)*(ido-1)+2*i-1] = val.i; 2192 } 2193 } 2194 2195 virtual size_t bufsize() const { return 0; } 2196 virtual bool needs_copy() const { return true; } 2197 2198 POCKETFFT_EXEC_DISPATCH 2199 }; 2200 2201 template <typename Tfs> class rfftp5: public rfftpass<Tfs> 2202 { 2203 private: 2204 size_t l1, ido; 2205 static constexpr size_t ip=5; 2206 quick_array<Tfs> wa; 2207 2208 auto WA(size_t x, size_t i) const 2209 { return wa[i+x*(ido-1)]; } 2210 2211 template<bool fwd, typename Tfd> Tfd *exec_ (Tfd * DUCC0_RESTRICT cc, 2212 Tfd * DUCC0_RESTRICT ch, Tfd * /*buf*/, size_t /*nthreads*/) const 2213 { 2214 constexpr Tfs tr11= Tfs(0.3090169943749474241022934171828191L), 2215 ti11= Tfs(0.9510565162951535721164393333793821L), 2216 tr12= Tfs(-0.8090169943749474241022934171828191L), 2217 ti12= Tfs(0.5877852522924731291687059546390728L); 2218 2219 if constexpr(fwd) 2220 { 2221 auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd& 2222 { return cc[a+ido*(b+l1*c)]; }; 2223 auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& 2224 { return ch[a+ido*(b+ip*c)]; }; 2225 2226 for (size_t k=0; k<l1; k++) 2227 { 2228 Tfd cr2, cr3, ci4, ci5; 2229 PM (cr2,ci5,CC(0,k,4),CC(0,k,1)); 2230 PM (cr3,ci4,CC(0,k,3),CC(0,k,2)); 2231 CH(0,0,k)=CC(0,k,0)+cr2+cr3; 2232 CH(ido-1,1,k)=CC(0,k,0)+tr11*cr2+tr12*cr3; 2233 CH(0,2,k)=ti11*ci5+ti12*ci4; 2234 CH(ido-1,3,k)=CC(0,k,0)+tr12*cr2+tr11*cr3; 2235 CH(0,4,k)=ti12*ci5-ti11*ci4; 2236 } 2237 if (ido==1) return ch; 2238 for (size_t k=0; k<l1;++k) 2239 for (size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2) 2240 { 2241 Tfd di2, di3, di4, di5, dr2, dr3, dr4, dr5; 2242 MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1)); 2243 MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2)); 2244 MULPM (dr4,di4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3)); 2245 MULPM (dr5,di5,WA(3,i-2),WA(3,i-1),CC(i-1,k,4),CC(i,k,4)); 2246 POCKETFFT_REARRANGE(dr2, di2, dr5, di5); 2247 POCKETFFT_REARRANGE(dr3, di3, dr4, di4); 2248 CH(i-1,0,k)=CC(i-1,k,0)+dr2+dr3; 2249 CH(i ,0,k)=CC(i ,k,0)+di2+di3; 2250 Tfd tr2=CC(i-1,k,0)+tr11*dr2+tr12*dr3; 2251 Tfd ti2=CC(i ,k,0)+tr11*di2+tr12*di3; 2252 Tfd tr3=CC(i-1,k,0)+tr12*dr2+tr11*dr3; 2253 Tfd ti3=CC(i ,k,0)+tr12*di2+tr11*di3; 2254 Tfd tr5 = ti11*dr5 + ti12*dr4; 2255 Tfd ti5 = ti11*di5 + ti12*di4; 2256 Tfd tr4 = ti12*dr5 - ti11*dr4; 2257 Tfd ti4 = ti12*di5 - ti11*di4; 2258 PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr5); 2259 PM(CH(i ,2,k),CH(ic ,1,k),ti5,ti2); 2260 PM(CH(i-1,4,k),CH(ic-1,3,k),tr3,tr4); 2261 PM(CH(i ,4,k),CH(ic ,3,k),ti4,ti3); 2262 } 2263 } 2264 else 2265 { 2266 auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd& 2267 { return cc[a+ido*(b+ip*c)]; }; 2268 auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& 2269 { return ch[a+ido*(b+l1*c)]; }; 2270 2271 for (size_t k=0; k<l1; k++) 2272 { 2273 Tfd ti5=CC(0,2,k)+CC(0,2,k); 2274 Tfd ti4=CC(0,4,k)+CC(0,4,k); 2275 Tfd tr2=CC(ido-1,1,k)+CC(ido-1,1,k); 2276 Tfd tr3=CC(ido-1,3,k)+CC(ido-1,3,k); 2277 CH(0,k,0)=CC(0,0,k)+tr2+tr3; 2278 Tfd cr2=CC(0,0,k)+tr11*tr2+tr12*tr3; 2279 Tfd cr3=CC(0,0,k)+tr12*tr2+tr11*tr3; 2280 Tfd ci4, ci5; 2281 MULPM(ci5,ci4,ti5,ti4,ti11,ti12); 2282 PM(CH(0,k,4),CH(0,k,1),cr2,ci5); 2283 PM(CH(0,k,3),CH(0,k,2),cr3,ci4); 2284 } 2285 if (ido==1) return ch; 2286 for (size_t k=0; k<l1;++k) 2287 for (size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2) 2288 { 2289 Tfd tr2, tr3, tr4, tr5, ti2, ti3, ti4, ti5; 2290 PM(tr2,tr5,CC(i-1,2,k),CC(ic-1,1,k)); 2291 PM(ti5,ti2,CC(i ,2,k),CC(ic ,1,k)); 2292 PM(tr3,tr4,CC(i-1,4,k),CC(ic-1,3,k)); 2293 PM(ti4,ti3,CC(i ,4,k),CC(ic ,3,k)); 2294 CH(i-1,k,0)=CC(i-1,0,k)+tr2+tr3; 2295 CH(i ,k,0)=CC(i ,0,k)+ti2+ti3; 2296 Tfd cr2=CC(i-1,0,k)+tr11*tr2+tr12*tr3; 2297 Tfd ci2=CC(i ,0,k)+tr11*ti2+tr12*ti3; 2298 Tfd cr3=CC(i-1,0,k)+tr12*tr2+tr11*tr3; 2299 Tfd ci3=CC(i ,0,k)+tr12*ti2+tr11*ti3; 2300 Tfd ci4, ci5, cr5, cr4; 2301 MULPM(cr5,cr4,tr5,tr4,ti11,ti12); 2302 MULPM(ci5,ci4,ti5,ti4,ti11,ti12); 2303 Tfd dr2, dr3, dr4, dr5, di2, di3, di4, di5; 2304 PM(dr4,dr3,cr3,ci4); 2305 PM(di3,di4,ci3,cr4); 2306 PM(dr5,dr2,cr2,ci5); 2307 PM(di2,di5,ci2,cr5); 2308 MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2); 2309 MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3); 2310 MULPM(CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),di4,dr4); 2311 MULPM(CH(i,k,4),CH(i-1,k,4),WA(3,i-2),WA(3,i-1),di5,dr5); 2312 } 2313 } 2314 return ch; 2315 } 2316 2317 public: 2318 rfftp5(size_t l1_, size_t ido_, const Troots<Tfs> &roots) 2319 : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) 2320 { 2321 MR_assert(ido&1, "ido must be odd"); 2322 size_t N=ip*l1*ido; 2323 size_t rfct = roots->size()/N; 2324 MR_assert(roots->size()==N*rfct, "mismatch"); 2325 for (size_t j=1; j<ip; ++j) 2326 for (size_t i=1; i<=(ido-1)/2; ++i) 2327 { 2328 auto val = (*roots)[rfct*j*l1*i]; 2329 wa[(j-1)*(ido-1)+2*i-2] = val.r; 2330 wa[(j-1)*(ido-1)+2*i-1] = val.i; 2331 } 2332 } 2333 2334 virtual size_t bufsize() const { return 0; } 2335 virtual bool needs_copy() const { return true; } 2336 2337 POCKETFFT_EXEC_DISPATCH 2338 }; 2339 2340 template <typename Tfs> class rfftpg: public rfftpass<Tfs> 2341 { 2342 private: 2343 size_t l1, ido; 2344 size_t ip; 2345 quick_array<Tfs> wa, csarr; 2346 2347 template<bool fwd, typename Tfd> Tfd *exec_ (Tfd * DUCC0_RESTRICT cc, 2348 Tfd * DUCC0_RESTRICT ch, Tfd * /*buf*/, size_t /*nthreads*/) const 2349 { 2350 if constexpr(fwd) 2351 { 2352 size_t ipph=(ip+1)/2; 2353 size_t idl1 = ido*l1; 2354 2355 auto CC = [cc,this](size_t a, size_t b, size_t c) -> Tfd& 2356 { return cc[a+ido*(b+ip*c)]; }; 2357 auto CH = [ch,this](size_t a, size_t b, size_t c) -> const Tfd& 2358 { return ch[a+ido*(b+l1*c)]; }; 2359 auto C1 = [cc,this] (size_t a, size_t b, size_t c) -> Tfd& 2360 { return cc[a+ido*(b+l1*c)]; }; 2361 auto C2 = [cc,idl1] (size_t a, size_t b) -> Tfd& 2362 { return cc[a+idl1*b]; }; 2363 auto CH2 = [ch,idl1] (size_t a, size_t b) -> Tfd& 2364 { return ch[a+idl1*b]; }; 2365 2366 if (ido>1) 2367 { 2368 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 114 2369 { 2370 size_t is=(j-1)*(ido-1), 2371 is2=(jc-1)*(ido-1); 2372 for (size_t k=0; k<l1; ++k) // 113 2373 { 2374 size_t idij=is; 2375 size_t idij2=is2; 2376 for (size_t i=1; i<=ido-2; i+=2) // 112 2377 { 2378 Tfd t1=C1(i,k,j ), t2=C1(i+1,k,j ), 2379 t3=C1(i,k,jc), t4=C1(i+1,k,jc); 2380 Tfd x1=wa[idij]*t1 + wa[idij+1]*t2, 2381 x2=wa[idij]*t2 - wa[idij+1]*t1, 2382 x3=wa[idij2]*t3 + wa[idij2+1]*t4, 2383 x4=wa[idij2]*t4 - wa[idij2+1]*t3; 2384 PM(C1(i,k,j),C1(i+1,k,jc),x3,x1); 2385 PM(C1(i+1,k,j),C1(i,k,jc),x2,x4); 2386 idij+=2; 2387 idij2+=2; 2388 } 2389 } 2390 } 2391 } 2392 2393 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 123 2394 for (size_t k=0; k<l1; ++k) // 122 2395 MPINPLACE(C1(0,k,jc), C1(0,k,j)); 2396 2397 //everything in C 2398 //memset(ch,0,ip*l1*ido*sizeof(double)); 2399 2400 for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc) // 127 2401 { 2402 for (size_t ik=0; ik<idl1; ++ik) // 124 2403 { 2404 CH2(ik,l ) = C2(ik,0)+csarr[2*l]*C2(ik,1)+csarr[4*l]*C2(ik,2); 2405 CH2(ik,lc) = csarr[2*l+1]*C2(ik,ip-1)+csarr[4*l+1]*C2(ik,ip-2); 2406 } 2407 size_t iang = 2*l; 2408 size_t j=3, jc=ip-3; 2409 for (; j<ipph-3; j+=4,jc-=4) // 126 2410 { 2411 iang+=l; if (iang>=ip) iang-=ip; 2412 Tfs ar1=csarr[2*iang], ai1=csarr[2*iang+1]; 2413 iang+=l; if (iang>=ip) iang-=ip; 2414 Tfs ar2=csarr[2*iang], ai2=csarr[2*iang+1]; 2415 iang+=l; if (iang>=ip) iang-=ip; 2416 Tfs ar3=csarr[2*iang], ai3=csarr[2*iang+1]; 2417 iang+=l; if (iang>=ip) iang-=ip; 2418 Tfs ar4=csarr[2*iang], ai4=csarr[2*iang+1]; 2419 for (size_t ik=0; ik<idl1; ++ik) // 125 2420 { 2421 CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1) 2422 +ar3*C2(ik,j +2)+ar4*C2(ik,j +3); 2423 CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1) 2424 +ai3*C2(ik,jc-2)+ai4*C2(ik,jc-3); 2425 } 2426 } 2427 for (; j<ipph-1; j+=2,jc-=2) // 126 2428 { 2429 iang+=l; if (iang>=ip) iang-=ip; 2430 Tfs ar1=csarr[2*iang], ai1=csarr[2*iang+1]; 2431 iang+=l; if (iang>=ip) iang-=ip; 2432 Tfs ar2=csarr[2*iang], ai2=csarr[2*iang+1]; 2433 for (size_t ik=0; ik<idl1; ++ik) // 125 2434 { 2435 CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1); 2436 CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1); 2437 } 2438 } 2439 for (; j<ipph; ++j,--jc) // 126 2440 { 2441 iang+=l; if (iang>=ip) iang-=ip; 2442 Tfs ar=csarr[2*iang], ai=csarr[2*iang+1]; 2443 for (size_t ik=0; ik<idl1; ++ik) // 125 2444 { 2445 CH2(ik,l ) += ar*C2(ik,j ); 2446 CH2(ik,lc) += ai*C2(ik,jc); 2447 } 2448 } 2449 } 2450 for (size_t ik=0; ik<idl1; ++ik) // 101 2451 CH2(ik,0) = C2(ik,0); 2452 for (size_t j=1; j<ipph; ++j) // 129 2453 for (size_t ik=0; ik<idl1; ++ik) // 128 2454 CH2(ik,0) += C2(ik,j); 2455 2456 // everything in CH at this point! 2457 //memset(cc,0,ip*l1*ido*sizeof(double)); 2458 2459 for (size_t k=0; k<l1; ++k) // 131 2460 for (size_t i=0; i<ido; ++i) // 130 2461 CC(i,0,k) = CH(i,k,0); 2462 2463 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 137 2464 { 2465 size_t j2=2*j-1; 2466 for (size_t k=0; k<l1; ++k) // 136 2467 { 2468 CC(ido-1,j2,k) = CH(0,k,j); 2469 CC(0,j2+1,k) = CH(0,k,jc); 2470 } 2471 } 2472 2473 if (ido==1) return cc; 2474 2475 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 140 2476 { 2477 size_t j2=2*j-1; 2478 for(size_t k=0; k<l1; ++k) // 139 2479 for(size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2) // 138 2480 { 2481 CC(i ,j2+1,k) = CH(i ,k,j )+CH(i ,k,jc); 2482 CC(ic ,j2 ,k) = CH(i ,k,j )-CH(i ,k,jc); 2483 CC(i+1 ,j2+1,k) = CH(i+1,k,j )+CH(i+1,k,jc); 2484 CC(ic+1,j2 ,k) = CH(i+1,k,jc)-CH(i+1,k,j ); 2485 } 2486 } 2487 return cc; 2488 } 2489 else 2490 { 2491 size_t ipph=(ip+1)/ 2; 2492 size_t idl1 = ido*l1; 2493 2494 auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd& 2495 { return cc[a+ido*(b+ip*c)]; }; 2496 auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& 2497 { return ch[a+ido*(b+l1*c)]; }; 2498 auto C1 = [cc,this](size_t a, size_t b, size_t c) -> const Tfd& 2499 { return cc[a+ido*(b+l1*c)]; }; 2500 auto C2 = [cc,idl1](size_t a, size_t b) -> Tfd& 2501 { return cc[a+idl1*b]; }; 2502 auto CH2 = [ch,idl1](size_t a, size_t b) -> Tfd& 2503 { return ch[a+idl1*b]; }; 2504 2505 for (size_t k=0; k<l1; ++k) // 102 2506 for (size_t i=0; i<ido; ++i) // 101 2507 CH(i,k,0) = CC(i,0,k); 2508 for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc) // 108 2509 { 2510 size_t j2=2*j-1; 2511 for (size_t k=0; k<l1; ++k) 2512 { 2513 CH(0,k,j ) = Tfs(2)*CC(ido-1,j2,k); 2514 CH(0,k,jc) = Tfs(2)*CC(0,j2+1,k); 2515 } 2516 } 2517 2518 if (ido!=1) 2519 { 2520 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 111 2521 { 2522 size_t j2=2*j-1; 2523 for (size_t k=0; k<l1; ++k) 2524 for (size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2) // 109 2525 { 2526 CH(i ,k,j ) = CC(i ,j2+1,k)+CC(ic ,j2,k); 2527 CH(i ,k,jc) = CC(i ,j2+1,k)-CC(ic ,j2,k); 2528 CH(i+1,k,j ) = CC(i+1,j2+1,k)-CC(ic+1,j2,k); 2529 CH(i+1,k,jc) = CC(i+1,j2+1,k)+CC(ic+1,j2,k); 2530 } 2531 } 2532 } 2533 for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc) 2534 { 2535 for (size_t ik=0; ik<idl1; ++ik) 2536 { 2537 C2(ik,l ) = CH2(ik,0)+csarr[2*l]*CH2(ik,1)+csarr[4*l]*CH2(ik,2); 2538 C2(ik,lc) = csarr[2*l+1]*CH2(ik,ip-1)+csarr[4*l+1]*CH2(ik,ip-2); 2539 } 2540 size_t iang=2*l; 2541 size_t j=3,jc=ip-3; 2542 for(; j<ipph-3; j+=4,jc-=4) 2543 { 2544 iang+=l; if(iang>ip) iang-=ip; 2545 Tfs ar1=csarr[2*iang], ai1=csarr[2*iang+1]; 2546 iang+=l; if(iang>ip) iang-=ip; 2547 Tfs ar2=csarr[2*iang], ai2=csarr[2*iang+1]; 2548 iang+=l; if(iang>ip) iang-=ip; 2549 Tfs ar3=csarr[2*iang], ai3=csarr[2*iang+1]; 2550 iang+=l; if(iang>ip) iang-=ip; 2551 Tfs ar4=csarr[2*iang], ai4=csarr[2*iang+1]; 2552 for (size_t ik=0; ik<idl1; ++ik) 2553 { 2554 C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1) 2555 +ar3*CH2(ik,j +2)+ar4*CH2(ik,j +3); 2556 C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1) 2557 +ai3*CH2(ik,jc-2)+ai4*CH2(ik,jc-3); 2558 } 2559 } 2560 for(; j<ipph-1; j+=2,jc-=2) 2561 { 2562 iang+=l; if(iang>ip) iang-=ip; 2563 Tfs ar1=csarr[2*iang], ai1=csarr[2*iang+1]; 2564 iang+=l; if(iang>ip) iang-=ip; 2565 Tfs ar2=csarr[2*iang], ai2=csarr[2*iang+1]; 2566 for (size_t ik=0; ik<idl1; ++ik) 2567 { 2568 C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1); 2569 C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1); 2570 } 2571 } 2572 for(; j<ipph; ++j,--jc) 2573 { 2574 iang+=l; if(iang>ip) iang-=ip; 2575 Tfs war=csarr[2*iang], wai=csarr[2*iang+1]; 2576 for (size_t ik=0; ik<idl1; ++ik) 2577 { 2578 C2(ik,l ) += war*CH2(ik,j ); 2579 C2(ik,lc) += wai*CH2(ik,jc); 2580 } 2581 } 2582 } 2583 for (size_t j=1; j<ipph; ++j) 2584 for (size_t ik=0; ik<idl1; ++ik) 2585 CH2(ik,0) += CH2(ik,j); 2586 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 124 2587 for (size_t k=0; k<l1; ++k) 2588 PM(CH(0,k,jc),CH(0,k,j),C1(0,k,j),C1(0,k,jc)); 2589 2590 if (ido==1) return ch; 2591 2592 for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc) // 127 2593 for (size_t k=0; k<l1; ++k) 2594 for (size_t i=1; i<=ido-2; i+=2) 2595 { 2596 CH(i ,k,j ) = C1(i ,k,j)-C1(i+1,k,jc); 2597 CH(i ,k,jc) = C1(i ,k,j)+C1(i+1,k,jc); 2598 CH(i+1,k,j ) = C1(i+1,k,j)+C1(i ,k,jc); 2599 CH(i+1,k,jc) = C1(i+1,k,j)-C1(i ,k,jc); 2600 } 2601 2602 // All in CH 2603 2604 for (size_t j=1; j<ip; ++j) 2605 { 2606 size_t is = (j-1)*(ido-1); 2607 for (size_t k=0; k<l1; ++k) 2608 { 2609 size_t idij = is; 2610 for (size_t i=1; i<=ido-2; i+=2) 2611 { 2612 Tfd t1=CH(i,k,j), t2=CH(i+1,k,j); 2613 CH(i ,k,j) = wa[idij]*t1-wa[idij+1]*t2; 2614 CH(i+1,k,j) = wa[idij]*t2+wa[idij+1]*t1; 2615 idij+=2; 2616 } 2617 } 2618 } 2619 return ch; 2620 } 2621 } 2622 2623 public: 2624 rfftpg(size_t l1_, size_t ido_, size_t ip_, const Troots<Tfs> &roots) 2625 : l1(l1_), ido(ido_), ip(ip_), wa((ip-1)*(ido-1)), csarr(2*ip) 2626 { 2627 MR_assert(ido&1, "ido must be odd"); 2628 size_t N=ip*l1*ido; 2629 size_t rfct = roots->size()/N; 2630 MR_assert(roots->size()==N*rfct, "mismatch"); 2631 for (size_t j=1; j<ip; ++j) 2632 for (size_t i=1; i<=(ido-1)/2; ++i) 2633 { 2634 auto val = (*roots)[rfct*j*l1*i]; 2635 wa[(j-1)*(ido-1)+2*i-2] = val.r; 2636 wa[(j-1)*(ido-1)+2*i-1] = val.i; 2637 } 2638 csarr[0] = Tfs(1); 2639 csarr[1] = Tfs(0); 2640 for (size_t i=2, ic=2*ip-2; i<=ic; i+=2, ic-=2) 2641 { 2642 auto val = (*roots)[i/2*rfct*(N/ip)]; 2643 csarr[i ] = val.r; 2644 csarr[i +1] = val.i; 2645 csarr[ic ] = val.r; 2646 csarr[ic+1] = -val.i; 2647 } 2648 } 2649 2650 virtual size_t bufsize() const { return 0; } 2651 virtual bool needs_copy() const { return true; } 2652 2653 POCKETFFT_EXEC_DISPATCH 2654 }; 2655 2656 template <typename Tfs> class rfftpblue: public rfftpass<Tfs> 2657 { 2658 private: 2659 const size_t l1, ido, ip; 2660 quick_array<Tfs> wa; 2661 const Tcpass<Tfs> cplan; 2662 size_t bufsz; 2663 bool need_cpy; 2664 2665 auto WA(size_t x, size_t i) const 2666 { return wa[i+x*(ido-1)]; } 2667 2668 template<bool fwd, typename Tfd> Tfd *exec_ 2669 (Tfd * DUCC0_RESTRICT cc, Tfd * DUCC0_RESTRICT ch, 2670 Tfd * DUCC0_RESTRICT buf_, size_t nthreads) const 2671 { 2672 using Tcd = Cmplx<Tfd>; 2673 auto buf = reinterpret_cast<Tcd *>(buf_); 2674 Tcd *cc2 = &buf[0]; 2675 Tcd *ch2 = &buf[ip]; 2676 Tcd *subbuf = &buf[2*ip]; 2677 static const auto ticd = tidx<Tcd *>(); 2678 2679 if constexpr(fwd) 2680 { 2681 auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd& 2682 { return cc[a+ido*(b+l1*c)]; }; 2683 auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& 2684 { return ch[a+ido*(b+ip*c)]; }; 2685 2686 for (size_t k=0; k<l1; ++k) 2687 { 2688 // copy in 2689 for (size_t m=0; m<ip; ++m) 2690 cc2[m] = {CC(0,k,m),Tfd(0)}; 2691 auto res = static_cast<Tcd *>(cplan->exec(ticd, cc2, ch2, 2692 subbuf, fwd, nthreads)); 2693 // copy out 2694 CH(0,0,k) = res[0].r; 2695 for (size_t m=1; m<=ip/2; ++m) 2696 { 2697 CH(ido-1,2*m-1,k)=res[m].r; 2698 CH(0,2*m,k)=res[m].i; 2699 } 2700 } 2701 if (ido==1) return ch; 2702 size_t ipph = (ip+1)/2; 2703 for (size_t k=0; k<l1; ++k) 2704 for (size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2) 2705 { 2706 // copy in 2707 cc2[0] = {CC(i-1,k,0),CC(i,k,0)}; 2708 for (size_t m=1; m<ipph; ++m) 2709 { 2710 MULPM (cc2[m].r,cc2[m].i,WA(m-1,i-2),WA(m-1,i-1),CC(i-1,k,m),CC(i,k,m)); 2711 MULPM (cc2[ip-m].r,cc2[ip-m].i,WA(ip-m-1,i-2),WA(ip-m-1,i-1),CC(i-1,k,ip-m),CC(i,k,ip-m)); 2712 } 2713 auto res = static_cast<Tcd *>(cplan->exec(ticd, cc2, ch2, 2714 subbuf, fwd, nthreads)); 2715 CH(i-1,0,k) = res[0].r; 2716 CH(i,0,k) = res[0].i; 2717 for (size_t m=1; m<ipph; ++m) 2718 { 2719 CH(i-1,2*m,k) = res[m].r; 2720 CH(ic-1,2*m-1,k) = res[ip-m].r; 2721 CH(i ,2*m,k) = res[m].i; 2722 CH(ic ,2*m-1,k) = -res[ip-m].i; 2723 } 2724 } 2725 } 2726 else 2727 { 2728 auto CC = [cc,this](size_t a, size_t b, size_t c) -> Tfd& 2729 { return cc[a+ido*(b+ip*c)]; }; 2730 auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& 2731 { return ch[a+ido*(b+l1*c)]; }; 2732 2733 for (size_t k=0; k<l1; k++) 2734 { 2735 cc2[0] = {CC(0,0,k), Tfd(0)}; 2736 for (size_t m=1; m<=ip/2; ++m) 2737 { 2738 cc2[m] = {CC(ido-1,2*m-1,k),CC(0,2*m,k)}; 2739 cc2[ip-m] = {CC(ido-1,2*m-1,k),-CC(0,2*m,k)}; 2740 } 2741 auto res = static_cast<Tcd *>(cplan->exec(ticd, cc2, ch2, 2742 subbuf, fwd, nthreads)); 2743 for (size_t m=0; m<ip; ++m) 2744 CH(0,k,m) = res[m].r; 2745 } 2746 if (ido==1) return ch; 2747 for (size_t k=0; k<l1; ++k) 2748 for (size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2) 2749 { 2750 // copy in 2751 cc2[0] = {CC(i-1,0,k),CC(i,0,k)}; 2752 for (size_t m=1; m<=ip/2; ++m) 2753 { 2754 cc2[m] = {CC(i-1,2*m,k),CC(i,2*m,k)}; 2755 cc2[ip-m] = {CC(ic-1,2*m-1,k),-CC(ic,2*m-1,k)}; 2756 } 2757 auto res = static_cast<Tcd *>(cplan->exec(ticd, cc2, ch2, 2758 subbuf, fwd, nthreads)); 2759 CH(i-1,k,0) = res[0].r; 2760 CH(i,k,0) = res[0].i; 2761 for (size_t m=1; m<ip; ++m) 2762 { 2763 MULPM(CH(i-1,k,m),CH(i,k,m),WA(m-1,i-2),-WA(m-1,i-1),res[m].r,res[m].i); 2764 MULPM(CH(i-1,k,ip-m),CH(i,k,ip-m),WA(ip-m-1,i-2),-WA(ip-m-1,i-1),res[ip-m].r,res[ip-m].i); 2765 } 2766 } 2767 } 2768 return ch; 2769 } 2770 2771 public: 2772 rfftpblue(size_t l1_, size_t ido_, size_t ip_, const Troots<Tfs> &roots, bool vectorize=false) 2773 : l1(l1_), ido(ido_), ip(ip_), wa((ip-1)*(ido-1)), 2774 cplan(cfftpass<Tfs>::make_pass(1,1,ip,roots,vectorize)) 2775 { 2776 MR_assert(ip&1, "Bluestein length must be odd"); 2777 MR_assert(ido&1, "ido must be odd"); 2778 size_t N=ip*l1*ido; 2779 auto rfct = roots->size()/N; 2780 MR_assert(roots->size()==N*rfct, "mismatch"); 2781 for (size_t j=1; j<ip; ++j) 2782 for (size_t i=1; i<=(ido-1)/2; ++i) 2783 { 2784 auto val = (*roots)[rfct*j*l1*i]; 2785 wa[(j-1)*(ido-1)+2*i-2] = val.r; 2786 wa[(j-1)*(ido-1)+2*i-1] = val.i; 2787 } 2788 } 2789 2790 virtual size_t bufsize() const { return 4*ip + 2*cplan->bufsize(); } 2791 virtual bool needs_copy() const { return true; } 2792 2793 POCKETFFT_EXEC_DISPATCH 2794 }; 2795 2796 template <typename Tfs> class rfft_multipass: public rfftpass<Tfs> 2797 { 2798 private: 2799 const size_t l1, ido; 2800 size_t ip; 2801 vector<Trpass<Tfs>> passes; 2802 size_t bufsz; 2803 bool need_cpy; 2804 quick_array<Tfs> wa; 2805 2806 auto WA(size_t x, size_t i) const 2807 { return wa[(i-1)*(ip-1)+x]; } 2808 2809 template<bool fwd, typename Tfd> Tfd *exec_(Tfd *cc, Tfd *ch, Tfd *buf, 2810 size_t nthreads) const 2811 { 2812 static const auto tifd = tidx<Tfd *>(); 2813 if ((l1==1) && (ido==1)) 2814 { 2815 Tfd *p1=cc, *p2=ch; 2816 if constexpr (fwd) 2817 for (auto it=passes.rbegin(); it!=passes.rend(); ++it) 2818 { 2819 auto res = static_cast<Tfd *>((*it)->exec(tifd, 2820 p1, p2, buf, fwd, nthreads)); 2821 if (res==p2) swap(p1,p2); 2822 } 2823 else 2824 for (const auto &pass: passes) 2825 { 2826 auto res = static_cast<Tfd *>(pass->exec(tifd, 2827 p1, p2, buf, fwd, nthreads)); 2828 if (res==p2) swap(p1,p2); 2829 } 2830 return p1; 2831 } 2832 else 2833 MR_fail("not yet supported"); 2834 } 2835 2836 public: 2837 rfft_multipass(size_t l1_, size_t ido_, size_t ip_, 2838 const Troots<Tfs> &roots, bool /*vectorize*/=false) 2839 : l1(l1_), ido(ido_), ip(ip_), bufsz(0), need_cpy(false), 2840 wa((ip-1)*(ido-1)) 2841 { 2842 size_t N=ip*l1*ido; 2843 auto rfct = roots->size()/N; 2844 MR_assert(roots->size()==N*rfct, "mismatch"); 2845 for (size_t j=1; j<ip; ++j) 2846 for (size_t i=1; i<=(ido-1)/2; ++i) 2847 { 2848 auto val = (*roots)[rfct*j*l1*i]; 2849 wa[(j-1)*(ido-1)+2*i-2] = val.r; 2850 wa[(j-1)*(ido-1)+2*i-1] = val.i; 2851 } 2852 2853 auto factors = rfftpass<Tfs>::factorize(ip); 2854 2855 size_t l1l=1; 2856 for (auto fct: factors) 2857 { 2858 passes.push_back(rfftpass<Tfs>::make_pass(l1l, ip/(fct*l1l), fct, roots)); 2859 l1l*=fct; 2860 } 2861 for (const auto &pass: passes) 2862 { 2863 bufsz = max(bufsz, pass->bufsize()); 2864 need_cpy |= pass->needs_copy(); 2865 } 2866 if ((l1!=1)||(ido!=1)) 2867 { 2868 need_cpy=true; 2869 bufsz += 2*ip; 2870 } 2871 } 2872 2873 virtual size_t bufsize() const { return bufsz; } 2874 virtual bool needs_copy() const { return need_cpy; } 2875 2876 POCKETFFT_EXEC_DISPATCH 2877 }; 2878 2879 template <typename Tfs> class rfftp_complexify: public rfftpass<Tfs> 2880 { 2881 private: 2882 size_t N; 2883 Troots<Tfs> roots; 2884 size_t rfct; 2885 Tcpass<Tfs> pass; 2886 size_t l1, ido; 2887 static constexpr size_t ip=2; 2888 2889 template<bool fwd, typename Tfd> Tfd *exec_ (Tfd * DUCC0_RESTRICT cc, 2890 Tfd * DUCC0_RESTRICT ch, Tfd * buf, size_t nthreads) const 2891 { 2892 using Tcd = Cmplx<Tfd>; 2893 auto ccc = reinterpret_cast<Tcd *>(cc); 2894 auto cch = reinterpret_cast<Tcd *>(ch); 2895 auto cbuf = reinterpret_cast<Tcd *>(buf); 2896 static const auto ticd = tidx<Tcd *>(); 2897 if constexpr(fwd) 2898 { 2899 auto res = static_cast<Tcd *>(pass->exec(ticd, 2900 ccc, cch, cbuf, true, nthreads)); 2901 auto rres = (res==ccc) ? ch : cc; 2902 rres[0] = res[0].r+res[0].i; 2903 //FIXME: parallelize? 2904 for (size_t i=1, xi=N/2-1; i<=xi; ++i, --xi) 2905 { 2906 auto xe = res[i]+res[xi].conj(); 2907 auto xo = Tcd(res[i].i+res[xi].i, res[xi].r-res[i].r) 2908 * (*roots)[rfct*i].conj(); 2909 rres[2*i-1] = Tfs(0.5)*(xe.r+xo.r); 2910 rres[2*i] = Tfs(0.5)*(xe.i+xo.i); 2911 rres[2*xi-1] = Tfs(0.5)*(xe.r-xo.r); 2912 rres[2*xi] = Tfs(0.5)*(xo.i-xe.i); 2913 } 2914 rres[N-1] = res[0].r-res[0].i; 2915 return rres; 2916 } 2917 else 2918 { 2919 cch[0] = Tcd(cc[0]+cc[N-1], cc[0]-cc[N-1]); 2920 //FIXME: parallelize? 2921 for (size_t i=1, xi=N/2-1; i<=xi; ++i, --xi) 2922 { 2923 Tcd t1 (cc[2*i-1], cc[2*i]); 2924 Tcd t2 (cc[2*xi-1], -cc[2*xi]); 2925 auto xe = t1+t2; 2926 auto xo = (t1-t2)*(*roots)[rfct*i]; 2927 cch[i] = (xe + Tcd(-xo.i, xo.r)); 2928 cch[xi] = (xe.conj() + Tcd(xo.i, xo.r)); 2929 } 2930 auto res = static_cast<Tcd *>(pass->exec(ticd, 2931 cch, ccc, cbuf, false, nthreads)); 2932 return (res==ccc) ? cc : ch; 2933 } 2934 } 2935 2936 public: 2937 rfftp_complexify(size_t N_, const Troots<Tfs> &roots_, bool vectorize=false) 2938 : N(N_), roots(roots_), pass(cfftpass<Tfs>::make_pass(N/2, vectorize)) 2939 { 2940 rfct = roots->size()/N; 2941 MR_assert(roots->size()==N*rfct, "mismatch"); 2942 MR_assert((N&1)==0, "N must be even"); 2943 } 2944 2945 virtual size_t bufsize() const { return 2*pass->bufsize(); } 2946 virtual bool needs_copy() const { return true; } 2947 2948 POCKETFFT_EXEC_DISPATCH 2949 }; 2950 #undef POCKETFFT_EXEC_DISPATCH 2951 2952 template<typename Tfs> Trpass<Tfs> rfftpass<Tfs>::make_pass(size_t l1, 2953 size_t ido, size_t ip, const Troots<Tfs> &roots, bool vectorize) 2954 { 2955 MR_assert(ip>=1, "no zero-sized FFTs"); 2956 if (ip==1) return make_shared<rfftp1<Tfs>>(); 2957 if ((ip>1000) && ((ip&1)==0)) // use complex transform 2958 return make_shared<rfftp_complexify<Tfs>>(ip, roots, vectorize); 2959 auto factors=rfftpass<Tfs>::factorize(ip); 2960 if (factors.size()==1) 2961 { 2962 switch(ip) 2963 { 2964 case 2: 2965 return make_shared<rfftp2<Tfs>>(l1, ido, roots); 2966 case 3: 2967 return make_shared<rfftp3<Tfs>>(l1, ido, roots); 2968 case 4: 2969 return make_shared<rfftp4<Tfs>>(l1, ido, roots); 2970 case 5: 2971 return make_shared<rfftp5<Tfs>>(l1, ido, roots); 2972 default: 2973 if (ip<135) 2974 return make_shared<rfftpg<Tfs>>(l1, ido, ip, roots); 2975 else 2976 return make_shared<rfftpblue<Tfs>>(l1, ido, ip, roots, vectorize); 2977 } 2978 } 2979 else // more than one factor, need a multipass 2980 return make_shared<rfft_multipass<Tfs>>(l1, ido, ip, roots, vectorize); 2981 } 2982 2983 template<typename Tfs> class pocketfft_r 2984 { 2985 private: 2986 size_t N; 2987 Trpass<Tfs> plan; 2988 2989 public: 2990 pocketfft_r(size_t n, bool vectorize=false) 2991 : N(n), plan(rfftpass<Tfs>::make_pass(n,vectorize)) {} 2992 size_t length() const { return N; } 2993 size_t bufsize() const { return N*plan->needs_copy()+plan->bufsize(); } 2994 template<typename Tfd> DUCC0_NOINLINE Tfd *exec(Tfd *in, Tfd *buf, Tfs fct, 2995 bool fwd, size_t nthreads=1) const 2996 { 2997 static const auto tifd = tidx<Tfd *>(); 2998 auto res = static_cast<Tfd *>(plan->exec(tifd, in, buf, 2999 buf+N*plan->needs_copy(), fwd, nthreads)); 3000 if (fct!=Tfs(1)) 3001 for (size_t i=0; i<N; ++i) res[i]*=fct; 3002 return res; 3003 } 3004 template<typename Tfd> DUCC0_NOINLINE void exec_copyback(Tfd *in, Tfd *buf, 3005 Tfs fct, bool fwd, size_t nthreads=1) const 3006 { 3007 static const auto tifd = tidx<Tfd *>(); 3008 auto res = static_cast<Tfd *>(plan->exec(tifd, in, buf, 3009 buf+N*plan->needs_copy(), fwd, nthreads)); 3010 if (res==in) 3011 { 3012 if (fct!=Tfs(1)) 3013 for (size_t i=0; i<N; ++i) in[i]*=fct; 3014 } 3015 else 3016 { 3017 if (fct!=Tfs(1)) 3018 for (size_t i=0; i<N; ++i) in[i]=res[i]*fct; 3019 else 3020 copy_n(res, N, in); 3021 } 3022 } 3023 template<typename Tfd> DUCC0_NOINLINE void exec(Tfd *in, Tfs fct, bool fwd, 3024 size_t nthreads=1) const 3025 { 3026 quick_array<Tfd> buf(N*plan->needs_copy()+plan->bufsize()); 3027 exec_copyback(in, buf.data(), fct, fwd, nthreads); 3028 } 3029 }; 3030 3031 template<typename Tfs> class pocketfft_hartley 3032 { 3033 private: 3034 size_t N; 3035 Trpass<Tfs> plan; 3036 3037 public: 3038 pocketfft_hartley(size_t n, bool vectorize=false) 3039 : N(n), plan(rfftpass<Tfs>::make_pass(n,vectorize)) {} 3040 size_t length() const { return N; } 3041 size_t bufsize() const { return N+plan->bufsize(); } 3042 template<typename Tfd> DUCC0_NOINLINE Tfd *exec(Tfd *in, Tfd *buf, Tfs fct, 3043 size_t nthreads=1) const 3044 { 3045 static const auto tifd = tidx<Tfd *>(); 3046 auto res = static_cast<Tfd *>(plan->exec(tifd, 3047 in, buf, buf+N, true, nthreads)); 3048 auto res2 = (res==buf) ? in : buf; 3049 res2[0] = fct*res[0]; 3050 size_t i=1, i1=1, i2=N-1; 3051 for (i=1; i<N-1; i+=2, ++i1, --i2) 3052 { 3053 #ifdef DUCC0_USE_PROPER_HARTLEY_CONVENTION 3054 res2[i1] = fct*(res[i]-res[i+1]); 3055 res2[i2] = fct*(res[i]+res[i+1]); 3056 #else 3057 res2[i1] = fct*(res[i]+res[i+1]); 3058 res2[i2] = fct*(res[i]-res[i+1]); 3059 #endif 3060 } 3061 if (i<N) 3062 res2[i1] = fct*res[i]; 3063 3064 return res2; 3065 } 3066 template<typename Tfd> DUCC0_NOINLINE void exec_copyback(Tfd *in, Tfd *buf, 3067 Tfs fct, size_t nthreads=1) const 3068 { 3069 auto res = exec(in, buf, fct, nthreads); 3070 if (res!=in) 3071 copy_n(res, N, in); 3072 } 3073 template<typename Tfd> DUCC0_NOINLINE void exec(Tfd *in, Tfs fct, 3074 size_t nthreads=1) const 3075 { 3076 quick_array<Tfd> buf(N+plan->bufsize()); 3077 exec_copyback(in, buf.data(), fct, nthreads); 3078 } 3079 }; 3080 3081 // R2R transforms using FFTW's halfcomplex format 3082 template<typename Tfs> class pocketfft_fftw 3083 { 3084 private: 3085 size_t N; 3086 Trpass<Tfs> plan; 3087 3088 public: 3089 pocketfft_fftw(size_t n, bool vectorize=false) 3090 : N(n), plan(rfftpass<Tfs>::make_pass(n,vectorize)) {} 3091 size_t length() const { return N; } 3092 size_t bufsize() const { return N+plan->bufsize(); } 3093 template<typename Tfd> DUCC0_NOINLINE Tfd *exec(Tfd *in, Tfd *buf, Tfs fct, 3094 bool fwd, size_t nthreads=1) const 3095 { 3096 static const auto tifd = tidx<Tfd *>(); 3097 auto res = in; 3098 auto res2 = buf; 3099 if (!fwd) // go to FFTPACK halfcomplex order 3100 { 3101 res2[0] = fct*res[0]; 3102 size_t i=1, i1=1, i2=N-1; 3103 for (i=1; i<N-1; i+=2, ++i1, --i2) 3104 { 3105 res2[i] = fct*res[i1]; 3106 res2[i+1] = fct*res[i2]; 3107 } 3108 if (i<N) 3109 res2[i] = fct*res[i1]; 3110 swap(res, res2); 3111 } 3112 res = static_cast<Tfd *>(plan->exec(tifd, 3113 res, res2, buf+N, fwd, nthreads)); 3114 if (!fwd) return res; 3115 3116 // go to FFTW halfcomplex order 3117 res2 = (res==buf) ? in : buf; 3118 res2[0] = fct*res[0]; 3119 size_t i=1, i1=1, i2=N-1; 3120 for (i=1; i<N-1; i+=2, ++i1, --i2) 3121 { 3122 res2[i1] = fct*res[i]; 3123 res2[i2] = fct*res[i+1]; 3124 } 3125 if (i<N) 3126 res2[i1] = fct*res[i]; 3127 3128 return res2; 3129 } 3130 template<typename Tfd> DUCC0_NOINLINE void exec_copyback(Tfd *in, Tfd *buf, 3131 Tfs fct, bool fwd, size_t nthreads=1) const 3132 { 3133 auto res = exec(in, buf, fct, fwd, nthreads); 3134 if (res!=in) 3135 copy_n(res, N, in); 3136 } 3137 template<typename Tfd> DUCC0_NOINLINE void exec(Tfd *in, Tfs fct, bool fwd, 3138 size_t nthreads=1) const 3139 { 3140 quick_array<Tfd> buf(N+plan->bufsize()); 3141 exec_copyback(in, buf.data(), fct, fwd, nthreads); 3142 } 3143 }; 3144 3145 } 3146 3147 using detail_fft::pocketfft_c; 3148 using detail_fft::pocketfft_r; 3149 using detail_fft::pocketfft_hartley; 3150 using detail_fft::pocketfft_fftw; 3151 inline size_t good_size_complex(size_t n) 3152 { return detail_fft::util1d::good_size_cmplx(n); } 3153 inline size_t good_size_real(size_t n) 3154 { return detail_fft::util1d::good_size_real(n); } 3155 3156 } 3157 3158 #endif 3159