1 /* 2 This file is part of pocketfft. 3 4 Copyright (C) 2010-2020 Max-Planck-Society 5 Copyright (C) 2019-2020 Peter Bell 6 7 For the odd-sized DCT-IV transforms: 8 Copyright (C) 2003, 2007-14 Matteo Frigo 9 Copyright (C) 2003, 2007-14 Massachusetts Institute of Technology 10 11 Authors: Martin Reinecke, Peter Bell 12 13 All rights reserved. 14 15 Redistribution and use in source and binary forms, with or without modification, 16 are permitted provided that the following conditions are met: 17 18 * Redistributions of source code must retain the above copyright notice, this 19 list of conditions and the following disclaimer. 20 * Redistributions in binary form must reproduce the above copyright notice, this 21 list of conditions and the following disclaimer in the documentation and/or 22 other materials provided with the distribution. 23 * Neither the name of the copyright holder nor the names of its contributors may 24 be used to endorse or promote products derived from this software without 25 specific prior written permission. 26 27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 28 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 29 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 30 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 31 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 32 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 33 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 34 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 35 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 36 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37 */ 38 39 #ifndef POCKETFFT_HDRONLY_H 40 #define POCKETFFT_HDRONLY_H 41 42 #ifndef __cplusplus 43 #error This file is C++ and requires a C++ compiler. 44 #endif 45 46 #if !(__cplusplus >= 201103L || _MSVC_LANG+0L >= 201103L) 47 #error This file requires at least C++11 support. 48 #endif 49 50 #ifndef POCKETFFT_CACHE_SIZE 51 #define POCKETFFT_CACHE_SIZE 16 52 #endif 53 54 #include <cmath> 55 #include <cstring> 56 #include <cstdlib> 57 #include <stdexcept> 58 #include <memory> 59 #include <vector> 60 #include <complex> 61 #if POCKETFFT_CACHE_SIZE!=0 62 #include <array> 63 #include <mutex> 64 #endif 65 66 #ifndef POCKETFFT_NO_MULTITHREADING 67 #include <mutex> 68 #include <condition_variable> 69 #include <thread> 70 #include <queue> 71 #include <atomic> 72 #include <functional> 73 #include <new> 74 75 #ifdef POCKETFFT_PTHREADS 76 # include <pthread.h> 77 #endif 78 #endif 79 80 #if defined(__GNUC__) 81 #define POCKETFFT_NOINLINE __attribute__((noinline)) 82 #define POCKETFFT_RESTRICT __restrict__ 83 #elif defined(_MSC_VER) 84 #define POCKETFFT_NOINLINE __declspec(noinline) 85 #define POCKETFFT_RESTRICT __restrict 86 #else 87 #define POCKETFFT_NOINLINE 88 #define POCKETFFT_RESTRICT 89 #endif 90 91 namespace pocketfft { 92 93 namespace detail { 94 using std::size_t; 95 using std::ptrdiff_t; 96 97 // Always use std:: for <cmath> functions 98 template <typename T> T cos(T) = delete; 99 template <typename T> T sin(T) = delete; 100 template <typename T> T sqrt(T) = delete; 101 102 using shape_t = std::vector<size_t>; 103 using stride_t = std::vector<ptrdiff_t>; 104 105 constexpr bool FORWARD = true, 106 BACKWARD = false; 107 108 // only enable vector support for gcc>=5.0 and clang>=5.0 109 #ifndef POCKETFFT_NO_VECTORS 110 #define POCKETFFT_NO_VECTORS 111 #if defined(__INTEL_COMPILER) 112 // do nothing. This is necessary because this compiler also sets __GNUC__. 113 #elif defined(__clang__) 114 // AppleClang has their own version numbering 115 #ifdef __apple_build_version__ 116 # if (__clang_major__ > 9) || (__clang_major__ == 9 && __clang_minor__ >= 1) 117 # undef POCKETFFT_NO_VECTORS 118 # endif 119 #elif __clang_major__ >= 5 120 # undef POCKETFFT_NO_VECTORS 121 #endif 122 #elif defined(__GNUC__) 123 #if __GNUC__>=5 124 #undef POCKETFFT_NO_VECTORS 125 #endif 126 #endif 127 #endif 128 129 template<typename T> struct VLEN { static constexpr size_t val=1; }; 130 131 #ifndef POCKETFFT_NO_VECTORS 132 #if (defined(__AVX512F__)) 133 template<> struct VLEN<float> { static constexpr size_t val=16; }; 134 template<> struct VLEN<double> { static constexpr size_t val=8; }; 135 #elif (defined(__AVX__)) 136 template<> struct VLEN<float> { static constexpr size_t val=8; }; 137 template<> struct VLEN<double> { static constexpr size_t val=4; }; 138 #elif (defined(__SSE2__)) 139 template<> struct VLEN<float> { static constexpr size_t val=4; }; 140 template<> struct VLEN<double> { static constexpr size_t val=2; }; 141 #elif (defined(__VSX__)) 142 template<> struct VLEN<float> { static constexpr size_t val=4; }; 143 template<> struct VLEN<double> { static constexpr size_t val=2; }; 144 #elif (defined(__ARM_NEON__) || defined(__ARM_NEON)) 145 template<> struct VLEN<float> { static constexpr size_t val=4; }; 146 template<> struct VLEN<double> { static constexpr size_t val=2; }; 147 #else 148 #define POCKETFFT_NO_VECTORS 149 #endif 150 #endif 151 152 #if __cplusplus >= 201703L 153 inline void *aligned_alloc(size_t align, size_t size) 154 { 155 void *ptr = ::aligned_alloc(align,size); 156 if (!ptr) throw std::bad_alloc(); 157 return ptr; 158 } 159 inline void aligned_dealloc(void *ptr) 160 { free(ptr); } 161 #else // portable emulation 162 inline void *aligned_alloc(size_t align, size_t size) 163 { 164 align = std::max(align, alignof(max_align_t)); 165 void *ptr = malloc(size+align); 166 if (!ptr) throw std::bad_alloc(); 167 void *res = reinterpret_cast<void *> 168 ((reinterpret_cast<uintptr_t>(ptr) & ~(uintptr_t(align-1))) + uintptr_t(align)); 169 (reinterpret_cast<void**>(res))[-1] = ptr; 170 return res; 171 } 172 inline void aligned_dealloc(void *ptr) 173 { if (ptr) free((reinterpret_cast<void**>(ptr))[-1]); } 174 #endif 175 176 template<typename T> class arr 177 { 178 private: 179 T *p; 180 size_t sz; 181 182 #if defined(POCKETFFT_NO_VECTORS) 183 static T *ralloc(size_t num) 184 { 185 if (num==0) return nullptr; 186 void *res = malloc(num*sizeof(T)); 187 if (!res) throw std::bad_alloc(); 188 return reinterpret_cast<T *>(res); 189 } 190 static void dealloc(T *ptr) 191 { free(ptr); } 192 #else 193 static T *ralloc(size_t num) 194 { 195 if (num==0) return nullptr; 196 void *ptr = aligned_alloc(64, num*sizeof(T)); 197 return static_cast<T*>(ptr); 198 } 199 static void dealloc(T *ptr) 200 { aligned_dealloc(ptr); } 201 #endif 202 203 public: 204 arr() : p(0), sz(0) {} 205 arr(size_t n) : p(ralloc(n)), sz(n) {} 206 arr(arr &&other) 207 : p(other.p), sz(other.sz) 208 { other.p=nullptr; other.sz=0; } 209 ~arr() { dealloc(p); } 210 211 void resize(size_t n) 212 { 213 if (n==sz) return; 214 dealloc(p); 215 p = ralloc(n); 216 sz = n; 217 } 218 219 T &operator[](size_t idx) { return p[idx]; } 220 const T &operator[](size_t idx) const { return p[idx]; } 221 222 T *data() { return p; } 223 const T *data() const { return p; } 224 225 size_t size() const { return sz; } 226 }; 227 228 template<typename T> struct cmplx { 229 T r, i; 230 cmplx() {} 231 cmplx(T r_, T i_) : r(r_), i(i_) {} 232 void Set(T r_, T i_) { r=r_; i=i_; } 233 void Set(T r_) { r=r_; i=T(0); } 234 cmplx &operator+= (const cmplx &other) 235 { r+=other.r; i+=other.i; return *this; } 236 template<typename T2>cmplx &operator*= (T2 other) 237 { r*=other; i*=other; return *this; } 238 template<typename T2>cmplx &operator*= (const cmplx<T2> &other) 239 { 240 T tmp = r*other.r - i*other.i; 241 i = r*other.i + i*other.r; 242 r = tmp; 243 return *this; 244 } 245 template<typename T2>cmplx &operator+= (const cmplx<T2> &other) 246 { r+=other.r; i+=other.i; return *this; } 247 template<typename T2>cmplx &operator-= (const cmplx<T2> &other) 248 { r-=other.r; i-=other.i; return *this; } 249 template<typename T2> auto operator* (const T2 &other) const 250 -> cmplx<decltype(r*other)> 251 { return {r*other, i*other}; } 252 template<typename T2> auto operator+ (const cmplx<T2> &other) const 253 -> cmplx<decltype(r+other.r)> 254 { return {r+other.r, i+other.i}; } 255 template<typename T2> auto operator- (const cmplx<T2> &other) const 256 -> cmplx<decltype(r+other.r)> 257 { return {r-other.r, i-other.i}; } 258 template<typename T2> auto operator* (const cmplx<T2> &other) const 259 -> cmplx<decltype(r+other.r)> 260 { return {r*other.r-i*other.i, r*other.i + i*other.r}; } 261 template<bool fwd, typename T2> auto special_mul (const cmplx<T2> &other) const 262 -> cmplx<decltype(r+other.r)> 263 { 264 using Tres = cmplx<decltype(r+other.r)>; 265 return fwd ? Tres(r*other.r+i*other.i, i*other.r-r*other.i) 266 : Tres(r*other.r-i*other.i, r*other.i+i*other.r); 267 } 268 }; 269 template<typename T> inline void PM(T &a, T &b, T c, T d) 270 { a=c+d; b=c-d; } 271 template<typename T> inline void PMINPLACE(T &a, T &b) 272 { T t = a; a+=b; b=t-b; } 273 template<typename T> inline void MPINPLACE(T &a, T &b) 274 { T t = a; a-=b; b=t+b; } 275 template<typename T> cmplx<T> conj(const cmplx<T> &a) 276 { return {a.r, -a.i}; } 277 template<bool fwd, typename T, typename T2> void special_mul (const cmplx<T> &v1, const cmplx<T2> &v2, cmplx<T> &res) 278 { 279 res = fwd ? cmplx<T>(v1.r*v2.r+v1.i*v2.i, v1.i*v2.r-v1.r*v2.i) 280 : cmplx<T>(v1.r*v2.r-v1.i*v2.i, v1.r*v2.i+v1.i*v2.r); 281 } 282 283 template<typename T> void ROT90(cmplx<T> &a) 284 { auto tmp_=a.r; a.r=-a.i; a.i=tmp_; } 285 template<bool fwd, typename T> void ROTX90(cmplx<T> &a) 286 { auto tmp_= fwd ? -a.r : a.r; a.r = fwd ? a.i : -a.i; a.i=tmp_; } 287 288 // 289 // twiddle factor section 290 // 291 template<typename T> class sincos_2pibyn 292 { 293 private: 294 using Thigh = typename std::conditional<(sizeof(T)>sizeof(double)), T, double>::type; 295 size_t N, mask, shift; 296 arr<cmplx<Thigh>> v1, v2; 297 298 static cmplx<Thigh> calc(size_t x, size_t n, Thigh ang) 299 { 300 x<<=3; 301 if (x<4*n) // first half 302 { 303 if (x<2*n) // first quadrant 304 { 305 if (x<n) return cmplx<Thigh>(std::cos(Thigh(x)*ang), std::sin(Thigh(x)*ang)); 306 return cmplx<Thigh>(std::sin(Thigh(2*n-x)*ang), std::cos(Thigh(2*n-x)*ang)); 307 } 308 else // second quadrant 309 { 310 x-=2*n; 311 if (x<n) return cmplx<Thigh>(-std::sin(Thigh(x)*ang), std::cos(Thigh(x)*ang)); 312 return cmplx<Thigh>(-std::cos(Thigh(2*n-x)*ang), std::sin(Thigh(2*n-x)*ang)); 313 } 314 } 315 else 316 { 317 x=8*n-x; 318 if (x<2*n) // third quadrant 319 { 320 if (x<n) return cmplx<Thigh>(std::cos(Thigh(x)*ang), -std::sin(Thigh(x)*ang)); 321 return cmplx<Thigh>(std::sin(Thigh(2*n-x)*ang), -std::cos(Thigh(2*n-x)*ang)); 322 } 323 else // fourth quadrant 324 { 325 x-=2*n; 326 if (x<n) return cmplx<Thigh>(-std::sin(Thigh(x)*ang), -std::cos(Thigh(x)*ang)); 327 return cmplx<Thigh>(-std::cos(Thigh(2*n-x)*ang), -std::sin(Thigh(2*n-x)*ang)); 328 } 329 } 330 } 331 332 public: 333 POCKETFFT_NOINLINE sincos_2pibyn(size_t n) 334 : N(n) 335 { 336 constexpr auto pi = 3.141592653589793238462643383279502884197L; 337 Thigh ang = Thigh(0.25L*pi/n); 338 size_t nval = (n+2)/2; 339 shift = 1; 340 while((size_t(1)<<shift)*(size_t(1)<<shift) < nval) ++shift; 341 mask = (size_t(1)<<shift)-1; 342 v1.resize(mask+1); 343 v1[0].Set(Thigh(1), Thigh(0)); 344 for (size_t i=1; i<v1.size(); ++i) 345 v1[i]=calc(i,n,ang); 346 v2.resize((nval+mask)/(mask+1)); 347 v2[0].Set(Thigh(1), Thigh(0)); 348 for (size_t i=1; i<v2.size(); ++i) 349 v2[i]=calc(i*(mask+1),n,ang); 350 } 351 352 cmplx<T> operator[](size_t idx) const 353 { 354 if (2*idx<=N) 355 { 356 auto x1=v1[idx&mask], x2=v2[idx>>shift]; 357 return cmplx<T>(T(x1.r*x2.r-x1.i*x2.i), T(x1.r*x2.i+x1.i*x2.r)); 358 } 359 idx = N-idx; 360 auto x1=v1[idx&mask], x2=v2[idx>>shift]; 361 return cmplx<T>(T(x1.r*x2.r-x1.i*x2.i), -T(x1.r*x2.i+x1.i*x2.r)); 362 } 363 }; 364 365 struct util // hack to avoid duplicate symbols 366 { 367 static POCKETFFT_NOINLINE size_t largest_prime_factor (size_t n) 368 { 369 size_t res=1; 370 while ((n&1)==0) 371 { res=2; n>>=1; } 372 for (size_t x=3; x*x<=n; x+=2) 373 while ((n%x)==0) 374 { res=x; n/=x; } 375 if (n>1) res=n; 376 return res; 377 } 378 379 static POCKETFFT_NOINLINE double cost_guess (size_t n) 380 { 381 constexpr double lfp=1.1; // penalty for non-hardcoded larger factors 382 size_t ni=n; 383 double result=0.; 384 while ((n&1)==0) 385 { result+=2; n>>=1; } 386 for (size_t x=3; x*x<=n; x+=2) 387 while ((n%x)==0) 388 { 389 result+= (x<=5) ? double(x) : lfp*double(x); // penalize larger prime factors 390 n/=x; 391 } 392 if (n>1) result+=(n<=5) ? double(n) : lfp*double(n); 393 return result*double(ni); 394 } 395 396 /* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */ 397 static POCKETFFT_NOINLINE size_t good_size_cmplx(size_t n) 398 { 399 if (n<=12) return n; 400 401 size_t bestfac=2*n; 402 for (size_t f11=1; f11<bestfac; f11*=11) 403 for (size_t f117=f11; f117<bestfac; f117*=7) 404 for (size_t f1175=f117; f1175<bestfac; f1175*=5) 405 { 406 size_t x=f1175; 407 while (x<n) x*=2; 408 for (;;) 409 { 410 if (x<n) 411 x*=3; 412 else if (x>n) 413 { 414 if (x<bestfac) bestfac=x; 415 if (x&1) break; 416 x>>=1; 417 } 418 else 419 return n; 420 } 421 } 422 return bestfac; 423 } 424 425 /* returns the smallest composite of 2, 3, 5 which is >= n */ 426 static POCKETFFT_NOINLINE size_t good_size_real(size_t n) 427 { 428 if (n<=6) return n; 429 430 size_t bestfac=2*n; 431 for (size_t f5=1; f5<bestfac; f5*=5) 432 { 433 size_t x = f5; 434 while (x<n) x *= 2; 435 for (;;) 436 { 437 if (x<n) 438 x*=3; 439 else if (x>n) 440 { 441 if (x<bestfac) bestfac=x; 442 if (x&1) break; 443 x>>=1; 444 } 445 else 446 return n; 447 } 448 } 449 return bestfac; 450 } 451 452 static size_t prod(const shape_t &shape) 453 { 454 size_t res=1; 455 for (auto sz: shape) 456 res*=sz; 457 return res; 458 } 459 460 static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape, 461 const stride_t &stride_in, const stride_t &stride_out, bool inplace) 462 { 463 auto ndim = shape.size(); 464 if (ndim<1) throw std::runtime_error("ndim must be >= 1"); 465 if ((stride_in.size()!=ndim) || (stride_out.size()!=ndim)) 466 throw std::runtime_error("stride dimension mismatch"); 467 if (inplace && (stride_in!=stride_out)) 468 throw std::runtime_error("stride mismatch"); 469 } 470 471 static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape, 472 const stride_t &stride_in, const stride_t &stride_out, bool inplace, 473 const shape_t &axes) 474 { 475 sanity_check(shape, stride_in, stride_out, inplace); 476 auto ndim = shape.size(); 477 shape_t tmp(ndim,0); 478 for (auto ax : axes) 479 { 480 if (ax>=ndim) throw std::invalid_argument("bad axis number"); 481 if (++tmp[ax]>1) throw std::invalid_argument("axis specified repeatedly"); 482 } 483 } 484 485 static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape, 486 const stride_t &stride_in, const stride_t &stride_out, bool inplace, 487 size_t axis) 488 { 489 sanity_check(shape, stride_in, stride_out, inplace); 490 if (axis>=shape.size()) throw std::invalid_argument("bad axis number"); 491 } 492 493 #ifdef POCKETFFT_NO_MULTITHREADING 494 static size_t thread_count (size_t /*nthreads*/, const shape_t &/*shape*/, 495 size_t /*axis*/, size_t /*vlen*/) 496 { return 1; } 497 #else 498 static size_t thread_count (size_t nthreads, const shape_t &shape, 499 size_t axis, size_t vlen) 500 { 501 if (nthreads==1) return 1; 502 size_t size = prod(shape); 503 size_t parallel = size / (shape[axis] * vlen); 504 if (shape[axis] < 1000) 505 parallel /= 4; 506 size_t max_threads = nthreads == 0 ? 507 std::thread::hardware_concurrency() : nthreads; 508 return std::max(size_t(1), std::min(parallel, max_threads)); 509 } 510 #endif 511 }; 512 513 namespace threading { 514 515 #ifdef POCKETFFT_NO_MULTITHREADING 516 517 constexpr inline size_t thread_id() { return 0; } 518 constexpr inline size_t num_threads() { return 1; } 519 520 template <typename Func> 521 void thread_map(size_t /* nthreads */, Func f) 522 { f(); } 523 524 #else 525 526 inline size_t &thread_id() 527 { 528 static thread_local size_t thread_id_=0; 529 return thread_id_; 530 } 531 inline size_t &num_threads() 532 { 533 static thread_local size_t num_threads_=1; 534 return num_threads_; 535 } 536 static const size_t max_threads = std::max(1u, std::thread::hardware_concurrency()); 537 538 class latch 539 { 540 std::atomic<size_t> num_left_; 541 std::mutex mut_; 542 std::condition_variable completed_; 543 using lock_t = std::unique_lock<std::mutex>; 544 545 public: 546 latch(size_t n): num_left_(n) {} 547 548 void count_down() 549 { 550 lock_t lock(mut_); 551 if (--num_left_) 552 return; 553 completed_.notify_all(); 554 } 555 556 void wait() 557 { 558 lock_t lock(mut_); 559 completed_.wait(lock, [this]{ return is_ready(); }); 560 } 561 bool is_ready() { return num_left_ == 0; } 562 }; 563 564 template <typename T> class concurrent_queue 565 { 566 std::queue<T> q_; 567 std::mutex mut_; 568 std::atomic<size_t> size_; 569 using lock_t = std::lock_guard<std::mutex>; 570 571 public: 572 573 void push(T val) 574 { 575 lock_t lock(mut_); 576 ++size_; 577 q_.push(std::move(val)); 578 } 579 580 bool try_pop(T &val) 581 { 582 if (size_ == 0) return false; 583 lock_t lock(mut_); 584 // Queue might have been emptied while we acquired the lock 585 if (q_.empty()) return false; 586 587 val = std::move(q_.front()); 588 --size_; 589 q_.pop(); 590 return true; 591 } 592 593 bool empty() const { return size_==0; } 594 }; 595 596 // C++ allocator with support for over-aligned types 597 template <typename T> struct aligned_allocator 598 { 599 using value_type = T; 600 template <class U> 601 aligned_allocator(const aligned_allocator<U>&) {} 602 aligned_allocator() = default; 603 604 T *allocate(size_t n) 605 { 606 void* mem = aligned_alloc(alignof(T), n*sizeof(T)); 607 return static_cast<T*>(mem); 608 } 609 610 void deallocate(T *p, size_t /*n*/) 611 { aligned_dealloc(p); } 612 }; 613 614 class thread_pool 615 { 616 // A reasonable guess, probably close enough for most hardware 617 static constexpr size_t cache_line_size = 64; 618 struct alignas(cache_line_size) worker 619 { 620 std::thread thread; 621 std::condition_variable work_ready; 622 std::mutex mut; 623 std::atomic_flag busy_flag = ATOMIC_FLAG_INIT; 624 std::function<void()> work; 625 626 void worker_main( 627 std::atomic<bool> &shutdown_flag, 628 std::atomic<size_t> &unscheduled_tasks, 629 concurrent_queue<std::function<void()>> &overflow_work) 630 { 631 using lock_t = std::unique_lock<std::mutex>; 632 bool expect_work = true; 633 while (!shutdown_flag || expect_work) 634 { 635 std::function<void()> local_work; 636 if (expect_work || unscheduled_tasks == 0) 637 { 638 lock_t lock(mut); 639 // Wait until there is work to be executed 640 work_ready.wait(lock, [&]{ return (work || shutdown_flag); }); 641 local_work.swap(work); 642 expect_work = false; 643 } 644 645 bool marked_busy = false; 646 if (local_work) 647 { 648 marked_busy = true; 649 local_work(); 650 } 651 652 if (!overflow_work.empty()) 653 { 654 if (!marked_busy && busy_flag.test_and_set()) 655 { 656 expect_work = true; 657 continue; 658 } 659 marked_busy = true; 660 661 while (overflow_work.try_pop(local_work)) 662 { 663 --unscheduled_tasks; 664 local_work(); 665 } 666 } 667 668 if (marked_busy) busy_flag.clear(); 669 } 670 } 671 }; 672 673 concurrent_queue<std::function<void()>> overflow_work_; 674 std::mutex mut_; 675 std::vector<worker, aligned_allocator<worker>> workers_; 676 std::atomic<bool> shutdown_; 677 std::atomic<size_t> unscheduled_tasks_; 678 using lock_t = std::lock_guard<std::mutex>; 679 680 void create_threads() 681 { 682 lock_t lock(mut_); 683 size_t nthreads=workers_.size(); 684 for (size_t i=0; i<nthreads; ++i) 685 { 686 try 687 { 688 auto *worker = &workers_[i]; 689 worker->busy_flag.clear(); 690 worker->work = nullptr; 691 worker->thread = std::thread([worker, this] 692 { 693 worker->worker_main(shutdown_, unscheduled_tasks_, overflow_work_); 694 }); 695 } 696 catch (...) 697 { 698 shutdown_locked(); 699 throw; 700 } 701 } 702 } 703 704 void shutdown_locked() 705 { 706 shutdown_ = true; 707 for (auto &worker : workers_) 708 worker.work_ready.notify_all(); 709 710 for (auto &worker : workers_) 711 if (worker.thread.joinable()) 712 worker.thread.join(); 713 } 714 715 public: 716 explicit thread_pool(size_t nthreads): 717 workers_(nthreads) 718 { create_threads(); } 719 720 thread_pool(): thread_pool(max_threads) {} 721 722 ~thread_pool() { shutdown(); } 723 724 void submit(std::function<void()> work) 725 { 726 lock_t lock(mut_); 727 if (shutdown_) 728 throw std::runtime_error("Work item submitted after shutdown"); 729 730 ++unscheduled_tasks_; 731 732 // First check for any idle workers and wake those 733 for (auto &worker : workers_) 734 if (!worker.busy_flag.test_and_set()) 735 { 736 --unscheduled_tasks_; 737 { 738 lock_t lock(worker.mut); 739 worker.work = std::move(work); 740 } 741 worker.work_ready.notify_one(); 742 return; 743 } 744 745 // If no workers were idle, push onto the overflow queue for later 746 overflow_work_.push(std::move(work)); 747 } 748 749 void shutdown() 750 { 751 lock_t lock(mut_); 752 shutdown_locked(); 753 } 754 755 void restart() 756 { 757 shutdown_ = false; 758 create_threads(); 759 } 760 }; 761 762 inline thread_pool & get_pool() 763 { 764 static thread_pool pool; 765 #ifdef POCKETFFT_PTHREADS 766 static std::once_flag f; 767 std::call_once(f, 768 []{ 769 pthread_atfork( 770 +[]{ get_pool().shutdown(); }, // prepare 771 +[]{ get_pool().restart(); }, // parent 772 +[]{ get_pool().restart(); } // child 773 ); 774 }); 775 #endif 776 777 return pool; 778 } 779 780 /** Map a function f over nthreads */ 781 template <typename Func> 782 void thread_map(size_t nthreads, Func f) 783 { 784 if (nthreads == 0) 785 nthreads = max_threads; 786 787 if (nthreads == 1) 788 { f(); return; } 789 790 auto & pool = get_pool(); 791 latch counter(nthreads); 792 std::exception_ptr ex; 793 std::mutex ex_mut; 794 for (size_t i=0; i<nthreads; ++i) 795 { 796 pool.submit( 797 [&f, &counter, &ex, &ex_mut, i, nthreads] { 798 thread_id() = i; 799 num_threads() = nthreads; 800 try { f(); } 801 catch (...) 802 { 803 std::lock_guard<std::mutex> lock(ex_mut); 804 ex = std::current_exception(); 805 } 806 counter.count_down(); 807 }); 808 } 809 counter.wait(); 810 if (ex) 811 std::rethrow_exception(ex); 812 } 813 814 #endif 815 816 } 817 818 // 819 // complex FFTPACK transforms 820 // 821 822 template<typename T0> class cfftp 823 { 824 private: 825 struct fctdata 826 { 827 size_t fct; 828 cmplx<T0> *tw, *tws; 829 }; 830 831 size_t length; 832 arr<cmplx<T0>> mem; 833 std::vector<fctdata> fact; 834 835 void add_factor(size_t factor) 836 { fact.push_back({factor, nullptr, nullptr}); } 837 838 template<bool fwd, typename T> void pass2 (size_t ido, size_t l1, 839 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, 840 const cmplx<T0> * POCKETFFT_RESTRICT wa) const 841 { 842 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& 843 { return ch[a+ido*(b+l1*c)]; }; 844 auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T& 845 { return cc[a+ido*(b+2*c)]; }; 846 auto WA = [wa, ido](size_t x, size_t i) 847 { return wa[i-1+x*(ido-1)]; }; 848 849 if (ido==1) 850 for (size_t k=0; k<l1; ++k) 851 { 852 CH(0,k,0) = CC(0,0,k)+CC(0,1,k); 853 CH(0,k,1) = CC(0,0,k)-CC(0,1,k); 854 } 855 else 856 for (size_t k=0; k<l1; ++k) 857 { 858 CH(0,k,0) = CC(0,0,k)+CC(0,1,k); 859 CH(0,k,1) = CC(0,0,k)-CC(0,1,k); 860 for (size_t i=1; i<ido; ++i) 861 { 862 CH(i,k,0) = CC(i,0,k)+CC(i,1,k); 863 special_mul<fwd>(CC(i,0,k)-CC(i,1,k),WA(0,i),CH(i,k,1)); 864 } 865 } 866 } 867 868 #define POCKETFFT_PREP3(idx) \ 869 T t0 = CC(idx,0,k), t1, t2; \ 870 PM (t1,t2,CC(idx,1,k),CC(idx,2,k)); \ 871 CH(idx,k,0)=t0+t1; 872 #define POCKETFFT_PARTSTEP3a(u1,u2,twr,twi) \ 873 { \ 874 T ca=t0+t1*twr; \ 875 T cb{-t2.i*twi, t2.r*twi}; \ 876 PM(CH(0,k,u1),CH(0,k,u2),ca,cb) ;\ 877 } 878 #define POCKETFFT_PARTSTEP3b(u1,u2,twr,twi) \ 879 { \ 880 T ca=t0+t1*twr; \ 881 T cb{-t2.i*twi, t2.r*twi}; \ 882 special_mul<fwd>(ca+cb,WA(u1-1,i),CH(i,k,u1)); \ 883 special_mul<fwd>(ca-cb,WA(u2-1,i),CH(i,k,u2)); \ 884 } 885 template<bool fwd, typename T> void pass3 (size_t ido, size_t l1, 886 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, 887 const cmplx<T0> * POCKETFFT_RESTRICT wa) const 888 { 889 constexpr T0 tw1r=-0.5, 890 tw1i= (fwd ? -1: 1) * T0(0.8660254037844386467637231707529362L); 891 892 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& 893 { return ch[a+ido*(b+l1*c)]; }; 894 auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T& 895 { return cc[a+ido*(b+3*c)]; }; 896 auto WA = [wa, ido](size_t x, size_t i) 897 { return wa[i-1+x*(ido-1)]; }; 898 899 if (ido==1) 900 for (size_t k=0; k<l1; ++k) 901 { 902 POCKETFFT_PREP3(0) 903 POCKETFFT_PARTSTEP3a(1,2,tw1r,tw1i) 904 } 905 else 906 for (size_t k=0; k<l1; ++k) 907 { 908 { 909 POCKETFFT_PREP3(0) 910 POCKETFFT_PARTSTEP3a(1,2,tw1r,tw1i) 911 } 912 for (size_t i=1; i<ido; ++i) 913 { 914 POCKETFFT_PREP3(i) 915 POCKETFFT_PARTSTEP3b(1,2,tw1r,tw1i) 916 } 917 } 918 } 919 920 #undef POCKETFFT_PARTSTEP3b 921 #undef POCKETFFT_PARTSTEP3a 922 #undef POCKETFFT_PREP3 923 924 template<bool fwd, typename T> void pass4 (size_t ido, size_t l1, 925 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, 926 const cmplx<T0> * POCKETFFT_RESTRICT wa) const 927 { 928 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& 929 { return ch[a+ido*(b+l1*c)]; }; 930 auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T& 931 { return cc[a+ido*(b+4*c)]; }; 932 auto WA = [wa, ido](size_t x, size_t i) 933 { return wa[i-1+x*(ido-1)]; }; 934 935 if (ido==1) 936 for (size_t k=0; k<l1; ++k) 937 { 938 T t1, t2, t3, t4; 939 PM(t2,t1,CC(0,0,k),CC(0,2,k)); 940 PM(t3,t4,CC(0,1,k),CC(0,3,k)); 941 ROTX90<fwd>(t4); 942 PM(CH(0,k,0),CH(0,k,2),t2,t3); 943 PM(CH(0,k,1),CH(0,k,3),t1,t4); 944 } 945 else 946 for (size_t k=0; k<l1; ++k) 947 { 948 { 949 T t1, t2, t3, t4; 950 PM(t2,t1,CC(0,0,k),CC(0,2,k)); 951 PM(t3,t4,CC(0,1,k),CC(0,3,k)); 952 ROTX90<fwd>(t4); 953 PM(CH(0,k,0),CH(0,k,2),t2,t3); 954 PM(CH(0,k,1),CH(0,k,3),t1,t4); 955 } 956 for (size_t i=1; i<ido; ++i) 957 { 958 T t1, t2, t3, t4; 959 T cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k); 960 PM(t2,t1,cc0,cc2); 961 PM(t3,t4,cc1,cc3); 962 ROTX90<fwd>(t4); 963 CH(i,k,0) = t2+t3; 964 special_mul<fwd>(t1+t4,WA(0,i),CH(i,k,1)); 965 special_mul<fwd>(t2-t3,WA(1,i),CH(i,k,2)); 966 special_mul<fwd>(t1-t4,WA(2,i),CH(i,k,3)); 967 } 968 } 969 } 970 971 #define POCKETFFT_PREP5(idx) \ 972 T t0 = CC(idx,0,k), t1, t2, t3, t4; \ 973 PM (t1,t4,CC(idx,1,k),CC(idx,4,k)); \ 974 PM (t2,t3,CC(idx,2,k),CC(idx,3,k)); \ 975 CH(idx,k,0).r=t0.r+t1.r+t2.r; \ 976 CH(idx,k,0).i=t0.i+t1.i+t2.i; 977 978 #define POCKETFFT_PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \ 979 { \ 980 T ca,cb; \ 981 ca.r=t0.r+twar*t1.r+twbr*t2.r; \ 982 ca.i=t0.i+twar*t1.i+twbr*t2.i; \ 983 cb.i=twai*t4.r twbi*t3.r; \ 984 cb.r=-(twai*t4.i twbi*t3.i); \ 985 PM(CH(0,k,u1),CH(0,k,u2),ca,cb); \ 986 } 987 988 #define POCKETFFT_PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \ 989 { \ 990 T ca,cb,da,db; \ 991 ca.r=t0.r+twar*t1.r+twbr*t2.r; \ 992 ca.i=t0.i+twar*t1.i+twbr*t2.i; \ 993 cb.i=twai*t4.r twbi*t3.r; \ 994 cb.r=-(twai*t4.i twbi*t3.i); \ 995 special_mul<fwd>(ca+cb,WA(u1-1,i),CH(i,k,u1)); \ 996 special_mul<fwd>(ca-cb,WA(u2-1,i),CH(i,k,u2)); \ 997 } 998 template<bool fwd, typename T> void pass5 (size_t ido, size_t l1, 999 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, 1000 const cmplx<T0> * POCKETFFT_RESTRICT wa) const 1001 { 1002 constexpr T0 tw1r= T0(0.3090169943749474241022934171828191L), 1003 tw1i= (fwd ? -1: 1) * T0(0.9510565162951535721164393333793821L), 1004 tw2r= T0(-0.8090169943749474241022934171828191L), 1005 tw2i= (fwd ? -1: 1) * T0(0.5877852522924731291687059546390728L); 1006 1007 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& 1008 { return ch[a+ido*(b+l1*c)]; }; 1009 auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T& 1010 { return cc[a+ido*(b+5*c)]; }; 1011 auto WA = [wa, ido](size_t x, size_t i) 1012 { return wa[i-1+x*(ido-1)]; }; 1013 1014 if (ido==1) 1015 for (size_t k=0; k<l1; ++k) 1016 { 1017 POCKETFFT_PREP5(0) 1018 POCKETFFT_PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i) 1019 POCKETFFT_PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i) 1020 } 1021 else 1022 for (size_t k=0; k<l1; ++k) 1023 { 1024 { 1025 POCKETFFT_PREP5(0) 1026 POCKETFFT_PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i) 1027 POCKETFFT_PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i) 1028 } 1029 for (size_t i=1; i<ido; ++i) 1030 { 1031 POCKETFFT_PREP5(i) 1032 POCKETFFT_PARTSTEP5b(1,4,tw1r,tw2r,+tw1i,+tw2i) 1033 POCKETFFT_PARTSTEP5b(2,3,tw2r,tw1r,+tw2i,-tw1i) 1034 } 1035 } 1036 } 1037 1038 #undef POCKETFFT_PARTSTEP5b 1039 #undef POCKETFFT_PARTSTEP5a 1040 #undef POCKETFFT_PREP5 1041 1042 #define POCKETFFT_PREP7(idx) \ 1043 T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \ 1044 PM (t2,t7,CC(idx,1,k),CC(idx,6,k)); \ 1045 PM (t3,t6,CC(idx,2,k),CC(idx,5,k)); \ 1046 PM (t4,t5,CC(idx,3,k),CC(idx,4,k)); \ 1047 CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \ 1048 CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i; 1049 1050 #define POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,out1,out2) \ 1051 { \ 1052 T ca,cb; \ 1053 ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; \ 1054 ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \ 1055 cb.i=y1*t7.r y2*t6.r y3*t5.r; \ 1056 cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \ 1057 PM(out1,out2,ca,cb); \ 1058 } 1059 #define POCKETFFT_PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \ 1060 POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2)) 1061 #define POCKETFFT_PARTSTEP7(u1,u2,x1,x2,x3,y1,y2,y3) \ 1062 { \ 1063 T da,db; \ 1064 POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \ 1065 special_mul<fwd>(da,WA(u1-1,i),CH(i,k,u1)); \ 1066 special_mul<fwd>(db,WA(u2-1,i),CH(i,k,u2)); \ 1067 } 1068 1069 template<bool fwd, typename T> void pass7(size_t ido, size_t l1, 1070 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, 1071 const cmplx<T0> * POCKETFFT_RESTRICT wa) const 1072 { 1073 constexpr T0 tw1r= T0(0.6234898018587335305250048840042398L), 1074 tw1i= (fwd ? -1 : 1) * T0(0.7818314824680298087084445266740578L), 1075 tw2r= T0(-0.2225209339563144042889025644967948L), 1076 tw2i= (fwd ? -1 : 1) * T0(0.9749279121818236070181316829939312L), 1077 tw3r= T0(-0.9009688679024191262361023195074451L), 1078 tw3i= (fwd ? -1 : 1) * T0(0.433883739117558120475768332848359L); 1079 1080 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& 1081 { return ch[a+ido*(b+l1*c)]; }; 1082 auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T& 1083 { return cc[a+ido*(b+7*c)]; }; 1084 auto WA = [wa, ido](size_t x, size_t i) 1085 { return wa[i-1+x*(ido-1)]; }; 1086 1087 if (ido==1) 1088 for (size_t k=0; k<l1; ++k) 1089 { 1090 POCKETFFT_PREP7(0) 1091 POCKETFFT_PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i) 1092 POCKETFFT_PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i) 1093 POCKETFFT_PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i) 1094 } 1095 else 1096 for (size_t k=0; k<l1; ++k) 1097 { 1098 { 1099 POCKETFFT_PREP7(0) 1100 POCKETFFT_PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i) 1101 POCKETFFT_PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i) 1102 POCKETFFT_PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i) 1103 } 1104 for (size_t i=1; i<ido; ++i) 1105 { 1106 POCKETFFT_PREP7(i) 1107 POCKETFFT_PARTSTEP7(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i) 1108 POCKETFFT_PARTSTEP7(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i) 1109 POCKETFFT_PARTSTEP7(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i) 1110 } 1111 } 1112 } 1113 1114 #undef POCKETFFT_PARTSTEP7 1115 #undef POCKETFFT_PARTSTEP7a0 1116 #undef POCKETFFT_PARTSTEP7a 1117 #undef POCKETFFT_PREP7 1118 1119 template <bool fwd, typename T> void ROTX45(T &a) const 1120 { 1121 constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L); 1122 if (fwd) 1123 { auto tmp_=a.r; a.r=hsqt2*(a.r+a.i); a.i=hsqt2*(a.i-tmp_); } 1124 else 1125 { auto tmp_=a.r; a.r=hsqt2*(a.r-a.i); a.i=hsqt2*(a.i+tmp_); } 1126 } 1127 template <bool fwd, typename T> void ROTX135(T &a) const 1128 { 1129 constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L); 1130 if (fwd) 1131 { auto tmp_=a.r; a.r=hsqt2*(a.i-a.r); a.i=hsqt2*(-tmp_-a.i); } 1132 else 1133 { auto tmp_=a.r; a.r=hsqt2*(-a.r-a.i); a.i=hsqt2*(tmp_-a.i); } 1134 } 1135 1136 template<bool fwd, typename T> void pass8 (size_t ido, size_t l1, 1137 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, 1138 const cmplx<T0> * POCKETFFT_RESTRICT wa) const 1139 { 1140 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& 1141 { return ch[a+ido*(b+l1*c)]; }; 1142 auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T& 1143 { return cc[a+ido*(b+8*c)]; }; 1144 auto WA = [wa, ido](size_t x, size_t i) 1145 { return wa[i-1+x*(ido-1)]; }; 1146 1147 if (ido==1) 1148 for (size_t k=0; k<l1; ++k) 1149 { 1150 T a0, a1, a2, a3, a4, a5, a6, a7; 1151 PM(a1,a5,CC(0,1,k),CC(0,5,k)); 1152 PM(a3,a7,CC(0,3,k),CC(0,7,k)); 1153 PMINPLACE(a1,a3); 1154 ROTX90<fwd>(a3); 1155 1156 ROTX90<fwd>(a7); 1157 PMINPLACE(a5,a7); 1158 ROTX45<fwd>(a5); 1159 ROTX135<fwd>(a7); 1160 1161 PM(a0,a4,CC(0,0,k),CC(0,4,k)); 1162 PM(a2,a6,CC(0,2,k),CC(0,6,k)); 1163 PM(CH(0,k,0),CH(0,k,4),a0+a2,a1); 1164 PM(CH(0,k,2),CH(0,k,6),a0-a2,a3); 1165 ROTX90<fwd>(a6); 1166 PM(CH(0,k,1),CH(0,k,5),a4+a6,a5); 1167 PM(CH(0,k,3),CH(0,k,7),a4-a6,a7); 1168 } 1169 else 1170 for (size_t k=0; k<l1; ++k) 1171 { 1172 { 1173 T a0, a1, a2, a3, a4, a5, a6, a7; 1174 PM(a1,a5,CC(0,1,k),CC(0,5,k)); 1175 PM(a3,a7,CC(0,3,k),CC(0,7,k)); 1176 PMINPLACE(a1,a3); 1177 ROTX90<fwd>(a3); 1178 1179 ROTX90<fwd>(a7); 1180 PMINPLACE(a5,a7); 1181 ROTX45<fwd>(a5); 1182 ROTX135<fwd>(a7); 1183 1184 PM(a0,a4,CC(0,0,k),CC(0,4,k)); 1185 PM(a2,a6,CC(0,2,k),CC(0,6,k)); 1186 PM(CH(0,k,0),CH(0,k,4),a0+a2,a1); 1187 PM(CH(0,k,2),CH(0,k,6),a0-a2,a3); 1188 ROTX90<fwd>(a6); 1189 PM(CH(0,k,1),CH(0,k,5),a4+a6,a5); 1190 PM(CH(0,k,3),CH(0,k,7),a4-a6,a7); 1191 } 1192 for (size_t i=1; i<ido; ++i) 1193 { 1194 T a0, a1, a2, a3, a4, a5, a6, a7; 1195 PM(a1,a5,CC(i,1,k),CC(i,5,k)); 1196 PM(a3,a7,CC(i,3,k),CC(i,7,k)); 1197 ROTX90<fwd>(a7); 1198 PMINPLACE(a1,a3); 1199 ROTX90<fwd>(a3); 1200 PMINPLACE(a5,a7); 1201 ROTX45<fwd>(a5); 1202 ROTX135<fwd>(a7); 1203 PM(a0,a4,CC(i,0,k),CC(i,4,k)); 1204 PM(a2,a6,CC(i,2,k),CC(i,6,k)); 1205 PMINPLACE(a0,a2); 1206 CH(i,k,0) = a0+a1; 1207 special_mul<fwd>(a0-a1,WA(3,i),CH(i,k,4)); 1208 special_mul<fwd>(a2+a3,WA(1,i),CH(i,k,2)); 1209 special_mul<fwd>(a2-a3,WA(5,i),CH(i,k,6)); 1210 ROTX90<fwd>(a6); 1211 PMINPLACE(a4,a6); 1212 special_mul<fwd>(a4+a5,WA(0,i),CH(i,k,1)); 1213 special_mul<fwd>(a4-a5,WA(4,i),CH(i,k,5)); 1214 special_mul<fwd>(a6+a7,WA(2,i),CH(i,k,3)); 1215 special_mul<fwd>(a6-a7,WA(6,i),CH(i,k,7)); 1216 } 1217 } 1218 } 1219 1220 1221 #define POCKETFFT_PREP11(idx) \ 1222 T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \ 1223 PM (t2,t11,CC(idx,1,k),CC(idx,10,k)); \ 1224 PM (t3,t10,CC(idx,2,k),CC(idx, 9,k)); \ 1225 PM (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)); \ 1226 PM (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)); \ 1227 PM (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)); \ 1228 CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; \ 1229 CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i; 1230 1231 #define POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,out1,out2) \ 1232 { \ 1233 T ca = t1 + t2*x1 + t3*x2 + t4*x3 + t5*x4 +t6*x5, \ 1234 cb; \ 1235 cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; \ 1236 cb.r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); \ 1237 PM(out1,out2,ca,cb); \ 1238 } 1239 #define POCKETFFT_PARTSTEP11a(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \ 1240 POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,CH(0,k,u1),CH(0,k,u2)) 1241 #define POCKETFFT_PARTSTEP11(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \ 1242 { \ 1243 T da,db; \ 1244 POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,da,db) \ 1245 special_mul<fwd>(da,WA(u1-1,i),CH(i,k,u1)); \ 1246 special_mul<fwd>(db,WA(u2-1,i),CH(i,k,u2)); \ 1247 } 1248 1249 template<bool fwd, typename T> void pass11 (size_t ido, size_t l1, 1250 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, 1251 const cmplx<T0> * POCKETFFT_RESTRICT wa) const 1252 { 1253 constexpr T0 tw1r= T0(0.8412535328311811688618116489193677L), 1254 tw1i= (fwd ? -1 : 1) * T0(0.5406408174555975821076359543186917L), 1255 tw2r= T0(0.4154150130018864255292741492296232L), 1256 tw2i= (fwd ? -1 : 1) * T0(0.9096319953545183714117153830790285L), 1257 tw3r= T0(-0.1423148382732851404437926686163697L), 1258 tw3i= (fwd ? -1 : 1) * T0(0.9898214418809327323760920377767188L), 1259 tw4r= T0(-0.6548607339452850640569250724662936L), 1260 tw4i= (fwd ? -1 : 1) * T0(0.7557495743542582837740358439723444L), 1261 tw5r= T0(-0.9594929736144973898903680570663277L), 1262 tw5i= (fwd ? -1 : 1) * T0(0.2817325568414296977114179153466169L); 1263 1264 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& 1265 { return ch[a+ido*(b+l1*c)]; }; 1266 auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T& 1267 { return cc[a+ido*(b+11*c)]; }; 1268 auto WA = [wa, ido](size_t x, size_t i) 1269 { return wa[i-1+x*(ido-1)]; }; 1270 1271 if (ido==1) 1272 for (size_t k=0; k<l1; ++k) 1273 { 1274 POCKETFFT_PREP11(0) 1275 POCKETFFT_PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i) 1276 POCKETFFT_PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i) 1277 POCKETFFT_PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i) 1278 POCKETFFT_PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i) 1279 POCKETFFT_PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i) 1280 } 1281 else 1282 for (size_t k=0; k<l1; ++k) 1283 { 1284 { 1285 POCKETFFT_PREP11(0) 1286 POCKETFFT_PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i) 1287 POCKETFFT_PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i) 1288 POCKETFFT_PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i) 1289 POCKETFFT_PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i) 1290 POCKETFFT_PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i) 1291 } 1292 for (size_t i=1; i<ido; ++i) 1293 { 1294 POCKETFFT_PREP11(i) 1295 POCKETFFT_PARTSTEP11(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i) 1296 POCKETFFT_PARTSTEP11(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i) 1297 POCKETFFT_PARTSTEP11(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i) 1298 POCKETFFT_PARTSTEP11(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i) 1299 POCKETFFT_PARTSTEP11(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i) 1300 } 1301 } 1302 } 1303 1304 #undef PARTSTEP11 1305 #undef PARTSTEP11a0 1306 #undef PARTSTEP11a 1307 #undef POCKETFFT_PREP11 1308 1309 template<bool fwd, typename T> void passg (size_t ido, size_t ip, 1310 size_t l1, T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, 1311 const cmplx<T0> * POCKETFFT_RESTRICT wa, 1312 const cmplx<T0> * POCKETFFT_RESTRICT csarr) const 1313 { 1314 const size_t cdim=ip; 1315 size_t ipph = (ip+1)/2; 1316 size_t idl1 = ido*l1; 1317 1318 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& 1319 { return ch[a+ido*(b+l1*c)]; }; 1320 auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& 1321 { return cc[a+ido*(b+cdim*c)]; }; 1322 auto CX = [cc, ido, l1](size_t a, size_t b, size_t c) -> T& 1323 { return cc[a+ido*(b+l1*c)]; }; 1324 auto CX2 = [cc, idl1](size_t a, size_t b) -> T& 1325 { return cc[a+idl1*b]; }; 1326 auto CH2 = [ch, idl1](size_t a, size_t b) -> const T& 1327 { return ch[a+idl1*b]; }; 1328 1329 arr<cmplx<T0>> wal(ip); 1330 wal[0] = cmplx<T0>(1., 0.); 1331 for (size_t i=1; i<ip; ++i) 1332 wal[i]=cmplx<T0>(csarr[i].r,fwd ? -csarr[i].i : csarr[i].i); 1333 1334 for (size_t k=0; k<l1; ++k) 1335 for (size_t i=0; i<ido; ++i) 1336 CH(i,k,0) = CC(i,0,k); 1337 for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc) 1338 for (size_t k=0; k<l1; ++k) 1339 for (size_t i=0; i<ido; ++i) 1340 PM(CH(i,k,j),CH(i,k,jc),CC(i,j,k),CC(i,jc,k)); 1341 for (size_t k=0; k<l1; ++k) 1342 for (size_t i=0; i<ido; ++i) 1343 { 1344 T tmp = CH(i,k,0); 1345 for (size_t j=1; j<ipph; ++j) 1346 tmp+=CH(i,k,j); 1347 CX(i,k,0) = tmp; 1348 } 1349 for (size_t l=1, lc=ip-1; l<ipph; ++l, --lc) 1350 { 1351 // j=0 1352 for (size_t ik=0; ik<idl1; ++ik) 1353 { 1354 CX2(ik,l).r = CH2(ik,0).r+wal[l].r*CH2(ik,1).r+wal[2*l].r*CH2(ik,2).r; 1355 CX2(ik,l).i = CH2(ik,0).i+wal[l].r*CH2(ik,1).i+wal[2*l].r*CH2(ik,2).i; 1356 CX2(ik,lc).r=-wal[l].i*CH2(ik,ip-1).i-wal[2*l].i*CH2(ik,ip-2).i; 1357 CX2(ik,lc).i=wal[l].i*CH2(ik,ip-1).r+wal[2*l].i*CH2(ik,ip-2).r; 1358 } 1359 1360 size_t iwal=2*l; 1361 size_t j=3, jc=ip-3; 1362 for (; j<ipph-1; j+=2, jc-=2) 1363 { 1364 iwal+=l; if (iwal>ip) iwal-=ip; 1365 cmplx<T0> xwal=wal[iwal]; 1366 iwal+=l; if (iwal>ip) iwal-=ip; 1367 cmplx<T0> xwal2=wal[iwal]; 1368 for (size_t ik=0; ik<idl1; ++ik) 1369 { 1370 CX2(ik,l).r += CH2(ik,j).r*xwal.r+CH2(ik,j+1).r*xwal2.r; 1371 CX2(ik,l).i += CH2(ik,j).i*xwal.r+CH2(ik,j+1).i*xwal2.r; 1372 CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i+CH2(ik,jc-1).i*xwal2.i; 1373 CX2(ik,lc).i += CH2(ik,jc).r*xwal.i+CH2(ik,jc-1).r*xwal2.i; 1374 } 1375 } 1376 for (; j<ipph; ++j, --jc) 1377 { 1378 iwal+=l; if (iwal>ip) iwal-=ip; 1379 cmplx<T0> xwal=wal[iwal]; 1380 for (size_t ik=0; ik<idl1; ++ik) 1381 { 1382 CX2(ik,l).r += CH2(ik,j).r*xwal.r; 1383 CX2(ik,l).i += CH2(ik,j).i*xwal.r; 1384 CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i; 1385 CX2(ik,lc).i += CH2(ik,jc).r*xwal.i; 1386 } 1387 } 1388 } 1389 1390 // shuffling and twiddling 1391 if (ido==1) 1392 for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc) 1393 for (size_t ik=0; ik<idl1; ++ik) 1394 { 1395 T t1=CX2(ik,j), t2=CX2(ik,jc); 1396 PM(CX2(ik,j),CX2(ik,jc),t1,t2); 1397 } 1398 else 1399 { 1400 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) 1401 for (size_t k=0; k<l1; ++k) 1402 { 1403 T t1=CX(0,k,j), t2=CX(0,k,jc); 1404 PM(CX(0,k,j),CX(0,k,jc),t1,t2); 1405 for (size_t i=1; i<ido; ++i) 1406 { 1407 T x1, x2; 1408 PM(x1,x2,CX(i,k,j),CX(i,k,jc)); 1409 size_t idij=(j-1)*(ido-1)+i-1; 1410 special_mul<fwd>(x1,wa[idij],CX(i,k,j)); 1411 idij=(jc-1)*(ido-1)+i-1; 1412 special_mul<fwd>(x2,wa[idij],CX(i,k,jc)); 1413 } 1414 } 1415 } 1416 } 1417 1418 template<bool fwd, typename T> void pass_all(T c[], T0 fct) const 1419 { 1420 if (length==1) { c[0]*=fct; return; } 1421 size_t l1=1; 1422 arr<T> ch(length); 1423 T *p1=c, *p2=ch.data(); 1424 1425 for(size_t k1=0; k1<fact.size(); k1++) 1426 { 1427 size_t ip=fact[k1].fct; 1428 size_t l2=ip*l1; 1429 size_t ido = length/l2; 1430 if (ip==4) 1431 pass4<fwd> (ido, l1, p1, p2, fact[k1].tw); 1432 else if(ip==8) 1433 pass8<fwd>(ido, l1, p1, p2, fact[k1].tw); 1434 else if(ip==2) 1435 pass2<fwd>(ido, l1, p1, p2, fact[k1].tw); 1436 else if(ip==3) 1437 pass3<fwd> (ido, l1, p1, p2, fact[k1].tw); 1438 else if(ip==5) 1439 pass5<fwd> (ido, l1, p1, p2, fact[k1].tw); 1440 else if(ip==7) 1441 pass7<fwd> (ido, l1, p1, p2, fact[k1].tw); 1442 else if(ip==11) 1443 pass11<fwd> (ido, l1, p1, p2, fact[k1].tw); 1444 else 1445 { 1446 passg<fwd>(ido, ip, l1, p1, p2, fact[k1].tw, fact[k1].tws); 1447 std::swap(p1,p2); 1448 } 1449 std::swap(p1,p2); 1450 l1=l2; 1451 } 1452 if (p1!=c) 1453 { 1454 if (fct!=1.) 1455 for (size_t i=0; i<length; ++i) 1456 c[i] = ch[i]*fct; 1457 else 1458 memcpy (c,p1,length*sizeof(T)); 1459 } 1460 else 1461 if (fct!=1.) 1462 for (size_t i=0; i<length; ++i) 1463 c[i] *= fct; 1464 } 1465 1466 public: 1467 template<typename T> void exec(T c[], T0 fct, bool fwd) const 1468 { fwd ? pass_all<true>(c, fct) : pass_all<false>(c, fct); } 1469 1470 private: 1471 POCKETFFT_NOINLINE void factorize() 1472 { 1473 size_t len=length; 1474 while ((len&7)==0) 1475 { add_factor(8); len>>=3; } 1476 while ((len&3)==0) 1477 { add_factor(4); len>>=2; } 1478 if ((len&1)==0) 1479 { 1480 len>>=1; 1481 // factor 2 should be at the front of the factor list 1482 add_factor(2); 1483 std::swap(fact[0].fct, fact.back().fct); 1484 } 1485 for (size_t divisor=3; divisor*divisor<=len; divisor+=2) 1486 while ((len%divisor)==0) 1487 { 1488 add_factor(divisor); 1489 len/=divisor; 1490 } 1491 if (len>1) add_factor(len); 1492 } 1493 1494 size_t twsize() const 1495 { 1496 size_t twsize=0, l1=1; 1497 for (size_t k=0; k<fact.size(); ++k) 1498 { 1499 size_t ip=fact[k].fct, ido= length/(l1*ip); 1500 twsize+=(ip-1)*(ido-1); 1501 if (ip>11) 1502 twsize+=ip; 1503 l1*=ip; 1504 } 1505 return twsize; 1506 } 1507 1508 void comp_twiddle() 1509 { 1510 sincos_2pibyn<T0> twiddle(length); 1511 size_t l1=1; 1512 size_t memofs=0; 1513 for (size_t k=0; k<fact.size(); ++k) 1514 { 1515 size_t ip=fact[k].fct, ido=length/(l1*ip); 1516 fact[k].tw=mem.data()+memofs; 1517 memofs+=(ip-1)*(ido-1); 1518 for (size_t j=1; j<ip; ++j) 1519 for (size_t i=1; i<ido; ++i) 1520 fact[k].tw[(j-1)*(ido-1)+i-1] = twiddle[j*l1*i]; 1521 if (ip>11) 1522 { 1523 fact[k].tws=mem.data()+memofs; 1524 memofs+=ip; 1525 for (size_t j=0; j<ip; ++j) 1526 fact[k].tws[j] = twiddle[j*l1*ido]; 1527 } 1528 l1*=ip; 1529 } 1530 } 1531 1532 public: 1533 POCKETFFT_NOINLINE cfftp(size_t length_) 1534 : length(length_) 1535 { 1536 if (length==0) throw std::runtime_error("zero-length FFT requested"); 1537 if (length==1) return; 1538 factorize(); 1539 mem.resize(twsize()); 1540 comp_twiddle(); 1541 } 1542 }; 1543 1544 // 1545 // real-valued FFTPACK transforms 1546 // 1547 1548 template<typename T0> class rfftp 1549 { 1550 private: 1551 struct fctdata 1552 { 1553 size_t fct; 1554 T0 *tw, *tws; 1555 }; 1556 1557 size_t length; 1558 arr<T0> mem; 1559 std::vector<fctdata> fact; 1560 1561 void add_factor(size_t factor) 1562 { fact.push_back({factor, nullptr, nullptr}); } 1563 1564 /* (a+ib) = conj(c+id) * (e+if) */ 1565 template<typename T1, typename T2, typename T3> inline void MULPM 1566 (T1 &a, T1 &b, T2 c, T2 d, T3 e, T3 f) const 1567 { a=c*e+d*f; b=c*f-d*e; } 1568 1569 template<typename T> void radf2 (size_t ido, size_t l1, 1570 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, 1571 const T0 * POCKETFFT_RESTRICT wa) const 1572 { 1573 auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; 1574 auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T& 1575 { return cc[a+ido*(b+l1*c)]; }; 1576 auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T& 1577 { return ch[a+ido*(b+2*c)]; }; 1578 1579 for (size_t k=0; k<l1; k++) 1580 PM (CH(0,0,k),CH(ido-1,1,k),CC(0,k,0),CC(0,k,1)); 1581 if ((ido&1)==0) 1582 for (size_t k=0; k<l1; k++) 1583 { 1584 CH( 0,1,k) = -CC(ido-1,k,1); 1585 CH(ido-1,0,k) = CC(ido-1,k,0); 1586 } 1587 if (ido<=2) return; 1588 for (size_t k=0; k<l1; k++) 1589 for (size_t i=2; i<ido; i+=2) 1590 { 1591 size_t ic=ido-i; 1592 T tr2, ti2; 1593 MULPM (tr2,ti2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1)); 1594 PM (CH(i-1,0,k),CH(ic-1,1,k),CC(i-1,k,0),tr2); 1595 PM (CH(i ,0,k),CH(ic ,1,k),ti2,CC(i ,k,0)); 1596 } 1597 } 1598 1599 // a2=a+b; b2=i*(b-a); 1600 #define POCKETFFT_REARRANGE(rx, ix, ry, iy) \ 1601 {\ 1602 auto t1=rx+ry, t2=ry-rx, t3=ix+iy, t4=ix-iy; \ 1603 rx=t1; ix=t3; ry=t4; iy=t2; \ 1604 } 1605 1606 template<typename T> void radf3(size_t ido, size_t l1, 1607 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, 1608 const T0 * POCKETFFT_RESTRICT wa) const 1609 { 1610 constexpr T0 taur=-0.5, taui=T0(0.8660254037844386467637231707529362L); 1611 1612 auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; 1613 auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T& 1614 { return cc[a+ido*(b+l1*c)]; }; 1615 auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T& 1616 { return ch[a+ido*(b+3*c)]; }; 1617 1618 for (size_t k=0; k<l1; k++) 1619 { 1620 T cr2=CC(0,k,1)+CC(0,k,2); 1621 CH(0,0,k) = CC(0,k,0)+cr2; 1622 CH(0,2,k) = taui*(CC(0,k,2)-CC(0,k,1)); 1623 CH(ido-1,1,k) = CC(0,k,0)+taur*cr2; 1624 } 1625 if (ido==1) return; 1626 for (size_t k=0; k<l1; k++) 1627 for (size_t i=2; i<ido; i+=2) 1628 { 1629 size_t ic=ido-i; 1630 T di2, di3, dr2, dr3; 1631 MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1)); // d2=conj(WA0)*CC1 1632 MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2)); // d3=conj(WA1)*CC2 1633 POCKETFFT_REARRANGE(dr2, di2, dr3, di3); 1634 CH(i-1,0,k) = CC(i-1,k,0)+dr2; // c add 1635 CH(i ,0,k) = CC(i ,k,0)+di2; 1636 T tr2 = CC(i-1,k,0)+taur*dr2; // c add 1637 T ti2 = CC(i ,k,0)+taur*di2; 1638 T tr3 = taui*dr3; // t3 = taui*i*(d3-d2)? 1639 T ti3 = taui*di3; 1640 PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr3); // PM(i) = t2+t3 1641 PM(CH(i ,2,k),CH(ic ,1,k),ti3,ti2); // PM(ic) = conj(t2-t3) 1642 } 1643 } 1644 1645 template<typename T> void radf4(size_t ido, size_t l1, 1646 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, 1647 const T0 * POCKETFFT_RESTRICT wa) const 1648 { 1649 constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L); 1650 1651 auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; 1652 auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T& 1653 { return cc[a+ido*(b+l1*c)]; }; 1654 auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T& 1655 { return ch[a+ido*(b+4*c)]; }; 1656 1657 for (size_t k=0; k<l1; k++) 1658 { 1659 T tr1,tr2; 1660 PM (tr1,CH(0,2,k),CC(0,k,3),CC(0,k,1)); 1661 PM (tr2,CH(ido-1,1,k),CC(0,k,0),CC(0,k,2)); 1662 PM (CH(0,0,k),CH(ido-1,3,k),tr2,tr1); 1663 } 1664 if ((ido&1)==0) 1665 for (size_t k=0; k<l1; k++) 1666 { 1667 T ti1=-hsqt2*(CC(ido-1,k,1)+CC(ido-1,k,3)); 1668 T tr1= hsqt2*(CC(ido-1,k,1)-CC(ido-1,k,3)); 1669 PM (CH(ido-1,0,k),CH(ido-1,2,k),CC(ido-1,k,0),tr1); 1670 PM (CH( 0,3,k),CH( 0,1,k),ti1,CC(ido-1,k,2)); 1671 } 1672 if (ido<=2) return; 1673 for (size_t k=0; k<l1; k++) 1674 for (size_t i=2; i<ido; i+=2) 1675 { 1676 size_t ic=ido-i; 1677 T ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4; 1678 MULPM(cr2,ci2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1)); 1679 MULPM(cr3,ci3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2)); 1680 MULPM(cr4,ci4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3)); 1681 PM(tr1,tr4,cr4,cr2); 1682 PM(ti1,ti4,ci2,ci4); 1683 PM(tr2,tr3,CC(i-1,k,0),cr3); 1684 PM(ti2,ti3,CC(i ,k,0),ci3); 1685 PM(CH(i-1,0,k),CH(ic-1,3,k),tr2,tr1); 1686 PM(CH(i ,0,k),CH(ic ,3,k),ti1,ti2); 1687 PM(CH(i-1,2,k),CH(ic-1,1,k),tr3,ti4); 1688 PM(CH(i ,2,k),CH(ic ,1,k),tr4,ti3); 1689 } 1690 } 1691 1692 template<typename T> void radf5(size_t ido, size_t l1, 1693 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, 1694 const T0 * POCKETFFT_RESTRICT wa) const 1695 { 1696 constexpr T0 tr11= T0(0.3090169943749474241022934171828191L), 1697 ti11= T0(0.9510565162951535721164393333793821L), 1698 tr12= T0(-0.8090169943749474241022934171828191L), 1699 ti12= T0(0.5877852522924731291687059546390728L); 1700 1701 auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; 1702 auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T& 1703 { return cc[a+ido*(b+l1*c)]; }; 1704 auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T& 1705 { return ch[a+ido*(b+5*c)]; }; 1706 1707 for (size_t k=0; k<l1; k++) 1708 { 1709 T cr2, cr3, ci4, ci5; 1710 PM (cr2,ci5,CC(0,k,4),CC(0,k,1)); 1711 PM (cr3,ci4,CC(0,k,3),CC(0,k,2)); 1712 CH(0,0,k)=CC(0,k,0)+cr2+cr3; 1713 CH(ido-1,1,k)=CC(0,k,0)+tr11*cr2+tr12*cr3; 1714 CH(0,2,k)=ti11*ci5+ti12*ci4; 1715 CH(ido-1,3,k)=CC(0,k,0)+tr12*cr2+tr11*cr3; 1716 CH(0,4,k)=ti12*ci5-ti11*ci4; 1717 } 1718 if (ido==1) return; 1719 for (size_t k=0; k<l1;++k) 1720 for (size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2) 1721 { 1722 T di2, di3, di4, di5, dr2, dr3, dr4, dr5; 1723 MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1)); 1724 MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2)); 1725 MULPM (dr4,di4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3)); 1726 MULPM (dr5,di5,WA(3,i-2),WA(3,i-1),CC(i-1,k,4),CC(i,k,4)); 1727 POCKETFFT_REARRANGE(dr2, di2, dr5, di5); 1728 POCKETFFT_REARRANGE(dr3, di3, dr4, di4); 1729 CH(i-1,0,k)=CC(i-1,k,0)+dr2+dr3; 1730 CH(i ,0,k)=CC(i ,k,0)+di2+di3; 1731 T tr2=CC(i-1,k,0)+tr11*dr2+tr12*dr3; 1732 T ti2=CC(i ,k,0)+tr11*di2+tr12*di3; 1733 T tr3=CC(i-1,k,0)+tr12*dr2+tr11*dr3; 1734 T ti3=CC(i ,k,0)+tr12*di2+tr11*di3; 1735 T tr5 = ti11*dr5 + ti12*dr4; 1736 T ti5 = ti11*di5 + ti12*di4; 1737 T tr4 = ti12*dr5 - ti11*dr4; 1738 T ti4 = ti12*di5 - ti11*di4; 1739 PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr5); 1740 PM(CH(i ,2,k),CH(ic ,1,k),ti5,ti2); 1741 PM(CH(i-1,4,k),CH(ic-1,3,k),tr3,tr4); 1742 PM(CH(i ,4,k),CH(ic ,3,k),ti4,ti3); 1743 } 1744 } 1745 1746 #undef POCKETFFT_REARRANGE 1747 1748 template<typename T> void radfg(size_t ido, size_t ip, size_t l1, 1749 T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, 1750 const T0 * POCKETFFT_RESTRICT wa, const T0 * POCKETFFT_RESTRICT csarr) const 1751 { 1752 const size_t cdim=ip; 1753 size_t ipph=(ip+1)/2; 1754 size_t idl1 = ido*l1; 1755 1756 auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> T& 1757 { return cc[a+ido*(b+cdim*c)]; }; 1758 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> const T& 1759 { return ch[a+ido*(b+l1*c)]; }; 1760 auto C1 = [cc,ido,l1] (size_t a, size_t b, size_t c) -> T& 1761 { return cc[a+ido*(b+l1*c)]; }; 1762 auto C2 = [cc,idl1] (size_t a, size_t b) -> T& 1763 { return cc[a+idl1*b]; }; 1764 auto CH2 = [ch,idl1] (size_t a, size_t b) -> T& 1765 { return ch[a+idl1*b]; }; 1766 1767 if (ido>1) 1768 { 1769 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 114 1770 { 1771 size_t is=(j-1)*(ido-1), 1772 is2=(jc-1)*(ido-1); 1773 for (size_t k=0; k<l1; ++k) // 113 1774 { 1775 size_t idij=is; 1776 size_t idij2=is2; 1777 for (size_t i=1; i<=ido-2; i+=2) // 112 1778 { 1779 T t1=C1(i,k,j ), t2=C1(i+1,k,j ), 1780 t3=C1(i,k,jc), t4=C1(i+1,k,jc); 1781 T x1=wa[idij]*t1 + wa[idij+1]*t2, 1782 x2=wa[idij]*t2 - wa[idij+1]*t1, 1783 x3=wa[idij2]*t3 + wa[idij2+1]*t4, 1784 x4=wa[idij2]*t4 - wa[idij2+1]*t3; 1785 PM(C1(i,k,j),C1(i+1,k,jc),x3,x1); 1786 PM(C1(i+1,k,j),C1(i,k,jc),x2,x4); 1787 idij+=2; 1788 idij2+=2; 1789 } 1790 } 1791 } 1792 } 1793 1794 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 123 1795 for (size_t k=0; k<l1; ++k) // 122 1796 MPINPLACE(C1(0,k,jc), C1(0,k,j)); 1797 1798 //everything in C 1799 //memset(ch,0,ip*l1*ido*sizeof(double)); 1800 1801 for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc) // 127 1802 { 1803 for (size_t ik=0; ik<idl1; ++ik) // 124 1804 { 1805 CH2(ik,l ) = C2(ik,0)+csarr[2*l]*C2(ik,1)+csarr[4*l]*C2(ik,2); 1806 CH2(ik,lc) = csarr[2*l+1]*C2(ik,ip-1)+csarr[4*l+1]*C2(ik,ip-2); 1807 } 1808 size_t iang = 2*l; 1809 size_t j=3, jc=ip-3; 1810 for (; j<ipph-3; j+=4,jc-=4) // 126 1811 { 1812 iang+=l; if (iang>=ip) iang-=ip; 1813 T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1]; 1814 iang+=l; if (iang>=ip) iang-=ip; 1815 T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1]; 1816 iang+=l; if (iang>=ip) iang-=ip; 1817 T0 ar3=csarr[2*iang], ai3=csarr[2*iang+1]; 1818 iang+=l; if (iang>=ip) iang-=ip; 1819 T0 ar4=csarr[2*iang], ai4=csarr[2*iang+1]; 1820 for (size_t ik=0; ik<idl1; ++ik) // 125 1821 { 1822 CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1) 1823 +ar3*C2(ik,j +2)+ar4*C2(ik,j +3); 1824 CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1) 1825 +ai3*C2(ik,jc-2)+ai4*C2(ik,jc-3); 1826 } 1827 } 1828 for (; j<ipph-1; j+=2,jc-=2) // 126 1829 { 1830 iang+=l; if (iang>=ip) iang-=ip; 1831 T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1]; 1832 iang+=l; if (iang>=ip) iang-=ip; 1833 T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1]; 1834 for (size_t ik=0; ik<idl1; ++ik) // 125 1835 { 1836 CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1); 1837 CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1); 1838 } 1839 } 1840 for (; j<ipph; ++j,--jc) // 126 1841 { 1842 iang+=l; if (iang>=ip) iang-=ip; 1843 T0 ar=csarr[2*iang], ai=csarr[2*iang+1]; 1844 for (size_t ik=0; ik<idl1; ++ik) // 125 1845 { 1846 CH2(ik,l ) += ar*C2(ik,j ); 1847 CH2(ik,lc) += ai*C2(ik,jc); 1848 } 1849 } 1850 } 1851 for (size_t ik=0; ik<idl1; ++ik) // 101 1852 CH2(ik,0) = C2(ik,0); 1853 for (size_t j=1; j<ipph; ++j) // 129 1854 for (size_t ik=0; ik<idl1; ++ik) // 128 1855 CH2(ik,0) += C2(ik,j); 1856 1857 // everything in CH at this point! 1858 //memset(cc,0,ip*l1*ido*sizeof(double)); 1859 1860 for (size_t k=0; k<l1; ++k) // 131 1861 for (size_t i=0; i<ido; ++i) // 130 1862 CC(i,0,k) = CH(i,k,0); 1863 1864 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 137 1865 { 1866 size_t j2=2*j-1; 1867 for (size_t k=0; k<l1; ++k) // 136 1868 { 1869 CC(ido-1,j2,k) = CH(0,k,j); 1870 CC(0,j2+1,k) = CH(0,k,jc); 1871 } 1872 } 1873 1874 if (ido==1) return; 1875 1876 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 140 1877 { 1878 size_t j2=2*j-1; 1879 for(size_t k=0; k<l1; ++k) // 139 1880 for(size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2) // 138 1881 { 1882 CC(i ,j2+1,k) = CH(i ,k,j )+CH(i ,k,jc); 1883 CC(ic ,j2 ,k) = CH(i ,k,j )-CH(i ,k,jc); 1884 CC(i+1 ,j2+1,k) = CH(i+1,k,j )+CH(i+1,k,jc); 1885 CC(ic+1,j2 ,k) = CH(i+1,k,jc)-CH(i+1,k,j ); 1886 } 1887 } 1888 } 1889 1890 template<typename T> void radb2(size_t ido, size_t l1, 1891 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, 1892 const T0 * POCKETFFT_RESTRICT wa) const 1893 { 1894 auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; 1895 auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T& 1896 { return cc[a+ido*(b+2*c)]; }; 1897 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& 1898 { return ch[a+ido*(b+l1*c)]; }; 1899 1900 for (size_t k=0; k<l1; k++) 1901 PM (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(ido-1,1,k)); 1902 if ((ido&1)==0) 1903 for (size_t k=0; k<l1; k++) 1904 { 1905 CH(ido-1,k,0) = 2*CC(ido-1,0,k); 1906 CH(ido-1,k,1) =-2*CC(0 ,1,k); 1907 } 1908 if (ido<=2) return; 1909 for (size_t k=0; k<l1;++k) 1910 for (size_t i=2; i<ido; i+=2) 1911 { 1912 size_t ic=ido-i; 1913 T ti2, tr2; 1914 PM (CH(i-1,k,0),tr2,CC(i-1,0,k),CC(ic-1,1,k)); 1915 PM (ti2,CH(i ,k,0),CC(i ,0,k),CC(ic ,1,k)); 1916 MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ti2,tr2); 1917 } 1918 } 1919 1920 template<typename T> void radb3(size_t ido, size_t l1, 1921 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, 1922 const T0 * POCKETFFT_RESTRICT wa) const 1923 { 1924 constexpr T0 taur=-0.5, taui=T0(0.8660254037844386467637231707529362L); 1925 1926 auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; 1927 auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T& 1928 { return cc[a+ido*(b+3*c)]; }; 1929 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& 1930 { return ch[a+ido*(b+l1*c)]; }; 1931 1932 for (size_t k=0; k<l1; k++) 1933 { 1934 T tr2=2*CC(ido-1,1,k); 1935 T cr2=CC(0,0,k)+taur*tr2; 1936 CH(0,k,0)=CC(0,0,k)+tr2; 1937 T ci3=2*taui*CC(0,2,k); 1938 PM (CH(0,k,2),CH(0,k,1),cr2,ci3); 1939 } 1940 if (ido==1) return; 1941 for (size_t k=0; k<l1; k++) 1942 for (size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2) 1943 { 1944 T tr2=CC(i-1,2,k)+CC(ic-1,1,k); // t2=CC(I) + conj(CC(ic)) 1945 T ti2=CC(i ,2,k)-CC(ic ,1,k); 1946 T cr2=CC(i-1,0,k)+taur*tr2; // c2=CC +taur*t2 1947 T ci2=CC(i ,0,k)+taur*ti2; 1948 CH(i-1,k,0)=CC(i-1,0,k)+tr2; // CH=CC+t2 1949 CH(i ,k,0)=CC(i ,0,k)+ti2; 1950 T cr3=taui*(CC(i-1,2,k)-CC(ic-1,1,k));// c3=taui*(CC(i)-conj(CC(ic))) 1951 T ci3=taui*(CC(i ,2,k)+CC(ic ,1,k)); 1952 T di2, di3, dr2, dr3; 1953 PM(dr3,dr2,cr2,ci3); // d2= (cr2-ci3, ci2+cr3) = c2+i*c3 1954 PM(di2,di3,ci2,cr3); // d3= (cr2+ci3, ci2-cr3) = c2-i*c3 1955 MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2); // ch = WA*d2 1956 MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3); 1957 } 1958 } 1959 1960 template<typename T> void radb4(size_t ido, size_t l1, 1961 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, 1962 const T0 * POCKETFFT_RESTRICT wa) const 1963 { 1964 constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); 1965 1966 auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; 1967 auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T& 1968 { return cc[a+ido*(b+4*c)]; }; 1969 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& 1970 { return ch[a+ido*(b+l1*c)]; }; 1971 1972 for (size_t k=0; k<l1; k++) 1973 { 1974 T tr1, tr2; 1975 PM (tr2,tr1,CC(0,0,k),CC(ido-1,3,k)); 1976 T tr3=2*CC(ido-1,1,k); 1977 T tr4=2*CC(0,2,k); 1978 PM (CH(0,k,0),CH(0,k,2),tr2,tr3); 1979 PM (CH(0,k,3),CH(0,k,1),tr1,tr4); 1980 } 1981 if ((ido&1)==0) 1982 for (size_t k=0; k<l1; k++) 1983 { 1984 T tr1,tr2,ti1,ti2; 1985 PM (ti1,ti2,CC(0 ,3,k),CC(0 ,1,k)); 1986 PM (tr2,tr1,CC(ido-1,0,k),CC(ido-1,2,k)); 1987 CH(ido-1,k,0)=tr2+tr2; 1988 CH(ido-1,k,1)=sqrt2*(tr1-ti1); 1989 CH(ido-1,k,2)=ti2+ti2; 1990 CH(ido-1,k,3)=-sqrt2*(tr1+ti1); 1991 } 1992 if (ido<=2) return; 1993 for (size_t k=0; k<l1;++k) 1994 for (size_t i=2; i<ido; i+=2) 1995 { 1996 T ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4; 1997 size_t ic=ido-i; 1998 PM (tr2,tr1,CC(i-1,0,k),CC(ic-1,3,k)); 1999 PM (ti1,ti2,CC(i ,0,k),CC(ic ,3,k)); 2000 PM (tr4,ti3,CC(i ,2,k),CC(ic ,1,k)); 2001 PM (tr3,ti4,CC(i-1,2,k),CC(ic-1,1,k)); 2002 PM (CH(i-1,k,0),cr3,tr2,tr3); 2003 PM (CH(i ,k,0),ci3,ti2,ti3); 2004 PM (cr4,cr2,tr1,tr4); 2005 PM (ci2,ci4,ti1,ti4); 2006 MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ci2,cr2); 2007 MULPM (CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),ci3,cr3); 2008 MULPM (CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),ci4,cr4); 2009 } 2010 } 2011 2012 template<typename T> void radb5(size_t ido, size_t l1, 2013 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, 2014 const T0 * POCKETFFT_RESTRICT wa) const 2015 { 2016 constexpr T0 tr11= T0(0.3090169943749474241022934171828191L), 2017 ti11= T0(0.9510565162951535721164393333793821L), 2018 tr12= T0(-0.8090169943749474241022934171828191L), 2019 ti12= T0(0.5877852522924731291687059546390728L); 2020 2021 auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; 2022 auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T& 2023 { return cc[a+ido*(b+5*c)]; }; 2024 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& 2025 { return ch[a+ido*(b+l1*c)]; }; 2026 2027 for (size_t k=0; k<l1; k++) 2028 { 2029 T ti5=CC(0,2,k)+CC(0,2,k); 2030 T ti4=CC(0,4,k)+CC(0,4,k); 2031 T tr2=CC(ido-1,1,k)+CC(ido-1,1,k); 2032 T tr3=CC(ido-1,3,k)+CC(ido-1,3,k); 2033 CH(0,k,0)=CC(0,0,k)+tr2+tr3; 2034 T cr2=CC(0,0,k)+tr11*tr2+tr12*tr3; 2035 T cr3=CC(0,0,k)+tr12*tr2+tr11*tr3; 2036 T ci4, ci5; 2037 MULPM(ci5,ci4,ti5,ti4,ti11,ti12); 2038 PM(CH(0,k,4),CH(0,k,1),cr2,ci5); 2039 PM(CH(0,k,3),CH(0,k,2),cr3,ci4); 2040 } 2041 if (ido==1) return; 2042 for (size_t k=0; k<l1;++k) 2043 for (size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2) 2044 { 2045 T tr2, tr3, tr4, tr5, ti2, ti3, ti4, ti5; 2046 PM(tr2,tr5,CC(i-1,2,k),CC(ic-1,1,k)); 2047 PM(ti5,ti2,CC(i ,2,k),CC(ic ,1,k)); 2048 PM(tr3,tr4,CC(i-1,4,k),CC(ic-1,3,k)); 2049 PM(ti4,ti3,CC(i ,4,k),CC(ic ,3,k)); 2050 CH(i-1,k,0)=CC(i-1,0,k)+tr2+tr3; 2051 CH(i ,k,0)=CC(i ,0,k)+ti2+ti3; 2052 T cr2=CC(i-1,0,k)+tr11*tr2+tr12*tr3; 2053 T ci2=CC(i ,0,k)+tr11*ti2+tr12*ti3; 2054 T cr3=CC(i-1,0,k)+tr12*tr2+tr11*tr3; 2055 T ci3=CC(i ,0,k)+tr12*ti2+tr11*ti3; 2056 T ci4, ci5, cr5, cr4; 2057 MULPM(cr5,cr4,tr5,tr4,ti11,ti12); 2058 MULPM(ci5,ci4,ti5,ti4,ti11,ti12); 2059 T dr2, dr3, dr4, dr5, di2, di3, di4, di5; 2060 PM(dr4,dr3,cr3,ci4); 2061 PM(di3,di4,ci3,cr4); 2062 PM(dr5,dr2,cr2,ci5); 2063 PM(di2,di5,ci2,cr5); 2064 MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2); 2065 MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3); 2066 MULPM(CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),di4,dr4); 2067 MULPM(CH(i,k,4),CH(i-1,k,4),WA(3,i-2),WA(3,i-1),di5,dr5); 2068 } 2069 } 2070 2071 template<typename T> void radbg(size_t ido, size_t ip, size_t l1, 2072 T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, 2073 const T0 * POCKETFFT_RESTRICT wa, const T0 * POCKETFFT_RESTRICT csarr) const 2074 { 2075 const size_t cdim=ip; 2076 size_t ipph=(ip+1)/ 2; 2077 size_t idl1 = ido*l1; 2078 2079 auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& 2080 { return cc[a+ido*(b+cdim*c)]; }; 2081 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& 2082 { return ch[a+ido*(b+l1*c)]; }; 2083 auto C1 = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T& 2084 { return cc[a+ido*(b+l1*c)]; }; 2085 auto C2 = [cc,idl1](size_t a, size_t b) -> T& 2086 { return cc[a+idl1*b]; }; 2087 auto CH2 = [ch,idl1](size_t a, size_t b) -> T& 2088 { return ch[a+idl1*b]; }; 2089 2090 for (size_t k=0; k<l1; ++k) // 102 2091 for (size_t i=0; i<ido; ++i) // 101 2092 CH(i,k,0) = CC(i,0,k); 2093 for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc) // 108 2094 { 2095 size_t j2=2*j-1; 2096 for (size_t k=0; k<l1; ++k) 2097 { 2098 CH(0,k,j ) = 2*CC(ido-1,j2,k); 2099 CH(0,k,jc) = 2*CC(0,j2+1,k); 2100 } 2101 } 2102 2103 if (ido!=1) 2104 { 2105 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 111 2106 { 2107 size_t j2=2*j-1; 2108 for (size_t k=0; k<l1; ++k) 2109 for (size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2) // 109 2110 { 2111 CH(i ,k,j ) = CC(i ,j2+1,k)+CC(ic ,j2,k); 2112 CH(i ,k,jc) = CC(i ,j2+1,k)-CC(ic ,j2,k); 2113 CH(i+1,k,j ) = CC(i+1,j2+1,k)-CC(ic+1,j2,k); 2114 CH(i+1,k,jc) = CC(i+1,j2+1,k)+CC(ic+1,j2,k); 2115 } 2116 } 2117 } 2118 for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc) 2119 { 2120 for (size_t ik=0; ik<idl1; ++ik) 2121 { 2122 C2(ik,l ) = CH2(ik,0)+csarr[2*l]*CH2(ik,1)+csarr[4*l]*CH2(ik,2); 2123 C2(ik,lc) = csarr[2*l+1]*CH2(ik,ip-1)+csarr[4*l+1]*CH2(ik,ip-2); 2124 } 2125 size_t iang=2*l; 2126 size_t j=3,jc=ip-3; 2127 for(; j<ipph-3; j+=4,jc-=4) 2128 { 2129 iang+=l; if(iang>ip) iang-=ip; 2130 T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1]; 2131 iang+=l; if(iang>ip) iang-=ip; 2132 T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1]; 2133 iang+=l; if(iang>ip) iang-=ip; 2134 T0 ar3=csarr[2*iang], ai3=csarr[2*iang+1]; 2135 iang+=l; if(iang>ip) iang-=ip; 2136 T0 ar4=csarr[2*iang], ai4=csarr[2*iang+1]; 2137 for (size_t ik=0; ik<idl1; ++ik) 2138 { 2139 C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1) 2140 +ar3*CH2(ik,j +2)+ar4*CH2(ik,j +3); 2141 C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1) 2142 +ai3*CH2(ik,jc-2)+ai4*CH2(ik,jc-3); 2143 } 2144 } 2145 for(; j<ipph-1; j+=2,jc-=2) 2146 { 2147 iang+=l; if(iang>ip) iang-=ip; 2148 T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1]; 2149 iang+=l; if(iang>ip) iang-=ip; 2150 T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1]; 2151 for (size_t ik=0; ik<idl1; ++ik) 2152 { 2153 C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1); 2154 C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1); 2155 } 2156 } 2157 for(; j<ipph; ++j,--jc) 2158 { 2159 iang+=l; if(iang>ip) iang-=ip; 2160 T0 war=csarr[2*iang], wai=csarr[2*iang+1]; 2161 for (size_t ik=0; ik<idl1; ++ik) 2162 { 2163 C2(ik,l ) += war*CH2(ik,j ); 2164 C2(ik,lc) += wai*CH2(ik,jc); 2165 } 2166 } 2167 } 2168 for (size_t j=1; j<ipph; ++j) 2169 for (size_t ik=0; ik<idl1; ++ik) 2170 CH2(ik,0) += CH2(ik,j); 2171 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 124 2172 for (size_t k=0; k<l1; ++k) 2173 PM(CH(0,k,jc),CH(0,k,j),C1(0,k,j),C1(0,k,jc)); 2174 2175 if (ido==1) return; 2176 2177 for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc) // 127 2178 for (size_t k=0; k<l1; ++k) 2179 for (size_t i=1; i<=ido-2; i+=2) 2180 { 2181 CH(i ,k,j ) = C1(i ,k,j)-C1(i+1,k,jc); 2182 CH(i ,k,jc) = C1(i ,k,j)+C1(i+1,k,jc); 2183 CH(i+1,k,j ) = C1(i+1,k,j)+C1(i ,k,jc); 2184 CH(i+1,k,jc) = C1(i+1,k,j)-C1(i ,k,jc); 2185 } 2186 2187 // All in CH 2188 2189 for (size_t j=1; j<ip; ++j) 2190 { 2191 size_t is = (j-1)*(ido-1); 2192 for (size_t k=0; k<l1; ++k) 2193 { 2194 size_t idij = is; 2195 for (size_t i=1; i<=ido-2; i+=2) 2196 { 2197 T t1=CH(i,k,j), t2=CH(i+1,k,j); 2198 CH(i ,k,j) = wa[idij]*t1-wa[idij+1]*t2; 2199 CH(i+1,k,j) = wa[idij]*t2+wa[idij+1]*t1; 2200 idij+=2; 2201 } 2202 } 2203 } 2204 } 2205 2206 template<typename T> void copy_and_norm(T *c, T *p1, size_t n, T0 fct) const 2207 { 2208 if (p1!=c) 2209 { 2210 if (fct!=1.) 2211 for (size_t i=0; i<n; ++i) 2212 c[i] = fct*p1[i]; 2213 else 2214 memcpy (c,p1,n*sizeof(T)); 2215 } 2216 else 2217 if (fct!=1.) 2218 for (size_t i=0; i<n; ++i) 2219 c[i] *= fct; 2220 } 2221 2222 public: 2223 template<typename T> void exec(T c[], T0 fct, bool r2hc) const 2224 { 2225 if (length==1) { c[0]*=fct; return; } 2226 size_t n=length, nf=fact.size(); 2227 arr<T> ch(n); 2228 T *p1=c, *p2=ch.data(); 2229 2230 if (r2hc) 2231 for(size_t k1=0, l1=n; k1<nf;++k1) 2232 { 2233 size_t k=nf-k1-1; 2234 size_t ip=fact[k].fct; 2235 size_t ido=n / l1; 2236 l1 /= ip; 2237 if(ip==4) 2238 radf4(ido, l1, p1, p2, fact[k].tw); 2239 else if(ip==2) 2240 radf2(ido, l1, p1, p2, fact[k].tw); 2241 else if(ip==3) 2242 radf3(ido, l1, p1, p2, fact[k].tw); 2243 else if(ip==5) 2244 radf5(ido, l1, p1, p2, fact[k].tw); 2245 else 2246 { radfg(ido, ip, l1, p1, p2, fact[k].tw, fact[k].tws); std::swap (p1,p2); } 2247 std::swap (p1,p2); 2248 } 2249 else 2250 for(size_t k=0, l1=1; k<nf; k++) 2251 { 2252 size_t ip = fact[k].fct, 2253 ido= n/(ip*l1); 2254 if(ip==4) 2255 radb4(ido, l1, p1, p2, fact[k].tw); 2256 else if(ip==2) 2257 radb2(ido, l1, p1, p2, fact[k].tw); 2258 else if(ip==3) 2259 radb3(ido, l1, p1, p2, fact[k].tw); 2260 else if(ip==5) 2261 radb5(ido, l1, p1, p2, fact[k].tw); 2262 else 2263 radbg(ido, ip, l1, p1, p2, fact[k].tw, fact[k].tws); 2264 std::swap (p1,p2); 2265 l1*=ip; 2266 } 2267 2268 copy_and_norm(c,p1,n,fct); 2269 } 2270 2271 private: 2272 void factorize() 2273 { 2274 size_t len=length; 2275 while ((len%4)==0) 2276 { add_factor(4); len>>=2; } 2277 if ((len%2)==0) 2278 { 2279 len>>=1; 2280 // factor 2 should be at the front of the factor list 2281 add_factor(2); 2282 std::swap(fact[0].fct, fact.back().fct); 2283 } 2284 for (size_t divisor=3; divisor*divisor<=len; divisor+=2) 2285 while ((len%divisor)==0) 2286 { 2287 add_factor(divisor); 2288 len/=divisor; 2289 } 2290 if (len>1) add_factor(len); 2291 } 2292 2293 size_t twsize() const 2294 { 2295 size_t twsz=0, l1=1; 2296 for (size_t k=0; k<fact.size(); ++k) 2297 { 2298 size_t ip=fact[k].fct, ido=length/(l1*ip); 2299 twsz+=(ip-1)*(ido-1); 2300 if (ip>5) twsz+=2*ip; 2301 l1*=ip; 2302 } 2303 return twsz; 2304 } 2305 2306 void comp_twiddle() 2307 { 2308 sincos_2pibyn<T0> twid(length); 2309 size_t l1=1; 2310 T0 *ptr=mem.data(); 2311 for (size_t k=0; k<fact.size(); ++k) 2312 { 2313 size_t ip=fact[k].fct, ido=length/(l1*ip); 2314 if (k<fact.size()-1) // last factor doesn't need twiddles 2315 { 2316 fact[k].tw=ptr; ptr+=(ip-1)*(ido-1); 2317 for (size_t j=1; j<ip; ++j) 2318 for (size_t i=1; i<=(ido-1)/2; ++i) 2319 { 2320 fact[k].tw[(j-1)*(ido-1)+2*i-2] = twid[j*l1*i].r; 2321 fact[k].tw[(j-1)*(ido-1)+2*i-1] = twid[j*l1*i].i; 2322 } 2323 } 2324 if (ip>5) // special factors required by *g functions 2325 { 2326 fact[k].tws=ptr; ptr+=2*ip; 2327 fact[k].tws[0] = 1.; 2328 fact[k].tws[1] = 0.; 2329 for (size_t i=2, ic=2*ip-2; i<=ic; i+=2, ic-=2) 2330 { 2331 fact[k].tws[i ] = twid[i/2*(length/ip)].r; 2332 fact[k].tws[i+1] = twid[i/2*(length/ip)].i; 2333 fact[k].tws[ic] = twid[i/2*(length/ip)].r; 2334 fact[k].tws[ic+1] = -twid[i/2*(length/ip)].i; 2335 } 2336 } 2337 l1*=ip; 2338 } 2339 } 2340 2341 public: 2342 POCKETFFT_NOINLINE rfftp(size_t length_) 2343 : length(length_) 2344 { 2345 if (length==0) throw std::runtime_error("zero-length FFT requested"); 2346 if (length==1) return; 2347 factorize(); 2348 mem.resize(twsize()); 2349 comp_twiddle(); 2350 } 2351 }; 2352 2353 // 2354 // complex Bluestein transforms 2355 // 2356 2357 template<typename T0> class fftblue 2358 { 2359 private: 2360 size_t n, n2; 2361 cfftp<T0> plan; 2362 arr<cmplx<T0>> mem; 2363 cmplx<T0> *bk, *bkf; 2364 2365 template<bool fwd, typename T> void fft(cmplx<T> c[], T0 fct) const 2366 { 2367 arr<cmplx<T>> akf(n2); 2368 2369 /* initialize a_k and FFT it */ 2370 for (size_t m=0; m<n; ++m) 2371 special_mul<fwd>(c[m],bk[m],akf[m]); 2372 auto zero = akf[0]*T0(0); 2373 for (size_t m=n; m<n2; ++m) 2374 akf[m]=zero; 2375 2376 plan.exec (akf.data(),1.,true); 2377 2378 /* do the convolution */ 2379 akf[0] = akf[0].template special_mul<!fwd>(bkf[0]); 2380 for (size_t m=1; m<(n2+1)/2; ++m) 2381 { 2382 akf[m] = akf[m].template special_mul<!fwd>(bkf[m]); 2383 akf[n2-m] = akf[n2-m].template special_mul<!fwd>(bkf[m]); 2384 } 2385 if ((n2&1)==0) 2386 akf[n2/2] = akf[n2/2].template special_mul<!fwd>(bkf[n2/2]); 2387 2388 /* inverse FFT */ 2389 plan.exec (akf.data(),1.,false); 2390 2391 /* multiply by b_k */ 2392 for (size_t m=0; m<n; ++m) 2393 c[m] = akf[m].template special_mul<fwd>(bk[m])*fct; 2394 } 2395 2396 public: 2397 POCKETFFT_NOINLINE fftblue(size_t length) 2398 : n(length), n2(util::good_size_cmplx(n*2-1)), plan(n2), mem(n+n2/2+1), 2399 bk(mem.data()), bkf(mem.data()+n) 2400 { 2401 /* initialize b_k */ 2402 sincos_2pibyn<T0> tmp(2*n); 2403 bk[0].Set(1, 0); 2404 2405 size_t coeff=0; 2406 for (size_t m=1; m<n; ++m) 2407 { 2408 coeff+=2*m-1; 2409 if (coeff>=2*n) coeff-=2*n; 2410 bk[m] = tmp[coeff]; 2411 } 2412 2413 /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */ 2414 arr<cmplx<T0>> tbkf(n2); 2415 T0 xn2 = T0(1)/T0(n2); 2416 tbkf[0] = bk[0]*xn2; 2417 for (size_t m=1; m<n; ++m) 2418 tbkf[m] = tbkf[n2-m] = bk[m]*xn2; 2419 for (size_t m=n;m<=(n2-n);++m) 2420 tbkf[m].Set(0.,0.); 2421 plan.exec(tbkf.data(),1.,true); 2422 for (size_t i=0; i<n2/2+1; ++i) 2423 bkf[i] = tbkf[i]; 2424 } 2425 2426 template<typename T> void exec(cmplx<T> c[], T0 fct, bool fwd) const 2427 { fwd ? fft<true>(c,fct) : fft<false>(c,fct); } 2428 2429 template<typename T> void exec_r(T c[], T0 fct, bool fwd) 2430 { 2431 arr<cmplx<T>> tmp(n); 2432 if (fwd) 2433 { 2434 auto zero = T0(0)*c[0]; 2435 for (size_t m=0; m<n; ++m) 2436 tmp[m].Set(c[m], zero); 2437 fft<true>(tmp.data(),fct); 2438 c[0] = tmp[0].r; 2439 memcpy (c+1, tmp.data()+1, (n-1)*sizeof(T)); 2440 } 2441 else 2442 { 2443 tmp[0].Set(c[0],c[0]*0); 2444 memcpy (reinterpret_cast<void *>(tmp.data()+1), 2445 reinterpret_cast<void *>(c+1), (n-1)*sizeof(T)); 2446 if ((n&1)==0) tmp[n/2].i=T0(0)*c[0]; 2447 for (size_t m=1; 2*m<n; ++m) 2448 tmp[n-m].Set(tmp[m].r, -tmp[m].i); 2449 fft<false>(tmp.data(),fct); 2450 for (size_t m=0; m<n; ++m) 2451 c[m] = tmp[m].r; 2452 } 2453 } 2454 }; 2455 2456 // 2457 // flexible (FFTPACK/Bluestein) complex 1D transform 2458 // 2459 2460 template<typename T0> class pocketfft_c 2461 { 2462 private: 2463 std::unique_ptr<cfftp<T0>> packplan; 2464 std::unique_ptr<fftblue<T0>> blueplan; 2465 size_t len; 2466 2467 public: 2468 POCKETFFT_NOINLINE pocketfft_c(size_t length) 2469 : len(length) 2470 { 2471 if (length==0) throw std::runtime_error("zero-length FFT requested"); 2472 size_t tmp = (length<50) ? 0 : util::largest_prime_factor(length); 2473 if (tmp*tmp <= length) 2474 { 2475 packplan=std::unique_ptr<cfftp<T0>>(new cfftp<T0>(length)); 2476 return; 2477 } 2478 double comp1 = util::cost_guess(length); 2479 double comp2 = 2*util::cost_guess(util::good_size_cmplx(2*length-1)); 2480 comp2*=1.5; /* fudge factor that appears to give good overall performance */ 2481 if (comp2<comp1) // use Bluestein 2482 blueplan=std::unique_ptr<fftblue<T0>>(new fftblue<T0>(length)); 2483 else 2484 packplan=std::unique_ptr<cfftp<T0>>(new cfftp<T0>(length)); 2485 } 2486 2487 template<typename T> POCKETFFT_NOINLINE void exec(cmplx<T> c[], T0 fct, bool fwd) const 2488 { packplan ? packplan->exec(c,fct,fwd) : blueplan->exec(c,fct,fwd); } 2489 2490 size_t length() const { return len; } 2491 }; 2492 2493 // 2494 // flexible (FFTPACK/Bluestein) real-valued 1D transform 2495 // 2496 2497 template<typename T0> class pocketfft_r 2498 { 2499 private: 2500 std::unique_ptr<rfftp<T0>> packplan; 2501 std::unique_ptr<fftblue<T0>> blueplan; 2502 size_t len; 2503 2504 public: 2505 POCKETFFT_NOINLINE pocketfft_r(size_t length) 2506 : len(length) 2507 { 2508 if (length==0) throw std::runtime_error("zero-length FFT requested"); 2509 size_t tmp = (length<50) ? 0 : util::largest_prime_factor(length); 2510 if (tmp*tmp <= length) 2511 { 2512 packplan=std::unique_ptr<rfftp<T0>>(new rfftp<T0>(length)); 2513 return; 2514 } 2515 double comp1 = 0.5*util::cost_guess(length); 2516 double comp2 = 2*util::cost_guess(util::good_size_cmplx(2*length-1)); 2517 comp2*=1.5; /* fudge factor that appears to give good overall performance */ 2518 if (comp2<comp1) // use Bluestein 2519 blueplan=std::unique_ptr<fftblue<T0>>(new fftblue<T0>(length)); 2520 else 2521 packplan=std::unique_ptr<rfftp<T0>>(new rfftp<T0>(length)); 2522 } 2523 2524 template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool fwd) const 2525 { packplan ? packplan->exec(c,fct,fwd) : blueplan->exec_r(c,fct,fwd); } 2526 2527 size_t length() const { return len; } 2528 }; 2529 2530 2531 // 2532 // sine/cosine transforms 2533 // 2534 2535 template<typename T0> class T_dct1 2536 { 2537 private: 2538 pocketfft_r<T0> fftplan; 2539 2540 public: 2541 POCKETFFT_NOINLINE T_dct1(size_t length) 2542 : fftplan(2*(length-1)) {} 2543 2544 template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho, 2545 int /*type*/, bool /*cosine*/) const 2546 { 2547 constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); 2548 size_t N=fftplan.length(), n=N/2+1; 2549 if (ortho) 2550 { c[0]*=sqrt2; c[n-1]*=sqrt2; } 2551 arr<T> tmp(N); 2552 tmp[0] = c[0]; 2553 for (size_t i=1; i<n; ++i) 2554 tmp[i] = tmp[N-i] = c[i]; 2555 fftplan.exec(tmp.data(), fct, true); 2556 c[0] = tmp[0]; 2557 for (size_t i=1; i<n; ++i) 2558 c[i] = tmp[2*i-1]; 2559 if (ortho) 2560 { c[0]*=sqrt2*T0(0.5); c[n-1]*=sqrt2*T0(0.5); } 2561 } 2562 2563 size_t length() const { return fftplan.length()/2+1; } 2564 }; 2565 2566 template<typename T0> class T_dst1 2567 { 2568 private: 2569 pocketfft_r<T0> fftplan; 2570 2571 public: 2572 POCKETFFT_NOINLINE T_dst1(size_t length) 2573 : fftplan(2*(length+1)) {} 2574 2575 template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct, 2576 bool /*ortho*/, int /*type*/, bool /*cosine*/) const 2577 { 2578 size_t N=fftplan.length(), n=N/2-1; 2579 arr<T> tmp(N); 2580 tmp[0] = tmp[n+1] = c[0]*0; 2581 for (size_t i=0; i<n; ++i) 2582 { tmp[i+1]=c[i]; tmp[N-1-i]=-c[i]; } 2583 fftplan.exec(tmp.data(), fct, true); 2584 for (size_t i=0; i<n; ++i) 2585 c[i] = -tmp[2*i+2]; 2586 } 2587 2588 size_t length() const { return fftplan.length()/2-1; } 2589 }; 2590 2591 template<typename T0> class T_dcst23 2592 { 2593 private: 2594 pocketfft_r<T0> fftplan; 2595 std::vector<T0> twiddle; 2596 2597 public: 2598 POCKETFFT_NOINLINE T_dcst23(size_t length) 2599 : fftplan(length), twiddle(length) 2600 { 2601 sincos_2pibyn<T0> tw(4*length); 2602 for (size_t i=0; i<length; ++i) 2603 twiddle[i] = tw[i+1].r; 2604 } 2605 2606 template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho, 2607 int type, bool cosine) const 2608 { 2609 constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); 2610 size_t N=length(); 2611 size_t NS2 = (N+1)/2; 2612 if (type==2) 2613 { 2614 if (!cosine) 2615 for (size_t k=1; k<N; k+=2) 2616 c[k] = -c[k]; 2617 c[0] *= 2; 2618 if ((N&1)==0) c[N-1]*=2; 2619 for (size_t k=1; k<N-1; k+=2) 2620 MPINPLACE(c[k+1], c[k]); 2621 fftplan.exec(c, fct, false); 2622 for (size_t k=1, kc=N-1; k<NS2; ++k, --kc) 2623 { 2624 T t1 = twiddle[k-1]*c[kc]+twiddle[kc-1]*c[k]; 2625 T t2 = twiddle[k-1]*c[k]-twiddle[kc-1]*c[kc]; 2626 c[k] = T0(0.5)*(t1+t2); c[kc]=T0(0.5)*(t1-t2); 2627 } 2628 if ((N&1)==0) 2629 c[NS2] *= twiddle[NS2-1]; 2630 if (!cosine) 2631 for (size_t k=0, kc=N-1; k<kc; ++k, --kc) 2632 std::swap(c[k], c[kc]); 2633 if (ortho) c[0]*=sqrt2*T0(0.5); 2634 } 2635 else 2636 { 2637 if (ortho) c[0]*=sqrt2; 2638 if (!cosine) 2639 for (size_t k=0, kc=N-1; k<NS2; ++k, --kc) 2640 std::swap(c[k], c[kc]); 2641 for (size_t k=1, kc=N-1; k<NS2; ++k, --kc) 2642 { 2643 T t1=c[k]+c[kc], t2=c[k]-c[kc]; 2644 c[k] = twiddle[k-1]*t2+twiddle[kc-1]*t1; 2645 c[kc]= twiddle[k-1]*t1-twiddle[kc-1]*t2; 2646 } 2647 if ((N&1)==0) 2648 c[NS2] *= 2*twiddle[NS2-1]; 2649 fftplan.exec(c, fct, true); 2650 for (size_t k=1; k<N-1; k+=2) 2651 MPINPLACE(c[k], c[k+1]); 2652 if (!cosine) 2653 for (size_t k=1; k<N; k+=2) 2654 c[k] = -c[k]; 2655 } 2656 } 2657 2658 size_t length() const { return fftplan.length(); } 2659 }; 2660 2661 template<typename T0> class T_dcst4 2662 { 2663 private: 2664 size_t N; 2665 std::unique_ptr<pocketfft_c<T0>> fft; 2666 std::unique_ptr<pocketfft_r<T0>> rfft; 2667 arr<cmplx<T0>> C2; 2668 2669 public: 2670 POCKETFFT_NOINLINE T_dcst4(size_t length) 2671 : N(length), 2672 fft((N&1) ? nullptr : new pocketfft_c<T0>(N/2)), 2673 rfft((N&1)? new pocketfft_r<T0>(N) : nullptr), 2674 C2((N&1) ? 0 : N/2) 2675 { 2676 if ((N&1)==0) 2677 { 2678 sincos_2pibyn<T0> tw(16*N); 2679 for (size_t i=0; i<N/2; ++i) 2680 C2[i] = conj(tw[8*i+1]); 2681 } 2682 } 2683 2684 template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct, 2685 bool /*ortho*/, int /*type*/, bool cosine) const 2686 { 2687 size_t n2 = N/2; 2688 if (!cosine) 2689 for (size_t k=0, kc=N-1; k<n2; ++k, --kc) 2690 std::swap(c[k], c[kc]); 2691 if (N&1) 2692 { 2693 // The following code is derived from the FFTW3 function apply_re11() 2694 // and is released under the 3-clause BSD license with friendly 2695 // permission of Matteo Frigo and Steven G. Johnson. 2696 2697 arr<T> y(N); 2698 { 2699 size_t i=0, m=n2; 2700 for (; m<N; ++i, m+=4) 2701 y[i] = c[m]; 2702 for (; m<2*N; ++i, m+=4) 2703 y[i] = -c[2*N-m-1]; 2704 for (; m<3*N; ++i, m+=4) 2705 y[i] = -c[m-2*N]; 2706 for (; m<4*N; ++i, m+=4) 2707 y[i] = c[4*N-m-1]; 2708 for (; i<N; ++i, m+=4) 2709 y[i] = c[m-4*N]; 2710 } 2711 rfft->exec(y.data(), fct, true); 2712 { 2713 auto SGN = [](size_t i) 2714 { 2715 constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); 2716 return (i&2) ? -sqrt2 : sqrt2; 2717 }; 2718 c[n2] = y[0]*SGN(n2+1); 2719 size_t i=0, i1=1, k=1; 2720 for (; k<n2; ++i, ++i1, k+=2) 2721 { 2722 c[i ] = y[2*k-1]*SGN(i1) + y[2*k ]*SGN(i); 2723 c[N -i1] = y[2*k-1]*SGN(N -i) - y[2*k ]*SGN(N -i1); 2724 c[n2-i1] = y[2*k+1]*SGN(n2-i) - y[2*k+2]*SGN(n2-i1); 2725 c[n2+i1] = y[2*k+1]*SGN(n2+i+2) + y[2*k+2]*SGN(n2+i1); 2726 } 2727 if (k == n2) 2728 { 2729 c[i ] = y[2*k-1]*SGN(i+1) + y[2*k]*SGN(i); 2730 c[N-i1] = y[2*k-1]*SGN(i+2) + y[2*k]*SGN(i1); 2731 } 2732 } 2733 2734 // FFTW-derived code ends here 2735 } 2736 else 2737 { 2738 // even length algorithm from 2739 // https://www.appletonaudio.com/blog/2013/derivation-of-fast-dct-4-algorithm-based-on-dft/ 2740 arr<cmplx<T>> y(n2); 2741 for(size_t i=0; i<n2; ++i) 2742 { 2743 y[i].Set(c[2*i],c[N-1-2*i]); 2744 y[i] *= C2[i]; 2745 } 2746 fft->exec(y.data(), fct, true); 2747 for(size_t i=0, ic=n2-1; i<n2; ++i, --ic) 2748 { 2749 c[2*i ] = 2*(y[i ].r*C2[i ].r-y[i ].i*C2[i ].i); 2750 c[2*i+1] = -2*(y[ic].i*C2[ic].r+y[ic].r*C2[ic].i); 2751 } 2752 } 2753 if (!cosine) 2754 for (size_t k=1; k<N; k+=2) 2755 c[k] = -c[k]; 2756 } 2757 2758 size_t length() const { return N; } 2759 }; 2760 2761 2762 // 2763 // multi-D infrastructure 2764 // 2765 2766 template<typename T> std::shared_ptr<T> get_plan(size_t length) 2767 { 2768 #if POCKETFFT_CACHE_SIZE==0 2769 return std::make_shared<T>(length); 2770 #else 2771 constexpr size_t nmax=POCKETFFT_CACHE_SIZE; 2772 static std::array<std::shared_ptr<T>, nmax> cache; 2773 static std::array<size_t, nmax> last_access{{0}}; 2774 static size_t access_counter = 0; 2775 static std::mutex mut; 2776 2777 auto find_in_cache = [&]() -> std::shared_ptr<T> 2778 { 2779 for (size_t i=0; i<nmax; ++i) 2780 if (cache[i] && (cache[i]->length()==length)) 2781 { 2782 // no need to update if this is already the most recent entry 2783 if (last_access[i]!=access_counter) 2784 { 2785 last_access[i] = ++access_counter; 2786 // Guard against overflow 2787 if (access_counter == 0) 2788 last_access.fill(0); 2789 } 2790 return cache[i]; 2791 } 2792 2793 return nullptr; 2794 }; 2795 2796 { 2797 std::lock_guard<std::mutex> lock(mut); 2798 auto p = find_in_cache(); 2799 if (p) return p; 2800 } 2801 auto plan = std::make_shared<T>(length); 2802 { 2803 std::lock_guard<std::mutex> lock(mut); 2804 auto p = find_in_cache(); 2805 if (p) return p; 2806 2807 size_t lru = 0; 2808 for (size_t i=1; i<nmax; ++i) 2809 if (last_access[i] < last_access[lru]) 2810 lru = i; 2811 2812 cache[lru] = plan; 2813 last_access[lru] = ++access_counter; 2814 } 2815 return plan; 2816 #endif 2817 } 2818 2819 class arr_info 2820 { 2821 protected: 2822 shape_t shp; 2823 stride_t str; 2824 2825 public: 2826 arr_info(const shape_t &shape_, const stride_t &stride_) 2827 : shp(shape_), str(stride_) {} 2828 size_t ndim() const { return shp.size(); } 2829 size_t size() const { return util::prod(shp); } 2830 const shape_t &shape() const { return shp; } 2831 size_t shape(size_t i) const { return shp[i]; } 2832 const stride_t &stride() const { return str; } 2833 const ptrdiff_t &stride(size_t i) const { return str[i]; } 2834 }; 2835 2836 template<typename T> class cndarr: public arr_info 2837 { 2838 protected: 2839 const char *d; 2840 2841 public: 2842 cndarr(const void *data_, const shape_t &shape_, const stride_t &stride_) 2843 : arr_info(shape_, stride_), 2844 d(reinterpret_cast<const char *>(data_)) {} 2845 const T &operator[](ptrdiff_t ofs) const 2846 { return *reinterpret_cast<const T *>(d+ofs); } 2847 }; 2848 2849 template<typename T> class ndarr: public cndarr<T> 2850 { 2851 public: 2852 ndarr(void *data_, const shape_t &shape_, const stride_t &stride_) 2853 : cndarr<T>::cndarr(const_cast<const void *>(data_), shape_, stride_) 2854 {} 2855 T &operator[](ptrdiff_t ofs) 2856 { return *reinterpret_cast<T *>(const_cast<char *>(cndarr<T>::d+ofs)); } 2857 }; 2858 2859 template<size_t N> class multi_iter 2860 { 2861 private: 2862 shape_t pos; 2863 const arr_info &iarr, &oarr; 2864 ptrdiff_t p_ii, p_i[N], str_i, p_oi, p_o[N], str_o; 2865 size_t idim, rem; 2866 2867 void advance_i() 2868 { 2869 for (int i_=int(pos.size())-1; i_>=0; --i_) 2870 { 2871 auto i = size_t(i_); 2872 if (i==idim) continue; 2873 p_ii += iarr.stride(i); 2874 p_oi += oarr.stride(i); 2875 if (++pos[i] < iarr.shape(i)) 2876 return; 2877 pos[i] = 0; 2878 p_ii -= ptrdiff_t(iarr.shape(i))*iarr.stride(i); 2879 p_oi -= ptrdiff_t(oarr.shape(i))*oarr.stride(i); 2880 } 2881 } 2882 2883 public: 2884 multi_iter(const arr_info &iarr_, const arr_info &oarr_, size_t idim_) 2885 : pos(iarr_.ndim(), 0), iarr(iarr_), oarr(oarr_), p_ii(0), 2886 str_i(iarr.stride(idim_)), p_oi(0), str_o(oarr.stride(idim_)), 2887 idim(idim_), rem(iarr.size()/iarr.shape(idim)) 2888 { 2889 auto nshares = threading::num_threads(); 2890 if (nshares==1) return; 2891 if (nshares==0) throw std::runtime_error("can't run with zero threads"); 2892 auto myshare = threading::thread_id(); 2893 if (myshare>=nshares) throw std::runtime_error("impossible share requested"); 2894 size_t nbase = rem/nshares; 2895 size_t additional = rem%nshares; 2896 size_t lo = myshare*nbase + ((myshare<additional) ? myshare : additional); 2897 size_t hi = lo+nbase+(myshare<additional); 2898 size_t todo = hi-lo; 2899 2900 size_t chunk = rem; 2901 for (size_t i=0; i<pos.size(); ++i) 2902 { 2903 if (i==idim) continue; 2904 chunk /= iarr.shape(i); 2905 size_t n_advance = lo/chunk; 2906 pos[i] += n_advance; 2907 p_ii += ptrdiff_t(n_advance)*iarr.stride(i); 2908 p_oi += ptrdiff_t(n_advance)*oarr.stride(i); 2909 lo -= n_advance*chunk; 2910 } 2911 rem = todo; 2912 } 2913 void advance(size_t n) 2914 { 2915 if (rem<n) throw std::runtime_error("underrun"); 2916 for (size_t i=0; i<n; ++i) 2917 { 2918 p_i[i] = p_ii; 2919 p_o[i] = p_oi; 2920 advance_i(); 2921 } 2922 rem -= n; 2923 } 2924 ptrdiff_t iofs(size_t i) const { return p_i[0] + ptrdiff_t(i)*str_i; } 2925 ptrdiff_t iofs(size_t j, size_t i) const { return p_i[j] + ptrdiff_t(i)*str_i; } 2926 ptrdiff_t oofs(size_t i) const { return p_o[0] + ptrdiff_t(i)*str_o; } 2927 ptrdiff_t oofs(size_t j, size_t i) const { return p_o[j] + ptrdiff_t(i)*str_o; } 2928 size_t length_in() const { return iarr.shape(idim); } 2929 size_t length_out() const { return oarr.shape(idim); } 2930 ptrdiff_t stride_in() const { return str_i; } 2931 ptrdiff_t stride_out() const { return str_o; } 2932 size_t remaining() const { return rem; } 2933 }; 2934 2935 class simple_iter 2936 { 2937 private: 2938 shape_t pos; 2939 const arr_info &arr; 2940 ptrdiff_t p; 2941 size_t rem; 2942 2943 public: 2944 simple_iter(const arr_info &arr_) 2945 : pos(arr_.ndim(), 0), arr(arr_), p(0), rem(arr_.size()) {} 2946 void advance() 2947 { 2948 --rem; 2949 for (int i_=int(pos.size())-1; i_>=0; --i_) 2950 { 2951 auto i = size_t(i_); 2952 p += arr.stride(i); 2953 if (++pos[i] < arr.shape(i)) 2954 return; 2955 pos[i] = 0; 2956 p -= ptrdiff_t(arr.shape(i))*arr.stride(i); 2957 } 2958 } 2959 ptrdiff_t ofs() const { return p; } 2960 size_t remaining() const { return rem; } 2961 }; 2962 2963 class rev_iter 2964 { 2965 private: 2966 shape_t pos; 2967 const arr_info &arr; 2968 std::vector<char> rev_axis; 2969 std::vector<char> rev_jump; 2970 size_t last_axis, last_size; 2971 shape_t shp; 2972 ptrdiff_t p, rp; 2973 size_t rem; 2974 2975 public: 2976 rev_iter(const arr_info &arr_, const shape_t &axes) 2977 : pos(arr_.ndim(), 0), arr(arr_), rev_axis(arr_.ndim(), 0), 2978 rev_jump(arr_.ndim(), 1), p(0), rp(0) 2979 { 2980 for (auto ax: axes) 2981 rev_axis[ax]=1; 2982 last_axis = axes.back(); 2983 last_size = arr.shape(last_axis)/2 + 1; 2984 shp = arr.shape(); 2985 shp[last_axis] = last_size; 2986 rem=1; 2987 for (auto i: shp) 2988 rem *= i; 2989 } 2990 void advance() 2991 { 2992 --rem; 2993 for (int i_=int(pos.size())-1; i_>=0; --i_) 2994 { 2995 auto i = size_t(i_); 2996 p += arr.stride(i); 2997 if (!rev_axis[i]) 2998 rp += arr.stride(i); 2999 else 3000 { 3001 rp -= arr.stride(i); 3002 if (rev_jump[i]) 3003 { 3004 rp += ptrdiff_t(arr.shape(i))*arr.stride(i); 3005 rev_jump[i] = 0; 3006 } 3007 } 3008 if (++pos[i] < shp[i]) 3009 return; 3010 pos[i] = 0; 3011 p -= ptrdiff_t(shp[i])*arr.stride(i); 3012 if (rev_axis[i]) 3013 { 3014 rp -= ptrdiff_t(arr.shape(i)-shp[i])*arr.stride(i); 3015 rev_jump[i] = 1; 3016 } 3017 else 3018 rp -= ptrdiff_t(shp[i])*arr.stride(i); 3019 } 3020 } 3021 ptrdiff_t ofs() const { return p; } 3022 ptrdiff_t rev_ofs() const { return rp; } 3023 size_t remaining() const { return rem; } 3024 }; 3025 3026 template<typename T> struct VTYPE {}; 3027 template <typename T> using vtype_t = typename VTYPE<T>::type; 3028 3029 #ifndef POCKETFFT_NO_VECTORS 3030 template<> struct VTYPE<float> 3031 { 3032 using type = float __attribute__ ((vector_size (VLEN<float>::val*sizeof(float)))); 3033 }; 3034 template<> struct VTYPE<double> 3035 { 3036 using type = double __attribute__ ((vector_size (VLEN<double>::val*sizeof(double)))); 3037 }; 3038 template<> struct VTYPE<long double> 3039 { 3040 using type = long double __attribute__ ((vector_size (VLEN<long double>::val*sizeof(long double)))); 3041 }; 3042 #endif 3043 3044 template<typename T> arr<char> alloc_tmp(const shape_t &shape, 3045 size_t axsize, size_t elemsize) 3046 { 3047 auto othersize = util::prod(shape)/axsize; 3048 auto tmpsize = axsize*((othersize>=VLEN<T>::val) ? VLEN<T>::val : 1); 3049 return arr<char>(tmpsize*elemsize); 3050 } 3051 template<typename T> arr<char> alloc_tmp(const shape_t &shape, 3052 const shape_t &axes, size_t elemsize) 3053 { 3054 size_t fullsize=util::prod(shape); 3055 size_t tmpsize=0; 3056 for (size_t i=0; i<axes.size(); ++i) 3057 { 3058 auto axsize = shape[axes[i]]; 3059 auto othersize = fullsize/axsize; 3060 auto sz = axsize*((othersize>=VLEN<T>::val) ? VLEN<T>::val : 1); 3061 if (sz>tmpsize) tmpsize=sz; 3062 } 3063 return arr<char>(tmpsize*elemsize); 3064 } 3065 3066 template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it, 3067 const cndarr<cmplx<T>> &src, cmplx<vtype_t<T>> *POCKETFFT_RESTRICT dst) 3068 { 3069 for (size_t i=0; i<it.length_in(); ++i) 3070 for (size_t j=0; j<vlen; ++j) 3071 { 3072 dst[i].r[j] = src[it.iofs(j,i)].r; 3073 dst[i].i[j] = src[it.iofs(j,i)].i; 3074 } 3075 } 3076 3077 template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it, 3078 const cndarr<T> &src, vtype_t<T> *POCKETFFT_RESTRICT dst) 3079 { 3080 for (size_t i=0; i<it.length_in(); ++i) 3081 for (size_t j=0; j<vlen; ++j) 3082 dst[i][j] = src[it.iofs(j,i)]; 3083 } 3084 3085 template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it, 3086 const cndarr<T> &src, T *POCKETFFT_RESTRICT dst) 3087 { 3088 if (dst == &src[it.iofs(0)]) return; // in-place 3089 for (size_t i=0; i<it.length_in(); ++i) 3090 dst[i] = src[it.iofs(i)]; 3091 } 3092 3093 template<typename T, size_t vlen> void copy_output(const multi_iter<vlen> &it, 3094 const cmplx<vtype_t<T>> *POCKETFFT_RESTRICT src, ndarr<cmplx<T>> &dst) 3095 { 3096 for (size_t i=0; i<it.length_out(); ++i) 3097 for (size_t j=0; j<vlen; ++j) 3098 dst[it.oofs(j,i)].Set(src[i].r[j],src[i].i[j]); 3099 } 3100 3101 template<typename T, size_t vlen> void copy_output(const multi_iter<vlen> &it, 3102 const vtype_t<T> *POCKETFFT_RESTRICT src, ndarr<T> &dst) 3103 { 3104 for (size_t i=0; i<it.length_out(); ++i) 3105 for (size_t j=0; j<vlen; ++j) 3106 dst[it.oofs(j,i)] = src[i][j]; 3107 } 3108 3109 template<typename T, size_t vlen> void copy_output(const multi_iter<vlen> &it, 3110 const T *POCKETFFT_RESTRICT src, ndarr<T> &dst) 3111 { 3112 if (src == &dst[it.oofs(0)]) return; // in-place 3113 for (size_t i=0; i<it.length_out(); ++i) 3114 dst[it.oofs(i)] = src[i]; 3115 } 3116 3117 template <typename T> struct add_vec { using type = vtype_t<T>; }; 3118 template <typename T> struct add_vec<cmplx<T>> 3119 { using type = cmplx<vtype_t<T>>; }; 3120 template <typename T> using add_vec_t = typename add_vec<T>::type; 3121 3122 template<typename Tplan, typename T, typename T0, typename Exec> 3123 POCKETFFT_NOINLINE void general_nd(const cndarr<T> &in, ndarr<T> &out, 3124 const shape_t &axes, T0 fct, size_t nthreads, const Exec & exec, 3125 const bool allow_inplace=true) 3126 { 3127 std::shared_ptr<Tplan> plan; 3128 3129 for (size_t iax=0; iax<axes.size(); ++iax) 3130 { 3131 size_t len=in.shape(axes[iax]); 3132 if ((!plan) || (len!=plan->length())) 3133 plan = get_plan<Tplan>(len); 3134 3135 threading::thread_map( 3136 util::thread_count(nthreads, in.shape(), axes[iax], VLEN<T>::val), 3137 [&] { 3138 constexpr auto vlen = VLEN<T0>::val; 3139 auto storage = alloc_tmp<T0>(in.shape(), len, sizeof(T)); 3140 const auto &tin(iax==0? in : out); 3141 multi_iter<vlen> it(tin, out, axes[iax]); 3142 #ifndef POCKETFFT_NO_VECTORS 3143 if (vlen>1) 3144 while (it.remaining()>=vlen) 3145 { 3146 it.advance(vlen); 3147 auto tdatav = reinterpret_cast<add_vec_t<T> *>(storage.data()); 3148 exec(it, tin, out, tdatav, *plan, fct); 3149 } 3150 #endif 3151 while (it.remaining()>0) 3152 { 3153 it.advance(1); 3154 auto buf = allow_inplace && it.stride_out() == sizeof(T) ? 3155 &out[it.oofs(0)] : reinterpret_cast<T *>(storage.data()); 3156 exec(it, tin, out, buf, *plan, fct); 3157 } 3158 }); // end of parallel region 3159 fct = T0(1); // factor has been applied, use 1 for remaining axes 3160 } 3161 } 3162 3163 struct ExecC2C 3164 { 3165 bool forward; 3166 3167 template <typename T0, typename T, size_t vlen> void operator () ( 3168 const multi_iter<vlen> &it, const cndarr<cmplx<T0>> &in, 3169 ndarr<cmplx<T0>> &out, T * buf, const pocketfft_c<T0> &plan, T0 fct) const 3170 { 3171 copy_input(it, in, buf); 3172 plan.exec(buf, fct, forward); 3173 copy_output(it, buf, out); 3174 } 3175 }; 3176 3177 template <typename T, size_t vlen> void copy_hartley(const multi_iter<vlen> &it, 3178 const vtype_t<T> *POCKETFFT_RESTRICT src, ndarr<T> &dst) 3179 { 3180 for (size_t j=0; j<vlen; ++j) 3181 dst[it.oofs(j,0)] = src[0][j]; 3182 size_t i=1, i1=1, i2=it.length_out()-1; 3183 for (i=1; i<it.length_out()-1; i+=2, ++i1, --i2) 3184 for (size_t j=0; j<vlen; ++j) 3185 { 3186 dst[it.oofs(j,i1)] = src[i][j]+src[i+1][j]; 3187 dst[it.oofs(j,i2)] = src[i][j]-src[i+1][j]; 3188 } 3189 if (i<it.length_out()) 3190 for (size_t j=0; j<vlen; ++j) 3191 dst[it.oofs(j,i1)] = src[i][j]; 3192 } 3193 3194 template <typename T, size_t vlen> void copy_hartley(const multi_iter<vlen> &it, 3195 const T *POCKETFFT_RESTRICT src, ndarr<T> &dst) 3196 { 3197 dst[it.oofs(0)] = src[0]; 3198 size_t i=1, i1=1, i2=it.length_out()-1; 3199 for (i=1; i<it.length_out()-1; i+=2, ++i1, --i2) 3200 { 3201 dst[it.oofs(i1)] = src[i]+src[i+1]; 3202 dst[it.oofs(i2)] = src[i]-src[i+1]; 3203 } 3204 if (i<it.length_out()) 3205 dst[it.oofs(i1)] = src[i]; 3206 } 3207 3208 struct ExecHartley 3209 { 3210 template <typename T0, typename T, size_t vlen> void operator () ( 3211 const multi_iter<vlen> &it, const cndarr<T0> &in, ndarr<T0> &out, 3212 T * buf, const pocketfft_r<T0> &plan, T0 fct) const 3213 { 3214 copy_input(it, in, buf); 3215 plan.exec(buf, fct, true); 3216 copy_hartley(it, buf, out); 3217 } 3218 }; 3219 3220 struct ExecDcst 3221 { 3222 bool ortho; 3223 int type; 3224 bool cosine; 3225 3226 template <typename T0, typename T, typename Tplan, size_t vlen> 3227 void operator () (const multi_iter<vlen> &it, const cndarr<T0> &in, 3228 ndarr<T0> &out, T * buf, const Tplan &plan, T0 fct) const 3229 { 3230 copy_input(it, in, buf); 3231 plan.exec(buf, fct, ortho, type, cosine); 3232 copy_output(it, buf, out); 3233 } 3234 }; 3235 3236 template<typename T> POCKETFFT_NOINLINE void general_r2c( 3237 const cndarr<T> &in, ndarr<cmplx<T>> &out, size_t axis, bool forward, T fct, 3238 size_t nthreads) 3239 { 3240 auto plan = get_plan<pocketfft_r<T>>(in.shape(axis)); 3241 size_t len=in.shape(axis); 3242 threading::thread_map( 3243 util::thread_count(nthreads, in.shape(), axis, VLEN<T>::val), 3244 [&] { 3245 constexpr auto vlen = VLEN<T>::val; 3246 auto storage = alloc_tmp<T>(in.shape(), len, sizeof(T)); 3247 multi_iter<vlen> it(in, out, axis); 3248 #ifndef POCKETFFT_NO_VECTORS 3249 if (vlen>1) 3250 while (it.remaining()>=vlen) 3251 { 3252 it.advance(vlen); 3253 auto tdatav = reinterpret_cast<vtype_t<T> *>(storage.data()); 3254 copy_input(it, in, tdatav); 3255 plan->exec(tdatav, fct, true); 3256 for (size_t j=0; j<vlen; ++j) 3257 out[it.oofs(j,0)].Set(tdatav[0][j]); 3258 size_t i=1, ii=1; 3259 if (forward) 3260 for (; i<len-1; i+=2, ++ii) 3261 for (size_t j=0; j<vlen; ++j) 3262 out[it.oofs(j,ii)].Set(tdatav[i][j], tdatav[i+1][j]); 3263 else 3264 for (; i<len-1; i+=2, ++ii) 3265 for (size_t j=0; j<vlen; ++j) 3266 out[it.oofs(j,ii)].Set(tdatav[i][j], -tdatav[i+1][j]); 3267 if (i<len) 3268 for (size_t j=0; j<vlen; ++j) 3269 out[it.oofs(j,ii)].Set(tdatav[i][j]); 3270 } 3271 #endif 3272 while (it.remaining()>0) 3273 { 3274 it.advance(1); 3275 auto tdata = reinterpret_cast<T *>(storage.data()); 3276 copy_input(it, in, tdata); 3277 plan->exec(tdata, fct, true); 3278 out[it.oofs(0)].Set(tdata[0]); 3279 size_t i=1, ii=1; 3280 if (forward) 3281 for (; i<len-1; i+=2, ++ii) 3282 out[it.oofs(ii)].Set(tdata[i], tdata[i+1]); 3283 else 3284 for (; i<len-1; i+=2, ++ii) 3285 out[it.oofs(ii)].Set(tdata[i], -tdata[i+1]); 3286 if (i<len) 3287 out[it.oofs(ii)].Set(tdata[i]); 3288 } 3289 }); // end of parallel region 3290 } 3291 template<typename T> POCKETFFT_NOINLINE void general_c2r( 3292 const cndarr<cmplx<T>> &in, ndarr<T> &out, size_t axis, bool forward, T fct, 3293 size_t nthreads) 3294 { 3295 auto plan = get_plan<pocketfft_r<T>>(out.shape(axis)); 3296 size_t len=out.shape(axis); 3297 threading::thread_map( 3298 util::thread_count(nthreads, in.shape(), axis, VLEN<T>::val), 3299 [&] { 3300 constexpr auto vlen = VLEN<T>::val; 3301 auto storage = alloc_tmp<T>(out.shape(), len, sizeof(T)); 3302 multi_iter<vlen> it(in, out, axis); 3303 #ifndef POCKETFFT_NO_VECTORS 3304 if (vlen>1) 3305 while (it.remaining()>=vlen) 3306 { 3307 it.advance(vlen); 3308 auto tdatav = reinterpret_cast<vtype_t<T> *>(storage.data()); 3309 for (size_t j=0; j<vlen; ++j) 3310 tdatav[0][j]=in[it.iofs(j,0)].r; 3311 { 3312 size_t i=1, ii=1; 3313 if (forward) 3314 for (; i<len-1; i+=2, ++ii) 3315 for (size_t j=0; j<vlen; ++j) 3316 { 3317 tdatav[i ][j] = in[it.iofs(j,ii)].r; 3318 tdatav[i+1][j] = -in[it.iofs(j,ii)].i; 3319 } 3320 else 3321 for (; i<len-1; i+=2, ++ii) 3322 for (size_t j=0; j<vlen; ++j) 3323 { 3324 tdatav[i ][j] = in[it.iofs(j,ii)].r; 3325 tdatav[i+1][j] = in[it.iofs(j,ii)].i; 3326 } 3327 if (i<len) 3328 for (size_t j=0; j<vlen; ++j) 3329 tdatav[i][j] = in[it.iofs(j,ii)].r; 3330 } 3331 plan->exec(tdatav, fct, false); 3332 copy_output(it, tdatav, out); 3333 } 3334 #endif 3335 while (it.remaining()>0) 3336 { 3337 it.advance(1); 3338 auto tdata = reinterpret_cast<T *>(storage.data()); 3339 tdata[0]=in[it.iofs(0)].r; 3340 { 3341 size_t i=1, ii=1; 3342 if (forward) 3343 for (; i<len-1; i+=2, ++ii) 3344 { 3345 tdata[i ] = in[it.iofs(ii)].r; 3346 tdata[i+1] = -in[it.iofs(ii)].i; 3347 } 3348 else 3349 for (; i<len-1; i+=2, ++ii) 3350 { 3351 tdata[i ] = in[it.iofs(ii)].r; 3352 tdata[i+1] = in[it.iofs(ii)].i; 3353 } 3354 if (i<len) 3355 tdata[i] = in[it.iofs(ii)].r; 3356 } 3357 plan->exec(tdata, fct, false); 3358 copy_output(it, tdata, out); 3359 } 3360 }); // end of parallel region 3361 } 3362 3363 struct ExecR2R 3364 { 3365 bool r2c, forward; 3366 3367 template <typename T0, typename T, size_t vlen> void operator () ( 3368 const multi_iter<vlen> &it, const cndarr<T0> &in, ndarr<T0> &out, T * buf, 3369 const pocketfft_r<T0> &plan, T0 fct) const 3370 { 3371 copy_input(it, in, buf); 3372 if ((!r2c) && forward) 3373 for (size_t i=2; i<it.length_out(); i+=2) 3374 buf[i] = -buf[i]; 3375 plan.exec(buf, fct, forward); 3376 if (r2c && (!forward)) 3377 for (size_t i=2; i<it.length_out(); i+=2) 3378 buf[i] = -buf[i]; 3379 copy_output(it, buf, out); 3380 } 3381 }; 3382 3383 template<typename T> void c2c(const shape_t &shape, const stride_t &stride_in, 3384 const stride_t &stride_out, const shape_t &axes, bool forward, 3385 const std::complex<T> *data_in, std::complex<T> *data_out, T fct, 3386 size_t nthreads=1) 3387 { 3388 if (util::prod(shape)==0) return; 3389 util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); 3390 cndarr<cmplx<T>> ain(data_in, shape, stride_in); 3391 ndarr<cmplx<T>> aout(data_out, shape, stride_out); 3392 general_nd<pocketfft_c<T>>(ain, aout, axes, fct, nthreads, ExecC2C{forward}); 3393 } 3394 3395 template<typename T> void dct(const shape_t &shape, 3396 const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, 3397 int type, const T *data_in, T *data_out, T fct, bool ortho, size_t nthreads=1) 3398 { 3399 if ((type<1) || (type>4)) throw std::invalid_argument("invalid DCT type"); 3400 if (util::prod(shape)==0) return; 3401 util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); 3402 cndarr<T> ain(data_in, shape, stride_in); 3403 ndarr<T> aout(data_out, shape, stride_out); 3404 const ExecDcst exec{ortho, type, true}; 3405 if (type==1) 3406 general_nd<T_dct1<T>>(ain, aout, axes, fct, nthreads, exec); 3407 else if (type==4) 3408 general_nd<T_dcst4<T>>(ain, aout, axes, fct, nthreads, exec); 3409 else 3410 general_nd<T_dcst23<T>>(ain, aout, axes, fct, nthreads, exec); 3411 } 3412 3413 template<typename T> void dst(const shape_t &shape, 3414 const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, 3415 int type, const T *data_in, T *data_out, T fct, bool ortho, size_t nthreads=1) 3416 { 3417 if ((type<1) || (type>4)) throw std::invalid_argument("invalid DST type"); 3418 if (util::prod(shape)==0) return; 3419 util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); 3420 cndarr<T> ain(data_in, shape, stride_in); 3421 ndarr<T> aout(data_out, shape, stride_out); 3422 const ExecDcst exec{ortho, type, false}; 3423 if (type==1) 3424 general_nd<T_dst1<T>>(ain, aout, axes, fct, nthreads, exec); 3425 else if (type==4) 3426 general_nd<T_dcst4<T>>(ain, aout, axes, fct, nthreads, exec); 3427 else 3428 general_nd<T_dcst23<T>>(ain, aout, axes, fct, nthreads, exec); 3429 } 3430 3431 template<typename T> void r2c(const shape_t &shape_in, 3432 const stride_t &stride_in, const stride_t &stride_out, size_t axis, 3433 bool forward, const T *data_in, std::complex<T> *data_out, T fct, 3434 size_t nthreads=1) 3435 { 3436 if (util::prod(shape_in)==0) return; 3437 util::sanity_check(shape_in, stride_in, stride_out, false, axis); 3438 cndarr<T> ain(data_in, shape_in, stride_in); 3439 shape_t shape_out(shape_in); 3440 shape_out[axis] = shape_in[axis]/2 + 1; 3441 ndarr<cmplx<T>> aout(data_out, shape_out, stride_out); 3442 general_r2c(ain, aout, axis, forward, fct, nthreads); 3443 } 3444 3445 template<typename T> void r2c(const shape_t &shape_in, 3446 const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, 3447 bool forward, const T *data_in, std::complex<T> *data_out, T fct, 3448 size_t nthreads=1) 3449 { 3450 if (util::prod(shape_in)==0) return; 3451 util::sanity_check(shape_in, stride_in, stride_out, false, axes); 3452 r2c(shape_in, stride_in, stride_out, axes.back(), forward, data_in, data_out, 3453 fct, nthreads); 3454 if (axes.size()==1) return; 3455 3456 shape_t shape_out(shape_in); 3457 shape_out[axes.back()] = shape_in[axes.back()]/2 + 1; 3458 auto newaxes = shape_t{axes.begin(), --axes.end()}; 3459 c2c(shape_out, stride_out, stride_out, newaxes, forward, data_out, data_out, 3460 T(1), nthreads); 3461 } 3462 3463 template<typename T> void c2r(const shape_t &shape_out, 3464 const stride_t &stride_in, const stride_t &stride_out, size_t axis, 3465 bool forward, const std::complex<T> *data_in, T *data_out, T fct, 3466 size_t nthreads=1) 3467 { 3468 if (util::prod(shape_out)==0) return; 3469 util::sanity_check(shape_out, stride_in, stride_out, false, axis); 3470 shape_t shape_in(shape_out); 3471 shape_in[axis] = shape_out[axis]/2 + 1; 3472 cndarr<cmplx<T>> ain(data_in, shape_in, stride_in); 3473 ndarr<T> aout(data_out, shape_out, stride_out); 3474 general_c2r(ain, aout, axis, forward, fct, nthreads); 3475 } 3476 3477 template<typename T> void c2r(const shape_t &shape_out, 3478 const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, 3479 bool forward, const std::complex<T> *data_in, T *data_out, T fct, 3480 size_t nthreads=1) 3481 { 3482 if (util::prod(shape_out)==0) return; 3483 if (axes.size()==1) 3484 return c2r(shape_out, stride_in, stride_out, axes[0], forward, 3485 data_in, data_out, fct, nthreads); 3486 util::sanity_check(shape_out, stride_in, stride_out, false, axes); 3487 auto shape_in = shape_out; 3488 shape_in[axes.back()] = shape_out[axes.back()]/2 + 1; 3489 auto nval = util::prod(shape_in); 3490 stride_t stride_inter(shape_in.size()); 3491 stride_inter.back() = sizeof(cmplx<T>); 3492 for (int i=int(shape_in.size())-2; i>=0; --i) 3493 stride_inter[size_t(i)] = 3494 stride_inter[size_t(i+1)]*ptrdiff_t(shape_in[size_t(i+1)]); 3495 arr<std::complex<T>> tmp(nval); 3496 auto newaxes = shape_t{axes.begin(), --axes.end()}; 3497 c2c(shape_in, stride_in, stride_inter, newaxes, forward, data_in, tmp.data(), 3498 T(1), nthreads); 3499 c2r(shape_out, stride_inter, stride_out, axes.back(), forward, 3500 tmp.data(), data_out, fct, nthreads); 3501 } 3502 3503 template<typename T> void r2r_fftpack(const shape_t &shape, 3504 const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, 3505 bool real2hermitian, bool forward, const T *data_in, T *data_out, T fct, 3506 size_t nthreads=1) 3507 { 3508 if (util::prod(shape)==0) return; 3509 util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); 3510 cndarr<T> ain(data_in, shape, stride_in); 3511 ndarr<T> aout(data_out, shape, stride_out); 3512 general_nd<pocketfft_r<T>>(ain, aout, axes, fct, nthreads, 3513 ExecR2R{real2hermitian, forward}); 3514 } 3515 3516 template<typename T> void r2r_separable_hartley(const shape_t &shape, 3517 const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, 3518 const T *data_in, T *data_out, T fct, size_t nthreads=1) 3519 { 3520 if (util::prod(shape)==0) return; 3521 util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); 3522 cndarr<T> ain(data_in, shape, stride_in); 3523 ndarr<T> aout(data_out, shape, stride_out); 3524 general_nd<pocketfft_r<T>>(ain, aout, axes, fct, nthreads, ExecHartley{}, 3525 false); 3526 } 3527 3528 template<typename T> void r2r_genuine_hartley(const shape_t &shape, 3529 const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, 3530 const T *data_in, T *data_out, T fct, size_t nthreads=1) 3531 { 3532 if (util::prod(shape)==0) return; 3533 if (axes.size()==1) 3534 return r2r_separable_hartley(shape, stride_in, stride_out, axes, data_in, 3535 data_out, fct, nthreads); 3536 util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); 3537 shape_t tshp(shape); 3538 tshp[axes.back()] = tshp[axes.back()]/2+1; 3539 arr<std::complex<T>> tdata(util::prod(tshp)); 3540 stride_t tstride(shape.size()); 3541 tstride.back()=sizeof(std::complex<T>); 3542 for (size_t i=tstride.size()-1; i>0; --i) 3543 tstride[i-1]=tstride[i]*ptrdiff_t(tshp[i]); 3544 r2c(shape, stride_in, tstride, axes, true, data_in, tdata.data(), fct, nthreads); 3545 cndarr<cmplx<T>> atmp(tdata.data(), tshp, tstride); 3546 ndarr<T> aout(data_out, shape, stride_out); 3547 simple_iter iin(atmp); 3548 rev_iter iout(aout, axes); 3549 while(iin.remaining()>0) 3550 { 3551 auto v = atmp[iin.ofs()]; 3552 aout[iout.ofs()] = v.r+v.i; 3553 aout[iout.rev_ofs()] = v.r-v.i; 3554 iin.advance(); iout.advance(); 3555 } 3556 } 3557 3558 } // namespace detail 3559 3560 using detail::FORWARD; 3561 using detail::BACKWARD; 3562 using detail::shape_t; 3563 using detail::stride_t; 3564 using detail::c2c; 3565 using detail::c2r; 3566 using detail::r2c; 3567 using detail::r2r_fftpack; 3568 using detail::r2r_separable_hartley; 3569 using detail::r2r_genuine_hartley; 3570 using detail::dct; 3571 using detail::dst; 3572 3573 } // namespace pocketfft 3574 3575 #undef POCKETFFT_NOINLINE 3576 #undef POCKETFFT_RESTRICT 3577 3578 #endif // POCKETFFT_HDRONLY_H 3579