1 /* 2 * This code is free software; you can redistribute it and/or modify 3 * it under the terms of the GNU General Public License as published by 4 * the Free Software Foundation; either version 2 of the License, or 5 * (at your option) any later version. 6 * 7 * This code is distributed in the hope that it will be useful, 8 * but WITHOUT ANY WARRANTY; without even the implied warranty of 9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 * GNU General Public License for more details. 11 * 12 * You should have received a copy of the GNU General Public License 13 * along with this code; if not, write to the Free Software 14 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 15 */ 16 17 /* 18 * Copyright (C) 2020-2021 Max-Planck-Society 19 * Author: Martin Reinecke 20 */ 21 22 #ifndef DUCC0_TOTALCONVOLVE_H 23 #define DUCC0_TOTALCONVOLVE_H 24 25 #include <cstdint> 26 #include <algorithm> 27 #include <cstddef> 28 #include <functional> 29 #include <memory> 30 #include <type_traits> 31 #include <vector> 32 #include <complex> 33 #include <cmath> 34 #include <mutex> 35 #include "ducc0/infra/error_handling.h" 36 #include "ducc0/infra/threading.h" 37 #include "ducc0/math/constants.h" 38 #include "ducc0/math/gridding_kernel.h" 39 #include "ducc0/infra/mav.h" 40 #include "ducc0/infra/simd.h" 41 #include "ducc0/infra/aligned_array.h" 42 #include "ducc0/infra/useful_macros.h" 43 #include "ducc0/infra/bucket_sort.h" 44 #include "ducc0/sht/sht.h" 45 #include "ducc0/sht/alm.h" 46 #include "ducc0/fft/fft1d.h" 47 #include "ducc0/fft/fft.h" 48 #include "ducc0/math/math_utils.h" 49 50 namespace ducc0 { 51 52 namespace detail_totalconvolve { 53 54 using namespace std; 55 56 template<typename T> class ConvolverPlan 57 { 58 protected: 59 constexpr static auto vlen = min<size_t>(8, native_simd<T>::size()); 60 using Tsimd = typename simd_select<T, vlen>::type; 61 62 size_t nthreads; 63 size_t lmax, kmax; 64 // _s: small grid 65 // _b: oversampled grid 66 // no suffix: grid with borders 67 size_t nphi_s, ntheta_s, npsi_s, nphi_b, ntheta_b, npsi_b; 68 double dphi, dtheta, dpsi, xdphi, xdtheta, xdpsi; 69 70 shared_ptr<HornerKernel> kernel; 71 size_t nbphi, nbtheta; 72 size_t nphi, ntheta; 73 double phi0, theta0; 74 getKernel(size_t axlen,size_t axlen2)75 cmav<T,1> getKernel(size_t axlen, size_t axlen2) const 76 { 77 auto axlen_big = max(axlen, axlen2); 78 auto axlen_small = min(axlen, axlen2); 79 auto fct = kernel->corfunc(axlen_small/2+1, 1./axlen_big, nthreads); 80 vmav<T,1> k2({axlen}); 81 mav_apply([](T &v){v=T(0);}, 1, k2); 82 { 83 k2(0) = T(fct[0])/axlen_small; 84 size_t i=1; 85 for (; 2*i<axlen_small; ++i) 86 k2(2*i-1) = T(fct[i])/axlen_small; 87 if (2*i==axlen_small) 88 k2(2*i-1) = T(0.5)*T(fct[i])/axlen_small; 89 } 90 pocketfft_r<T> plan(axlen); 91 plan.exec(k2.data(), T(1), false, nthreads); 92 return k2; 93 } 94 correct(vmav<T,2> & arr,int spin)95 void correct(vmav<T,2> &arr, int spin) const 96 { 97 T sfct = (spin&1) ? -1 : 1; 98 vmav<T,2> tmp({nphi_b,nphi_s}); 99 // copy and extend to second half 100 for (size_t j=0; j<nphi_s; ++j) 101 tmp(0,j) = arr(0,j); 102 for (size_t i=1, i2=nphi_s-1; i+1<ntheta_s; ++i,--i2) 103 for (size_t j=0,j2=nphi_s/2; j<nphi_s; ++j,++j2) 104 { 105 if (j2>=nphi_s) j2-=nphi_s; 106 tmp(i,j2) = arr(i,j2); 107 tmp(i2,j) = sfct*tmp(i,j2); 108 } 109 for (size_t j=0; j<nphi_s; ++j) 110 tmp(ntheta_s-1,j) = arr(ntheta_s-1,j); 111 auto fct = kernel->corfunc(nphi_s/2+1, 1./nphi_b, nthreads); 112 vector<T> k2(fct.size()); 113 for (size_t i=0; i<fct.size(); ++i) k2[i] = T(fct[i]/nphi_s); 114 vfmav<T> ftmp(tmp); 115 cfmav<T> ftmp0(subarray<2>(tmp, {{0, nphi_s}, {0, nphi_s}})); 116 auto kern = getKernel(nphi_s, nphi_b); 117 convolve_axis(ftmp0, ftmp, 0, kern, nthreads); 118 cfmav<T> ftmp2(subarray<2>(tmp, {{0, ntheta_b}, {0, nphi_s}})); 119 vfmav<T> farr(arr); 120 convolve_axis(ftmp2, farr, 1, kern, nthreads); 121 } decorrect(vmav<T,2> & arr,int spin)122 void decorrect(vmav<T,2> &arr, int spin) const 123 { 124 T sfct = (spin&1) ? -1 : 1; 125 vmav<T,2> tmp({nphi_b,nphi_s}); 126 auto fct = kernel->corfunc(nphi_s/2+1, 1./nphi_b, nthreads); 127 vector<T> k2(fct.size()); 128 for (size_t i=0; i<fct.size(); ++i) k2[i] = T(fct[i]/nphi_s); 129 cfmav<T> farr(arr); 130 vfmav<T> ftmp2(subarray<2>(tmp, {{0, ntheta_b}, {0, nphi_s}})); 131 auto kern = getKernel(nphi_b, nphi_s); 132 convolve_axis(farr, ftmp2, 1, kern, nthreads); 133 // extend to second half 134 for (size_t i=1, i2=nphi_b-1; i+1<ntheta_b; ++i,--i2) 135 for (size_t j=0,j2=nphi_s/2; j<nphi_s; ++j,++j2) 136 { 137 if (j2>=nphi_s) j2-=nphi_s; 138 tmp(i2,j) = sfct*tmp(i,j2); 139 } 140 cfmav<T> ftmp(tmp); 141 vfmav<T> ftmp0(subarray<2>(tmp, {{0, nphi_s}, {0, nphi_s}})); 142 convolve_axis(ftmp, ftmp0, 0, kern, nthreads); 143 for (size_t j=0; j<nphi_s; ++j) 144 arr(0,j) = T(0.5)*tmp(0,j); 145 for (size_t i=1; i+1<ntheta_s; ++i) 146 for (size_t j=0; j<nphi_s; ++j) 147 arr(i,j) = tmp(i,j); 148 for (size_t j=0; j<nphi_s; ++j) 149 arr(ntheta_s-1,j) = T(0.5)*tmp(ntheta_s-1,j); 150 } 151 getIdx(const cmav<T,1> & theta,const cmav<T,1> & phi,const cmav<T,1> & psi,size_t patch_ntheta,size_t patch_nphi,size_t itheta0,size_t iphi0,size_t supp)152 quick_array<uint32_t> getIdx(const cmav<T,1> &theta, const cmav<T,1> &phi, const cmav<T,1> &psi, 153 size_t patch_ntheta, size_t patch_nphi, size_t itheta0, size_t iphi0, size_t supp) const 154 { 155 size_t nptg = theta.shape(0); 156 constexpr size_t cellsize=8; 157 size_t nct = patch_ntheta/cellsize+1, 158 ncp = patch_nphi/cellsize+1, 159 ncpsi = npsi_b/cellsize+1; 160 double theta0 = (int(itheta0)-int(nbtheta))*dtheta, 161 phi0 = (int(iphi0)-int(nbphi))*dphi; 162 double theta_lo=theta0, theta_hi=theta_lo+(patch_ntheta+1)*dtheta; 163 double phi_lo=phi0, phi_hi=phi_lo+(patch_nphi+1)*dphi; 164 MR_assert(uint64_t(nct)*uint64_t(ncp)*uint64_t(ncpsi)<(uint64_t(1)<<32), 165 "key space too large"); 166 167 quick_array<uint32_t> key(nptg); 168 execParallel(nptg, nthreads, [&](size_t lo, size_t hi) 169 { 170 for (size_t i=lo; i<hi; ++i) 171 { 172 MR_assert((theta(i)>=theta_lo) && (theta(i)<=theta_hi), "theta out of range: ", theta(i)); 173 MR_assert((phi(i)>=phi_lo) && (phi(i)<=phi_hi), "phi out of range: ", phi(i)); 174 auto ftheta = (theta(i)-theta0)*xdtheta-supp*0.5; 175 auto itheta = size_t(ftheta+1); 176 auto fphi = (phi(i)-phi0)*xdphi-supp*0.5; 177 auto iphi = size_t(fphi+1); 178 auto fpsi = psi(i)*xdpsi; 179 fpsi = fmodulo(fpsi, double(npsi_b)); 180 size_t ipsi = size_t(fpsi); 181 ipsi /= cellsize; 182 itheta /= cellsize; 183 iphi /= cellsize; 184 MR_assert(itheta<nct, "bad itheta"); 185 MR_assert(iphi<ncp, "bad iphi"); 186 key[i] = (itheta*ncp+iphi)*ncpsi+ipsi; 187 } 188 }); 189 quick_array<uint32_t> res(key.size()); 190 bucket_sort(&key[0], &res[0], key.size(), ncp*nct*ncpsi, nthreads); 191 return res; 192 } 193 194 template<size_t supp> class WeightHelper 195 { 196 public: 197 static constexpr size_t vlen = Tsimd::size(); 198 static constexpr size_t nvec = (supp+vlen-1)/vlen; 199 const ConvolverPlan &plan; 200 union kbuf { 201 T scalar[3*nvec*vlen]; 202 Tsimd simd[3*nvec]; 203 #if defined(_MSC_VER) kbuf()204 kbuf() {} 205 #endif 206 }; 207 kbuf buf; 208 209 private: 210 TemplateKernel<supp, Tsimd> tkrn; 211 double mytheta0, myphi0; 212 213 public: WeightHelper(const ConvolverPlan & plan_,const mav_info<3> & info,size_t itheta0,size_t iphi0)214 WeightHelper(const ConvolverPlan &plan_, const mav_info<3> &info, size_t itheta0, size_t iphi0) 215 : plan(plan_), 216 tkrn(*plan.kernel), 217 mytheta0(plan.theta0+itheta0*plan.dtheta), 218 myphi0(plan.phi0+iphi0*plan.dphi), 219 wpsi(&buf.scalar[0]), 220 wtheta(&buf.scalar[nvec*vlen]), 221 wphi(&buf.simd[2*nvec]), 222 jumptheta(info.stride(1)) 223 { 224 MR_assert(info.stride(2)==1, "last axis of cube must be contiguous"); 225 } prep(double theta,double phi,double psi)226 void prep(double theta, double phi, double psi) 227 { 228 auto ftheta = (theta-mytheta0)*plan.xdtheta-supp*0.5; 229 itheta = size_t(ftheta+1); 230 ftheta = -1+(itheta-ftheta)*2; 231 auto fphi = (phi-myphi0)*plan.xdphi-supp*0.5; 232 iphi = size_t(fphi+1); 233 fphi = -1+(iphi-fphi)*2; 234 auto fpsi = psi*plan.xdpsi-supp*0.5; 235 fpsi = fmodulo(fpsi, double(plan.npsi_b)); 236 ipsi = size_t(fpsi+1); 237 fpsi = -1+(ipsi-fpsi)*2; 238 if (ipsi>=plan.npsi_b) ipsi-=plan.npsi_b; 239 tkrn.eval3(T(fpsi), T(ftheta), T(fphi), &buf.simd[0]); 240 } 241 size_t itheta, iphi, ipsi; 242 const T * DUCC0_RESTRICT wpsi; 243 const T * DUCC0_RESTRICT wtheta; 244 const Tsimd * DUCC0_RESTRICT wphi; 245 ptrdiff_t jumptheta; 246 }; 247 248 // prefetching distance 249 static constexpr size_t pfdist=2; 250 interpolx(size_t supp_,const cmav<T,3> & cube,size_t itheta0,size_t iphi0,const cmav<T,1> & theta,const cmav<T,1> & phi,const cmav<T,1> & psi,vmav<T,1> & signal)251 template<size_t supp> void interpolx(size_t supp_, const cmav<T,3> &cube, 252 size_t itheta0, size_t iphi0, const cmav<T,1> &theta, const cmav<T,1> &phi, 253 const cmav<T,1> &psi, vmav<T,1> &signal) const 254 { 255 if constexpr (supp>=8) 256 if (supp_<=supp/2) return interpolx<supp/2>(supp_, cube, itheta0, iphi0, theta, phi, psi, signal); 257 if constexpr (supp>4) 258 if (supp_<supp) return interpolx<supp-1>(supp_, cube, itheta0, iphi0, theta, phi, psi, signal); 259 MR_assert(supp_==supp, "requested support ou of range"); 260 261 MR_assert(cube.stride(2)==1, "last axis of cube must be contiguous"); 262 MR_assert(phi.shape(0)==theta.shape(0), "array shape mismatch"); 263 MR_assert(psi.shape(0)==theta.shape(0), "array shape mismatch"); 264 MR_assert(signal.shape(0)==theta.shape(0), "array shape mismatch"); 265 static constexpr size_t vlen = Tsimd::size(); 266 static constexpr size_t nvec = (supp+vlen-1)/vlen; 267 MR_assert(cube.shape(0)==npsi_b, "bad psi dimension"); 268 auto idx = getIdx(theta, phi, psi, cube.shape(1), cube.shape(2), itheta0, iphi0, supp); 269 270 execStatic(idx.size(), nthreads, 0, [&](Scheduler &sched) 271 { 272 WeightHelper<supp> hlp(*this, cube, itheta0, iphi0); 273 while (auto rng=sched.getNext()) for(auto ind=rng.lo; ind<rng.hi; ++ind) 274 { 275 if (ind+pfdist<rng.hi) 276 { 277 size_t i=idx[ind+pfdist]; 278 DUCC0_PREFETCH_R(&theta(i)); 279 DUCC0_PREFETCH_R(&phi(i)) 280 DUCC0_PREFETCH_R(&psi(i)); 281 DUCC0_PREFETCH_R(&signal(i)); 282 DUCC0_PREFETCH_W(&signal(i)); 283 } 284 size_t i=idx[ind]; 285 hlp.prep(theta(i), phi(i), psi(i)); 286 auto ipsi = hlp.ipsi; 287 const T * DUCC0_RESTRICT ptr = &cube(ipsi,hlp.itheta,hlp.iphi); 288 Tsimd res=0; 289 if constexpr(nvec==1) 290 { 291 for (size_t ipsic=0; ipsic<supp; ++ipsic) 292 { 293 const T * DUCC0_RESTRICT ptr2 = ptr; 294 Tsimd tres=0; 295 for (size_t itheta=0; itheta<supp; ++itheta, ptr2+=hlp.jumptheta) 296 tres += hlp.wtheta[itheta]*Tsimd(ptr2, element_aligned_tag()); 297 res += tres*hlp.wpsi[ipsic]; 298 if (++ipsi>=npsi_b) ipsi=0; 299 ptr = &cube(ipsi,hlp.itheta,hlp.iphi); 300 } 301 res *= hlp.wphi[0]; 302 } 303 else 304 { 305 for (size_t ipsic=0; ipsic<supp; ++ipsic) 306 { 307 const T * DUCC0_RESTRICT ptr2 = ptr; 308 Tsimd tres=0; 309 for (size_t itheta=0; itheta<supp; ++itheta, ptr2+=hlp.jumptheta) 310 for (size_t iphi=0; iphi<nvec; ++iphi) 311 tres += hlp.wtheta[itheta]*hlp.wphi[iphi]*Tsimd(ptr2+iphi*vlen,element_aligned_tag()); 312 res += tres*hlp.wpsi[ipsic]; 313 if (++ipsi>=npsi_b) ipsi=0; 314 ptr = &cube(ipsi,hlp.itheta,hlp.iphi); 315 } 316 } 317 signal(i) = reduce(res, std::plus<>()); 318 } 319 }); 320 } deinterpolx(size_t supp_,vmav<T,3> & cube,size_t itheta0,size_t iphi0,const cmav<T,1> & theta,const cmav<T,1> & phi,const cmav<T,1> & psi,const cmav<T,1> & signal)321 template<size_t supp> void deinterpolx(size_t supp_, vmav<T,3> &cube, 322 size_t itheta0, size_t iphi0, const cmav<T,1> &theta, const cmav<T,1> &phi, 323 const cmav<T,1> &psi, const cmav<T,1> &signal) const 324 { 325 if constexpr (supp>=8) 326 if (supp_<=supp/2) return deinterpolx<supp/2>(supp_, cube, itheta0, iphi0, theta, phi, psi, signal); 327 if constexpr (supp>4) 328 if (supp_<supp) return deinterpolx<supp-1>(supp_, cube, itheta0, iphi0, theta, phi, psi, signal); 329 MR_assert(supp_==supp, "requested support ou of range"); 330 331 MR_assert(cube.stride(2)==1, "last axis of cube must be contiguous"); 332 MR_assert(phi.shape(0)==theta.shape(0), "array shape mismatch"); 333 MR_assert(psi.shape(0)==theta.shape(0), "array shape mismatch"); 334 MR_assert(signal.shape(0)==theta.shape(0), "array shape mismatch"); 335 static constexpr size_t vlen = Tsimd::size(); 336 static constexpr size_t nvec = (supp+vlen-1)/vlen; 337 MR_assert(cube.shape(0)==npsi_b, "bad psi dimension"); 338 auto idx = getIdx(theta, phi, psi, cube.shape(1), cube.shape(2), itheta0, iphi0, supp); 339 340 constexpr size_t cellsize=16; 341 size_t nct = cube.shape(1)/cellsize+10, 342 ncp = cube.shape(2)/cellsize+10; 343 vmav<std::mutex,2> locks({nct,ncp}); 344 345 execStatic(idx.size(), nthreads, 0, [&](Scheduler &sched) 346 { 347 size_t b_theta=~(size_t(0)), b_phi=~(size_t(0)); 348 WeightHelper<supp> hlp(*this, cube, itheta0, iphi0); 349 while (auto rng=sched.getNext()) for(auto ind=rng.lo; ind<rng.hi; ++ind) 350 { 351 if (ind+pfdist<rng.hi) 352 { 353 size_t i=idx[ind+pfdist]; 354 DUCC0_PREFETCH_R(&theta(i)); 355 DUCC0_PREFETCH_R(&phi(i)) 356 DUCC0_PREFETCH_R(&psi(i)); 357 DUCC0_PREFETCH_R(&signal(i)); 358 } 359 size_t i=idx[ind]; 360 hlp.prep(theta(i), phi(i), psi(i)); 361 auto ipsi = hlp.ipsi; 362 T * DUCC0_RESTRICT ptr = &cube(ipsi,hlp.itheta,hlp.iphi); 363 364 size_t b_theta_new = hlp.itheta/cellsize, 365 b_phi_new = hlp.iphi/cellsize; 366 if ((b_theta_new!=b_theta) || (b_phi_new!=b_phi)) 367 { 368 if (b_theta<locks.shape(0)) // unlock 369 { 370 locks(b_theta,b_phi).unlock(); 371 locks(b_theta,b_phi+1).unlock(); 372 locks(b_theta+1,b_phi).unlock(); 373 locks(b_theta+1,b_phi+1).unlock(); 374 } 375 b_theta = b_theta_new; 376 b_phi = b_phi_new; 377 locks(b_theta,b_phi).lock(); 378 locks(b_theta,b_phi+1).lock(); 379 locks(b_theta+1,b_phi).lock(); 380 locks(b_theta+1,b_phi+1).lock(); 381 } 382 383 { 384 Tsimd tmp=signal(i); 385 if constexpr (nvec==1) 386 { 387 tmp *= hlp.wphi[0]; 388 for (size_t ipsic=0; ipsic<supp; ++ipsic) 389 { 390 auto ttmp=tmp*hlp.wpsi[ipsic]; 391 T * DUCC0_RESTRICT ptr2 = ptr; 392 for (size_t itheta=0; itheta<supp; ++itheta, ptr2+=hlp.jumptheta) 393 { 394 Tsimd var=Tsimd(ptr2,element_aligned_tag()); 395 var += ttmp*hlp.wtheta[itheta]; 396 var.copy_to(ptr2,element_aligned_tag()); 397 } 398 if (++ipsi>=npsi_b) ipsi=0; 399 ptr = &cube(ipsi,hlp.itheta,hlp.iphi); 400 } 401 } 402 else 403 { 404 for (size_t ipsic=0; ipsic<supp; ++ipsic) 405 { 406 auto ttmp=tmp*hlp.wpsi[ipsic]; 407 T * DUCC0_RESTRICT ptr2 = ptr; 408 for (size_t itheta=0; itheta<supp; ++itheta) 409 { 410 auto tttmp=ttmp*hlp.wtheta[itheta]; 411 for (size_t iphi=0; iphi<nvec; ++iphi) 412 { 413 Tsimd var=Tsimd(ptr2+iphi*vlen, element_aligned_tag()); 414 var += tttmp*hlp.wphi[iphi]; 415 var.copy_to(ptr2+iphi*vlen, element_aligned_tag()); 416 } 417 ptr2 += hlp.jumptheta; 418 } 419 if (++ipsi>=npsi_b) ipsi=0; 420 ptr = &cube(ipsi,hlp.itheta,hlp.iphi); 421 } 422 } 423 } 424 } 425 if (b_theta<locks.shape(0)) // unlock 426 { 427 locks(b_theta,b_phi).unlock(); 428 locks(b_theta,b_phi+1).unlock(); 429 locks(b_theta+1,b_phi).unlock(); 430 locks(b_theta+1,b_phi+1).unlock(); 431 } 432 }); 433 } realsigma()434 double realsigma() const 435 { 436 return min(double(npsi_b)/(2*kmax+1), 437 min(double(nphi_b)/(2*lmax+1), double(ntheta_b)/(lmax+1))); 438 } 439 440 public: ConvolverPlan(size_t lmax_,size_t kmax_,double sigma,double epsilon,size_t nthreads_)441 ConvolverPlan(size_t lmax_, size_t kmax_, double sigma, double epsilon, 442 size_t nthreads_) 443 : nthreads((nthreads_==0) ? get_default_nthreads() : nthreads_), 444 lmax(lmax_), 445 kmax(kmax_), 446 nphi_s(2*good_size_real(lmax+1)), 447 ntheta_s(nphi_s/2+1), 448 npsi_s(kmax*2+1), 449 nphi_b(std::max<size_t>(20,2*good_size_real(size_t((2*lmax+1)*sigma/2.)))), 450 ntheta_b(nphi_b/2+1), 451 npsi_b(size_t(npsi_s*sigma+0.99999)), 452 dphi(2*pi/nphi_b), 453 dtheta(pi/(ntheta_b-1)), 454 dpsi(2*pi/npsi_b), 455 xdphi(1./dphi), 456 xdtheta(1./dtheta), 457 xdpsi(1./dpsi), 458 kernel(selectKernel<T>(realsigma(), epsilon/3.)), 459 nbphi((kernel->support()+1)/2), 460 nbtheta((kernel->support()+1)/2), 461 nphi(nphi_b+2*nbphi+vlen), 462 ntheta(ntheta_b+2*nbtheta), 463 phi0(nbphi*(-dphi)), 464 theta0(nbtheta*(-dtheta)) 465 { 466 auto supp = kernel->support(); 467 MR_assert((supp<=ntheta) && (supp<=nphi_b), "kernel support too large!"); 468 } 469 Lmax()470 size_t Lmax() const { return lmax; } Kmax()471 size_t Kmax() const { return kmax; } Ntheta()472 size_t Ntheta() const { return ntheta; } Nphi()473 size_t Nphi() const { return nphi; } Npsi()474 size_t Npsi() const { return npsi_b; } 475 getPatchInfo(double theta_lo,double theta_hi,double phi_lo,double phi_hi)476 vector<size_t> getPatchInfo(double theta_lo, double theta_hi, double phi_lo, double phi_hi) const 477 { 478 vector<size_t> res(4); 479 auto tmp = (theta_lo-theta0)*xdtheta-nbtheta; 480 res[0] = min(size_t(max(0., tmp)), ntheta); 481 tmp = (theta_hi-theta0)*xdtheta+nbtheta+1.; 482 res[1] = min(size_t(max(0., tmp)), ntheta); 483 tmp = (phi_lo-phi0)*xdphi-nbphi; 484 res[2] = min(size_t(max(0., tmp)), nphi); 485 tmp = (phi_hi-phi0)*xdphi+nbphi+1.+vlen; 486 res[3] = min(size_t(max(0., tmp)), nphi); 487 return res; 488 } 489 getPlane(const cmav<complex<T>,2> & vslm,const cmav<complex<T>,2> & vblm,size_t mbeam,vmav<T,3> & planes)490 void getPlane(const cmav<complex<T>,2> &vslm, const cmav<complex<T>,2> &vblm, 491 size_t mbeam, vmav<T,3> &planes) const 492 { 493 size_t nplanes=1+(mbeam>0); 494 auto ncomp = vslm.shape(0); 495 MR_assert(ncomp>0, "need at least one component"); 496 MR_assert(vblm.shape(0)==ncomp, "inconsistent slm and blm vectors"); 497 Alm_Base islm(lmax, lmax), iblm(lmax, kmax); 498 MR_assert(islm.Num_Alms()==vslm.shape(1), "bad array dimension"); 499 MR_assert(iblm.Num_Alms()==vblm.shape(1), "bad array dimension"); 500 MR_assert(planes.conformable({nplanes, Ntheta(), Nphi()}), "bad planes shape"); 501 MR_assert(mbeam <= kmax, "mbeam too high"); 502 503 vector<T> lnorm(lmax+1); 504 for (size_t i=0; i<=lmax; ++i) 505 lnorm[i]=T(std::sqrt(4*pi/(2*i+1.))); 506 507 Alm_Base base(lmax, lmax); 508 vmav<complex<T>,2> aarr({nplanes,base.Num_Alms()}); 509 for (size_t m=0; m<=lmax; ++m) 510 for (size_t l=m; l<=lmax; ++l) 511 { 512 aarr(0, base.index(l,m))=0.; 513 if (mbeam>0) 514 aarr(1, base.index(l,m))=0.; 515 if (l>=mbeam) 516 { 517 auto norm = (mbeam>0) ? -lnorm[l] : lnorm[l]; 518 for (size_t i=0; i<ncomp; ++i) 519 { 520 auto tmp = vblm(i,iblm.index(l,mbeam))*norm; 521 aarr(0,base.index(l,m)) += vslm(i,islm.index(l,m))*tmp.real(); 522 if (mbeam>0) 523 aarr(1,base.index(l,m)) += vslm(i,islm.index(l,m))*tmp.imag(); 524 } 525 } 526 } 527 auto subplanes=subarray<3>(planes,{{0, nplanes}, {nbtheta, nbtheta+ntheta_s}, {nbphi, nbphi+nphi_s}}); 528 synthesis_2d(aarr, subplanes, mbeam, lmax, lmax, "CC", nthreads); 529 for (size_t iplane=0; iplane<nplanes; ++iplane) 530 { 531 auto m = subarray<2>(planes, {{iplane},{nbtheta, nbtheta+ntheta_b}, {nbphi, nbphi+nphi_b}}); 532 correct(m,mbeam); 533 } 534 535 // fill border regions 536 T fct = (mbeam&1) ? -1 : 1; 537 for (size_t iplane=0; iplane<nplanes; ++iplane) 538 { 539 for (size_t i=0; i<nbtheta; ++i) 540 for (size_t j=0, j2=nphi_b/2; j<nphi_b; ++j,++j2) 541 { 542 if (j2>=nphi_b) j2-=nphi_b; 543 planes(iplane,nbtheta-1-i,j2+nbphi) = fct*planes(iplane,nbtheta+1+i,j+nbphi); 544 planes(iplane,nbtheta+ntheta_b+i,j2+nbphi) = fct*planes(iplane,nbtheta+ntheta_b-2-i,j+nbphi); 545 } 546 for (size_t i=0; i<ntheta; ++i) 547 { 548 for (size_t j=0; j<nbphi; ++j) 549 { 550 planes(iplane,i,j) = planes(iplane,i,j+nphi_b); 551 planes(iplane,i,j+nphi_b+nbphi) = planes(iplane,i,j+nbphi); 552 } 553 // SIMD buffer 554 for (size_t j=0; j<vlen; ++j) 555 planes(iplane, i, nphi-vlen+j) = T(0); 556 } 557 } 558 } 559 getPlane(const cmav<complex<T>,1> & slm,const cmav<complex<T>,1> & blm,size_t mbeam,vmav<T,3> & planes)560 void getPlane(const cmav<complex<T>,1> &slm, const cmav<complex<T>,1> &blm, 561 size_t mbeam, vmav<T,3> &planes) const 562 { 563 cmav<complex<T>,2> vslm(&slm(0), {1,slm.shape(0)}, {0,slm.stride(0)}); 564 cmav<complex<T>,2> vblm(&blm(0), {1,blm.shape(0)}, {0,blm.stride(0)}); 565 getPlane(vslm, vblm, mbeam, planes); 566 } 567 interpol(const cmav<T,3> & cube,size_t itheta0,size_t iphi0,const cmav<T,1> & theta,const cmav<T,1> & phi,const cmav<T,1> & psi,vmav<T,1> & signal)568 void interpol(const cmav<T,3> &cube, size_t itheta0, 569 size_t iphi0, const cmav<T,1> &theta, const cmav<T,1> &phi, 570 const cmav<T,1> &psi, vmav<T,1> &signal) const 571 { 572 constexpr size_t maxsupp = is_same<T, double>::value ? 16 : 8; 573 interpolx<maxsupp>(kernel->support(), cube, itheta0, iphi0, theta, phi, psi, signal); 574 } 575 deinterpol(vmav<T,3> & cube,size_t itheta0,size_t iphi0,const cmav<T,1> & theta,const cmav<T,1> & phi,const cmav<T,1> & psi,const cmav<T,1> & signal)576 void deinterpol(vmav<T,3> &cube, size_t itheta0, 577 size_t iphi0, const cmav<T,1> &theta, const cmav<T,1> &phi, 578 const cmav<T,1> &psi, const cmav<T,1> &signal) const 579 { 580 constexpr size_t maxsupp = is_same<T, double>::value ? 16 : 8; 581 deinterpolx<maxsupp>(kernel->support(), cube, itheta0, iphi0, theta, phi, psi, signal); 582 } 583 updateSlm(vmav<complex<T>,2> & vslm,const cmav<complex<T>,2> & vblm,size_t mbeam,vmav<T,3> & planes)584 void updateSlm(vmav<complex<T>,2> &vslm, const cmav<complex<T>,2> &vblm, 585 size_t mbeam, vmav<T,3> &planes) const 586 { 587 size_t nplanes=1+(mbeam>0); 588 auto ncomp = vslm.shape(0); 589 MR_assert(ncomp>0, "need at least one component"); 590 MR_assert(vblm.shape(0)==ncomp, "inconsistent slm and blm vectors"); 591 Alm_Base islm(lmax, lmax), iblm(lmax, kmax); 592 MR_assert(islm.Num_Alms()==vslm.shape(1), "bad array dimension"); 593 MR_assert(iblm.Num_Alms()==vblm.shape(1), "bad array dimension"); 594 MR_assert(planes.conformable({nplanes, Ntheta(), Nphi()}), "bad planes shape"); 595 MR_assert(mbeam <= kmax, "mbeam too high"); 596 597 // move stuff from border regions onto the main grid 598 for (size_t iplane=0; iplane<nplanes; ++iplane) 599 { 600 for (size_t i=0; i<ntheta; ++i) 601 for (size_t j=0; j<nbphi; ++j) 602 { 603 planes(iplane,i,j+nphi_b) += planes(iplane,i,j); 604 planes(iplane,i,j+nbphi) += planes(iplane,i,j+nphi_b+nbphi); 605 } 606 607 for (size_t i=0; i<nbtheta; ++i) 608 for (size_t j=0, j2=nphi_b/2; j<nphi_b; ++j,++j2) 609 { 610 T fct = (mbeam&1) ? -1 : 1; 611 if (j2>=nphi_b) j2-=nphi_b; 612 planes(iplane,nbtheta+1+i,j+nbphi) += fct*planes(iplane,nbtheta-1-i,j2+nbphi); 613 planes(iplane,nbtheta+ntheta_b-2-i, j+nbphi) += fct*planes(iplane,nbtheta+ntheta_b+i,j2+nbphi); 614 } 615 616 // special treatment for poles 617 for (size_t j=0,j2=nphi_b/2; j<nphi_b/2; ++j,++j2) 618 { 619 T fct = (mbeam&1) ? -1 : 1; 620 if (j2>=nphi_b) j2-=nphi_b; 621 T tval = planes(iplane,nbtheta,j+nbphi) + fct*planes(iplane,nbtheta,j2+nbphi); 622 planes(iplane,nbtheta,j+nbphi) = tval; 623 planes(iplane,nbtheta,j2+nbphi) = fct*tval; 624 tval = planes(iplane,nbtheta+ntheta_b-1,j+nbphi) + fct*planes(iplane,nbtheta+ntheta_b-1,j2+nbphi); 625 planes(iplane,nbtheta+ntheta_b-1,j+nbphi) = tval; 626 planes(iplane,nbtheta+ntheta_b-1,j2+nbphi) = fct*tval; 627 } 628 } 629 630 vector<T>lnorm(lmax+1); 631 for (size_t i=0; i<=lmax; ++i) 632 lnorm[i]=T(std::sqrt(4*pi/(2*i+1.))); 633 634 Alm_Base base(lmax,lmax); 635 636 for (size_t iplane=0; iplane<nplanes; ++iplane) 637 { 638 auto m = subarray<2>(planes, {{iplane}, {nbtheta, nbtheta+ntheta_b}, {nbphi,nbphi+nphi_b}}); 639 decorrect(m,mbeam); 640 } 641 auto subplanes=subarray<3>(planes, {{0, nplanes}, {nbtheta, nbtheta+ntheta_s}, {nbphi,nbphi+nphi_s}}); 642 643 vmav<complex<T>,2> aarr({nplanes, base.Num_Alms()}); 644 adjoint_synthesis_2d(aarr, subplanes, mbeam, lmax, lmax, "CC", nthreads); 645 for (size_t m=0; m<=lmax; ++m) 646 for (size_t l=m; l<=lmax; ++l) 647 if (l>=mbeam) 648 for (size_t i=0; i<ncomp; ++i) 649 { 650 auto tmp = vblm(i,iblm.index(l,mbeam))*lnorm[l] * T((mbeam==0) ? 1 : (-2)); 651 vslm(i,islm.index(l,m)) += aarr(0,base.index(l,m))*tmp.real(); 652 if (mbeam>0) 653 vslm(i,islm.index(l,m)) += aarr(1,base.index(l,m))*tmp.imag(); 654 } 655 } 656 updateSlm(vmav<complex<T>,1> & slm,const cmav<complex<T>,1> & blm,size_t mbeam,vmav<T,3> & planes)657 void updateSlm(vmav<complex<T>,1> &slm, const cmav<complex<T>,1> &blm, 658 size_t mbeam, vmav<T,3> &planes) const 659 { 660 vmav<complex<T>,2> vslm(&slm(0), {1,slm.shape(0)}, {0,slm.stride(0)}); 661 cmav<complex<T>,2> vblm(&blm(0), {1,blm.shape(0)}, {0,blm.stride(0)}); 662 updateSlm(vslm, vblm, mbeam, planes); 663 } 664 prepPsi(vmav<T,3> & subcube)665 void prepPsi(vmav<T,3> &subcube) const 666 { 667 MR_assert(subcube.shape(0)==npsi_b, "bad psi dimension"); 668 auto newpart = subarray<3>(subcube, {{npsi_s, MAXIDX}, {}, {}}); 669 mav_apply([](T &v){v=T(0);}, nthreads, newpart); 670 auto fct = kernel->corfunc(npsi_s/2+1, 1./npsi_b, nthreads); 671 for (size_t k=0; k<npsi_s; ++k) 672 { 673 auto factor = T(fct[(k+1)/2]); 674 for (size_t i=0; i<subcube.shape(1); ++i) 675 for (size_t j=0; j<subcube.shape(2); ++j) 676 subcube(k,i,j) *= factor; 677 } 678 vfmav<T> fsubcube(subcube); 679 r2r_fftpack(fsubcube, fsubcube, {0}, false, true, T(1), nthreads); 680 } 681 deprepPsi(vmav<T,3> & subcube)682 void deprepPsi(vmav<T,3> &subcube) const 683 { 684 MR_assert(subcube.shape(0)==npsi_b, "bad psi dimension"); 685 vfmav<T> fsubcube(subcube); 686 r2r_fftpack(fsubcube, fsubcube, {0}, true, false, T(1), nthreads); 687 auto fct = kernel->corfunc(npsi_s/2+1, 1./npsi_b, nthreads); 688 for (size_t k=0; k<npsi_s; ++k) 689 { 690 auto factor = T(fct[(k+1)/2]); 691 for (size_t i=0; i<subcube.shape(1); ++i) 692 for (size_t j=0; j<subcube.shape(2); ++j) 693 subcube(k,i,j) *= factor; 694 } 695 } 696 }; 697 698 } 699 700 using detail_totalconvolve::ConvolverPlan; 701 702 703 } 704 705 #endif 706