1 /*
2  *  This code is free software; you can redistribute it and/or modify
3  *  it under the terms of the GNU General Public License as published by
4  *  the Free Software Foundation; either version 2 of the License, or
5  *  (at your option) any later version.
6  *
7  *  This code is distributed in the hope that it will be useful,
8  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
9  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10  *  GNU General Public License for more details.
11  *
12  *  You should have received a copy of the GNU General Public License
13  *  along with this code; if not, write to the Free Software
14  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
15  */
16 
17 /*
18  *  Copyright (C) 2020-2021 Max-Planck-Society
19  *  Author: Martin Reinecke
20  */
21 
22 #ifndef DUCC0_TOTALCONVOLVE_H
23 #define DUCC0_TOTALCONVOLVE_H
24 
25 #include <cstdint>
26 #include <algorithm>
27 #include <cstddef>
28 #include <functional>
29 #include <memory>
30 #include <type_traits>
31 #include <vector>
32 #include <complex>
33 #include <cmath>
34 #include <mutex>
35 #include "ducc0/infra/error_handling.h"
36 #include "ducc0/infra/threading.h"
37 #include "ducc0/math/constants.h"
38 #include "ducc0/math/gridding_kernel.h"
39 #include "ducc0/infra/mav.h"
40 #include "ducc0/infra/simd.h"
41 #include "ducc0/infra/aligned_array.h"
42 #include "ducc0/infra/useful_macros.h"
43 #include "ducc0/infra/bucket_sort.h"
44 #include "ducc0/sht/sht.h"
45 #include "ducc0/sht/alm.h"
46 #include "ducc0/fft/fft1d.h"
47 #include "ducc0/fft/fft.h"
48 #include "ducc0/math/math_utils.h"
49 
50 namespace ducc0 {
51 
52 namespace detail_totalconvolve {
53 
54 using namespace std;
55 
56 template<typename T> class ConvolverPlan
57   {
58   protected:
59     constexpr static auto vlen = min<size_t>(8, native_simd<T>::size());
60     using Tsimd = typename simd_select<T, vlen>::type;
61 
62     size_t nthreads;
63     size_t lmax, kmax;
64     // _s: small grid
65     // _b: oversampled grid
66     // no suffix: grid with borders
67     size_t nphi_s, ntheta_s, npsi_s, nphi_b, ntheta_b, npsi_b;
68     double dphi, dtheta, dpsi, xdphi, xdtheta, xdpsi;
69 
70     shared_ptr<HornerKernel> kernel;
71     size_t nbphi, nbtheta;
72     size_t nphi, ntheta;
73     double phi0, theta0;
74 
getKernel(size_t axlen,size_t axlen2)75     cmav<T,1> getKernel(size_t axlen, size_t axlen2) const
76       {
77       auto axlen_big = max(axlen, axlen2);
78       auto axlen_small = min(axlen, axlen2);
79       auto fct = kernel->corfunc(axlen_small/2+1, 1./axlen_big, nthreads);
80       vmav<T,1> k2({axlen});
81       mav_apply([](T &v){v=T(0);}, 1, k2);
82       {
83       k2(0) = T(fct[0])/axlen_small;
84       size_t i=1;
85       for (; 2*i<axlen_small; ++i)
86         k2(2*i-1) = T(fct[i])/axlen_small;
87       if (2*i==axlen_small)
88         k2(2*i-1) = T(0.5)*T(fct[i])/axlen_small;
89       }
90       pocketfft_r<T> plan(axlen);
91       plan.exec(k2.data(), T(1), false, nthreads);
92       return k2;
93       }
94 
correct(vmav<T,2> & arr,int spin)95     void correct(vmav<T,2> &arr, int spin) const
96       {
97       T sfct = (spin&1) ? -1 : 1;
98       vmav<T,2> tmp({nphi_b,nphi_s});
99       // copy and extend to second half
100       for (size_t j=0; j<nphi_s; ++j)
101         tmp(0,j) = arr(0,j);
102       for (size_t i=1, i2=nphi_s-1; i+1<ntheta_s; ++i,--i2)
103         for (size_t j=0,j2=nphi_s/2; j<nphi_s; ++j,++j2)
104           {
105           if (j2>=nphi_s) j2-=nphi_s;
106           tmp(i,j2) = arr(i,j2);
107           tmp(i2,j) = sfct*tmp(i,j2);
108           }
109       for (size_t j=0; j<nphi_s; ++j)
110         tmp(ntheta_s-1,j) = arr(ntheta_s-1,j);
111       auto fct = kernel->corfunc(nphi_s/2+1, 1./nphi_b, nthreads);
112       vector<T> k2(fct.size());
113       for (size_t i=0; i<fct.size(); ++i) k2[i] = T(fct[i]/nphi_s);
114       vfmav<T> ftmp(tmp);
115       cfmav<T> ftmp0(subarray<2>(tmp, {{0, nphi_s}, {0, nphi_s}}));
116       auto kern = getKernel(nphi_s, nphi_b);
117       convolve_axis(ftmp0, ftmp, 0, kern, nthreads);
118       cfmav<T> ftmp2(subarray<2>(tmp, {{0, ntheta_b}, {0, nphi_s}}));
119       vfmav<T> farr(arr);
120       convolve_axis(ftmp2, farr, 1, kern, nthreads);
121       }
decorrect(vmav<T,2> & arr,int spin)122     void decorrect(vmav<T,2> &arr, int spin) const
123       {
124       T sfct = (spin&1) ? -1 : 1;
125       vmav<T,2> tmp({nphi_b,nphi_s});
126       auto fct = kernel->corfunc(nphi_s/2+1, 1./nphi_b, nthreads);
127       vector<T> k2(fct.size());
128       for (size_t i=0; i<fct.size(); ++i) k2[i] = T(fct[i]/nphi_s);
129       cfmav<T> farr(arr);
130       vfmav<T> ftmp2(subarray<2>(tmp, {{0, ntheta_b}, {0, nphi_s}}));
131       auto kern = getKernel(nphi_b, nphi_s);
132       convolve_axis(farr, ftmp2, 1, kern, nthreads);
133       // extend to second half
134       for (size_t i=1, i2=nphi_b-1; i+1<ntheta_b; ++i,--i2)
135         for (size_t j=0,j2=nphi_s/2; j<nphi_s; ++j,++j2)
136           {
137           if (j2>=nphi_s) j2-=nphi_s;
138           tmp(i2,j) = sfct*tmp(i,j2);
139           }
140       cfmav<T> ftmp(tmp);
141       vfmav<T> ftmp0(subarray<2>(tmp, {{0, nphi_s}, {0, nphi_s}}));
142       convolve_axis(ftmp, ftmp0, 0, kern, nthreads);
143       for (size_t j=0; j<nphi_s; ++j)
144         arr(0,j) = T(0.5)*tmp(0,j);
145       for (size_t i=1; i+1<ntheta_s; ++i)
146         for (size_t j=0; j<nphi_s; ++j)
147           arr(i,j) = tmp(i,j);
148       for (size_t j=0; j<nphi_s; ++j)
149         arr(ntheta_s-1,j) = T(0.5)*tmp(ntheta_s-1,j);
150       }
151 
getIdx(const cmav<T,1> & theta,const cmav<T,1> & phi,const cmav<T,1> & psi,size_t patch_ntheta,size_t patch_nphi,size_t itheta0,size_t iphi0,size_t supp)152     quick_array<uint32_t> getIdx(const cmav<T,1> &theta, const cmav<T,1> &phi, const cmav<T,1> &psi,
153       size_t patch_ntheta, size_t patch_nphi, size_t itheta0, size_t iphi0, size_t supp) const
154       {
155       size_t nptg = theta.shape(0);
156       constexpr size_t cellsize=8;
157       size_t nct = patch_ntheta/cellsize+1,
158              ncp = patch_nphi/cellsize+1,
159              ncpsi = npsi_b/cellsize+1;
160       double theta0 = (int(itheta0)-int(nbtheta))*dtheta,
161              phi0 = (int(iphi0)-int(nbphi))*dphi;
162       double theta_lo=theta0, theta_hi=theta_lo+(patch_ntheta+1)*dtheta;
163       double phi_lo=phi0, phi_hi=phi_lo+(patch_nphi+1)*dphi;
164       MR_assert(uint64_t(nct)*uint64_t(ncp)*uint64_t(ncpsi)<(uint64_t(1)<<32),
165         "key space too large");
166 
167       quick_array<uint32_t> key(nptg);
168       execParallel(nptg, nthreads, [&](size_t lo, size_t hi)
169         {
170         for (size_t i=lo; i<hi; ++i)
171           {
172           MR_assert((theta(i)>=theta_lo) && (theta(i)<=theta_hi), "theta out of range: ", theta(i));
173           MR_assert((phi(i)>=phi_lo) && (phi(i)<=phi_hi), "phi out of range: ", phi(i));
174           auto ftheta = (theta(i)-theta0)*xdtheta-supp*0.5;
175           auto itheta = size_t(ftheta+1);
176           auto fphi = (phi(i)-phi0)*xdphi-supp*0.5;
177           auto iphi = size_t(fphi+1);
178           auto fpsi = psi(i)*xdpsi;
179           fpsi = fmodulo(fpsi, double(npsi_b));
180           size_t ipsi = size_t(fpsi);
181           ipsi /= cellsize;
182           itheta /= cellsize;
183           iphi /= cellsize;
184           MR_assert(itheta<nct, "bad itheta");
185           MR_assert(iphi<ncp, "bad iphi");
186           key[i] = (itheta*ncp+iphi)*ncpsi+ipsi;
187           }
188         });
189       quick_array<uint32_t> res(key.size());
190       bucket_sort(&key[0], &res[0], key.size(), ncp*nct*ncpsi, nthreads);
191       return res;
192       }
193 
194     template<size_t supp> class WeightHelper
195       {
196       public:
197         static constexpr size_t vlen = Tsimd::size();
198         static constexpr size_t nvec = (supp+vlen-1)/vlen;
199         const ConvolverPlan &plan;
200         union kbuf {
201           T scalar[3*nvec*vlen];
202           Tsimd simd[3*nvec];
203 #if defined(_MSC_VER)
kbuf()204           kbuf() {}
205 #endif
206           };
207         kbuf buf;
208 
209       private:
210         TemplateKernel<supp, Tsimd> tkrn;
211         double mytheta0, myphi0;
212 
213       public:
WeightHelper(const ConvolverPlan & plan_,const mav_info<3> & info,size_t itheta0,size_t iphi0)214         WeightHelper(const ConvolverPlan &plan_, const mav_info<3> &info, size_t itheta0, size_t iphi0)
215           : plan(plan_),
216             tkrn(*plan.kernel),
217             mytheta0(plan.theta0+itheta0*plan.dtheta),
218             myphi0(plan.phi0+iphi0*plan.dphi),
219             wpsi(&buf.scalar[0]),
220             wtheta(&buf.scalar[nvec*vlen]),
221             wphi(&buf.simd[2*nvec]),
222             jumptheta(info.stride(1))
223           {
224           MR_assert(info.stride(2)==1, "last axis of cube must be contiguous");
225           }
prep(double theta,double phi,double psi)226         void prep(double theta, double phi, double psi)
227           {
228           auto ftheta = (theta-mytheta0)*plan.xdtheta-supp*0.5;
229           itheta = size_t(ftheta+1);
230           ftheta = -1+(itheta-ftheta)*2;
231           auto fphi = (phi-myphi0)*plan.xdphi-supp*0.5;
232           iphi = size_t(fphi+1);
233           fphi = -1+(iphi-fphi)*2;
234           auto fpsi = psi*plan.xdpsi-supp*0.5;
235           fpsi = fmodulo(fpsi, double(plan.npsi_b));
236           ipsi = size_t(fpsi+1);
237           fpsi = -1+(ipsi-fpsi)*2;
238           if (ipsi>=plan.npsi_b) ipsi-=plan.npsi_b;
239           tkrn.eval3(T(fpsi), T(ftheta), T(fphi), &buf.simd[0]);
240           }
241         size_t itheta, iphi, ipsi;
242         const T * DUCC0_RESTRICT wpsi;
243         const T * DUCC0_RESTRICT wtheta;
244         const Tsimd * DUCC0_RESTRICT wphi;
245         ptrdiff_t jumptheta;
246       };
247 
248     // prefetching distance
249     static constexpr size_t pfdist=2;
250 
interpolx(size_t supp_,const cmav<T,3> & cube,size_t itheta0,size_t iphi0,const cmav<T,1> & theta,const cmav<T,1> & phi,const cmav<T,1> & psi,vmav<T,1> & signal)251     template<size_t supp> void interpolx(size_t supp_, const cmav<T,3> &cube,
252       size_t itheta0, size_t iphi0, const cmav<T,1> &theta, const cmav<T,1> &phi,
253       const cmav<T,1> &psi, vmav<T,1> &signal) const
254       {
255       if constexpr (supp>=8)
256         if (supp_<=supp/2) return interpolx<supp/2>(supp_, cube, itheta0, iphi0, theta, phi, psi, signal);
257       if constexpr (supp>4)
258         if (supp_<supp) return interpolx<supp-1>(supp_, cube, itheta0, iphi0, theta, phi, psi, signal);
259       MR_assert(supp_==supp, "requested support ou of range");
260 
261       MR_assert(cube.stride(2)==1, "last axis of cube must be contiguous");
262       MR_assert(phi.shape(0)==theta.shape(0), "array shape mismatch");
263       MR_assert(psi.shape(0)==theta.shape(0), "array shape mismatch");
264       MR_assert(signal.shape(0)==theta.shape(0), "array shape mismatch");
265       static constexpr size_t vlen = Tsimd::size();
266       static constexpr size_t nvec = (supp+vlen-1)/vlen;
267       MR_assert(cube.shape(0)==npsi_b, "bad psi dimension");
268       auto idx = getIdx(theta, phi, psi, cube.shape(1), cube.shape(2), itheta0, iphi0, supp);
269 
270       execStatic(idx.size(), nthreads, 0, [&](Scheduler &sched)
271         {
272         WeightHelper<supp> hlp(*this, cube, itheta0, iphi0);
273         while (auto rng=sched.getNext()) for(auto ind=rng.lo; ind<rng.hi; ++ind)
274           {
275           if (ind+pfdist<rng.hi)
276             {
277             size_t i=idx[ind+pfdist];
278             DUCC0_PREFETCH_R(&theta(i));
279             DUCC0_PREFETCH_R(&phi(i))
280             DUCC0_PREFETCH_R(&psi(i));
281             DUCC0_PREFETCH_R(&signal(i));
282             DUCC0_PREFETCH_W(&signal(i));
283             }
284           size_t i=idx[ind];
285           hlp.prep(theta(i), phi(i), psi(i));
286           auto ipsi = hlp.ipsi;
287           const T * DUCC0_RESTRICT ptr = &cube(ipsi,hlp.itheta,hlp.iphi);
288           Tsimd res=0;
289           if constexpr(nvec==1)
290             {
291             for (size_t ipsic=0; ipsic<supp; ++ipsic)
292               {
293               const T * DUCC0_RESTRICT ptr2 = ptr;
294               Tsimd tres=0;
295               for (size_t itheta=0; itheta<supp; ++itheta, ptr2+=hlp.jumptheta)
296                 tres += hlp.wtheta[itheta]*Tsimd(ptr2, element_aligned_tag());
297               res += tres*hlp.wpsi[ipsic];
298               if (++ipsi>=npsi_b) ipsi=0;
299               ptr = &cube(ipsi,hlp.itheta,hlp.iphi);
300               }
301             res *= hlp.wphi[0];
302             }
303           else
304             {
305             for (size_t ipsic=0; ipsic<supp; ++ipsic)
306               {
307               const T * DUCC0_RESTRICT ptr2 = ptr;
308               Tsimd tres=0;
309               for (size_t itheta=0; itheta<supp; ++itheta, ptr2+=hlp.jumptheta)
310                 for (size_t iphi=0; iphi<nvec; ++iphi)
311                   tres += hlp.wtheta[itheta]*hlp.wphi[iphi]*Tsimd(ptr2+iphi*vlen,element_aligned_tag());
312               res += tres*hlp.wpsi[ipsic];
313               if (++ipsi>=npsi_b) ipsi=0;
314               ptr = &cube(ipsi,hlp.itheta,hlp.iphi);
315               }
316             }
317           signal(i) = reduce(res, std::plus<>());
318           }
319         });
320       }
deinterpolx(size_t supp_,vmav<T,3> & cube,size_t itheta0,size_t iphi0,const cmav<T,1> & theta,const cmav<T,1> & phi,const cmav<T,1> & psi,const cmav<T,1> & signal)321     template<size_t supp> void deinterpolx(size_t supp_, vmav<T,3> &cube,
322       size_t itheta0, size_t iphi0, const cmav<T,1> &theta, const cmav<T,1> &phi,
323       const cmav<T,1> &psi, const cmav<T,1> &signal) const
324       {
325       if constexpr (supp>=8)
326         if (supp_<=supp/2) return deinterpolx<supp/2>(supp_, cube, itheta0, iphi0, theta, phi, psi, signal);
327       if constexpr (supp>4)
328         if (supp_<supp) return deinterpolx<supp-1>(supp_, cube, itheta0, iphi0, theta, phi, psi, signal);
329       MR_assert(supp_==supp, "requested support ou of range");
330 
331       MR_assert(cube.stride(2)==1, "last axis of cube must be contiguous");
332       MR_assert(phi.shape(0)==theta.shape(0), "array shape mismatch");
333       MR_assert(psi.shape(0)==theta.shape(0), "array shape mismatch");
334       MR_assert(signal.shape(0)==theta.shape(0), "array shape mismatch");
335       static constexpr size_t vlen = Tsimd::size();
336       static constexpr size_t nvec = (supp+vlen-1)/vlen;
337       MR_assert(cube.shape(0)==npsi_b, "bad psi dimension");
338       auto idx = getIdx(theta, phi, psi, cube.shape(1), cube.shape(2), itheta0, iphi0, supp);
339 
340       constexpr size_t cellsize=16;
341       size_t nct = cube.shape(1)/cellsize+10,
342              ncp = cube.shape(2)/cellsize+10;
343       vmav<std::mutex,2> locks({nct,ncp});
344 
345       execStatic(idx.size(), nthreads, 0, [&](Scheduler &sched)
346         {
347         size_t b_theta=~(size_t(0)), b_phi=~(size_t(0));
348         WeightHelper<supp> hlp(*this, cube, itheta0, iphi0);
349         while (auto rng=sched.getNext()) for(auto ind=rng.lo; ind<rng.hi; ++ind)
350           {
351           if (ind+pfdist<rng.hi)
352             {
353             size_t i=idx[ind+pfdist];
354             DUCC0_PREFETCH_R(&theta(i));
355             DUCC0_PREFETCH_R(&phi(i))
356             DUCC0_PREFETCH_R(&psi(i));
357             DUCC0_PREFETCH_R(&signal(i));
358             }
359           size_t i=idx[ind];
360           hlp.prep(theta(i), phi(i), psi(i));
361           auto ipsi = hlp.ipsi;
362           T * DUCC0_RESTRICT ptr = &cube(ipsi,hlp.itheta,hlp.iphi);
363 
364           size_t b_theta_new = hlp.itheta/cellsize,
365                  b_phi_new = hlp.iphi/cellsize;
366           if ((b_theta_new!=b_theta) || (b_phi_new!=b_phi))
367             {
368             if (b_theta<locks.shape(0))  // unlock
369               {
370               locks(b_theta,b_phi).unlock();
371               locks(b_theta,b_phi+1).unlock();
372               locks(b_theta+1,b_phi).unlock();
373               locks(b_theta+1,b_phi+1).unlock();
374               }
375             b_theta = b_theta_new;
376             b_phi = b_phi_new;
377             locks(b_theta,b_phi).lock();
378             locks(b_theta,b_phi+1).lock();
379             locks(b_theta+1,b_phi).lock();
380             locks(b_theta+1,b_phi+1).lock();
381             }
382 
383             {
384             Tsimd tmp=signal(i);
385             if constexpr (nvec==1)
386               {
387               tmp *= hlp.wphi[0];
388               for (size_t ipsic=0; ipsic<supp; ++ipsic)
389                 {
390                 auto ttmp=tmp*hlp.wpsi[ipsic];
391                 T * DUCC0_RESTRICT ptr2 = ptr;
392                 for (size_t itheta=0; itheta<supp; ++itheta, ptr2+=hlp.jumptheta)
393                   {
394                   Tsimd var=Tsimd(ptr2,element_aligned_tag());
395                   var += ttmp*hlp.wtheta[itheta];
396                   var.copy_to(ptr2,element_aligned_tag());
397                   }
398                 if (++ipsi>=npsi_b) ipsi=0;
399                 ptr = &cube(ipsi,hlp.itheta,hlp.iphi);
400                 }
401               }
402             else
403               {
404               for (size_t ipsic=0; ipsic<supp; ++ipsic)
405                 {
406                 auto ttmp=tmp*hlp.wpsi[ipsic];
407                 T * DUCC0_RESTRICT ptr2 = ptr;
408                 for (size_t itheta=0; itheta<supp; ++itheta)
409                   {
410                   auto tttmp=ttmp*hlp.wtheta[itheta];
411                   for (size_t iphi=0; iphi<nvec; ++iphi)
412                     {
413                     Tsimd var=Tsimd(ptr2+iphi*vlen, element_aligned_tag());
414                     var += tttmp*hlp.wphi[iphi];
415                     var.copy_to(ptr2+iphi*vlen, element_aligned_tag());
416                     }
417                   ptr2 += hlp.jumptheta;
418                   }
419                 if (++ipsi>=npsi_b) ipsi=0;
420                 ptr = &cube(ipsi,hlp.itheta,hlp.iphi);
421                 }
422               }
423             }
424           }
425         if (b_theta<locks.shape(0))  // unlock
426           {
427           locks(b_theta,b_phi).unlock();
428           locks(b_theta,b_phi+1).unlock();
429           locks(b_theta+1,b_phi).unlock();
430           locks(b_theta+1,b_phi+1).unlock();
431           }
432         });
433       }
realsigma()434     double realsigma() const
435       {
436       return min(double(npsi_b)/(2*kmax+1),
437                  min(double(nphi_b)/(2*lmax+1), double(ntheta_b)/(lmax+1)));
438       }
439 
440   public:
ConvolverPlan(size_t lmax_,size_t kmax_,double sigma,double epsilon,size_t nthreads_)441     ConvolverPlan(size_t lmax_, size_t kmax_, double sigma, double epsilon,
442       size_t nthreads_)
443       : nthreads((nthreads_==0) ? get_default_nthreads() : nthreads_),
444         lmax(lmax_),
445         kmax(kmax_),
446         nphi_s(2*good_size_real(lmax+1)),
447         ntheta_s(nphi_s/2+1),
448         npsi_s(kmax*2+1),
449         nphi_b(std::max<size_t>(20,2*good_size_real(size_t((2*lmax+1)*sigma/2.)))),
450         ntheta_b(nphi_b/2+1),
451         npsi_b(size_t(npsi_s*sigma+0.99999)),
452         dphi(2*pi/nphi_b),
453         dtheta(pi/(ntheta_b-1)),
454         dpsi(2*pi/npsi_b),
455         xdphi(1./dphi),
456         xdtheta(1./dtheta),
457         xdpsi(1./dpsi),
458         kernel(selectKernel<T>(realsigma(), epsilon/3.)),
459         nbphi((kernel->support()+1)/2),
460         nbtheta((kernel->support()+1)/2),
461         nphi(nphi_b+2*nbphi+vlen),
462         ntheta(ntheta_b+2*nbtheta),
463         phi0(nbphi*(-dphi)),
464         theta0(nbtheta*(-dtheta))
465       {
466       auto supp = kernel->support();
467       MR_assert((supp<=ntheta) && (supp<=nphi_b), "kernel support too large!");
468       }
469 
Lmax()470     size_t Lmax() const { return lmax; }
Kmax()471     size_t Kmax() const { return kmax; }
Ntheta()472     size_t Ntheta() const { return ntheta; }
Nphi()473     size_t Nphi() const { return nphi; }
Npsi()474     size_t Npsi() const { return npsi_b; }
475 
getPatchInfo(double theta_lo,double theta_hi,double phi_lo,double phi_hi)476     vector<size_t> getPatchInfo(double theta_lo, double theta_hi, double phi_lo, double phi_hi) const
477       {
478       vector<size_t> res(4);
479       auto tmp = (theta_lo-theta0)*xdtheta-nbtheta;
480       res[0] = min(size_t(max(0., tmp)), ntheta);
481       tmp = (theta_hi-theta0)*xdtheta+nbtheta+1.;
482       res[1] = min(size_t(max(0., tmp)), ntheta);
483       tmp = (phi_lo-phi0)*xdphi-nbphi;
484       res[2] = min(size_t(max(0., tmp)), nphi);
485       tmp = (phi_hi-phi0)*xdphi+nbphi+1.+vlen;
486       res[3] = min(size_t(max(0., tmp)), nphi);
487       return res;
488       }
489 
getPlane(const cmav<complex<T>,2> & vslm,const cmav<complex<T>,2> & vblm,size_t mbeam,vmav<T,3> & planes)490     void getPlane(const cmav<complex<T>,2> &vslm, const cmav<complex<T>,2> &vblm,
491       size_t mbeam, vmav<T,3> &planes) const
492       {
493       size_t nplanes=1+(mbeam>0);
494       auto ncomp = vslm.shape(0);
495       MR_assert(ncomp>0, "need at least one component");
496       MR_assert(vblm.shape(0)==ncomp, "inconsistent slm and blm vectors");
497       Alm_Base islm(lmax, lmax), iblm(lmax, kmax);
498       MR_assert(islm.Num_Alms()==vslm.shape(1), "bad array dimension");
499       MR_assert(iblm.Num_Alms()==vblm.shape(1), "bad array dimension");
500       MR_assert(planes.conformable({nplanes, Ntheta(), Nphi()}), "bad planes shape");
501       MR_assert(mbeam <= kmax, "mbeam too high");
502 
503       vector<T> lnorm(lmax+1);
504       for (size_t i=0; i<=lmax; ++i)
505         lnorm[i]=T(std::sqrt(4*pi/(2*i+1.)));
506 
507       Alm_Base base(lmax, lmax);
508       vmav<complex<T>,2> aarr({nplanes,base.Num_Alms()});
509       for (size_t m=0; m<=lmax; ++m)
510         for (size_t l=m; l<=lmax; ++l)
511           {
512           aarr(0, base.index(l,m))=0.;
513           if (mbeam>0)
514             aarr(1, base.index(l,m))=0.;
515           if (l>=mbeam)
516             {
517             auto norm = (mbeam>0) ? -lnorm[l] : lnorm[l];
518             for (size_t i=0; i<ncomp; ++i)
519               {
520               auto tmp = vblm(i,iblm.index(l,mbeam))*norm;
521               aarr(0,base.index(l,m)) += vslm(i,islm.index(l,m))*tmp.real();
522               if (mbeam>0)
523                 aarr(1,base.index(l,m)) += vslm(i,islm.index(l,m))*tmp.imag();
524               }
525             }
526           }
527       auto subplanes=subarray<3>(planes,{{0, nplanes}, {nbtheta, nbtheta+ntheta_s}, {nbphi, nbphi+nphi_s}});
528       synthesis_2d(aarr, subplanes, mbeam, lmax, lmax, "CC", nthreads);
529       for (size_t iplane=0; iplane<nplanes; ++iplane)
530         {
531         auto m = subarray<2>(planes, {{iplane},{nbtheta, nbtheta+ntheta_b}, {nbphi, nbphi+nphi_b}});
532         correct(m,mbeam);
533         }
534 
535       // fill border regions
536       T fct = (mbeam&1) ? -1 : 1;
537       for (size_t iplane=0; iplane<nplanes; ++iplane)
538         {
539         for (size_t i=0; i<nbtheta; ++i)
540           for (size_t j=0, j2=nphi_b/2; j<nphi_b; ++j,++j2)
541             {
542             if (j2>=nphi_b) j2-=nphi_b;
543             planes(iplane,nbtheta-1-i,j2+nbphi) = fct*planes(iplane,nbtheta+1+i,j+nbphi);
544             planes(iplane,nbtheta+ntheta_b+i,j2+nbphi) = fct*planes(iplane,nbtheta+ntheta_b-2-i,j+nbphi);
545             }
546         for (size_t i=0; i<ntheta; ++i)
547           {
548           for (size_t j=0; j<nbphi; ++j)
549             {
550             planes(iplane,i,j) = planes(iplane,i,j+nphi_b);
551             planes(iplane,i,j+nphi_b+nbphi) = planes(iplane,i,j+nbphi);
552             }
553           // SIMD buffer
554           for (size_t j=0; j<vlen; ++j)
555             planes(iplane, i, nphi-vlen+j) = T(0);
556           }
557         }
558       }
559 
getPlane(const cmav<complex<T>,1> & slm,const cmav<complex<T>,1> & blm,size_t mbeam,vmav<T,3> & planes)560     void getPlane(const cmav<complex<T>,1> &slm, const cmav<complex<T>,1> &blm,
561       size_t mbeam, vmav<T,3> &planes) const
562       {
563       cmav<complex<T>,2> vslm(&slm(0), {1,slm.shape(0)}, {0,slm.stride(0)});
564       cmav<complex<T>,2> vblm(&blm(0), {1,blm.shape(0)}, {0,blm.stride(0)});
565       getPlane(vslm, vblm, mbeam, planes);
566       }
567 
interpol(const cmav<T,3> & cube,size_t itheta0,size_t iphi0,const cmav<T,1> & theta,const cmav<T,1> & phi,const cmav<T,1> & psi,vmav<T,1> & signal)568     void interpol(const cmav<T,3> &cube, size_t itheta0,
569       size_t iphi0, const cmav<T,1> &theta, const cmav<T,1> &phi,
570       const cmav<T,1> &psi, vmav<T,1> &signal) const
571       {
572       constexpr size_t maxsupp = is_same<T, double>::value ? 16 : 8;
573       interpolx<maxsupp>(kernel->support(), cube, itheta0, iphi0, theta, phi, psi, signal);
574       }
575 
deinterpol(vmav<T,3> & cube,size_t itheta0,size_t iphi0,const cmav<T,1> & theta,const cmav<T,1> & phi,const cmav<T,1> & psi,const cmav<T,1> & signal)576     void deinterpol(vmav<T,3> &cube, size_t itheta0,
577       size_t iphi0, const cmav<T,1> &theta, const cmav<T,1> &phi,
578       const cmav<T,1> &psi, const cmav<T,1> &signal) const
579       {
580       constexpr size_t maxsupp = is_same<T, double>::value ? 16 : 8;
581       deinterpolx<maxsupp>(kernel->support(), cube, itheta0, iphi0, theta, phi, psi, signal);
582       }
583 
updateSlm(vmav<complex<T>,2> & vslm,const cmav<complex<T>,2> & vblm,size_t mbeam,vmav<T,3> & planes)584     void updateSlm(vmav<complex<T>,2> &vslm, const cmav<complex<T>,2> &vblm,
585       size_t mbeam, vmav<T,3> &planes) const
586       {
587       size_t nplanes=1+(mbeam>0);
588       auto ncomp = vslm.shape(0);
589       MR_assert(ncomp>0, "need at least one component");
590       MR_assert(vblm.shape(0)==ncomp, "inconsistent slm and blm vectors");
591       Alm_Base islm(lmax, lmax), iblm(lmax, kmax);
592       MR_assert(islm.Num_Alms()==vslm.shape(1), "bad array dimension");
593       MR_assert(iblm.Num_Alms()==vblm.shape(1), "bad array dimension");
594       MR_assert(planes.conformable({nplanes, Ntheta(), Nphi()}), "bad planes shape");
595       MR_assert(mbeam <= kmax, "mbeam too high");
596 
597       // move stuff from border regions onto the main grid
598       for (size_t iplane=0; iplane<nplanes; ++iplane)
599         {
600         for (size_t i=0; i<ntheta; ++i)
601           for (size_t j=0; j<nbphi; ++j)
602             {
603             planes(iplane,i,j+nphi_b) += planes(iplane,i,j);
604             planes(iplane,i,j+nbphi) += planes(iplane,i,j+nphi_b+nbphi);
605             }
606 
607         for (size_t i=0; i<nbtheta; ++i)
608           for (size_t j=0, j2=nphi_b/2; j<nphi_b; ++j,++j2)
609             {
610             T fct = (mbeam&1) ? -1 : 1;
611             if (j2>=nphi_b) j2-=nphi_b;
612             planes(iplane,nbtheta+1+i,j+nbphi) += fct*planes(iplane,nbtheta-1-i,j2+nbphi);
613             planes(iplane,nbtheta+ntheta_b-2-i, j+nbphi) += fct*planes(iplane,nbtheta+ntheta_b+i,j2+nbphi);
614             }
615 
616         // special treatment for poles
617         for (size_t j=0,j2=nphi_b/2; j<nphi_b/2; ++j,++j2)
618           {
619           T fct = (mbeam&1) ? -1 : 1;
620           if (j2>=nphi_b) j2-=nphi_b;
621           T tval = planes(iplane,nbtheta,j+nbphi) + fct*planes(iplane,nbtheta,j2+nbphi);
622           planes(iplane,nbtheta,j+nbphi) = tval;
623           planes(iplane,nbtheta,j2+nbphi) = fct*tval;
624           tval = planes(iplane,nbtheta+ntheta_b-1,j+nbphi) + fct*planes(iplane,nbtheta+ntheta_b-1,j2+nbphi);
625           planes(iplane,nbtheta+ntheta_b-1,j+nbphi) = tval;
626           planes(iplane,nbtheta+ntheta_b-1,j2+nbphi) = fct*tval;
627           }
628         }
629 
630       vector<T>lnorm(lmax+1);
631       for (size_t i=0; i<=lmax; ++i)
632         lnorm[i]=T(std::sqrt(4*pi/(2*i+1.)));
633 
634       Alm_Base base(lmax,lmax);
635 
636       for (size_t iplane=0; iplane<nplanes; ++iplane)
637         {
638         auto m = subarray<2>(planes, {{iplane}, {nbtheta, nbtheta+ntheta_b}, {nbphi,nbphi+nphi_b}});
639         decorrect(m,mbeam);
640         }
641       auto subplanes=subarray<3>(planes, {{0, nplanes}, {nbtheta, nbtheta+ntheta_s}, {nbphi,nbphi+nphi_s}});
642 
643       vmav<complex<T>,2> aarr({nplanes, base.Num_Alms()});
644       adjoint_synthesis_2d(aarr, subplanes, mbeam, lmax, lmax, "CC", nthreads);
645       for (size_t m=0; m<=lmax; ++m)
646         for (size_t l=m; l<=lmax; ++l)
647           if (l>=mbeam)
648             for (size_t i=0; i<ncomp; ++i)
649               {
650               auto tmp = vblm(i,iblm.index(l,mbeam))*lnorm[l] * T((mbeam==0) ? 1 : (-2));
651               vslm(i,islm.index(l,m)) += aarr(0,base.index(l,m))*tmp.real();
652               if (mbeam>0)
653                 vslm(i,islm.index(l,m)) += aarr(1,base.index(l,m))*tmp.imag();
654               }
655       }
656 
updateSlm(vmav<complex<T>,1> & slm,const cmav<complex<T>,1> & blm,size_t mbeam,vmav<T,3> & planes)657     void updateSlm(vmav<complex<T>,1> &slm, const cmav<complex<T>,1> &blm,
658       size_t mbeam, vmav<T,3> &planes) const
659       {
660       vmav<complex<T>,2> vslm(&slm(0), {1,slm.shape(0)}, {0,slm.stride(0)});
661       cmav<complex<T>,2> vblm(&blm(0), {1,blm.shape(0)}, {0,blm.stride(0)});
662       updateSlm(vslm, vblm, mbeam, planes);
663       }
664 
prepPsi(vmav<T,3> & subcube)665     void prepPsi(vmav<T,3> &subcube) const
666       {
667       MR_assert(subcube.shape(0)==npsi_b, "bad psi dimension");
668       auto newpart = subarray<3>(subcube, {{npsi_s, MAXIDX}, {}, {}});
669       mav_apply([](T &v){v=T(0);}, nthreads, newpart);
670       auto fct = kernel->corfunc(npsi_s/2+1, 1./npsi_b, nthreads);
671       for (size_t k=0; k<npsi_s; ++k)
672         {
673         auto factor = T(fct[(k+1)/2]);
674         for (size_t i=0; i<subcube.shape(1); ++i)
675           for (size_t j=0; j<subcube.shape(2); ++j)
676             subcube(k,i,j) *= factor;
677         }
678       vfmav<T> fsubcube(subcube);
679       r2r_fftpack(fsubcube, fsubcube, {0}, false, true, T(1), nthreads);
680       }
681 
deprepPsi(vmav<T,3> & subcube)682     void deprepPsi(vmav<T,3> &subcube) const
683       {
684       MR_assert(subcube.shape(0)==npsi_b, "bad psi dimension");
685       vfmav<T> fsubcube(subcube);
686       r2r_fftpack(fsubcube, fsubcube, {0}, true, false, T(1), nthreads);
687       auto fct = kernel->corfunc(npsi_s/2+1, 1./npsi_b, nthreads);
688       for (size_t k=0; k<npsi_s; ++k)
689         {
690         auto factor = T(fct[(k+1)/2]);
691         for (size_t i=0; i<subcube.shape(1); ++i)
692           for (size_t j=0; j<subcube.shape(2); ++j)
693             subcube(k,i,j) *= factor;
694         }
695       }
696   };
697 
698 }
699 
700 using detail_totalconvolve::ConvolverPlan;
701 
702 
703 }
704 
705 #endif
706