1 /*
2  *  This file is part of libsharp2.
3  *
4  *  libsharp2 is free software; you can redistribute it and/or modify
5  *  it under the terms of the GNU General Public License as published by
6  *  the Free Software Foundation; either version 2 of the License, or
7  *  (at your option) any later version.
8  *
9  *  libsharp2 is distributed in the hope that it will be useful,
10  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
11  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *  GNU General Public License for more details.
13  *
14  *  You should have received a copy of the GNU General Public License
15  *  along with libsharp2; if not, write to the Free Software
16  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17  */
18 
19 /* libsharp2 is being developed at the Max-Planck-Institut fuer Astrophysik */
20 
21 /*! \file sharp_mpi.c
22  *  Functionality only needed for MPI-parallel transforms
23  *
24  *  Copyright (C) 2012-2019 Max-Planck-Society
25  *  \author Martin Reinecke \author Dag Sverre Seljebotn
26  */
27 
28 #ifdef USE_MPI
29 
30 #include "libsharp2/sharp_mpi.h"
31 
32 typedef struct
33   {
34   int ntasks;     /* number of tasks */
35   int mytask;     /* own task number */
36   MPI_Comm comm;  /* communicator to use */
37 
38   int *nm;        /* number of m values on every task */
39   int *ofs_m;     /* accumulated nm */
40   int nmtotal;    /* total number of m values (must be mmax+1) */
41   int *mval;      /* array containing all m values of task 0, task 1 etc. */
42   int mmax;
43   int nph;
44 
45   int *npair;     /* number of ring pairs on every task */
46   int *ofs_pair;  /* accumulated npair */
47   int npairtotal; /* total number of ring pairs */
48 
49   double *theta;  /* theta of first ring of every pair on task 0, task 1 etc. */
50   int *ispair;    /* is this really a pair? */
51 
52   int *almcount, *almdisp, *mapcount, *mapdisp; /* for all2all communication */
53   } sharp_mpi_info;
54 
sharp_make_mpi_info(MPI_Comm comm,const sharp_job * job,sharp_mpi_info * minfo)55 static void sharp_make_mpi_info (MPI_Comm comm, const sharp_job *job,
56   sharp_mpi_info *minfo)
57   {
58   minfo->comm = comm;
59   MPI_Comm_size (comm, &minfo->ntasks);
60   MPI_Comm_rank (comm, &minfo->mytask);
61 
62   minfo->nm=RALLOC(int,minfo->ntasks);
63   MPI_Allgather ((int *)(&job->ainfo->nm),1,MPI_INT,minfo->nm,1,MPI_INT,comm);
64   minfo->ofs_m=RALLOC(int,minfo->ntasks+1);
65   minfo->ofs_m[0]=0;
66   for (int i=1; i<=minfo->ntasks; ++i)
67     minfo->ofs_m[i] = minfo->ofs_m[i-1]+minfo->nm[i-1];
68   minfo->nmtotal=minfo->ofs_m[minfo->ntasks];
69   minfo->mval=RALLOC(int,minfo->nmtotal);
70   MPI_Allgatherv(job->ainfo->mval, job->ainfo->nm, MPI_INT, minfo->mval,
71     minfo->nm, minfo->ofs_m, MPI_INT, comm);
72 
73   minfo->mmax=sharp_get_mmax(minfo->mval,minfo->nmtotal);
74 
75   minfo->npair=RALLOC(int,minfo->ntasks);
76   MPI_Allgather ((int *)(&job->ginfo->npairs), 1, MPI_INT, minfo->npair, 1,
77     MPI_INT, comm);
78   minfo->ofs_pair=RALLOC(int,minfo->ntasks+1);
79   minfo->ofs_pair[0]=0;
80   for (int i=1; i<=minfo->ntasks; ++i)
81     minfo->ofs_pair[i] = minfo->ofs_pair[i-1]+minfo->npair[i-1];
82   minfo->npairtotal=minfo->ofs_pair[minfo->ntasks];
83 
84   double *theta_tmp=RALLOC(double,job->ginfo->npairs);
85   int *ispair_tmp=RALLOC(int,job->ginfo->npairs);
86   for (int i=0; i<job->ginfo->npairs; ++i)
87     {
88     theta_tmp[i]=job->ginfo->pair[i].r1.theta;
89     ispair_tmp[i]=job->ginfo->pair[i].r2.nph>0;
90     }
91   minfo->theta=RALLOC(double,minfo->npairtotal);
92   minfo->ispair=RALLOC(int,minfo->npairtotal);
93   MPI_Allgatherv(theta_tmp, job->ginfo->npairs, MPI_DOUBLE, minfo->theta,
94     minfo->npair, minfo->ofs_pair, MPI_DOUBLE, comm);
95   MPI_Allgatherv(ispair_tmp, job->ginfo->npairs, MPI_INT, minfo->ispair,
96     minfo->npair, minfo->ofs_pair, MPI_INT, comm);
97   DEALLOC(theta_tmp);
98   DEALLOC(ispair_tmp);
99 
100   minfo->nph=2*job->nmaps;
101 
102   minfo->almcount=RALLOC(int,minfo->ntasks);
103   minfo->almdisp=RALLOC(int,minfo->ntasks+1);
104   minfo->mapcount=RALLOC(int,minfo->ntasks);
105   minfo->mapdisp=RALLOC(int,minfo->ntasks+1);
106   minfo->almdisp[0]=minfo->mapdisp[0]=0;
107   for (int i=0; i<minfo->ntasks; ++i)
108     {
109     minfo->almcount[i] = 2*minfo->nph*minfo->nm[minfo->mytask]*minfo->npair[i];
110     minfo->almdisp[i+1] = minfo->almdisp[i]+minfo->almcount[i];
111     minfo->mapcount[i] = 2*minfo->nph*minfo->nm[i]*minfo->npair[minfo->mytask];
112     minfo->mapdisp[i+1] = minfo->mapdisp[i]+minfo->mapcount[i];
113     }
114   }
115 
sharp_destroy_mpi_info(sharp_mpi_info * minfo)116 static void sharp_destroy_mpi_info (sharp_mpi_info *minfo)
117   {
118   DEALLOC(minfo->nm);
119   DEALLOC(minfo->ofs_m);
120   DEALLOC(minfo->mval);
121   DEALLOC(minfo->npair);
122   DEALLOC(minfo->ofs_pair);
123   DEALLOC(minfo->theta);
124   DEALLOC(minfo->ispair);
125   DEALLOC(minfo->almcount);
126   DEALLOC(minfo->almdisp);
127   DEALLOC(minfo->mapcount);
128   DEALLOC(minfo->mapdisp);
129   }
130 
sharp_communicate_alm2map(const sharp_mpi_info * minfo,dcmplx ** ph)131 static void sharp_communicate_alm2map (const sharp_mpi_info *minfo, dcmplx **ph)
132   {
133   dcmplx *phas_tmp = RALLOC(dcmplx,minfo->mapdisp[minfo->ntasks]/2);
134 
135   MPI_Alltoallv (*ph,minfo->almcount,minfo->almdisp,MPI_DOUBLE,phas_tmp,
136     minfo->mapcount,minfo->mapdisp,MPI_DOUBLE,minfo->comm);
137 
138   DEALLOC(*ph);
139   ALLOC(*ph,dcmplx,minfo->nph*minfo->npair[minfo->mytask]*minfo->nmtotal);
140 
141   for (int task=0; task<minfo->ntasks; ++task)
142     for (int th=0; th<minfo->npair[minfo->mytask]; ++th)
143       for (int mi=0; mi<minfo->nm[task]; ++mi)
144         {
145         int m = minfo->mval[mi+minfo->ofs_m[task]];
146         int o1 = minfo->nph*(th*(minfo->mmax+1) + m);
147         int o2 = minfo->mapdisp[task]/2+minfo->nph*(mi+th*minfo->nm[task]);
148         for (int i=0; i<minfo->nph; ++i)
149           (*ph)[o1+i] = phas_tmp[o2+i];
150         }
151   DEALLOC(phas_tmp);
152   }
153 
sharp_communicate_map2alm(const sharp_mpi_info * minfo,dcmplx ** ph)154 static void sharp_communicate_map2alm (const sharp_mpi_info *minfo, dcmplx **ph)
155   {
156   dcmplx *phas_tmp = RALLOC(dcmplx,minfo->mapdisp[minfo->ntasks]/2);
157 
158   for (int task=0; task<minfo->ntasks; ++task)
159     for (int th=0; th<minfo->npair[minfo->mytask]; ++th)
160       for (int mi=0; mi<minfo->nm[task]; ++mi)
161         {
162         int m = minfo->mval[mi+minfo->ofs_m[task]];
163         int o1 = minfo->mapdisp[task]/2+minfo->nph*(mi+th*minfo->nm[task]);
164         int o2 = minfo->nph*(th*(minfo->mmax+1) + m);
165         for (int i=0; i<minfo->nph; ++i)
166           phas_tmp[o1+i] = (*ph)[o2+i];
167         }
168 
169   DEALLOC(*ph);
170   ALLOC(*ph,dcmplx,minfo->nph*minfo->nm[minfo->mytask]*minfo->npairtotal);
171 
172   MPI_Alltoallv (phas_tmp,minfo->mapcount,minfo->mapdisp,MPI_DOUBLE,
173     *ph,minfo->almcount,minfo->almdisp,MPI_DOUBLE,minfo->comm);
174 
175   DEALLOC(phas_tmp);
176   }
177 
alloc_phase_mpi(sharp_job * job,int nm,int ntheta,int nmfull,int nthetafull)178 static void alloc_phase_mpi (sharp_job *job, int nm, int ntheta,
179   int nmfull, int nthetafull)
180   {
181   ptrdiff_t phase_size = (job->type==SHARP_MAP2ALM) ?
182     (ptrdiff_t)(nmfull)*ntheta : (ptrdiff_t)(nm)*nthetafull;
183   job->phase=RALLOC(dcmplx,2*job->nmaps*phase_size);
184   job->s_m=2*job->nmaps;
185   job->s_th = job->s_m * ((job->type==SHARP_MAP2ALM) ? nmfull : nm);
186   }
187 
alm2map_comm(sharp_job * job,const sharp_mpi_info * minfo)188 static void alm2map_comm (sharp_job *job, const sharp_mpi_info *minfo)
189   {
190   if (job->type != SHARP_MAP2ALM)
191     {
192     sharp_communicate_alm2map (minfo,&job->phase);
193     job->s_th=job->s_m*minfo->nmtotal;
194     }
195   }
196 
map2alm_comm(sharp_job * job,const sharp_mpi_info * minfo)197 static void map2alm_comm (sharp_job *job, const sharp_mpi_info *minfo)
198   {
199   if (job->type == SHARP_MAP2ALM)
200     {
201     sharp_communicate_map2alm (minfo,&job->phase);
202     job->s_th=job->s_m*minfo->nm[minfo->mytask];
203     }
204   }
205 
sharp_execute_job_mpi(sharp_job * job,MPI_Comm comm)206 static void sharp_execute_job_mpi (sharp_job *job, MPI_Comm comm)
207   {
208   int ntasks;
209   MPI_Comm_size(comm, &ntasks);
210   if (ntasks==1) /* fall back to scalar implementation */
211     { sharp_execute_job (job); return; }
212 
213   MPI_Barrier(comm);
214   double timer=sharp_wallTime();
215   job->opcnt=0;
216   sharp_mpi_info minfo;
217   sharp_make_mpi_info(comm, job, &minfo);
218 
219   if (minfo.npairtotal>minfo.ntasks*300)
220     {
221     int nsub=(minfo.npairtotal+minfo.ntasks*200-1)/(minfo.ntasks*200);
222     for (int isub=0; isub<nsub; ++isub)
223       {
224       sharp_job ljob=*job;
225       // When creating a_lm, every sub-job produces a complete set of
226       // coefficients; they need to be added up.
227       if ((isub>0)&&(job->type==SHARP_MAP2ALM)) ljob.flags|=SHARP_ADD;
228       sharp_geom_info lginfo;
229       lginfo.pair=RALLOC(sharp_ringpair,(job->ginfo->npairs/nsub)+1);
230       lginfo.npairs=0;
231       lginfo.nphmax = job->ginfo->nphmax;
232       while (lginfo.npairs*nsub+isub<job->ginfo->npairs)
233         {
234         lginfo.pair[lginfo.npairs]=job->ginfo->pair[lginfo.npairs*nsub+isub];
235         ++lginfo.npairs;
236         }
237       ljob.ginfo=&lginfo;
238       sharp_execute_job_mpi (&ljob,comm);
239       job->opcnt+=ljob.opcnt;
240       DEALLOC(lginfo.pair);
241       }
242     }
243   else
244     {
245     int lmax = job->ainfo->lmax;
246     job->norm_l = sharp_Ylmgen_get_norm (lmax, job->spin);
247 
248     /* clear output arrays if requested */
249     init_output (job);
250 
251     alloc_phase_mpi (job,job->ainfo->nm,job->ginfo->npairs,minfo.mmax+1,
252       minfo.npairtotal);
253 
254     double *cth = RALLOC(double,minfo.npairtotal),
255           *sth = RALLOC(double,minfo.npairtotal);
256     int *mlim = RALLOC(int,minfo.npairtotal);
257     for (int i=0; i<minfo.npairtotal; ++i)
258       {
259       cth[i] = cos(minfo.theta[i]);
260       sth[i] = sin(minfo.theta[i]);
261       mlim[i] = sharp_get_mlim(lmax, job->spin, sth[i], cth[i]);
262       }
263 
264     /* map->phase where necessary */
265     map2phase (job, minfo.mmax, 0, job->ginfo->npairs);
266 
267     map2alm_comm (job, &minfo);
268 
269 #pragma omp parallel
270 {
271     sharp_job ljob = *job;
272     sharp_Ylmgen_C generator;
273     sharp_Ylmgen_init (&generator,lmax,minfo.mmax,ljob.spin);
274     alloc_almtmp(&ljob,lmax);
275 
276 #pragma omp for schedule(dynamic,1)
277     for (int mi=0; mi<job->ainfo->nm; ++mi)
278       {
279   /* alm->alm_tmp where necessary */
280       alm2almtmp (&ljob, lmax, mi);
281 
282   /* inner conversion loop */
283       inner_loop (&ljob, minfo.ispair, cth, sth, 0, minfo.npairtotal,
284         &generator, mi, mlim);
285 
286   /* alm_tmp->alm where necessary */
287       almtmp2alm (&ljob, lmax, mi);
288       }
289 
290     sharp_Ylmgen_destroy(&generator);
291     dealloc_almtmp(&ljob);
292 
293 #pragma omp critical
294     job->opcnt+=ljob.opcnt;
295 } /* end of parallel region */
296 
297     alm2map_comm (job, &minfo);
298 
299   /* phase->map where necessary */
300     phase2map (job, minfo.mmax, 0, job->ginfo->npairs);
301 
302     DEALLOC(mlim);
303     DEALLOC(cth);
304     DEALLOC(sth);
305     DEALLOC(job->norm_l);
306     dealloc_phase (job);
307     }
308   sharp_destroy_mpi_info(&minfo);
309   job->time=sharp_wallTime()-timer;
310   }
311 
sharp_execute_mpi(MPI_Comm comm,sharp_jobtype type,int spin,void * alm,void * map,const sharp_geom_info * geom_info,const sharp_alm_info * alm_info,int flags,double * time,unsigned long long * opcnt)312 void sharp_execute_mpi (MPI_Comm comm, sharp_jobtype type, int spin,
313   void *alm, void *map, const sharp_geom_info *geom_info,
314   const sharp_alm_info *alm_info, int flags, double *time,
315   unsigned long long *opcnt)
316   {
317   sharp_job job;
318   sharp_build_job_common (&job, type, spin, alm, map, geom_info, alm_info,
319     flags);
320 
321   sharp_execute_job_mpi (&job, comm);
322   if (time!=NULL) *time = job.time;
323   if (opcnt!=NULL) *opcnt = job.opcnt;
324   }
325 
326 /* We declare this only in C file to make symbol available for Fortran wrappers;
327    without declaring it in C header as it should not be available to C code */
328 void sharp_execute_mpi_fortran(MPI_Fint comm, sharp_jobtype type, int spin,
329   void *alm, void *map, const sharp_geom_info *geom_info,
330   const sharp_alm_info *alm_info, int flags, double *time,
331   unsigned long long *opcnt);
sharp_execute_mpi_fortran(MPI_Fint comm,sharp_jobtype type,int spin,void * alm,void * map,const sharp_geom_info * geom_info,const sharp_alm_info * alm_info,int flags,double * time,unsigned long long * opcnt)332 void sharp_execute_mpi_fortran(MPI_Fint comm, sharp_jobtype type, int spin,
333   void *alm, void *map, const sharp_geom_info *geom_info,
334   const sharp_alm_info *alm_info, int flags, double *time,
335   unsigned long long *opcnt)
336   {
337   sharp_execute_mpi(MPI_Comm_f2c(comm), type, spin, alm, map, geom_info,
338                     alm_info, flags, time, opcnt);
339   }
340 
341 #endif
342