1 #include <petscsys.h>        /*I  "petscsys.h"  I*/
2 #include <petsc/private/petscimpl.h>
3 
4 struct _n_PetscShmComm {
5   PetscMPIInt *globranks;       /* global ranks of each rank in the shared memory communicator */
6   PetscMPIInt shmsize;          /* size of the shared memory communicator */
7   MPI_Comm    globcomm,shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */
8 };
9 
10 /*
11    Private routine to delete internal shared memory communicator when a communicator is freed.
12 
13    This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this  data as an attribute is freed.
14 
15    Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval()
16 
17 */
Petsc_ShmComm_Attr_Delete_Fn(MPI_Comm comm,PetscMPIInt keyval,void * val,void * extra_state)18 PETSC_EXTERN PetscMPIInt MPIAPI Petsc_ShmComm_Attr_Delete_Fn(MPI_Comm comm,PetscMPIInt keyval,void *val,void *extra_state)
19 {
20   PetscErrorCode  ierr;
21   PetscShmComm p = (PetscShmComm)val;
22 
23   PetscFunctionBegin;
24   ierr = PetscInfo1(NULL,"Deleting shared memory subcommunicator in a MPI_Comm %ld\n",(long)comm);CHKERRMPI(ierr);
25   ierr = MPI_Comm_free(&p->shmcomm);CHKERRMPI(ierr);
26   ierr = PetscFree(p->globranks);CHKERRMPI(ierr);
27   ierr = PetscFree(val);CHKERRMPI(ierr);
28   PetscFunctionReturn(MPI_SUCCESS);
29 }
30 
31 /*@C
32     PetscShmCommGet - Given a PETSc communicator returns a communicator of all ranks that share a common memory
33 
34 
35     Collective.
36 
37     Input Parameter:
38 .   globcomm - MPI_Comm
39 
40     Output Parameter:
41 .   pshmcomm - the PETSc shared memory communicator object
42 
43     Level: developer
44 
45     Notes:
46     This should be called only with an PetscCommDuplicate() communictor
47 
48            When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis
49 
50 @*/
PetscShmCommGet(MPI_Comm globcomm,PetscShmComm * pshmcomm)51 PetscErrorCode PetscShmCommGet(MPI_Comm globcomm,PetscShmComm *pshmcomm)
52 {
53 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
54   PetscErrorCode   ierr;
55   MPI_Group        globgroup,shmgroup;
56   PetscMPIInt      *shmranks,i,flg;
57   PetscCommCounter *counter;
58 
59   PetscFunctionBegin;
60   ierr = MPI_Comm_get_attr(globcomm,Petsc_Counter_keyval,&counter,&flg);CHKERRQ(ierr);
61   if (!flg) SETERRQ(globcomm,PETSC_ERR_ARG_CORRUPT,"Bad MPI communicator supplied; must be a PETSc communicator");
62 
63   ierr = MPI_Comm_get_attr(globcomm,Petsc_ShmComm_keyval,pshmcomm,&flg);CHKERRQ(ierr);
64   if (flg) PetscFunctionReturn(0);
65 
66   ierr        = PetscNew(pshmcomm);CHKERRQ(ierr);
67   (*pshmcomm)->globcomm = globcomm;
68 
69   ierr = MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED,0, MPI_INFO_NULL,&(*pshmcomm)->shmcomm);CHKERRQ(ierr);
70 
71   ierr = MPI_Comm_size((*pshmcomm)->shmcomm,&(*pshmcomm)->shmsize);CHKERRQ(ierr);
72   ierr = MPI_Comm_group(globcomm, &globgroup);CHKERRQ(ierr);
73   ierr = MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup);CHKERRQ(ierr);
74   ierr = PetscMalloc1((*pshmcomm)->shmsize,&shmranks);CHKERRQ(ierr);
75   ierr = PetscMalloc1((*pshmcomm)->shmsize,&(*pshmcomm)->globranks);CHKERRQ(ierr);
76   for (i=0; i<(*pshmcomm)->shmsize; i++) shmranks[i] = i;
77   ierr = MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks);CHKERRQ(ierr);
78   ierr = PetscFree(shmranks);CHKERRQ(ierr);
79   ierr = MPI_Group_free(&globgroup);CHKERRQ(ierr);
80   ierr = MPI_Group_free(&shmgroup);CHKERRQ(ierr);
81 
82   for (i=0; i<(*pshmcomm)->shmsize; i++) {
83     ierr = PetscInfo2(NULL,"Shared memory rank %d global rank %d\n",i,(*pshmcomm)->globranks[i]);CHKERRQ(ierr);
84   }
85   ierr = MPI_Comm_set_attr(globcomm,Petsc_ShmComm_keyval,*pshmcomm);CHKERRQ(ierr);
86   PetscFunctionReturn(0);
87 #else
88   SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich.");
89 #endif
90 }
91 
92 /*@C
93     PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator
94 
95     Input Parameters:
96 +   pshmcomm - the shared memory communicator object
97 -   grank    - the global rank
98 
99     Output Parameter:
100 .   lrank - the local rank, or MPI_PROC_NULL if it does not exist
101 
102     Level: developer
103 
104     Developer Notes:
105     Assumes the pshmcomm->globranks[] is sorted
106 
107     It may be better to rewrite this to map multiple global ranks to local in the same function call
108 
109 @*/
PetscShmCommGlobalToLocal(PetscShmComm pshmcomm,PetscMPIInt grank,PetscMPIInt * lrank)110 PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm,PetscMPIInt grank,PetscMPIInt *lrank)
111 {
112   PetscMPIInt    low,high,t,i;
113   PetscBool      flg = PETSC_FALSE;
114   PetscErrorCode ierr;
115 
116   PetscFunctionBegin;
117   *lrank = MPI_PROC_NULL;
118   if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(0);
119   if (grank > pshmcomm->globranks[pshmcomm->shmsize-1]) PetscFunctionReturn(0);
120   ierr = PetscOptionsGetBool(NULL,NULL,"-noshared",&flg,NULL);CHKERRQ(ierr);
121   if (flg) PetscFunctionReturn(0);
122   low  = 0;
123   high = pshmcomm->shmsize;
124   while (high-low > 5) {
125     t = (low+high)/2;
126     if (pshmcomm->globranks[t] > grank) high = t;
127     else low = t;
128   }
129   for (i=low; i<high; i++) {
130     if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(0);
131     if (pshmcomm->globranks[i] == grank) {
132       *lrank = i;
133       PetscFunctionReturn(0);
134     }
135   }
136   PetscFunctionReturn(0);
137 }
138 
139 /*@C
140     PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank
141 
142     Input Parameters:
143 +   pshmcomm - the shared memory communicator object
144 -   lrank    - the local rank in the shared memory communicator
145 
146     Output Parameter:
147 .   grank - the global rank in the global communicator where the shared memory communicator is built
148 
149     Level: developer
150 
151 @*/
PetscShmCommLocalToGlobal(PetscShmComm pshmcomm,PetscMPIInt lrank,PetscMPIInt * grank)152 PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm,PetscMPIInt lrank,PetscMPIInt *grank)
153 {
154   PetscFunctionBegin;
155   if (lrank < 0 || lrank >= pshmcomm->shmsize) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"No rank %D in the shared memory communicator",lrank);
156   *grank = pshmcomm->globranks[lrank];
157   PetscFunctionReturn(0);
158 }
159 
160 /*@C
161     PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory
162 
163     Input Parameter:
164 .   pshmcomm - PetscShmComm object obtained with PetscShmCommGet()
165 
166     Output Parameter:
167 .   comm     - the MPI communicator
168 
169     Level: developer
170 
171 @*/
PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm,MPI_Comm * comm)172 PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm,MPI_Comm *comm)
173 {
174   PetscFunctionBegin;
175   *comm = pshmcomm->shmcomm;
176   PetscFunctionReturn(0);
177 }
178 
179 #if defined(PETSC_HAVE_OPENMP_SUPPORT)
180 #include <pthread.h>
181 #include <hwloc.h>
182 #include <omp.h>
183 
184 /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
185    otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
186    simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
187    by 50%. Until the reason is found out, we use mmap() instead.
188 */
189 #define USE_MMAP_ALLOCATE_SHARED_MEMORY
190 
191 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
192 #include <sys/mman.h>
193 #include <sys/types.h>
194 #include <sys/stat.h>
195 #include <fcntl.h>
196 #endif
197 
198 struct _n_PetscOmpCtrl {
199   MPI_Comm          omp_comm;        /* a shared memory communicator to spawn omp threads */
200   MPI_Comm          omp_master_comm; /* a communicator to give to third party libraries */
201   PetscMPIInt       omp_comm_size;   /* size of omp_comm, a kind of OMP_NUM_THREADS */
202   PetscBool         is_omp_master;   /* rank 0's in omp_comm */
203   MPI_Win           omp_win;         /* a shared memory window containing a barrier */
204   pthread_barrier_t *barrier;        /* pointer to the barrier */
205   hwloc_topology_t  topology;
206   hwloc_cpuset_t    cpuset;          /* cpu bindings of omp master */
207   hwloc_cpuset_t    omp_cpuset;      /* union of cpu bindings of ranks in omp_comm */
208 };
209 
210 
211 /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
212    contained by the controller.
213 
214    PETSc OpenMP controller users do not call this function directly. This function exists
215    only because we want to separate shared memory allocation methods from other code.
216  */
PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)217 PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
218 {
219   PetscErrorCode        ierr;
220   MPI_Aint              size;
221   void                  *baseptr;
222   pthread_barrierattr_t  attr;
223 
224 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
225   PetscInt              fd;
226   PetscChar             pathname[PETSC_MAX_PATH_LEN];
227 #else
228   PetscMPIInt           disp_unit;
229 #endif
230 
231   PetscFunctionBegin;
232 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
233   size = sizeof(pthread_barrier_t);
234   if (ctrl->is_omp_master) {
235     /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
236     ierr    = PetscGetTmp(PETSC_COMM_SELF,pathname,PETSC_MAX_PATH_LEN);CHKERRQ(ierr);
237     ierr    = PetscStrlcat(pathname,"/petsc-shm-XXXXXX",PETSC_MAX_PATH_LEN);CHKERRQ(ierr);
238     /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
239     fd      = mkstemp(pathname); if (fd == -1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not create tmp file %s with mkstemp\n", pathname);
240     ierr    = ftruncate(fd,size);CHKERRQ(ierr);
241     baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); if (baseptr == MAP_FAILED) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed\n");
242     ierr    = close(fd);CHKERRQ(ierr);
243     ierr    = MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);CHKERRQ(ierr);
244     /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
245     ierr    = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
246     ierr    = unlink(pathname);CHKERRQ(ierr);
247   } else {
248     ierr    = MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);CHKERRQ(ierr);
249     fd      = open(pathname,O_RDWR); if (fd == -1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not open tmp file %s\n", pathname);
250     baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); if (baseptr == MAP_FAILED) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed\n");
251     ierr    = close(fd);CHKERRQ(ierr);
252     ierr    = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
253   }
254 #else
255   size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
256   ierr = MPI_Win_allocate_shared(size,1,MPI_INFO_NULL,ctrl->omp_comm,&baseptr,&ctrl->omp_win);CHKERRQ(ierr);
257   ierr = MPI_Win_shared_query(ctrl->omp_win,0,&size,&disp_unit,&baseptr);CHKERRQ(ierr);
258 #endif
259   ctrl->barrier = (pthread_barrier_t*)baseptr;
260 
261   /* omp master initializes the barrier */
262   if (ctrl->is_omp_master) {
263     ierr = MPI_Comm_size(ctrl->omp_comm,&ctrl->omp_comm_size);CHKERRQ(ierr);
264     ierr = pthread_barrierattr_init(&attr);CHKERRQ(ierr);
265     ierr = pthread_barrierattr_setpshared(&attr,PTHREAD_PROCESS_SHARED);CHKERRQ(ierr); /* make the barrier also work for processes */
266     ierr = pthread_barrier_init(ctrl->barrier,&attr,(unsigned int)ctrl->omp_comm_size);CHKERRQ(ierr);
267     ierr = pthread_barrierattr_destroy(&attr);CHKERRQ(ierr);
268   }
269 
270   /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
271   ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
272   PetscFunctionReturn(0);
273 }
274 
275 /* Destroy the pthread barrier in the PETSc OpenMP controller */
PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)276 PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
277 {
278   PetscErrorCode ierr;
279 
280   PetscFunctionBegin;
281   /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
282   ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
283   if (ctrl->is_omp_master) { ierr = pthread_barrier_destroy(ctrl->barrier);CHKERRQ(ierr); }
284 
285 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
286   ierr = munmap(ctrl->barrier,sizeof(pthread_barrier_t));CHKERRQ(ierr);
287 #else
288   ierr = MPI_Win_free(&ctrl->omp_win);CHKERRQ(ierr);
289 #endif
290   PetscFunctionReturn(0);
291 }
292 
293 /*@C
294     PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries using OpenMP
295 
296     Input Parameter:
297 +   petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
298 -   nthreads   - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value
299 
300     Output Parameter:
301 .   pctrl      - a PETSc OpenMP controller
302 
303     Level: developer
304 
305     TODO: Possibly use the variable PetscNumOMPThreads to determine the number for threads to use
306 
307 .seealso PetscOmpCtrlDestroy()
308 @*/
PetscOmpCtrlCreate(MPI_Comm petsc_comm,PetscInt nthreads,PetscOmpCtrl * pctrl)309 PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm,PetscInt nthreads,PetscOmpCtrl *pctrl)
310 {
311   PetscErrorCode        ierr;
312   PetscOmpCtrl          ctrl;
313   unsigned long         *cpu_ulongs=NULL;
314   PetscInt              i,nr_cpu_ulongs;
315   PetscShmComm          pshmcomm;
316   MPI_Comm              shm_comm;
317   PetscMPIInt           shm_rank,shm_comm_size,omp_rank,color;
318   PetscInt              num_packages,num_cores;
319 
320   PetscFunctionBegin;
321   ierr = PetscNew(&ctrl);CHKERRQ(ierr);
322 
323   /*=================================================================================
324     Init hwloc
325    ==================================================================================*/
326   ierr = hwloc_topology_init(&ctrl->topology);CHKERRQ(ierr);
327 #if HWLOC_API_VERSION >= 0x00020000
328   /* to filter out unneeded info and have faster hwloc_topology_load */
329   ierr = hwloc_topology_set_all_types_filter(ctrl->topology,HWLOC_TYPE_FILTER_KEEP_NONE);CHKERRQ(ierr);
330   ierr = hwloc_topology_set_type_filter(ctrl->topology,HWLOC_OBJ_CORE,HWLOC_TYPE_FILTER_KEEP_ALL);CHKERRQ(ierr);
331 #endif
332   ierr = hwloc_topology_load(ctrl->topology);CHKERRQ(ierr);
333 
334   /*=================================================================================
335     Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
336     physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
337     others are called slaves. OMP Masters make up a new comm called omp_master_comm,
338     which is usually passed to third party libraries.
339    ==================================================================================*/
340 
341   /* fetch the stored shared memory communicator */
342   ierr = PetscShmCommGet(petsc_comm,&pshmcomm);CHKERRQ(ierr);
343   ierr = PetscShmCommGetMpiShmComm(pshmcomm,&shm_comm);CHKERRQ(ierr);
344 
345   ierr = MPI_Comm_rank(shm_comm,&shm_rank);CHKERRQ(ierr);
346   ierr = MPI_Comm_size(shm_comm,&shm_comm_size);CHKERRQ(ierr);
347 
348   /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
349   if (nthreads == -1) {
350     num_packages = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_PACKAGE);
351     num_cores    = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_CORE) <= 0 ? 1 :  hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_CORE);
352     nthreads     = num_cores/num_packages;
353     if (nthreads > shm_comm_size) nthreads = shm_comm_size;
354   }
355 
356   if (nthreads < 1 || nthreads > shm_comm_size) SETERRQ2(petsc_comm,PETSC_ERR_ARG_OUTOFRANGE,"number of OpenMP threads %D can not be < 1 or > the MPI shared memory communicator size %d\n",nthreads,shm_comm_size);
357   if (shm_comm_size % nthreads) { ierr = PetscPrintf(petsc_comm,"Warning: number of OpenMP threads %D is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n",nthreads,shm_comm_size);CHKERRQ(ierr); }
358 
359   /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
360      shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
361      color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
362      be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
363      Use 0 as key so that rank ordering wont change in new comm.
364    */
365   color = shm_rank / nthreads;
366   ierr  = MPI_Comm_split(shm_comm,color,0/*key*/,&ctrl->omp_comm);CHKERRQ(ierr);
367 
368   /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
369   ierr = MPI_Comm_rank(ctrl->omp_comm,&omp_rank);CHKERRQ(ierr);
370   if (!omp_rank) {
371     ctrl->is_omp_master = PETSC_TRUE;  /* master */
372     color = 0;
373   } else {
374     ctrl->is_omp_master = PETSC_FALSE; /* slave */
375     color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
376   }
377   ierr = MPI_Comm_split(petsc_comm,color,0/*key*/,&ctrl->omp_master_comm);CHKERRQ(ierr); /* rank 0 in omp_master_comm is rank 0 in petsc_comm */
378 
379   /*=================================================================================
380     Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
381     slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
382     and run them on the idle CPUs.
383    ==================================================================================*/
384   ierr = PetscOmpCtrlCreateBarrier(ctrl);CHKERRQ(ierr);
385 
386   /*=================================================================================
387     omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
388     is the union of the bindings of all ranks in the omp_comm
389     =================================================================================*/
390 
391   ctrl->cpuset = hwloc_bitmap_alloc(); if (!ctrl->cpuset) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed\n");
392   ierr = hwloc_get_cpubind(ctrl->topology,ctrl->cpuset, HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
393 
394   /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
395   nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset (ctrl->topology))+sizeof(unsigned long)*8)/sizeof(unsigned long)/8;
396   ierr = PetscMalloc1(nr_cpu_ulongs,&cpu_ulongs);CHKERRQ(ierr);
397   if (nr_cpu_ulongs == 1) {
398     cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
399   } else {
400     for (i=0; i<nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset,(unsigned)i);
401   }
402 
403   ierr = MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs,nr_cpu_ulongs, MPI_UNSIGNED_LONG,MPI_BOR,0,ctrl->omp_comm);CHKERRQ(ierr);
404 
405   if (ctrl->is_omp_master) {
406     ctrl->omp_cpuset = hwloc_bitmap_alloc(); if (!ctrl->omp_cpuset) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed\n");
407     if (nr_cpu_ulongs == 1) {
408 #if HWLOC_API_VERSION >= 0x00020000
409       ierr = hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);CHKERRQ(ierr);
410 #else
411       hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);
412 #endif
413     } else {
414       for (i=0; i<nr_cpu_ulongs; i++)  {
415 #if HWLOC_API_VERSION >= 0x00020000
416         ierr = hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);CHKERRQ(ierr);
417 #else
418         hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);
419 #endif
420       }
421     }
422   }
423 
424   ierr = PetscFree(cpu_ulongs);CHKERRQ(ierr);
425   *pctrl = ctrl;
426   PetscFunctionReturn(0);
427 }
428 
429 /*@C
430     PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller
431 
432     Input Parameter:
433 .   pctrl  - a PETSc OpenMP controller
434 
435     Level: developer
436 
437 .seealso PetscOmpCtrlCreate()
438 @*/
PetscOmpCtrlDestroy(PetscOmpCtrl * pctrl)439 PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
440 {
441   PetscErrorCode  ierr;
442   PetscOmpCtrl    ctrl = *pctrl;
443 
444   PetscFunctionBegin;
445   hwloc_bitmap_free(ctrl->cpuset);
446   hwloc_topology_destroy(ctrl->topology);
447   PetscOmpCtrlDestroyBarrier(ctrl);
448   ierr = MPI_Comm_free(&ctrl->omp_comm);CHKERRQ(ierr);
449   if (ctrl->is_omp_master) {
450     hwloc_bitmap_free(ctrl->omp_cpuset);
451     ierr = MPI_Comm_free(&ctrl->omp_master_comm);CHKERRQ(ierr);
452   }
453   ierr = PetscFree(ctrl);CHKERRQ(ierr);
454   PetscFunctionReturn(0);
455 }
456 
457 /*@C
458     PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller
459 
460     Input Parameter:
461 .   ctrl - a PETSc OMP controller
462 
463     Output Parameter:
464 +   omp_comm         - a communicator that includes a master rank and slave ranks where master spawns threads
465 .   omp_master_comm  - on master ranks, return a communicator that include master ranks of each omp_comm;
466                        on slave ranks, MPI_COMM_NULL will be return in reality.
467 -   is_omp_master    - true if the calling process is an OMP master rank.
468 
469     Notes: any output parameter can be NULL. The parameter is just ignored.
470 
471     Level: developer
472 @*/
PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl,MPI_Comm * omp_comm,MPI_Comm * omp_master_comm,PetscBool * is_omp_master)473 PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl,MPI_Comm *omp_comm,MPI_Comm *omp_master_comm,PetscBool *is_omp_master)
474 {
475   PetscFunctionBegin;
476   if (omp_comm)        *omp_comm        = ctrl->omp_comm;
477   if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
478   if (is_omp_master)   *is_omp_master   = ctrl->is_omp_master;
479   PetscFunctionReturn(0);
480 }
481 
482 /*@C
483     PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU)
484 
485     Input Parameter:
486 .   ctrl - a PETSc OMP controller
487 
488     Notes:
489     this is a pthread barrier on MPI processes. Using MPI_Barrier instead is conceptually correct. But MPI standard does not
490     require processes blocked by MPI_Barrier free their CPUs to let other processes progress. In practice, to minilize latency,
491     MPI processes stuck in MPI_Barrier keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.
492 
493     A code using PetscOmpCtrlBarrier() would be like this,
494 
495     if (is_omp_master) {
496       PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
497       Call the library using OpenMP
498       PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
499     }
500     PetscOmpCtrlBarrier(ctrl);
501 
502     Level: developer
503 
504 .seealso PetscOmpCtrlOmpRegionOnMasterBegin(), PetscOmpCtrlOmpRegionOnMasterEnd()
505 @*/
PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)506 PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
507 {
508   int err;
509 
510   PetscFunctionBegin;
511   err = pthread_barrier_wait(ctrl->barrier);
512   if (err && err != PTHREAD_BARRIER_SERIAL_THREAD) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %D\n", err);
513   PetscFunctionReturn(0);
514 }
515 
516 /*@C
517     PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks
518 
519     Input Parameter:
520 .   ctrl - a PETSc OMP controller
521 
522     Notes:
523     Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms() to know if this is a master rank.
524     This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime
525 
526     Level: developer
527 
528 .seealso: PetscOmpCtrlOmpRegionOnMasterEnd()
529 @*/
PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)530 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
531 {
532   PetscErrorCode ierr;
533 
534   PetscFunctionBegin;
535   ierr = hwloc_set_cpubind(ctrl->topology,ctrl->omp_cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
536   omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
537   PetscFunctionReturn(0);
538 }
539 
540 /*@C
541    PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks
542 
543    Input Parameter:
544 .  ctrl - a PETSc OMP controller
545 
546    Notes:
547    Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms() to know if this is a master rank.
548    This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.
549 
550    Level: developer
551 
552 .seealso: PetscOmpCtrlOmpRegionOnMasterBegin()
553 @*/
PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)554 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
555 {
556   PetscErrorCode ierr;
557 
558   PetscFunctionBegin;
559   ierr = hwloc_set_cpubind(ctrl->topology,ctrl->cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
560   omp_set_num_threads(1);
561   PetscFunctionReturn(0);
562 }
563 
564 #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
565 #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */
566