1 #include <petscsys.h> /*I "petscsys.h" I*/
2 #include <petsc/private/petscimpl.h>
3
4 struct _n_PetscShmComm {
5 PetscMPIInt *globranks; /* global ranks of each rank in the shared memory communicator */
6 PetscMPIInt shmsize; /* size of the shared memory communicator */
7 MPI_Comm globcomm,shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */
8 };
9
10 /*
11 Private routine to delete internal shared memory communicator when a communicator is freed.
12
13 This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this data as an attribute is freed.
14
15 Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval()
16
17 */
Petsc_ShmComm_Attr_Delete_Fn(MPI_Comm comm,PetscMPIInt keyval,void * val,void * extra_state)18 PETSC_EXTERN PetscMPIInt MPIAPI Petsc_ShmComm_Attr_Delete_Fn(MPI_Comm comm,PetscMPIInt keyval,void *val,void *extra_state)
19 {
20 PetscErrorCode ierr;
21 PetscShmComm p = (PetscShmComm)val;
22
23 PetscFunctionBegin;
24 ierr = PetscInfo1(NULL,"Deleting shared memory subcommunicator in a MPI_Comm %ld\n",(long)comm);CHKERRMPI(ierr);
25 ierr = MPI_Comm_free(&p->shmcomm);CHKERRMPI(ierr);
26 ierr = PetscFree(p->globranks);CHKERRMPI(ierr);
27 ierr = PetscFree(val);CHKERRMPI(ierr);
28 PetscFunctionReturn(MPI_SUCCESS);
29 }
30
31 /*@C
32 PetscShmCommGet - Given a PETSc communicator returns a communicator of all ranks that share a common memory
33
34
35 Collective.
36
37 Input Parameter:
38 . globcomm - MPI_Comm
39
40 Output Parameter:
41 . pshmcomm - the PETSc shared memory communicator object
42
43 Level: developer
44
45 Notes:
46 This should be called only with an PetscCommDuplicate() communictor
47
48 When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis
49
50 @*/
PetscShmCommGet(MPI_Comm globcomm,PetscShmComm * pshmcomm)51 PetscErrorCode PetscShmCommGet(MPI_Comm globcomm,PetscShmComm *pshmcomm)
52 {
53 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
54 PetscErrorCode ierr;
55 MPI_Group globgroup,shmgroup;
56 PetscMPIInt *shmranks,i,flg;
57 PetscCommCounter *counter;
58
59 PetscFunctionBegin;
60 ierr = MPI_Comm_get_attr(globcomm,Petsc_Counter_keyval,&counter,&flg);CHKERRQ(ierr);
61 if (!flg) SETERRQ(globcomm,PETSC_ERR_ARG_CORRUPT,"Bad MPI communicator supplied; must be a PETSc communicator");
62
63 ierr = MPI_Comm_get_attr(globcomm,Petsc_ShmComm_keyval,pshmcomm,&flg);CHKERRQ(ierr);
64 if (flg) PetscFunctionReturn(0);
65
66 ierr = PetscNew(pshmcomm);CHKERRQ(ierr);
67 (*pshmcomm)->globcomm = globcomm;
68
69 ierr = MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED,0, MPI_INFO_NULL,&(*pshmcomm)->shmcomm);CHKERRQ(ierr);
70
71 ierr = MPI_Comm_size((*pshmcomm)->shmcomm,&(*pshmcomm)->shmsize);CHKERRQ(ierr);
72 ierr = MPI_Comm_group(globcomm, &globgroup);CHKERRQ(ierr);
73 ierr = MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup);CHKERRQ(ierr);
74 ierr = PetscMalloc1((*pshmcomm)->shmsize,&shmranks);CHKERRQ(ierr);
75 ierr = PetscMalloc1((*pshmcomm)->shmsize,&(*pshmcomm)->globranks);CHKERRQ(ierr);
76 for (i=0; i<(*pshmcomm)->shmsize; i++) shmranks[i] = i;
77 ierr = MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks);CHKERRQ(ierr);
78 ierr = PetscFree(shmranks);CHKERRQ(ierr);
79 ierr = MPI_Group_free(&globgroup);CHKERRQ(ierr);
80 ierr = MPI_Group_free(&shmgroup);CHKERRQ(ierr);
81
82 for (i=0; i<(*pshmcomm)->shmsize; i++) {
83 ierr = PetscInfo2(NULL,"Shared memory rank %d global rank %d\n",i,(*pshmcomm)->globranks[i]);CHKERRQ(ierr);
84 }
85 ierr = MPI_Comm_set_attr(globcomm,Petsc_ShmComm_keyval,*pshmcomm);CHKERRQ(ierr);
86 PetscFunctionReturn(0);
87 #else
88 SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich.");
89 #endif
90 }
91
92 /*@C
93 PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator
94
95 Input Parameters:
96 + pshmcomm - the shared memory communicator object
97 - grank - the global rank
98
99 Output Parameter:
100 . lrank - the local rank, or MPI_PROC_NULL if it does not exist
101
102 Level: developer
103
104 Developer Notes:
105 Assumes the pshmcomm->globranks[] is sorted
106
107 It may be better to rewrite this to map multiple global ranks to local in the same function call
108
109 @*/
PetscShmCommGlobalToLocal(PetscShmComm pshmcomm,PetscMPIInt grank,PetscMPIInt * lrank)110 PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm,PetscMPIInt grank,PetscMPIInt *lrank)
111 {
112 PetscMPIInt low,high,t,i;
113 PetscBool flg = PETSC_FALSE;
114 PetscErrorCode ierr;
115
116 PetscFunctionBegin;
117 *lrank = MPI_PROC_NULL;
118 if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(0);
119 if (grank > pshmcomm->globranks[pshmcomm->shmsize-1]) PetscFunctionReturn(0);
120 ierr = PetscOptionsGetBool(NULL,NULL,"-noshared",&flg,NULL);CHKERRQ(ierr);
121 if (flg) PetscFunctionReturn(0);
122 low = 0;
123 high = pshmcomm->shmsize;
124 while (high-low > 5) {
125 t = (low+high)/2;
126 if (pshmcomm->globranks[t] > grank) high = t;
127 else low = t;
128 }
129 for (i=low; i<high; i++) {
130 if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(0);
131 if (pshmcomm->globranks[i] == grank) {
132 *lrank = i;
133 PetscFunctionReturn(0);
134 }
135 }
136 PetscFunctionReturn(0);
137 }
138
139 /*@C
140 PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank
141
142 Input Parameters:
143 + pshmcomm - the shared memory communicator object
144 - lrank - the local rank in the shared memory communicator
145
146 Output Parameter:
147 . grank - the global rank in the global communicator where the shared memory communicator is built
148
149 Level: developer
150
151 @*/
PetscShmCommLocalToGlobal(PetscShmComm pshmcomm,PetscMPIInt lrank,PetscMPIInt * grank)152 PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm,PetscMPIInt lrank,PetscMPIInt *grank)
153 {
154 PetscFunctionBegin;
155 if (lrank < 0 || lrank >= pshmcomm->shmsize) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"No rank %D in the shared memory communicator",lrank);
156 *grank = pshmcomm->globranks[lrank];
157 PetscFunctionReturn(0);
158 }
159
160 /*@C
161 PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory
162
163 Input Parameter:
164 . pshmcomm - PetscShmComm object obtained with PetscShmCommGet()
165
166 Output Parameter:
167 . comm - the MPI communicator
168
169 Level: developer
170
171 @*/
PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm,MPI_Comm * comm)172 PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm,MPI_Comm *comm)
173 {
174 PetscFunctionBegin;
175 *comm = pshmcomm->shmcomm;
176 PetscFunctionReturn(0);
177 }
178
179 #if defined(PETSC_HAVE_OPENMP_SUPPORT)
180 #include <pthread.h>
181 #include <hwloc.h>
182 #include <omp.h>
183
184 /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
185 otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
186 simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
187 by 50%. Until the reason is found out, we use mmap() instead.
188 */
189 #define USE_MMAP_ALLOCATE_SHARED_MEMORY
190
191 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
192 #include <sys/mman.h>
193 #include <sys/types.h>
194 #include <sys/stat.h>
195 #include <fcntl.h>
196 #endif
197
198 struct _n_PetscOmpCtrl {
199 MPI_Comm omp_comm; /* a shared memory communicator to spawn omp threads */
200 MPI_Comm omp_master_comm; /* a communicator to give to third party libraries */
201 PetscMPIInt omp_comm_size; /* size of omp_comm, a kind of OMP_NUM_THREADS */
202 PetscBool is_omp_master; /* rank 0's in omp_comm */
203 MPI_Win omp_win; /* a shared memory window containing a barrier */
204 pthread_barrier_t *barrier; /* pointer to the barrier */
205 hwloc_topology_t topology;
206 hwloc_cpuset_t cpuset; /* cpu bindings of omp master */
207 hwloc_cpuset_t omp_cpuset; /* union of cpu bindings of ranks in omp_comm */
208 };
209
210
211 /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
212 contained by the controller.
213
214 PETSc OpenMP controller users do not call this function directly. This function exists
215 only because we want to separate shared memory allocation methods from other code.
216 */
PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)217 PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
218 {
219 PetscErrorCode ierr;
220 MPI_Aint size;
221 void *baseptr;
222 pthread_barrierattr_t attr;
223
224 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
225 PetscInt fd;
226 PetscChar pathname[PETSC_MAX_PATH_LEN];
227 #else
228 PetscMPIInt disp_unit;
229 #endif
230
231 PetscFunctionBegin;
232 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
233 size = sizeof(pthread_barrier_t);
234 if (ctrl->is_omp_master) {
235 /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
236 ierr = PetscGetTmp(PETSC_COMM_SELF,pathname,PETSC_MAX_PATH_LEN);CHKERRQ(ierr);
237 ierr = PetscStrlcat(pathname,"/petsc-shm-XXXXXX",PETSC_MAX_PATH_LEN);CHKERRQ(ierr);
238 /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
239 fd = mkstemp(pathname); if (fd == -1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not create tmp file %s with mkstemp\n", pathname);
240 ierr = ftruncate(fd,size);CHKERRQ(ierr);
241 baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); if (baseptr == MAP_FAILED) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed\n");
242 ierr = close(fd);CHKERRQ(ierr);
243 ierr = MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);CHKERRQ(ierr);
244 /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
245 ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
246 ierr = unlink(pathname);CHKERRQ(ierr);
247 } else {
248 ierr = MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);CHKERRQ(ierr);
249 fd = open(pathname,O_RDWR); if (fd == -1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not open tmp file %s\n", pathname);
250 baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); if (baseptr == MAP_FAILED) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed\n");
251 ierr = close(fd);CHKERRQ(ierr);
252 ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
253 }
254 #else
255 size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
256 ierr = MPI_Win_allocate_shared(size,1,MPI_INFO_NULL,ctrl->omp_comm,&baseptr,&ctrl->omp_win);CHKERRQ(ierr);
257 ierr = MPI_Win_shared_query(ctrl->omp_win,0,&size,&disp_unit,&baseptr);CHKERRQ(ierr);
258 #endif
259 ctrl->barrier = (pthread_barrier_t*)baseptr;
260
261 /* omp master initializes the barrier */
262 if (ctrl->is_omp_master) {
263 ierr = MPI_Comm_size(ctrl->omp_comm,&ctrl->omp_comm_size);CHKERRQ(ierr);
264 ierr = pthread_barrierattr_init(&attr);CHKERRQ(ierr);
265 ierr = pthread_barrierattr_setpshared(&attr,PTHREAD_PROCESS_SHARED);CHKERRQ(ierr); /* make the barrier also work for processes */
266 ierr = pthread_barrier_init(ctrl->barrier,&attr,(unsigned int)ctrl->omp_comm_size);CHKERRQ(ierr);
267 ierr = pthread_barrierattr_destroy(&attr);CHKERRQ(ierr);
268 }
269
270 /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
271 ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
272 PetscFunctionReturn(0);
273 }
274
275 /* Destroy the pthread barrier in the PETSc OpenMP controller */
PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)276 PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
277 {
278 PetscErrorCode ierr;
279
280 PetscFunctionBegin;
281 /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
282 ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
283 if (ctrl->is_omp_master) { ierr = pthread_barrier_destroy(ctrl->barrier);CHKERRQ(ierr); }
284
285 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
286 ierr = munmap(ctrl->barrier,sizeof(pthread_barrier_t));CHKERRQ(ierr);
287 #else
288 ierr = MPI_Win_free(&ctrl->omp_win);CHKERRQ(ierr);
289 #endif
290 PetscFunctionReturn(0);
291 }
292
293 /*@C
294 PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries using OpenMP
295
296 Input Parameter:
297 + petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
298 - nthreads - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value
299
300 Output Parameter:
301 . pctrl - a PETSc OpenMP controller
302
303 Level: developer
304
305 TODO: Possibly use the variable PetscNumOMPThreads to determine the number for threads to use
306
307 .seealso PetscOmpCtrlDestroy()
308 @*/
PetscOmpCtrlCreate(MPI_Comm petsc_comm,PetscInt nthreads,PetscOmpCtrl * pctrl)309 PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm,PetscInt nthreads,PetscOmpCtrl *pctrl)
310 {
311 PetscErrorCode ierr;
312 PetscOmpCtrl ctrl;
313 unsigned long *cpu_ulongs=NULL;
314 PetscInt i,nr_cpu_ulongs;
315 PetscShmComm pshmcomm;
316 MPI_Comm shm_comm;
317 PetscMPIInt shm_rank,shm_comm_size,omp_rank,color;
318 PetscInt num_packages,num_cores;
319
320 PetscFunctionBegin;
321 ierr = PetscNew(&ctrl);CHKERRQ(ierr);
322
323 /*=================================================================================
324 Init hwloc
325 ==================================================================================*/
326 ierr = hwloc_topology_init(&ctrl->topology);CHKERRQ(ierr);
327 #if HWLOC_API_VERSION >= 0x00020000
328 /* to filter out unneeded info and have faster hwloc_topology_load */
329 ierr = hwloc_topology_set_all_types_filter(ctrl->topology,HWLOC_TYPE_FILTER_KEEP_NONE);CHKERRQ(ierr);
330 ierr = hwloc_topology_set_type_filter(ctrl->topology,HWLOC_OBJ_CORE,HWLOC_TYPE_FILTER_KEEP_ALL);CHKERRQ(ierr);
331 #endif
332 ierr = hwloc_topology_load(ctrl->topology);CHKERRQ(ierr);
333
334 /*=================================================================================
335 Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
336 physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
337 others are called slaves. OMP Masters make up a new comm called omp_master_comm,
338 which is usually passed to third party libraries.
339 ==================================================================================*/
340
341 /* fetch the stored shared memory communicator */
342 ierr = PetscShmCommGet(petsc_comm,&pshmcomm);CHKERRQ(ierr);
343 ierr = PetscShmCommGetMpiShmComm(pshmcomm,&shm_comm);CHKERRQ(ierr);
344
345 ierr = MPI_Comm_rank(shm_comm,&shm_rank);CHKERRQ(ierr);
346 ierr = MPI_Comm_size(shm_comm,&shm_comm_size);CHKERRQ(ierr);
347
348 /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
349 if (nthreads == -1) {
350 num_packages = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_PACKAGE);
351 num_cores = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_CORE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_CORE);
352 nthreads = num_cores/num_packages;
353 if (nthreads > shm_comm_size) nthreads = shm_comm_size;
354 }
355
356 if (nthreads < 1 || nthreads > shm_comm_size) SETERRQ2(petsc_comm,PETSC_ERR_ARG_OUTOFRANGE,"number of OpenMP threads %D can not be < 1 or > the MPI shared memory communicator size %d\n",nthreads,shm_comm_size);
357 if (shm_comm_size % nthreads) { ierr = PetscPrintf(petsc_comm,"Warning: number of OpenMP threads %D is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n",nthreads,shm_comm_size);CHKERRQ(ierr); }
358
359 /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
360 shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
361 color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
362 be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
363 Use 0 as key so that rank ordering wont change in new comm.
364 */
365 color = shm_rank / nthreads;
366 ierr = MPI_Comm_split(shm_comm,color,0/*key*/,&ctrl->omp_comm);CHKERRQ(ierr);
367
368 /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
369 ierr = MPI_Comm_rank(ctrl->omp_comm,&omp_rank);CHKERRQ(ierr);
370 if (!omp_rank) {
371 ctrl->is_omp_master = PETSC_TRUE; /* master */
372 color = 0;
373 } else {
374 ctrl->is_omp_master = PETSC_FALSE; /* slave */
375 color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
376 }
377 ierr = MPI_Comm_split(petsc_comm,color,0/*key*/,&ctrl->omp_master_comm);CHKERRQ(ierr); /* rank 0 in omp_master_comm is rank 0 in petsc_comm */
378
379 /*=================================================================================
380 Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
381 slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
382 and run them on the idle CPUs.
383 ==================================================================================*/
384 ierr = PetscOmpCtrlCreateBarrier(ctrl);CHKERRQ(ierr);
385
386 /*=================================================================================
387 omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
388 is the union of the bindings of all ranks in the omp_comm
389 =================================================================================*/
390
391 ctrl->cpuset = hwloc_bitmap_alloc(); if (!ctrl->cpuset) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed\n");
392 ierr = hwloc_get_cpubind(ctrl->topology,ctrl->cpuset, HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
393
394 /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
395 nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset (ctrl->topology))+sizeof(unsigned long)*8)/sizeof(unsigned long)/8;
396 ierr = PetscMalloc1(nr_cpu_ulongs,&cpu_ulongs);CHKERRQ(ierr);
397 if (nr_cpu_ulongs == 1) {
398 cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
399 } else {
400 for (i=0; i<nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset,(unsigned)i);
401 }
402
403 ierr = MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs,nr_cpu_ulongs, MPI_UNSIGNED_LONG,MPI_BOR,0,ctrl->omp_comm);CHKERRQ(ierr);
404
405 if (ctrl->is_omp_master) {
406 ctrl->omp_cpuset = hwloc_bitmap_alloc(); if (!ctrl->omp_cpuset) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed\n");
407 if (nr_cpu_ulongs == 1) {
408 #if HWLOC_API_VERSION >= 0x00020000
409 ierr = hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);CHKERRQ(ierr);
410 #else
411 hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);
412 #endif
413 } else {
414 for (i=0; i<nr_cpu_ulongs; i++) {
415 #if HWLOC_API_VERSION >= 0x00020000
416 ierr = hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);CHKERRQ(ierr);
417 #else
418 hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);
419 #endif
420 }
421 }
422 }
423
424 ierr = PetscFree(cpu_ulongs);CHKERRQ(ierr);
425 *pctrl = ctrl;
426 PetscFunctionReturn(0);
427 }
428
429 /*@C
430 PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller
431
432 Input Parameter:
433 . pctrl - a PETSc OpenMP controller
434
435 Level: developer
436
437 .seealso PetscOmpCtrlCreate()
438 @*/
PetscOmpCtrlDestroy(PetscOmpCtrl * pctrl)439 PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
440 {
441 PetscErrorCode ierr;
442 PetscOmpCtrl ctrl = *pctrl;
443
444 PetscFunctionBegin;
445 hwloc_bitmap_free(ctrl->cpuset);
446 hwloc_topology_destroy(ctrl->topology);
447 PetscOmpCtrlDestroyBarrier(ctrl);
448 ierr = MPI_Comm_free(&ctrl->omp_comm);CHKERRQ(ierr);
449 if (ctrl->is_omp_master) {
450 hwloc_bitmap_free(ctrl->omp_cpuset);
451 ierr = MPI_Comm_free(&ctrl->omp_master_comm);CHKERRQ(ierr);
452 }
453 ierr = PetscFree(ctrl);CHKERRQ(ierr);
454 PetscFunctionReturn(0);
455 }
456
457 /*@C
458 PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller
459
460 Input Parameter:
461 . ctrl - a PETSc OMP controller
462
463 Output Parameter:
464 + omp_comm - a communicator that includes a master rank and slave ranks where master spawns threads
465 . omp_master_comm - on master ranks, return a communicator that include master ranks of each omp_comm;
466 on slave ranks, MPI_COMM_NULL will be return in reality.
467 - is_omp_master - true if the calling process is an OMP master rank.
468
469 Notes: any output parameter can be NULL. The parameter is just ignored.
470
471 Level: developer
472 @*/
PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl,MPI_Comm * omp_comm,MPI_Comm * omp_master_comm,PetscBool * is_omp_master)473 PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl,MPI_Comm *omp_comm,MPI_Comm *omp_master_comm,PetscBool *is_omp_master)
474 {
475 PetscFunctionBegin;
476 if (omp_comm) *omp_comm = ctrl->omp_comm;
477 if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
478 if (is_omp_master) *is_omp_master = ctrl->is_omp_master;
479 PetscFunctionReturn(0);
480 }
481
482 /*@C
483 PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU)
484
485 Input Parameter:
486 . ctrl - a PETSc OMP controller
487
488 Notes:
489 this is a pthread barrier on MPI processes. Using MPI_Barrier instead is conceptually correct. But MPI standard does not
490 require processes blocked by MPI_Barrier free their CPUs to let other processes progress. In practice, to minilize latency,
491 MPI processes stuck in MPI_Barrier keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.
492
493 A code using PetscOmpCtrlBarrier() would be like this,
494
495 if (is_omp_master) {
496 PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
497 Call the library using OpenMP
498 PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
499 }
500 PetscOmpCtrlBarrier(ctrl);
501
502 Level: developer
503
504 .seealso PetscOmpCtrlOmpRegionOnMasterBegin(), PetscOmpCtrlOmpRegionOnMasterEnd()
505 @*/
PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)506 PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
507 {
508 int err;
509
510 PetscFunctionBegin;
511 err = pthread_barrier_wait(ctrl->barrier);
512 if (err && err != PTHREAD_BARRIER_SERIAL_THREAD) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %D\n", err);
513 PetscFunctionReturn(0);
514 }
515
516 /*@C
517 PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks
518
519 Input Parameter:
520 . ctrl - a PETSc OMP controller
521
522 Notes:
523 Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms() to know if this is a master rank.
524 This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime
525
526 Level: developer
527
528 .seealso: PetscOmpCtrlOmpRegionOnMasterEnd()
529 @*/
PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)530 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
531 {
532 PetscErrorCode ierr;
533
534 PetscFunctionBegin;
535 ierr = hwloc_set_cpubind(ctrl->topology,ctrl->omp_cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
536 omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
537 PetscFunctionReturn(0);
538 }
539
540 /*@C
541 PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks
542
543 Input Parameter:
544 . ctrl - a PETSc OMP controller
545
546 Notes:
547 Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms() to know if this is a master rank.
548 This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.
549
550 Level: developer
551
552 .seealso: PetscOmpCtrlOmpRegionOnMasterBegin()
553 @*/
PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)554 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
555 {
556 PetscErrorCode ierr;
557
558 PetscFunctionBegin;
559 ierr = hwloc_set_cpubind(ctrl->topology,ctrl->cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
560 omp_set_num_threads(1);
561 PetscFunctionReturn(0);
562 }
563
564 #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
565 #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */
566