1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  */
5 
6 #include <mpidimpl.h>
7 #include "mpl_shm.h"
8 #include "mpidu_shm.h"
9 #include "mpidu_shm_seg.h"
10 
11 #include <stdlib.h>
12 #ifdef HAVE_UNISTD_H
13 #include <unistd.h>
14 #endif
15 #include <errno.h>
16 
17 #if defined (HAVE_SYSV_SHARED_MEM)
18 #include <sys/ipc.h>
19 #include <sys/shm.h>
20 #endif
21 
22 #include <stddef.h>
23 
24 /*
25 === BEGIN_MPI_T_CVAR_INFO_BLOCK ===
26 
27 cvars:
28     - name        : MPIR_CVAR_SHM_RANDOM_ADDR_RETRY
29       category    : MEMORY
30       type        : int
31       default     : 100
32       class       : none
33       verbosity   : MPI_T_VERBOSITY_USER_BASIC
34       scope       : MPI_T_SCOPE_GROUP
35       description : >-
36         The default number of retries for generating a random address. A retrying
37         involves only local operations.
38 
39     - name        : MPIR_CVAR_SHM_SYMHEAP_RETRY
40       category    : MEMORY
41       type        : int
42       default     : 100
43       class       : none
44       verbosity   : MPI_T_VERBOSITY_USER_BASIC
45       scope       : MPI_T_SCOPE_GROUP
46       description : >-
47         The default number of retries for allocating a symmetric heap in shared
48         memory. A retrying involves collective communication over the group in
49         the shared memory.
50 === END_MPI_T_CVAR_INFO_BLOCK ===
51 */
52 
53 enum {
54     SYMSHM_SUCCESS,
55     SYMSHM_MAP_FAIL,            /* mapping failure when specified with a fixed start addr */
56     SYMSHM_OTHER_FAIL           /* other failure reported by MPL shm */
57 };
58 
59 /* Linked list internally used to keep track
60  * of allocate shared memory segments */
61 typedef struct seg_list {
62     void *key;
63     MPIDU_shm_seg_t *shm_seg;
64     struct seg_list *next;
65 } seg_list_t;
66 
67 static seg_list_t *seg_list_head = NULL;
68 static seg_list_t *seg_list_tail = NULL;
69 
MPIDU_shm_get_mapsize(size_t size,size_t * psz)70 size_t MPIDU_shm_get_mapsize(size_t size, size_t * psz)
71 {
72     size_t page_sz, mapsize;
73 
74     if (*psz == 0)
75         page_sz = (size_t) sysconf(_SC_PAGESIZE);
76     else
77         page_sz = *psz;
78 
79     mapsize = (size + (page_sz - 1)) & (~(page_sz - 1));
80     *psz = page_sz;
81 
82     return mapsize;
83 }
84 
85 #ifdef USE_SYM_HEAP
check_maprange_ok(void * start,size_t size)86 static int check_maprange_ok(void *start, size_t size)
87 {
88     int rc = 0;
89     int ret = 0;
90     size_t page_sz = 0;
91     size_t mapsize = MPIDU_shm_get_mapsize(size, &page_sz);
92     size_t i, num_pages = mapsize / page_sz;
93     char *ptr = (char *) start;
94 
95     for (i = 0; i < num_pages; i++) {
96         rc = msync(ptr, page_sz, 0);
97 
98         if (rc == -1) {
99             if (errno != ENOMEM)
100                 goto fn_fail;
101             ptr += page_sz;
102         } else
103             goto fn_exit;
104     }
105 
106   fn_fail:
107     ret = 1;
108   fn_exit:
109     return ret;
110 }
111 
generate_random_addr(size_t size)112 static void *generate_random_addr(size_t size)
113 {
114     /* starting position for pointer to map
115      * This is not generic, probably only works properly on Linux
116      * but it's not fatal since we bail after a fixed number of iterations
117      */
118 #define MAP_POINTER ((random_unsigned&((0x00006FFFFFFFFFFF&(~(page_sz-1)))|0x0000600000000000)))
119     uintptr_t map_pointer;
120     char random_state[256];
121     size_t page_sz = 0;
122     uint64_t random_unsigned;
123     size_t mapsize = MPIDU_shm_get_mapsize(size, &page_sz);
124     MPL_time_t ts;
125     unsigned int ts_32 = 0;
126     int iter = MPIR_CVAR_SHM_RANDOM_ADDR_RETRY;
127     int32_t rh, rl;
128     struct random_data rbuf;
129 
130     /* rbuf must be zero-cleared otherwise it results in SIGSEGV in glibc
131      * (http://stackoverflow.com/questions/4167034/c-initstate-r-crashing) */
132     memset(&rbuf, 0, sizeof(rbuf));
133 
134     MPL_wtime(&ts);
135     MPL_wtime_touint(&ts, &ts_32);
136 
137     initstate_r(ts_32, random_state, sizeof(random_state), &rbuf);
138     random_r(&rbuf, &rh);
139     random_r(&rbuf, &rl);
140     random_unsigned = ((uint64_t) rh) << 32 | (uint64_t) rl;
141     map_pointer = MAP_POINTER;
142 
143     while (check_maprange_ok((void *) map_pointer, mapsize) == 0) {
144         random_r(&rbuf, &rh);
145         random_r(&rbuf, &rl);
146         random_unsigned = ((uint64_t) rh) << 32 | (uint64_t) rl;
147         map_pointer = MAP_POINTER;
148         iter--;
149 
150         if (iter == 0) {
151             map_pointer = UINTPTR_MAX;
152             goto fn_exit;
153         }
154     }
155 
156   fn_exit:
157     return (void *) map_pointer;
158 }
159 
160 typedef struct {
161     unsigned long long sz;
162     int loc;
163 } ull_maxloc_t;
164 
165 /* Compute maxloc for unsigned long long type.
166  * If more than one max value exists, the loc with lower rank is returned. */
ull_maxloc_op_func(void * invec,void * inoutvec,int * len,MPI_Datatype * datatype)167 static void ull_maxloc_op_func(void *invec, void *inoutvec, int *len, MPI_Datatype * datatype)
168 {
169     ull_maxloc_t *inmaxloc = (ull_maxloc_t *) invec;
170     ull_maxloc_t *outmaxloc = (ull_maxloc_t *) inoutvec;
171     int i;
172     for (i = 0; i < *len; i++) {
173         if (inmaxloc->sz > outmaxloc->sz) {
174             outmaxloc->sz = inmaxloc->sz;
175             outmaxloc->loc = inmaxloc->loc;
176         } else if (inmaxloc->sz == outmaxloc->sz && inmaxloc->loc < outmaxloc->loc)
177             outmaxloc->loc = inmaxloc->loc;
178     }
179 }
180 
181 /* Allreduce MAXLOC for unsigned size type by using user defined operator
182  * and derived datatype. We have to customize it because standard MAXLOC
183  * supports only pairtypes with signed {short, int, long}. We internally
184  * cast size_t to unsigned long long which is large enough to hold size type
185  * and matches an MPI basic datatype. */
allreduce_maxloc(size_t mysz,int myloc,MPIR_Comm * comm,size_t * maxsz,int * maxsz_loc)186 static int allreduce_maxloc(size_t mysz, int myloc, MPIR_Comm * comm, size_t * maxsz,
187                             int *maxsz_loc)
188 {
189     int mpi_errno = MPI_SUCCESS;
190     int blocks[2] = { 1, 1 };
191     MPI_Aint disps[2];
192     MPI_Datatype types[2], maxloc_type = MPI_DATATYPE_NULL;
193     MPI_Op maxloc_op = MPI_OP_NULL;
194     ull_maxloc_t maxloc, maxloc_result;
195     MPIR_Errflag_t errflag = MPIR_ERR_NONE;
196 
197     types[0] = MPI_UNSIGNED_LONG_LONG;
198     types[1] = MPI_INT;
199     disps[0] = 0;
200     disps[1] = (MPI_Aint) ((uintptr_t) & maxloc.loc - (uintptr_t) & maxloc.sz);
201 
202     mpi_errno = MPIR_Type_create_struct_impl(2, blocks, disps, types, &maxloc_type);
203     MPIR_ERR_CHECK(mpi_errno);
204 
205     mpi_errno = MPIR_Type_commit_impl(&maxloc_type);
206     MPIR_ERR_CHECK(mpi_errno);
207 
208     mpi_errno = MPIR_Op_create_impl(ull_maxloc_op_func, 0, &maxloc_op);
209     MPIR_ERR_CHECK(mpi_errno);
210 
211     maxloc.sz = (unsigned long long) mysz;
212     maxloc.loc = myloc;
213 
214     mpi_errno = MPIR_Allreduce(&maxloc, &maxloc_result, 1, maxloc_type, maxloc_op, comm, &errflag);
215     MPIR_ERR_CHECK(mpi_errno);
216 
217     *maxsz_loc = maxloc_result.loc;
218     *maxsz = (size_t) maxloc_result.sz;
219 
220   fn_exit:
221     if (maxloc_type != MPI_DATATYPE_NULL)
222         MPIR_Type_free_impl(&maxloc_type);
223     if (maxloc_op != MPI_OP_NULL)
224         MPIR_Op_free_impl(&maxloc_op);
225     return mpi_errno;
226   fn_fail:
227     goto fn_exit;
228 }
229 
230 /* Collectively allocate symmetric shared memory region with address
231  * defined by base_ptr. MPL_shm routines and MPI collectives are internally used.
232  *
233  * The caller should ensure segment_len is page aligned and base_addr
234  * is a symmetric non-NULL address on all processes.
235  *
236  * map_result indicates the mapping result of the node. It can be
237  * SYMSHM_SUCCESS, SYMSHM_MAP_FAIL or SYMSHM_OTHER_FAIL.
238  * If it is SYMSHM_MAP_FAIL, the caller can try it again with a different
239  * start address; if it is SYMSHM_OTHER_FAIL, it usually means no more shm
240  * segment can be allocated, thus the caller should stop retrying. */
map_symm_shm(MPIR_Comm * shm_comm_ptr,MPIDU_shm_seg_t * shm_seg,int * map_result_ptr)241 static int map_symm_shm(MPIR_Comm * shm_comm_ptr, MPIDU_shm_seg_t * shm_seg, int *map_result_ptr)
242 {
243     int mpi_errno = MPI_SUCCESS, mpl_err = MPL_SUCCESS;
244     int all_map_result = SYMSHM_MAP_FAIL;
245     bool mapped_flag = false;
246     MPIR_Errflag_t errflag = MPIR_ERR_NONE;
247 
248     if (shm_seg->segment_len > 0) {
249         if (shm_comm_ptr) {
250             if (shm_comm_ptr->rank == 0) {
251                 char *serialized_hnd = NULL;
252 
253                 /* try to map with specified symmetric address. */
254                 mpl_err = MPL_shm_fixed_seg_create_and_attach(shm_seg->hnd, shm_seg->segment_len,
255                                                               (void **) &(shm_seg->base_addr), 0);
256                 if (mpl_err) {
257                     *map_result_ptr =
258                         (mpl_err == MPL_ERR_SHM_INVAL) ? SYMSHM_MAP_FAIL : SYMSHM_OTHER_FAIL;
259                     goto root_sync;
260                 } else
261                     mapped_flag = true;
262 
263                 mpl_err = MPL_shm_hnd_get_serialized_by_ref(shm_seg->hnd, &serialized_hnd);
264                 if (mpl_err) {
265                     *map_result_ptr = SYMSHM_OTHER_FAIL;
266                 }
267 
268               root_sync:
269                 /* broadcast the mapping result on rank 0 */
270                 mpi_errno = MPIR_Bcast(map_result_ptr, 1, MPI_INT, 0, shm_comm_ptr, &errflag);
271                 MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
272 
273                 if (*map_result_ptr != SYMSHM_SUCCESS)
274                     goto map_fail;
275 
276                 mpi_errno =
277                     MPIR_Bcast(serialized_hnd, MPL_SHM_GHND_SZ, MPI_BYTE, 0,
278                                shm_comm_ptr, &errflag);
279                 MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
280 
281             } else {
282                 char serialized_hnd[MPL_SHM_GHND_SZ] = { 0 };
283 
284                 /* receive the mapping result of rank 0 */
285                 mpi_errno = MPIR_Bcast(map_result_ptr, 1, MPI_INT, 0, shm_comm_ptr, &errflag);
286                 MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
287 
288                 if (*map_result_ptr != SYMSHM_SUCCESS)
289                     goto map_fail;
290 
291                 /* if rank 0 mapped successfully, others on the node attach shared memory region */
292 
293                 /* get serialized handle from rank 0 and deserialize it */
294                 mpi_errno =
295                     MPIR_Bcast(serialized_hnd, MPL_SHM_GHND_SZ, MPI_BYTE, 0,
296                                shm_comm_ptr, &errflag);
297                 MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
298 
299                 mpl_err =
300                     MPL_shm_hnd_deserialize(shm_seg->hnd, serialized_hnd, strlen(serialized_hnd));
301                 if (mpl_err) {
302                     *map_result_ptr = SYMSHM_OTHER_FAIL;
303                     goto result_sync;
304                 }
305 
306                 /* try to attach with specified symmetric address */
307                 mpl_err = MPL_shm_fixed_seg_attach(shm_seg->hnd, shm_seg->segment_len,
308                                                    (void **) &(shm_seg->base_addr), 0);
309                 if (mpl_err) {
310                     *map_result_ptr =
311                         (mpl_err == MPL_ERR_SHM_INVAL) ? SYMSHM_MAP_FAIL : SYMSHM_OTHER_FAIL;
312                 } else
313                     mapped_flag = true;
314             }
315 
316           result_sync:
317             /* check results of all processes. If any failure happens (max result > 0),
318              * return SYMSHM_OTHER_FAIL if anyone reports it (max result == 2).
319              * Otherwise return SYMSHM_MAP_FAIL (max result == 1). */
320             mpi_errno = MPIR_Allreduce(map_result_ptr, &all_map_result, 1, MPI_INT,
321                                        MPI_MAX, shm_comm_ptr, &errflag);
322             MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
323 
324             if (all_map_result != SYMSHM_SUCCESS)
325                 goto map_fail;
326 
327             if (shm_comm_ptr->rank == 0) {
328                 /* unlink shared memory region so it gets deleted when all processes exit */
329                 mpl_err = MPL_shm_seg_remove(shm_seg->hnd);
330                 MPIR_ERR_CHKANDJUMP(mpl_err, mpi_errno, MPI_ERR_OTHER, "**remove_shar_mem");
331             }
332         } else {
333             /* if there is only one process on this processor, don't use shared memory */
334             int rc = check_maprange_ok(shm_seg->base_addr, shm_seg->segment_len);
335             if (rc) {
336                 shm_seg->base_addr = MPL_mmap(shm_seg->base_addr, shm_seg->segment_len,
337                                               PROT_READ | PROT_WRITE,
338                                               MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0,
339                                               MPL_MEM_SHM);
340             } else {
341                 shm_seg->base_addr = (void *) MAP_FAILED;
342             }
343             *map_result_ptr = (shm_seg->base_addr == (void *) MAP_FAILED) ?
344                 SYMSHM_MAP_FAIL : SYMSHM_SUCCESS;
345         }
346     }
347 
348   fn_exit:
349     return mpi_errno;
350   map_fail:
351     if (mapped_flag) {
352         /* destroy successful shm segment */
353         mpl_err =
354             MPL_shm_seg_detach(shm_seg->hnd, (void **) &(shm_seg->base_addr), shm_seg->segment_len);
355         MPIR_ERR_CHKANDJUMP(mpl_err, mpi_errno, MPI_ERR_OTHER, "**detach_shar_mem");
356     }
357     *map_result_ptr = all_map_result;
358     goto fn_exit;
359   fn_fail:
360     goto fn_exit;
361 }
362 
unmap_symm_shm(MPIR_Comm * shm_comm_ptr,MPIDU_shm_seg_t * shm_seg)363 static int unmap_symm_shm(MPIR_Comm * shm_comm_ptr, MPIDU_shm_seg_t * shm_seg)
364 {
365     int mpi_errno = MPI_SUCCESS, mpl_err = MPL_SUCCESS;
366 
367     if (shm_comm_ptr != NULL) {
368         /* destroy successful shm segment */
369         mpl_err =
370             MPL_shm_seg_detach(shm_seg->hnd, (void **) &(shm_seg->base_addr), shm_seg->segment_len);
371         MPIR_ERR_CHKANDJUMP(mpl_err, mpi_errno, MPI_ERR_OTHER, "**detach_shar_mem");
372     } else {
373         MPL_munmap(shm_seg->base_addr, shm_seg->segment_len, MPL_MEM_SHM);
374     }
375 
376   fn_exit:
377     return mpi_errno;
378   fn_fail:
379     goto fn_exit;
380 }
381 
382 /* Allocate symmetric shared memory across all processes in comm */
shm_alloc_symm_all(MPIR_Comm * comm_ptr,size_t offset,MPIDU_shm_seg_t * shm_seg,bool * fail_flag)383 static int shm_alloc_symm_all(MPIR_Comm * comm_ptr, size_t offset, MPIDU_shm_seg_t * shm_seg,
384                               bool * fail_flag)
385 {
386     int mpi_errno = MPI_SUCCESS;
387     int rank = comm_ptr->rank;
388     int all_map_result = SYMSHM_MAP_FAIL;
389     int iter = MPIR_CVAR_SHM_SYMHEAP_RETRY;
390     int maxsz_loc = 0;
391     size_t maxsz = 0;
392     char *map_pointer = NULL;
393     MPIR_Comm *shm_comm_ptr = comm_ptr->node_comm;
394     MPIR_Errflag_t errflag = MPIR_ERR_NONE;
395 
396     /* we let the process with larger amount of requested memory to calculate
397      * the random address. This should reduce the number of attempts allocating
398      * symmetric shared memory as the other processes are more likely to accept
399      * the returned pointer when mapping memory into their address space. */
400     mpi_errno = allreduce_maxloc(shm_seg->segment_len, rank, comm_ptr, &maxsz, &maxsz_loc);
401     if (mpi_errno != MPI_SUCCESS)
402         goto fn_fail;
403 
404     if (maxsz == 0)
405         goto fn_exit;
406 
407     while (all_map_result == SYMSHM_MAP_FAIL && iter-- > 0) {
408         int map_result = SYMSHM_SUCCESS;
409 
410         /* rank maxsz_loc in comm generates random address */
411         if (comm_ptr->rank == maxsz_loc)
412             map_pointer = generate_random_addr(shm_seg->segment_len);
413 
414         /* broadcast fixed address to the other processes in comm */
415         mpi_errno =
416             MPIR_Bcast(&map_pointer, sizeof(char *), MPI_CHAR, maxsz_loc, comm_ptr, &errflag);
417         MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
418 
419         /* optimization: make sure every process memory in the shared segment is mapped
420          * at the same virtual address in the corresponding address space. Example with
421          * 4 processes: if after calling MPIDU_shm_alloc_symm_all() process 0 gets addr
422          * 0x4000,  process 1 will get 0x3000, process 2 will get 0x2000, and process 3
423          * will get 0x1000. This way all processes have their own memory inside the shm
424          * segment starting at addr 0x4000. Processes in other nodes will also have the
425          * same address. */
426         shm_seg->base_addr = map_pointer - offset;
427 
428         /* try mapping symmetric memory */
429         mpi_errno = map_symm_shm(shm_comm_ptr, shm_seg, &map_result);
430         MPIR_ERR_CHECK(mpi_errno);
431 
432         /* check if any mapping failure occurs */
433         mpi_errno = MPIR_Allreduce(&map_result, &all_map_result, 1, MPI_INT,
434                                    MPI_MAX, comm_ptr, &errflag);
435         MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
436 
437         /* cleanup local shm segment if mapping failed on other process */
438         if (all_map_result != SYMSHM_SUCCESS && map_result == SYMSHM_SUCCESS &&
439             shm_seg->segment_len > 0) {
440             mpi_errno = unmap_symm_shm(shm_comm_ptr, shm_seg);
441             MPIR_ERR_CHECK(mpi_errno);
442         }
443     }
444 
445     if (all_map_result != SYMSHM_SUCCESS) {
446         /* if fail to allocate, return and let the caller choose another method */
447         *fail_flag = true;
448     }
449 
450   fn_exit:
451     return mpi_errno;
452   fn_fail:
453     goto fn_exit;
454 }
455 #endif /* end of USE_SYM_HEAP */
456 
shm_alloc(MPIR_Comm * shm_comm_ptr,MPIDU_shm_seg_t * shm_seg,bool * fail_flag)457 static int shm_alloc(MPIR_Comm * shm_comm_ptr, MPIDU_shm_seg_t * shm_seg, bool * fail_flag)
458 {
459     int mpi_errno = MPI_SUCCESS, mpl_err = MPL_SUCCESS;
460     bool shm_fail_flag = false;
461     bool any_shm_fail_flag = false;
462     bool mapped_flag = false;
463     MPIR_Errflag_t errflag = MPIR_ERR_NONE;
464 
465     if (shm_comm_ptr->rank == 0) {
466         char *serialized_hnd = NULL;
467         char mpl_err_hnd[MPL_SHM_GHND_SZ] = { 0 };
468 
469         /* root prepare shm segment */
470         mpl_err = MPL_shm_seg_create_and_attach(shm_seg->hnd, shm_seg->segment_len,
471                                                 (void **) &(shm_seg->base_addr), 0);
472         if (mpl_err != MPL_SUCCESS) {
473             shm_fail_flag = true;
474             goto hnd_sync;
475         } else
476             mapped_flag = true;
477 
478         mpl_err = MPL_shm_hnd_get_serialized_by_ref(shm_seg->hnd, &serialized_hnd);
479         if (mpl_err != MPL_SUCCESS)
480             shm_fail_flag = true;
481 
482       hnd_sync:
483         if (shm_fail_flag)
484             serialized_hnd = &mpl_err_hnd[0];
485 
486         mpi_errno =
487             MPIR_Bcast_impl(serialized_hnd, MPL_SHM_GHND_SZ, MPI_BYTE, 0, shm_comm_ptr, &errflag);
488         MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
489 
490         if (shm_fail_flag)
491             goto map_fail;
492 
493         /* ensure all other processes have mapped successfully */
494         mpi_errno = MPIR_Allreduce_impl(&shm_fail_flag, &any_shm_fail_flag, 1, MPI_C_BOOL,
495                                         MPI_LOR, shm_comm_ptr, &errflag);
496         MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
497 
498         /* unlink shared memory region so it gets deleted when all processes exit */
499         mpl_err = MPL_shm_seg_remove(shm_seg->hnd);
500         MPIR_ERR_CHKANDJUMP(mpl_err, mpi_errno, MPI_ERR_OTHER, "**remove_shar_mem");
501 
502         if (any_shm_fail_flag)
503             goto map_fail;
504 
505     } else {
506         char serialized_hnd[MPL_SHM_GHND_SZ] = { 0 };
507 
508         /* get serialized handle from rank 0 and deserialize it */
509         mpi_errno = MPIR_Bcast_impl(serialized_hnd, MPL_SHM_GHND_SZ, MPI_CHAR, 0,
510                                     shm_comm_ptr, &errflag);
511         MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
512 
513         /* empty handler means root fails */
514         if (strlen(serialized_hnd) == 0)
515             goto map_fail;
516 
517         mpl_err = MPL_shm_hnd_deserialize(shm_seg->hnd, serialized_hnd, strlen(serialized_hnd));
518         if (mpl_err != MPL_SUCCESS) {
519             shm_fail_flag = true;
520             goto result_sync;
521         }
522 
523         mpl_err = MPL_shm_seg_attach(shm_seg->hnd, shm_seg->segment_len,
524                                      (void **) &shm_seg->base_addr, 0);
525         if (mpl_err != MPL_SUCCESS) {
526             shm_fail_flag = true;
527             goto result_sync;
528         } else
529             mapped_flag = true;
530 
531       result_sync:
532         mpi_errno = MPIR_Allreduce_impl(&shm_fail_flag, &any_shm_fail_flag, 1, MPI_C_BOOL,
533                                         MPI_LOR, shm_comm_ptr, &errflag);
534         MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
535 
536         if (any_shm_fail_flag)
537             goto map_fail;
538     }
539 
540   fn_exit:
541     return mpi_errno;
542   map_fail:
543     if (mapped_flag) {
544         /* destroy successful shm segment */
545         mpl_err =
546             MPL_shm_seg_detach(shm_seg->hnd, (void **) &(shm_seg->base_addr), shm_seg->segment_len);
547         MPIR_ERR_CHKANDJUMP(mpl_err, mpi_errno, MPI_ERR_OTHER, "**detach_shar_mem");
548     }
549     *fail_flag = true;
550     goto fn_exit;
551   fn_fail:
552     goto fn_exit;
553 }
554 
MPIDU_shm_alloc_symm_all(MPIR_Comm * comm_ptr,size_t len,size_t offset,void ** ptr,bool * fail_flag)555 int MPIDU_shm_alloc_symm_all(MPIR_Comm * comm_ptr, size_t len, size_t offset, void **ptr,
556                              bool * fail_flag)
557 {
558     int mpi_errno = MPI_SUCCESS;
559 #ifdef USE_SYM_HEAP
560     int mpl_err = MPL_SUCCESS;
561     MPIDU_shm_seg_t *shm_seg = NULL;
562     seg_list_t *el = NULL;
563     MPIR_CHKPMEM_DECL(2);
564 
565     *ptr = NULL;
566 
567     MPIR_CHKPMEM_MALLOC(shm_seg, MPIDU_shm_seg_t *, sizeof(*shm_seg), mpi_errno, "shm_seg_handle",
568                         MPL_MEM_OTHER);
569     MPIR_CHKPMEM_MALLOC(el, seg_list_t *, sizeof(*el), mpi_errno,
570                         "seg_list_element", MPL_MEM_OTHER);
571 
572     mpl_err = MPL_shm_hnd_init(&(shm_seg->hnd));
573     MPIR_ERR_CHKANDJUMP(mpl_err, mpi_errno, MPI_ERR_OTHER, "**alloc_shar_mem");
574 
575     shm_seg->segment_len = len;
576 
577     mpi_errno = shm_alloc_symm_all(comm_ptr, offset, shm_seg, fail_flag);
578     if (mpi_errno || *fail_flag)
579         goto fn_fail;
580 
581     if (len == 0) {
582         /* process requested no memory, cleanup and return */
583         MPL_shm_seg_remove(shm_seg->hnd);
584         MPL_shm_hnd_finalize(&(shm_seg->hnd));
585         MPIR_CHKPMEM_REAP();
586         goto fn_exit;
587     }
588 
589     *ptr = shm_seg->base_addr;
590 
591     /* store shm_seg handle in linked list for later retrieval */
592     el->key = *ptr;
593     el->shm_seg = shm_seg;
594     LL_APPEND(seg_list_head, seg_list_tail, el);
595 
596     MPIR_CHKPMEM_COMMIT();
597 
598   fn_exit:
599     return mpi_errno;
600   fn_fail:
601     /* --BEGIN ERROR HANDLING-- */
602     if (shm_seg) {
603         MPL_shm_seg_remove(shm_seg->hnd);
604         MPL_shm_hnd_finalize(&(shm_seg->hnd));
605     }
606     MPIR_CHKPMEM_REAP();
607     goto fn_exit;
608     /* --END ERROR HANDLING-- */
609 #else
610     /* always fail, return and let the caller choose another method */
611     *fail_flag = true;
612     return mpi_errno;
613 #endif /* end of USE_SYM_HEAP */
614 }
615 
MPIDU_shm_alloc(MPIR_Comm * shm_comm_ptr,size_t len,void ** ptr,bool * fail_flag)616 int MPIDU_shm_alloc(MPIR_Comm * shm_comm_ptr, size_t len, void **ptr, bool * fail_flag)
617 {
618     int mpi_errno = MPI_SUCCESS, mpl_err = MPL_SUCCESS;
619     MPIDU_shm_seg_t *shm_seg = NULL;
620     seg_list_t *el = NULL;
621     MPIR_CHKPMEM_DECL(2);
622 
623     *ptr = NULL;
624 
625     MPIR_Assert(shm_comm_ptr != NULL);
626     MPIR_Assert(len > 0);
627 
628     MPIR_CHKPMEM_MALLOC(shm_seg, MPIDU_shm_seg_t *, sizeof(*shm_seg), mpi_errno, "shm_seg_handle",
629                         MPL_MEM_OTHER);
630     MPIR_CHKPMEM_MALLOC(el, seg_list_t *, sizeof(*el), mpi_errno,
631                         "seg_list_element", MPL_MEM_OTHER);
632 
633     mpl_err = MPL_shm_hnd_init(&(shm_seg->hnd));
634     MPIR_ERR_CHKANDJUMP(mpl_err, mpi_errno, MPI_ERR_OTHER, "**alloc_shar_mem");
635 
636     shm_seg->segment_len = len;
637 
638     mpi_errno = shm_alloc(shm_comm_ptr, shm_seg, fail_flag);
639     if (mpi_errno || *fail_flag)
640         goto fn_fail;
641 
642     *ptr = shm_seg->base_addr;
643 
644     /* store shm_seg handle in linked list for later retrieval */
645     el->key = *ptr;
646     el->shm_seg = shm_seg;
647     LL_APPEND(seg_list_head, seg_list_tail, el);
648 
649     MPIR_CHKPMEM_COMMIT();
650 
651   fn_exit:
652     return mpi_errno;
653   fn_fail:
654     /* --BEGIN ERROR HANDLING-- */
655     if (shm_seg) {
656         MPL_shm_seg_remove(shm_seg->hnd);
657         MPL_shm_hnd_finalize(&(shm_seg->hnd));
658     }
659     MPIR_CHKPMEM_REAP();
660     goto fn_exit;
661     /* --END ERROR HANDLING-- */
662 }
663 
MPIDU_shm_free(void * ptr)664 int MPIDU_shm_free(void *ptr)
665 {
666     int mpi_errno = MPI_SUCCESS, mpl_err = MPL_SUCCESS;
667     MPIDU_shm_seg_t *shm_seg = NULL;
668     seg_list_t *el = NULL;
669 
670     /* retrieve memory handle for baseaddr */
671     LL_FOREACH(seg_list_head, el) {
672         if (el->key == ptr) {
673             shm_seg = el->shm_seg;
674             LL_DELETE(seg_list_head, seg_list_tail, el);
675             MPL_free(el);
676             break;
677         }
678     }
679 
680     MPIR_Assert(shm_seg != NULL);
681 
682     /* if there is only one process in the node the serialized handle points
683      * to NULL as there is no shared file backing up memory. This is used to
684      * differentiate between shared memory and private memory allocations
685      * when symmetric shared memory is being requested. */
686     char *serialized_hnd = NULL;
687     MPL_shm_hnd_get_serialized_by_ref(shm_seg->hnd, &serialized_hnd);
688 
689     if (serialized_hnd) {
690         mpl_err = MPL_shm_seg_detach(shm_seg->hnd, (void **) &(shm_seg->base_addr),
691                                      shm_seg->segment_len);
692         MPIR_ERR_CHKANDJUMP(mpl_err, mpi_errno, MPI_ERR_OTHER, "**detach_shar_mem");
693     } else {
694         MPL_munmap(shm_seg->base_addr, shm_seg->segment_len, MPL_MEM_SHM);
695     }
696 
697   fn_exit:
698     MPL_shm_hnd_finalize(&(shm_seg->hnd));
699     MPL_free(shm_seg);
700     return mpi_errno;
701   fn_fail:
702     goto fn_exit;
703 }
704