1 /*
2 * Copyright (C) by Argonne National Laboratory
3 * See COPYRIGHT in top-level directory
4 */
5
6 #include <mpidimpl.h>
7 #include "mpl_shm.h"
8 #include "mpidu_shm.h"
9 #include "mpidu_shm_seg.h"
10
11 #include <stdlib.h>
12 #ifdef HAVE_UNISTD_H
13 #include <unistd.h>
14 #endif
15 #include <errno.h>
16
17 #if defined (HAVE_SYSV_SHARED_MEM)
18 #include <sys/ipc.h>
19 #include <sys/shm.h>
20 #endif
21
22 #include <stddef.h>
23
24 /*
25 === BEGIN_MPI_T_CVAR_INFO_BLOCK ===
26
27 cvars:
28 - name : MPIR_CVAR_SHM_RANDOM_ADDR_RETRY
29 category : MEMORY
30 type : int
31 default : 100
32 class : none
33 verbosity : MPI_T_VERBOSITY_USER_BASIC
34 scope : MPI_T_SCOPE_GROUP
35 description : >-
36 The default number of retries for generating a random address. A retrying
37 involves only local operations.
38
39 - name : MPIR_CVAR_SHM_SYMHEAP_RETRY
40 category : MEMORY
41 type : int
42 default : 100
43 class : none
44 verbosity : MPI_T_VERBOSITY_USER_BASIC
45 scope : MPI_T_SCOPE_GROUP
46 description : >-
47 The default number of retries for allocating a symmetric heap in shared
48 memory. A retrying involves collective communication over the group in
49 the shared memory.
50 === END_MPI_T_CVAR_INFO_BLOCK ===
51 */
52
53 enum {
54 SYMSHM_SUCCESS,
55 SYMSHM_MAP_FAIL, /* mapping failure when specified with a fixed start addr */
56 SYMSHM_OTHER_FAIL /* other failure reported by MPL shm */
57 };
58
59 /* Linked list internally used to keep track
60 * of allocate shared memory segments */
61 typedef struct seg_list {
62 void *key;
63 MPIDU_shm_seg_t *shm_seg;
64 struct seg_list *next;
65 } seg_list_t;
66
67 static seg_list_t *seg_list_head = NULL;
68 static seg_list_t *seg_list_tail = NULL;
69
MPIDU_shm_get_mapsize(size_t size,size_t * psz)70 size_t MPIDU_shm_get_mapsize(size_t size, size_t * psz)
71 {
72 size_t page_sz, mapsize;
73
74 if (*psz == 0)
75 page_sz = (size_t) sysconf(_SC_PAGESIZE);
76 else
77 page_sz = *psz;
78
79 mapsize = (size + (page_sz - 1)) & (~(page_sz - 1));
80 *psz = page_sz;
81
82 return mapsize;
83 }
84
85 #ifdef USE_SYM_HEAP
check_maprange_ok(void * start,size_t size)86 static int check_maprange_ok(void *start, size_t size)
87 {
88 int rc = 0;
89 int ret = 0;
90 size_t page_sz = 0;
91 size_t mapsize = MPIDU_shm_get_mapsize(size, &page_sz);
92 size_t i, num_pages = mapsize / page_sz;
93 char *ptr = (char *) start;
94
95 for (i = 0; i < num_pages; i++) {
96 rc = msync(ptr, page_sz, 0);
97
98 if (rc == -1) {
99 if (errno != ENOMEM)
100 goto fn_fail;
101 ptr += page_sz;
102 } else
103 goto fn_exit;
104 }
105
106 fn_fail:
107 ret = 1;
108 fn_exit:
109 return ret;
110 }
111
generate_random_addr(size_t size)112 static void *generate_random_addr(size_t size)
113 {
114 /* starting position for pointer to map
115 * This is not generic, probably only works properly on Linux
116 * but it's not fatal since we bail after a fixed number of iterations
117 */
118 #define MAP_POINTER ((random_unsigned&((0x00006FFFFFFFFFFF&(~(page_sz-1)))|0x0000600000000000)))
119 uintptr_t map_pointer;
120 char random_state[256];
121 size_t page_sz = 0;
122 uint64_t random_unsigned;
123 size_t mapsize = MPIDU_shm_get_mapsize(size, &page_sz);
124 MPL_time_t ts;
125 unsigned int ts_32 = 0;
126 int iter = MPIR_CVAR_SHM_RANDOM_ADDR_RETRY;
127 int32_t rh, rl;
128 struct random_data rbuf;
129
130 /* rbuf must be zero-cleared otherwise it results in SIGSEGV in glibc
131 * (http://stackoverflow.com/questions/4167034/c-initstate-r-crashing) */
132 memset(&rbuf, 0, sizeof(rbuf));
133
134 MPL_wtime(&ts);
135 MPL_wtime_touint(&ts, &ts_32);
136
137 initstate_r(ts_32, random_state, sizeof(random_state), &rbuf);
138 random_r(&rbuf, &rh);
139 random_r(&rbuf, &rl);
140 random_unsigned = ((uint64_t) rh) << 32 | (uint64_t) rl;
141 map_pointer = MAP_POINTER;
142
143 while (check_maprange_ok((void *) map_pointer, mapsize) == 0) {
144 random_r(&rbuf, &rh);
145 random_r(&rbuf, &rl);
146 random_unsigned = ((uint64_t) rh) << 32 | (uint64_t) rl;
147 map_pointer = MAP_POINTER;
148 iter--;
149
150 if (iter == 0) {
151 map_pointer = UINTPTR_MAX;
152 goto fn_exit;
153 }
154 }
155
156 fn_exit:
157 return (void *) map_pointer;
158 }
159
160 typedef struct {
161 unsigned long long sz;
162 int loc;
163 } ull_maxloc_t;
164
165 /* Compute maxloc for unsigned long long type.
166 * If more than one max value exists, the loc with lower rank is returned. */
ull_maxloc_op_func(void * invec,void * inoutvec,int * len,MPI_Datatype * datatype)167 static void ull_maxloc_op_func(void *invec, void *inoutvec, int *len, MPI_Datatype * datatype)
168 {
169 ull_maxloc_t *inmaxloc = (ull_maxloc_t *) invec;
170 ull_maxloc_t *outmaxloc = (ull_maxloc_t *) inoutvec;
171 int i;
172 for (i = 0; i < *len; i++) {
173 if (inmaxloc->sz > outmaxloc->sz) {
174 outmaxloc->sz = inmaxloc->sz;
175 outmaxloc->loc = inmaxloc->loc;
176 } else if (inmaxloc->sz == outmaxloc->sz && inmaxloc->loc < outmaxloc->loc)
177 outmaxloc->loc = inmaxloc->loc;
178 }
179 }
180
181 /* Allreduce MAXLOC for unsigned size type by using user defined operator
182 * and derived datatype. We have to customize it because standard MAXLOC
183 * supports only pairtypes with signed {short, int, long}. We internally
184 * cast size_t to unsigned long long which is large enough to hold size type
185 * and matches an MPI basic datatype. */
allreduce_maxloc(size_t mysz,int myloc,MPIR_Comm * comm,size_t * maxsz,int * maxsz_loc)186 static int allreduce_maxloc(size_t mysz, int myloc, MPIR_Comm * comm, size_t * maxsz,
187 int *maxsz_loc)
188 {
189 int mpi_errno = MPI_SUCCESS;
190 int blocks[2] = { 1, 1 };
191 MPI_Aint disps[2];
192 MPI_Datatype types[2], maxloc_type = MPI_DATATYPE_NULL;
193 MPI_Op maxloc_op = MPI_OP_NULL;
194 ull_maxloc_t maxloc, maxloc_result;
195 MPIR_Errflag_t errflag = MPIR_ERR_NONE;
196
197 types[0] = MPI_UNSIGNED_LONG_LONG;
198 types[1] = MPI_INT;
199 disps[0] = 0;
200 disps[1] = (MPI_Aint) ((uintptr_t) & maxloc.loc - (uintptr_t) & maxloc.sz);
201
202 mpi_errno = MPIR_Type_create_struct_impl(2, blocks, disps, types, &maxloc_type);
203 MPIR_ERR_CHECK(mpi_errno);
204
205 mpi_errno = MPIR_Type_commit_impl(&maxloc_type);
206 MPIR_ERR_CHECK(mpi_errno);
207
208 mpi_errno = MPIR_Op_create_impl(ull_maxloc_op_func, 0, &maxloc_op);
209 MPIR_ERR_CHECK(mpi_errno);
210
211 maxloc.sz = (unsigned long long) mysz;
212 maxloc.loc = myloc;
213
214 mpi_errno = MPIR_Allreduce(&maxloc, &maxloc_result, 1, maxloc_type, maxloc_op, comm, &errflag);
215 MPIR_ERR_CHECK(mpi_errno);
216
217 *maxsz_loc = maxloc_result.loc;
218 *maxsz = (size_t) maxloc_result.sz;
219
220 fn_exit:
221 if (maxloc_type != MPI_DATATYPE_NULL)
222 MPIR_Type_free_impl(&maxloc_type);
223 if (maxloc_op != MPI_OP_NULL)
224 MPIR_Op_free_impl(&maxloc_op);
225 return mpi_errno;
226 fn_fail:
227 goto fn_exit;
228 }
229
230 /* Collectively allocate symmetric shared memory region with address
231 * defined by base_ptr. MPL_shm routines and MPI collectives are internally used.
232 *
233 * The caller should ensure segment_len is page aligned and base_addr
234 * is a symmetric non-NULL address on all processes.
235 *
236 * map_result indicates the mapping result of the node. It can be
237 * SYMSHM_SUCCESS, SYMSHM_MAP_FAIL or SYMSHM_OTHER_FAIL.
238 * If it is SYMSHM_MAP_FAIL, the caller can try it again with a different
239 * start address; if it is SYMSHM_OTHER_FAIL, it usually means no more shm
240 * segment can be allocated, thus the caller should stop retrying. */
map_symm_shm(MPIR_Comm * shm_comm_ptr,MPIDU_shm_seg_t * shm_seg,int * map_result_ptr)241 static int map_symm_shm(MPIR_Comm * shm_comm_ptr, MPIDU_shm_seg_t * shm_seg, int *map_result_ptr)
242 {
243 int mpi_errno = MPI_SUCCESS, mpl_err = MPL_SUCCESS;
244 int all_map_result = SYMSHM_MAP_FAIL;
245 bool mapped_flag = false;
246 MPIR_Errflag_t errflag = MPIR_ERR_NONE;
247
248 if (shm_seg->segment_len > 0) {
249 if (shm_comm_ptr) {
250 if (shm_comm_ptr->rank == 0) {
251 char *serialized_hnd = NULL;
252
253 /* try to map with specified symmetric address. */
254 mpl_err = MPL_shm_fixed_seg_create_and_attach(shm_seg->hnd, shm_seg->segment_len,
255 (void **) &(shm_seg->base_addr), 0);
256 if (mpl_err) {
257 *map_result_ptr =
258 (mpl_err == MPL_ERR_SHM_INVAL) ? SYMSHM_MAP_FAIL : SYMSHM_OTHER_FAIL;
259 goto root_sync;
260 } else
261 mapped_flag = true;
262
263 mpl_err = MPL_shm_hnd_get_serialized_by_ref(shm_seg->hnd, &serialized_hnd);
264 if (mpl_err) {
265 *map_result_ptr = SYMSHM_OTHER_FAIL;
266 }
267
268 root_sync:
269 /* broadcast the mapping result on rank 0 */
270 mpi_errno = MPIR_Bcast(map_result_ptr, 1, MPI_INT, 0, shm_comm_ptr, &errflag);
271 MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
272
273 if (*map_result_ptr != SYMSHM_SUCCESS)
274 goto map_fail;
275
276 mpi_errno =
277 MPIR_Bcast(serialized_hnd, MPL_SHM_GHND_SZ, MPI_BYTE, 0,
278 shm_comm_ptr, &errflag);
279 MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
280
281 } else {
282 char serialized_hnd[MPL_SHM_GHND_SZ] = { 0 };
283
284 /* receive the mapping result of rank 0 */
285 mpi_errno = MPIR_Bcast(map_result_ptr, 1, MPI_INT, 0, shm_comm_ptr, &errflag);
286 MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
287
288 if (*map_result_ptr != SYMSHM_SUCCESS)
289 goto map_fail;
290
291 /* if rank 0 mapped successfully, others on the node attach shared memory region */
292
293 /* get serialized handle from rank 0 and deserialize it */
294 mpi_errno =
295 MPIR_Bcast(serialized_hnd, MPL_SHM_GHND_SZ, MPI_BYTE, 0,
296 shm_comm_ptr, &errflag);
297 MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
298
299 mpl_err =
300 MPL_shm_hnd_deserialize(shm_seg->hnd, serialized_hnd, strlen(serialized_hnd));
301 if (mpl_err) {
302 *map_result_ptr = SYMSHM_OTHER_FAIL;
303 goto result_sync;
304 }
305
306 /* try to attach with specified symmetric address */
307 mpl_err = MPL_shm_fixed_seg_attach(shm_seg->hnd, shm_seg->segment_len,
308 (void **) &(shm_seg->base_addr), 0);
309 if (mpl_err) {
310 *map_result_ptr =
311 (mpl_err == MPL_ERR_SHM_INVAL) ? SYMSHM_MAP_FAIL : SYMSHM_OTHER_FAIL;
312 } else
313 mapped_flag = true;
314 }
315
316 result_sync:
317 /* check results of all processes. If any failure happens (max result > 0),
318 * return SYMSHM_OTHER_FAIL if anyone reports it (max result == 2).
319 * Otherwise return SYMSHM_MAP_FAIL (max result == 1). */
320 mpi_errno = MPIR_Allreduce(map_result_ptr, &all_map_result, 1, MPI_INT,
321 MPI_MAX, shm_comm_ptr, &errflag);
322 MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
323
324 if (all_map_result != SYMSHM_SUCCESS)
325 goto map_fail;
326
327 if (shm_comm_ptr->rank == 0) {
328 /* unlink shared memory region so it gets deleted when all processes exit */
329 mpl_err = MPL_shm_seg_remove(shm_seg->hnd);
330 MPIR_ERR_CHKANDJUMP(mpl_err, mpi_errno, MPI_ERR_OTHER, "**remove_shar_mem");
331 }
332 } else {
333 /* if there is only one process on this processor, don't use shared memory */
334 int rc = check_maprange_ok(shm_seg->base_addr, shm_seg->segment_len);
335 if (rc) {
336 shm_seg->base_addr = MPL_mmap(shm_seg->base_addr, shm_seg->segment_len,
337 PROT_READ | PROT_WRITE,
338 MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0,
339 MPL_MEM_SHM);
340 } else {
341 shm_seg->base_addr = (void *) MAP_FAILED;
342 }
343 *map_result_ptr = (shm_seg->base_addr == (void *) MAP_FAILED) ?
344 SYMSHM_MAP_FAIL : SYMSHM_SUCCESS;
345 }
346 }
347
348 fn_exit:
349 return mpi_errno;
350 map_fail:
351 if (mapped_flag) {
352 /* destroy successful shm segment */
353 mpl_err =
354 MPL_shm_seg_detach(shm_seg->hnd, (void **) &(shm_seg->base_addr), shm_seg->segment_len);
355 MPIR_ERR_CHKANDJUMP(mpl_err, mpi_errno, MPI_ERR_OTHER, "**detach_shar_mem");
356 }
357 *map_result_ptr = all_map_result;
358 goto fn_exit;
359 fn_fail:
360 goto fn_exit;
361 }
362
unmap_symm_shm(MPIR_Comm * shm_comm_ptr,MPIDU_shm_seg_t * shm_seg)363 static int unmap_symm_shm(MPIR_Comm * shm_comm_ptr, MPIDU_shm_seg_t * shm_seg)
364 {
365 int mpi_errno = MPI_SUCCESS, mpl_err = MPL_SUCCESS;
366
367 if (shm_comm_ptr != NULL) {
368 /* destroy successful shm segment */
369 mpl_err =
370 MPL_shm_seg_detach(shm_seg->hnd, (void **) &(shm_seg->base_addr), shm_seg->segment_len);
371 MPIR_ERR_CHKANDJUMP(mpl_err, mpi_errno, MPI_ERR_OTHER, "**detach_shar_mem");
372 } else {
373 MPL_munmap(shm_seg->base_addr, shm_seg->segment_len, MPL_MEM_SHM);
374 }
375
376 fn_exit:
377 return mpi_errno;
378 fn_fail:
379 goto fn_exit;
380 }
381
382 /* Allocate symmetric shared memory across all processes in comm */
shm_alloc_symm_all(MPIR_Comm * comm_ptr,size_t offset,MPIDU_shm_seg_t * shm_seg,bool * fail_flag)383 static int shm_alloc_symm_all(MPIR_Comm * comm_ptr, size_t offset, MPIDU_shm_seg_t * shm_seg,
384 bool * fail_flag)
385 {
386 int mpi_errno = MPI_SUCCESS;
387 int rank = comm_ptr->rank;
388 int all_map_result = SYMSHM_MAP_FAIL;
389 int iter = MPIR_CVAR_SHM_SYMHEAP_RETRY;
390 int maxsz_loc = 0;
391 size_t maxsz = 0;
392 char *map_pointer = NULL;
393 MPIR_Comm *shm_comm_ptr = comm_ptr->node_comm;
394 MPIR_Errflag_t errflag = MPIR_ERR_NONE;
395
396 /* we let the process with larger amount of requested memory to calculate
397 * the random address. This should reduce the number of attempts allocating
398 * symmetric shared memory as the other processes are more likely to accept
399 * the returned pointer when mapping memory into their address space. */
400 mpi_errno = allreduce_maxloc(shm_seg->segment_len, rank, comm_ptr, &maxsz, &maxsz_loc);
401 if (mpi_errno != MPI_SUCCESS)
402 goto fn_fail;
403
404 if (maxsz == 0)
405 goto fn_exit;
406
407 while (all_map_result == SYMSHM_MAP_FAIL && iter-- > 0) {
408 int map_result = SYMSHM_SUCCESS;
409
410 /* rank maxsz_loc in comm generates random address */
411 if (comm_ptr->rank == maxsz_loc)
412 map_pointer = generate_random_addr(shm_seg->segment_len);
413
414 /* broadcast fixed address to the other processes in comm */
415 mpi_errno =
416 MPIR_Bcast(&map_pointer, sizeof(char *), MPI_CHAR, maxsz_loc, comm_ptr, &errflag);
417 MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
418
419 /* optimization: make sure every process memory in the shared segment is mapped
420 * at the same virtual address in the corresponding address space. Example with
421 * 4 processes: if after calling MPIDU_shm_alloc_symm_all() process 0 gets addr
422 * 0x4000, process 1 will get 0x3000, process 2 will get 0x2000, and process 3
423 * will get 0x1000. This way all processes have their own memory inside the shm
424 * segment starting at addr 0x4000. Processes in other nodes will also have the
425 * same address. */
426 shm_seg->base_addr = map_pointer - offset;
427
428 /* try mapping symmetric memory */
429 mpi_errno = map_symm_shm(shm_comm_ptr, shm_seg, &map_result);
430 MPIR_ERR_CHECK(mpi_errno);
431
432 /* check if any mapping failure occurs */
433 mpi_errno = MPIR_Allreduce(&map_result, &all_map_result, 1, MPI_INT,
434 MPI_MAX, comm_ptr, &errflag);
435 MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
436
437 /* cleanup local shm segment if mapping failed on other process */
438 if (all_map_result != SYMSHM_SUCCESS && map_result == SYMSHM_SUCCESS &&
439 shm_seg->segment_len > 0) {
440 mpi_errno = unmap_symm_shm(shm_comm_ptr, shm_seg);
441 MPIR_ERR_CHECK(mpi_errno);
442 }
443 }
444
445 if (all_map_result != SYMSHM_SUCCESS) {
446 /* if fail to allocate, return and let the caller choose another method */
447 *fail_flag = true;
448 }
449
450 fn_exit:
451 return mpi_errno;
452 fn_fail:
453 goto fn_exit;
454 }
455 #endif /* end of USE_SYM_HEAP */
456
shm_alloc(MPIR_Comm * shm_comm_ptr,MPIDU_shm_seg_t * shm_seg,bool * fail_flag)457 static int shm_alloc(MPIR_Comm * shm_comm_ptr, MPIDU_shm_seg_t * shm_seg, bool * fail_flag)
458 {
459 int mpi_errno = MPI_SUCCESS, mpl_err = MPL_SUCCESS;
460 bool shm_fail_flag = false;
461 bool any_shm_fail_flag = false;
462 bool mapped_flag = false;
463 MPIR_Errflag_t errflag = MPIR_ERR_NONE;
464
465 if (shm_comm_ptr->rank == 0) {
466 char *serialized_hnd = NULL;
467 char mpl_err_hnd[MPL_SHM_GHND_SZ] = { 0 };
468
469 /* root prepare shm segment */
470 mpl_err = MPL_shm_seg_create_and_attach(shm_seg->hnd, shm_seg->segment_len,
471 (void **) &(shm_seg->base_addr), 0);
472 if (mpl_err != MPL_SUCCESS) {
473 shm_fail_flag = true;
474 goto hnd_sync;
475 } else
476 mapped_flag = true;
477
478 mpl_err = MPL_shm_hnd_get_serialized_by_ref(shm_seg->hnd, &serialized_hnd);
479 if (mpl_err != MPL_SUCCESS)
480 shm_fail_flag = true;
481
482 hnd_sync:
483 if (shm_fail_flag)
484 serialized_hnd = &mpl_err_hnd[0];
485
486 mpi_errno =
487 MPIR_Bcast_impl(serialized_hnd, MPL_SHM_GHND_SZ, MPI_BYTE, 0, shm_comm_ptr, &errflag);
488 MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
489
490 if (shm_fail_flag)
491 goto map_fail;
492
493 /* ensure all other processes have mapped successfully */
494 mpi_errno = MPIR_Allreduce_impl(&shm_fail_flag, &any_shm_fail_flag, 1, MPI_C_BOOL,
495 MPI_LOR, shm_comm_ptr, &errflag);
496 MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
497
498 /* unlink shared memory region so it gets deleted when all processes exit */
499 mpl_err = MPL_shm_seg_remove(shm_seg->hnd);
500 MPIR_ERR_CHKANDJUMP(mpl_err, mpi_errno, MPI_ERR_OTHER, "**remove_shar_mem");
501
502 if (any_shm_fail_flag)
503 goto map_fail;
504
505 } else {
506 char serialized_hnd[MPL_SHM_GHND_SZ] = { 0 };
507
508 /* get serialized handle from rank 0 and deserialize it */
509 mpi_errno = MPIR_Bcast_impl(serialized_hnd, MPL_SHM_GHND_SZ, MPI_CHAR, 0,
510 shm_comm_ptr, &errflag);
511 MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
512
513 /* empty handler means root fails */
514 if (strlen(serialized_hnd) == 0)
515 goto map_fail;
516
517 mpl_err = MPL_shm_hnd_deserialize(shm_seg->hnd, serialized_hnd, strlen(serialized_hnd));
518 if (mpl_err != MPL_SUCCESS) {
519 shm_fail_flag = true;
520 goto result_sync;
521 }
522
523 mpl_err = MPL_shm_seg_attach(shm_seg->hnd, shm_seg->segment_len,
524 (void **) &shm_seg->base_addr, 0);
525 if (mpl_err != MPL_SUCCESS) {
526 shm_fail_flag = true;
527 goto result_sync;
528 } else
529 mapped_flag = true;
530
531 result_sync:
532 mpi_errno = MPIR_Allreduce_impl(&shm_fail_flag, &any_shm_fail_flag, 1, MPI_C_BOOL,
533 MPI_LOR, shm_comm_ptr, &errflag);
534 MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
535
536 if (any_shm_fail_flag)
537 goto map_fail;
538 }
539
540 fn_exit:
541 return mpi_errno;
542 map_fail:
543 if (mapped_flag) {
544 /* destroy successful shm segment */
545 mpl_err =
546 MPL_shm_seg_detach(shm_seg->hnd, (void **) &(shm_seg->base_addr), shm_seg->segment_len);
547 MPIR_ERR_CHKANDJUMP(mpl_err, mpi_errno, MPI_ERR_OTHER, "**detach_shar_mem");
548 }
549 *fail_flag = true;
550 goto fn_exit;
551 fn_fail:
552 goto fn_exit;
553 }
554
MPIDU_shm_alloc_symm_all(MPIR_Comm * comm_ptr,size_t len,size_t offset,void ** ptr,bool * fail_flag)555 int MPIDU_shm_alloc_symm_all(MPIR_Comm * comm_ptr, size_t len, size_t offset, void **ptr,
556 bool * fail_flag)
557 {
558 int mpi_errno = MPI_SUCCESS;
559 #ifdef USE_SYM_HEAP
560 int mpl_err = MPL_SUCCESS;
561 MPIDU_shm_seg_t *shm_seg = NULL;
562 seg_list_t *el = NULL;
563 MPIR_CHKPMEM_DECL(2);
564
565 *ptr = NULL;
566
567 MPIR_CHKPMEM_MALLOC(shm_seg, MPIDU_shm_seg_t *, sizeof(*shm_seg), mpi_errno, "shm_seg_handle",
568 MPL_MEM_OTHER);
569 MPIR_CHKPMEM_MALLOC(el, seg_list_t *, sizeof(*el), mpi_errno,
570 "seg_list_element", MPL_MEM_OTHER);
571
572 mpl_err = MPL_shm_hnd_init(&(shm_seg->hnd));
573 MPIR_ERR_CHKANDJUMP(mpl_err, mpi_errno, MPI_ERR_OTHER, "**alloc_shar_mem");
574
575 shm_seg->segment_len = len;
576
577 mpi_errno = shm_alloc_symm_all(comm_ptr, offset, shm_seg, fail_flag);
578 if (mpi_errno || *fail_flag)
579 goto fn_fail;
580
581 if (len == 0) {
582 /* process requested no memory, cleanup and return */
583 MPL_shm_seg_remove(shm_seg->hnd);
584 MPL_shm_hnd_finalize(&(shm_seg->hnd));
585 MPIR_CHKPMEM_REAP();
586 goto fn_exit;
587 }
588
589 *ptr = shm_seg->base_addr;
590
591 /* store shm_seg handle in linked list for later retrieval */
592 el->key = *ptr;
593 el->shm_seg = shm_seg;
594 LL_APPEND(seg_list_head, seg_list_tail, el);
595
596 MPIR_CHKPMEM_COMMIT();
597
598 fn_exit:
599 return mpi_errno;
600 fn_fail:
601 /* --BEGIN ERROR HANDLING-- */
602 if (shm_seg) {
603 MPL_shm_seg_remove(shm_seg->hnd);
604 MPL_shm_hnd_finalize(&(shm_seg->hnd));
605 }
606 MPIR_CHKPMEM_REAP();
607 goto fn_exit;
608 /* --END ERROR HANDLING-- */
609 #else
610 /* always fail, return and let the caller choose another method */
611 *fail_flag = true;
612 return mpi_errno;
613 #endif /* end of USE_SYM_HEAP */
614 }
615
MPIDU_shm_alloc(MPIR_Comm * shm_comm_ptr,size_t len,void ** ptr,bool * fail_flag)616 int MPIDU_shm_alloc(MPIR_Comm * shm_comm_ptr, size_t len, void **ptr, bool * fail_flag)
617 {
618 int mpi_errno = MPI_SUCCESS, mpl_err = MPL_SUCCESS;
619 MPIDU_shm_seg_t *shm_seg = NULL;
620 seg_list_t *el = NULL;
621 MPIR_CHKPMEM_DECL(2);
622
623 *ptr = NULL;
624
625 MPIR_Assert(shm_comm_ptr != NULL);
626 MPIR_Assert(len > 0);
627
628 MPIR_CHKPMEM_MALLOC(shm_seg, MPIDU_shm_seg_t *, sizeof(*shm_seg), mpi_errno, "shm_seg_handle",
629 MPL_MEM_OTHER);
630 MPIR_CHKPMEM_MALLOC(el, seg_list_t *, sizeof(*el), mpi_errno,
631 "seg_list_element", MPL_MEM_OTHER);
632
633 mpl_err = MPL_shm_hnd_init(&(shm_seg->hnd));
634 MPIR_ERR_CHKANDJUMP(mpl_err, mpi_errno, MPI_ERR_OTHER, "**alloc_shar_mem");
635
636 shm_seg->segment_len = len;
637
638 mpi_errno = shm_alloc(shm_comm_ptr, shm_seg, fail_flag);
639 if (mpi_errno || *fail_flag)
640 goto fn_fail;
641
642 *ptr = shm_seg->base_addr;
643
644 /* store shm_seg handle in linked list for later retrieval */
645 el->key = *ptr;
646 el->shm_seg = shm_seg;
647 LL_APPEND(seg_list_head, seg_list_tail, el);
648
649 MPIR_CHKPMEM_COMMIT();
650
651 fn_exit:
652 return mpi_errno;
653 fn_fail:
654 /* --BEGIN ERROR HANDLING-- */
655 if (shm_seg) {
656 MPL_shm_seg_remove(shm_seg->hnd);
657 MPL_shm_hnd_finalize(&(shm_seg->hnd));
658 }
659 MPIR_CHKPMEM_REAP();
660 goto fn_exit;
661 /* --END ERROR HANDLING-- */
662 }
663
MPIDU_shm_free(void * ptr)664 int MPIDU_shm_free(void *ptr)
665 {
666 int mpi_errno = MPI_SUCCESS, mpl_err = MPL_SUCCESS;
667 MPIDU_shm_seg_t *shm_seg = NULL;
668 seg_list_t *el = NULL;
669
670 /* retrieve memory handle for baseaddr */
671 LL_FOREACH(seg_list_head, el) {
672 if (el->key == ptr) {
673 shm_seg = el->shm_seg;
674 LL_DELETE(seg_list_head, seg_list_tail, el);
675 MPL_free(el);
676 break;
677 }
678 }
679
680 MPIR_Assert(shm_seg != NULL);
681
682 /* if there is only one process in the node the serialized handle points
683 * to NULL as there is no shared file backing up memory. This is used to
684 * differentiate between shared memory and private memory allocations
685 * when symmetric shared memory is being requested. */
686 char *serialized_hnd = NULL;
687 MPL_shm_hnd_get_serialized_by_ref(shm_seg->hnd, &serialized_hnd);
688
689 if (serialized_hnd) {
690 mpl_err = MPL_shm_seg_detach(shm_seg->hnd, (void **) &(shm_seg->base_addr),
691 shm_seg->segment_len);
692 MPIR_ERR_CHKANDJUMP(mpl_err, mpi_errno, MPI_ERR_OTHER, "**detach_shar_mem");
693 } else {
694 MPL_munmap(shm_seg->base_addr, shm_seg->segment_len, MPL_MEM_SHM);
695 }
696
697 fn_exit:
698 MPL_shm_hnd_finalize(&(shm_seg->hnd));
699 MPL_free(shm_seg);
700 return mpi_errno;
701 fn_fail:
702 goto fn_exit;
703 }
704