1 /*
2 * Copyright (C) by Argonne National Laboratory
3 * See COPYRIGHT in top-level directory
4 */
5
6 #include "mpidimpl.h"
7 #include "ipc_noinline.h"
8 #include "ipc_types.h"
9 #include "ipc_mem.h"
10
11 /* Return global node rank of each process in the shared communicator.
12 * I.e., rank in MPIR_Process.comm_world->node_comm. The caller routine
13 * must allocate/free each buffer. */
get_node_ranks(MPIR_Comm * shm_comm_ptr,int * shm_ranks,int * node_ranks)14 static int get_node_ranks(MPIR_Comm * shm_comm_ptr, int *shm_ranks, int *node_ranks)
15 {
16 int i;
17 int mpi_errno = MPI_SUCCESS;
18 MPIR_Group *shm_group_ptr;
19
20 MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_GET_NODE_RANKS);
21 MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_GET_NODE_RANKS);
22
23 for (i = 0; i < shm_comm_ptr->local_size; i++)
24 shm_ranks[i] = i;
25
26 mpi_errno = MPIR_Comm_group_impl(shm_comm_ptr, &shm_group_ptr);
27 MPIR_ERR_CHECK(mpi_errno);
28
29 /* Get node group if it is not yet initialized */
30 if (!MPIDI_IPCI_global.node_group_ptr) {
31 mpi_errno = MPIR_Comm_group_impl(MPIR_Process.comm_world->node_comm,
32 &MPIDI_IPCI_global.node_group_ptr);
33 MPIR_ERR_CHECK(mpi_errno);
34 }
35
36 mpi_errno = MPIR_Group_translate_ranks_impl(shm_group_ptr, shm_comm_ptr->local_size,
37 shm_ranks, MPIDI_IPCI_global.node_group_ptr,
38 node_ranks);
39 MPIR_ERR_CHECK(mpi_errno);
40
41 mpi_errno = MPIR_Group_free_impl(shm_group_ptr);
42 MPIR_ERR_CHECK(mpi_errno);
43
44 fn_exit:
45 MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_GET_NODE_RANKS);
46 return mpi_errno;
47 fn_fail:
48 goto fn_exit;
49 }
50
51
52 typedef struct win_shared_info {
53 uint32_t disp_unit;
54 size_t size;
55 MPIDI_IPCI_type_t ipc_type;
56 MPIDI_IPCI_ipc_handle_t ipc_handle;
57 } win_shared_info_t;
58
MPIDI_IPC_mpi_win_create_hook(MPIR_Win * win)59 int MPIDI_IPC_mpi_win_create_hook(MPIR_Win * win)
60 {
61 int mpi_errno = MPI_SUCCESS;
62 MPIR_Comm *shm_comm_ptr = win->comm_ptr->node_comm;
63 MPIR_Errflag_t errflag = MPIR_ERR_NONE;
64 int i;
65 size_t total_shm_size = 0;
66 MPIDIG_win_shared_info_t *shared_table = NULL;
67 win_shared_info_t *ipc_shared_table = NULL; /* temporary exchange buffer */
68 int *ranks_in_shm_grp = NULL;
69 MPIDI_IPCI_ipc_attr_t ipc_attr;
70
71 MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_IPC_MPI_WIN_CREATE_HOOK);
72 MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_IPC_MPI_WIN_CREATE_HOOK);
73 MPIR_CHKPMEM_DECL(2);
74 MPIR_CHKLMEM_DECL(2);
75
76 /* Skip IPC initialization if no local process */
77 if (!shm_comm_ptr)
78 goto fn_exit;
79
80 /* Determine IPC type based on buffer type and submodule availability.
81 * We always exchange in case any remote buffer can be shared by IPC. */
82 MPIR_GPU_query_pointer_attr(win->base, &ipc_attr.gpu_attr);
83
84 if (ipc_attr.gpu_attr.type == MPL_GPU_POINTER_DEV) {
85 mpi_errno = MPIDI_GPU_get_ipc_attr(win->base, shm_comm_ptr->rank, shm_comm_ptr, &ipc_attr);
86 MPIR_ERR_CHECK(mpi_errno);
87 } else {
88 mpi_errno = MPIDI_XPMEM_get_ipc_attr(win->base, win->size, &ipc_attr);
89 MPIR_ERR_CHECK(mpi_errno);
90 }
91
92 /* Disable local IPC for zero buffer */
93 if (win->size == 0)
94 ipc_attr.ipc_type = MPIDI_IPCI_TYPE__NONE;
95
96 /* Exchange shared memory region information */
97 MPIR_T_PVAR_TIMER_START(RMA, rma_wincreate_allgather);
98
99 /* Create shared table for later RMA access. Note that shared_table is
100 * a CH4 level structure used also for other window types. CH4 always
101 * initializes shared table for win_allocate and win_allocate_shared because
102 * their shm region are ensured by POSIX. The other window types can only
103 * optionally initialize it in shmmod .*/
104 MPIR_CHKPMEM_CALLOC(MPIDIG_WIN(win, shared_table), MPIDIG_win_shared_info_t *,
105 sizeof(MPIDIG_win_shared_info_t) * shm_comm_ptr->local_size,
106 mpi_errno, "shared table", MPL_MEM_RMA);
107 shared_table = MPIDIG_WIN(win, shared_table);
108
109 MPIR_CHKLMEM_MALLOC(ipc_shared_table, win_shared_info_t *,
110 sizeof(win_shared_info_t) * shm_comm_ptr->local_size,
111 mpi_errno, "IPC temporary shared table", MPL_MEM_RMA);
112
113 memset(&ipc_shared_table[shm_comm_ptr->rank], 0, sizeof(win_shared_info_t));
114 ipc_shared_table[shm_comm_ptr->rank].size = win->size;
115 ipc_shared_table[shm_comm_ptr->rank].disp_unit = win->disp_unit;
116 ipc_shared_table[shm_comm_ptr->rank].ipc_type = ipc_attr.ipc_type;
117 ipc_shared_table[shm_comm_ptr->rank].ipc_handle = ipc_attr.ipc_handle;
118
119 mpi_errno = MPIR_Allgather(MPI_IN_PLACE,
120 0,
121 MPI_DATATYPE_NULL,
122 ipc_shared_table,
123 sizeof(win_shared_info_t), MPI_BYTE, shm_comm_ptr, &errflag);
124 MPIR_T_PVAR_TIMER_END(RMA, rma_wincreate_allgather);
125 MPIR_ERR_CHECK(mpi_errno);
126
127 /* Skip shared_table initialization if:
128 * 1. All local processes are zero size, or
129 * 2. Any process has a non-zero buffer but the IPC type is NONE (i.e.
130 * no submodule supports that buffer) */
131 for (i = 0; i < shm_comm_ptr->local_size; i++) {
132 total_shm_size += ipc_shared_table[i].size;
133 if (ipc_shared_table[i].size > 0 && ipc_shared_table[i].ipc_type == MPIDI_IPCI_TYPE__NONE) {
134 total_shm_size = 0;
135 break;
136 }
137 }
138 if (total_shm_size == 0) {
139 MPL_free(MPIDIG_WIN(win, shared_table));
140 MPIDIG_WIN(win, shared_table) = NULL;
141 goto fn_exit;
142 }
143
144 /* Attach remote memory regions based on its IPC type */
145 MPIR_CHKLMEM_MALLOC(ranks_in_shm_grp, int *, shm_comm_ptr->local_size * sizeof(int) * 2,
146 mpi_errno, "ranks in shm group", MPL_MEM_RMA);
147 mpi_errno = get_node_ranks(shm_comm_ptr, ranks_in_shm_grp,
148 &ranks_in_shm_grp[shm_comm_ptr->local_size]);
149 MPIR_ERR_CHECK(mpi_errno);
150
151 for (i = 0; i < shm_comm_ptr->local_size; i++) {
152 shared_table[i].size = ipc_shared_table[i].size;
153 shared_table[i].disp_unit = ipc_shared_table[i].disp_unit;
154 shared_table[i].shm_base_addr = NULL;
155
156 if (i == shm_comm_ptr->rank) {
157 shared_table[i].shm_base_addr = win->base;
158 } else {
159 /* attach remote buffer */
160 switch (ipc_shared_table[i].ipc_type) {
161 case MPIDI_IPCI_TYPE__XPMEM:
162 mpi_errno =
163 MPIDI_XPMEM_ipc_handle_map(ipc_shared_table[i].ipc_handle.xpmem,
164 &shared_table[i].shm_base_addr);
165 MPIR_ERR_CHECK(mpi_errno);
166 break;
167 case MPIDI_IPCI_TYPE__GPU:
168 /* FIXME: remote win buffer should be mapped to each of their corresponding
169 * local GPU device. */
170 mpi_errno =
171 MPIDI_GPU_ipc_handle_map(ipc_shared_table[i].ipc_handle.gpu,
172 ipc_attr.gpu_attr.device, MPI_BYTE,
173 &shared_table[i].shm_base_addr);
174 MPIR_ERR_CHECK(mpi_errno);
175 break;
176 case MPIDI_IPCI_TYPE__NONE:
177 /* no-op */
178 break;
179 default:
180 /* Unknown IPC type */
181 MPIR_Assert(0);
182 break;
183 }
184 }
185 IPC_TRACE("shared_table[%d]: size=0x%lx, disp_unit=0x%x, shm_base_addr=%p (ipc_type=%d)\n",
186 i, shared_table[i].size, shared_table[i].disp_unit,
187 shared_table[i].shm_base_addr, ipc_shared_table[i].ipc_type);
188 }
189
190 /* Initialize POSIX shm window components (e.g., shared mutex), thus
191 * we can reuse POSIX operation routines. */
192 mpi_errno = MPIDI_POSIX_shm_win_init_hook(win);
193 MPIR_ERR_CHECK(mpi_errno);
194
195 fn_exit:
196 MPIR_CHKLMEM_FREEALL();
197 MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_IPC_MPI_WIN_CREATE_HOOK);
198 return mpi_errno;
199 fn_fail:
200 MPIR_CHKPMEM_REAP();
201 goto fn_exit;
202 }
203
MPIDI_IPC_mpi_win_free_hook(MPIR_Win * win)204 int MPIDI_IPC_mpi_win_free_hook(MPIR_Win * win)
205 {
206 int mpi_errno = MPI_SUCCESS;
207 MPIR_Comm *shm_comm_ptr = win->comm_ptr->node_comm;
208 MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_IPC_MPI_WIN_FREE_HOOK);
209 MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_IPC_MPI_WIN_FREE_HOOK);
210
211 if (win->create_flavor != MPI_WIN_FLAVOR_CREATE ||
212 !shm_comm_ptr || !MPIDIG_WIN(win, shared_table))
213 goto fn_exit;
214
215 MPL_free(MPIDIG_WIN(win, shared_table));
216
217 fn_exit:
218 MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_IPC_MPI_WIN_FREE_HOOK);
219 return mpi_errno;
220 fn_fail:
221 goto fn_exit;
222 }
223