1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  */
5 
6 #include "mpidimpl.h"
7 #include "ipc_noinline.h"
8 #include "ipc_types.h"
9 #include "ipc_mem.h"
10 
11 /* Return global node rank of each process in the shared communicator.
12  * I.e., rank in MPIR_Process.comm_world->node_comm. The caller routine
13  * must allocate/free each buffer. */
get_node_ranks(MPIR_Comm * shm_comm_ptr,int * shm_ranks,int * node_ranks)14 static int get_node_ranks(MPIR_Comm * shm_comm_ptr, int *shm_ranks, int *node_ranks)
15 {
16     int i;
17     int mpi_errno = MPI_SUCCESS;
18     MPIR_Group *shm_group_ptr;
19 
20     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_GET_NODE_RANKS);
21     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_GET_NODE_RANKS);
22 
23     for (i = 0; i < shm_comm_ptr->local_size; i++)
24         shm_ranks[i] = i;
25 
26     mpi_errno = MPIR_Comm_group_impl(shm_comm_ptr, &shm_group_ptr);
27     MPIR_ERR_CHECK(mpi_errno);
28 
29     /* Get node group if it is not yet initialized */
30     if (!MPIDI_IPCI_global.node_group_ptr) {
31         mpi_errno = MPIR_Comm_group_impl(MPIR_Process.comm_world->node_comm,
32                                          &MPIDI_IPCI_global.node_group_ptr);
33         MPIR_ERR_CHECK(mpi_errno);
34     }
35 
36     mpi_errno = MPIR_Group_translate_ranks_impl(shm_group_ptr, shm_comm_ptr->local_size,
37                                                 shm_ranks, MPIDI_IPCI_global.node_group_ptr,
38                                                 node_ranks);
39     MPIR_ERR_CHECK(mpi_errno);
40 
41     mpi_errno = MPIR_Group_free_impl(shm_group_ptr);
42     MPIR_ERR_CHECK(mpi_errno);
43 
44   fn_exit:
45     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_GET_NODE_RANKS);
46     return mpi_errno;
47   fn_fail:
48     goto fn_exit;
49 }
50 
51 
52 typedef struct win_shared_info {
53     uint32_t disp_unit;
54     size_t size;
55     MPIDI_IPCI_type_t ipc_type;
56     MPIDI_IPCI_ipc_handle_t ipc_handle;
57 } win_shared_info_t;
58 
MPIDI_IPC_mpi_win_create_hook(MPIR_Win * win)59 int MPIDI_IPC_mpi_win_create_hook(MPIR_Win * win)
60 {
61     int mpi_errno = MPI_SUCCESS;
62     MPIR_Comm *shm_comm_ptr = win->comm_ptr->node_comm;
63     MPIR_Errflag_t errflag = MPIR_ERR_NONE;
64     int i;
65     size_t total_shm_size = 0;
66     MPIDIG_win_shared_info_t *shared_table = NULL;
67     win_shared_info_t *ipc_shared_table = NULL; /* temporary exchange buffer */
68     int *ranks_in_shm_grp = NULL;
69     MPIDI_IPCI_ipc_attr_t ipc_attr;
70 
71     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_IPC_MPI_WIN_CREATE_HOOK);
72     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_IPC_MPI_WIN_CREATE_HOOK);
73     MPIR_CHKPMEM_DECL(2);
74     MPIR_CHKLMEM_DECL(2);
75 
76     /* Skip IPC initialization if no local process */
77     if (!shm_comm_ptr)
78         goto fn_exit;
79 
80     /* Determine IPC type based on buffer type and submodule availability.
81      * We always exchange in case any remote buffer can be shared by IPC. */
82     MPIR_GPU_query_pointer_attr(win->base, &ipc_attr.gpu_attr);
83 
84     if (ipc_attr.gpu_attr.type == MPL_GPU_POINTER_DEV) {
85         mpi_errno = MPIDI_GPU_get_ipc_attr(win->base, shm_comm_ptr->rank, shm_comm_ptr, &ipc_attr);
86         MPIR_ERR_CHECK(mpi_errno);
87     } else {
88         mpi_errno = MPIDI_XPMEM_get_ipc_attr(win->base, win->size, &ipc_attr);
89         MPIR_ERR_CHECK(mpi_errno);
90     }
91 
92     /* Disable local IPC for zero buffer */
93     if (win->size == 0)
94         ipc_attr.ipc_type = MPIDI_IPCI_TYPE__NONE;
95 
96     /* Exchange shared memory region information */
97     MPIR_T_PVAR_TIMER_START(RMA, rma_wincreate_allgather);
98 
99     /* Create shared table for later RMA access. Note that shared_table is
100      * a CH4 level structure used also for other window types. CH4 always
101      * initializes shared table for win_allocate and win_allocate_shared because
102      * their shm region are ensured by POSIX. The other window types can only
103      * optionally initialize it in shmmod .*/
104     MPIR_CHKPMEM_CALLOC(MPIDIG_WIN(win, shared_table), MPIDIG_win_shared_info_t *,
105                         sizeof(MPIDIG_win_shared_info_t) * shm_comm_ptr->local_size,
106                         mpi_errno, "shared table", MPL_MEM_RMA);
107     shared_table = MPIDIG_WIN(win, shared_table);
108 
109     MPIR_CHKLMEM_MALLOC(ipc_shared_table, win_shared_info_t *,
110                         sizeof(win_shared_info_t) * shm_comm_ptr->local_size,
111                         mpi_errno, "IPC temporary shared table", MPL_MEM_RMA);
112 
113     memset(&ipc_shared_table[shm_comm_ptr->rank], 0, sizeof(win_shared_info_t));
114     ipc_shared_table[shm_comm_ptr->rank].size = win->size;
115     ipc_shared_table[shm_comm_ptr->rank].disp_unit = win->disp_unit;
116     ipc_shared_table[shm_comm_ptr->rank].ipc_type = ipc_attr.ipc_type;
117     ipc_shared_table[shm_comm_ptr->rank].ipc_handle = ipc_attr.ipc_handle;
118 
119     mpi_errno = MPIR_Allgather(MPI_IN_PLACE,
120                                0,
121                                MPI_DATATYPE_NULL,
122                                ipc_shared_table,
123                                sizeof(win_shared_info_t), MPI_BYTE, shm_comm_ptr, &errflag);
124     MPIR_T_PVAR_TIMER_END(RMA, rma_wincreate_allgather);
125     MPIR_ERR_CHECK(mpi_errno);
126 
127     /* Skip shared_table initialization if:
128      *  1. All local processes are zero size, or
129      *  2. Any process has a non-zero buffer but the IPC type is NONE (i.e.
130      *     no submodule supports that buffer) */
131     for (i = 0; i < shm_comm_ptr->local_size; i++) {
132         total_shm_size += ipc_shared_table[i].size;
133         if (ipc_shared_table[i].size > 0 && ipc_shared_table[i].ipc_type == MPIDI_IPCI_TYPE__NONE) {
134             total_shm_size = 0;
135             break;
136         }
137     }
138     if (total_shm_size == 0) {
139         MPL_free(MPIDIG_WIN(win, shared_table));
140         MPIDIG_WIN(win, shared_table) = NULL;
141         goto fn_exit;
142     }
143 
144     /* Attach remote memory regions based on its IPC type */
145     MPIR_CHKLMEM_MALLOC(ranks_in_shm_grp, int *, shm_comm_ptr->local_size * sizeof(int) * 2,
146                         mpi_errno, "ranks in shm group", MPL_MEM_RMA);
147     mpi_errno = get_node_ranks(shm_comm_ptr, ranks_in_shm_grp,
148                                &ranks_in_shm_grp[shm_comm_ptr->local_size]);
149     MPIR_ERR_CHECK(mpi_errno);
150 
151     for (i = 0; i < shm_comm_ptr->local_size; i++) {
152         shared_table[i].size = ipc_shared_table[i].size;
153         shared_table[i].disp_unit = ipc_shared_table[i].disp_unit;
154         shared_table[i].shm_base_addr = NULL;
155 
156         if (i == shm_comm_ptr->rank) {
157             shared_table[i].shm_base_addr = win->base;
158         } else {
159             /* attach remote buffer */
160             switch (ipc_shared_table[i].ipc_type) {
161                 case MPIDI_IPCI_TYPE__XPMEM:
162                     mpi_errno =
163                         MPIDI_XPMEM_ipc_handle_map(ipc_shared_table[i].ipc_handle.xpmem,
164                                                    &shared_table[i].shm_base_addr);
165                     MPIR_ERR_CHECK(mpi_errno);
166                     break;
167                 case MPIDI_IPCI_TYPE__GPU:
168                     /* FIXME: remote win buffer should be mapped to each of their corresponding
169                      * local GPU device. */
170                     mpi_errno =
171                         MPIDI_GPU_ipc_handle_map(ipc_shared_table[i].ipc_handle.gpu,
172                                                  ipc_attr.gpu_attr.device, MPI_BYTE,
173                                                  &shared_table[i].shm_base_addr);
174                     MPIR_ERR_CHECK(mpi_errno);
175                     break;
176                 case MPIDI_IPCI_TYPE__NONE:
177                     /* no-op */
178                     break;
179                 default:
180                     /* Unknown IPC type */
181                     MPIR_Assert(0);
182                     break;
183             }
184         }
185         IPC_TRACE("shared_table[%d]: size=0x%lx, disp_unit=0x%x, shm_base_addr=%p (ipc_type=%d)\n",
186                   i, shared_table[i].size, shared_table[i].disp_unit,
187                   shared_table[i].shm_base_addr, ipc_shared_table[i].ipc_type);
188     }
189 
190     /* Initialize POSIX shm window components (e.g., shared mutex), thus
191      * we can reuse POSIX operation routines. */
192     mpi_errno = MPIDI_POSIX_shm_win_init_hook(win);
193     MPIR_ERR_CHECK(mpi_errno);
194 
195   fn_exit:
196     MPIR_CHKLMEM_FREEALL();
197     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_IPC_MPI_WIN_CREATE_HOOK);
198     return mpi_errno;
199   fn_fail:
200     MPIR_CHKPMEM_REAP();
201     goto fn_exit;
202 }
203 
MPIDI_IPC_mpi_win_free_hook(MPIR_Win * win)204 int MPIDI_IPC_mpi_win_free_hook(MPIR_Win * win)
205 {
206     int mpi_errno = MPI_SUCCESS;
207     MPIR_Comm *shm_comm_ptr = win->comm_ptr->node_comm;
208     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_IPC_MPI_WIN_FREE_HOOK);
209     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_IPC_MPI_WIN_FREE_HOOK);
210 
211     if (win->create_flavor != MPI_WIN_FLAVOR_CREATE ||
212         !shm_comm_ptr || !MPIDIG_WIN(win, shared_table))
213         goto fn_exit;
214 
215     MPL_free(MPIDIG_WIN(win, shared_table));
216 
217   fn_exit:
218     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_IPC_MPI_WIN_FREE_HOOK);
219     return mpi_errno;
220   fn_fail:
221     goto fn_exit;
222 }
223