1 /*
2 * Copyright (C) by Argonne National Laboratory
3 * See COPYRIGHT in top-level directory
4 */
5
6 /*
7 === BEGIN_MPI_T_CVAR_INFO_BLOCK ===
8 cvars:
9 - name : MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE
10 category : CH4
11 type : int
12 default : 1
13 class : none
14 verbosity : MPI_T_VERBOSITY_USER_BASIC
15 scope : MPI_T_SCOPE_ALL_EQ
16 description : >-
17 By default, we will cache ipc handle. To manually disable ipc
18 handle cache, user can set this variable to 0.
19 === END_MPI_T_CVAR_INFO_BLOCK ===
20 */
21
22 #include "mpidimpl.h"
23 #include "gpu_pre.h"
24 #include "gpu_types.h"
25
ipc_handle_cache_search(MPL_gavl_tree_t gavl_tree,const void * addr,uintptr_t len,void ** handle_obj)26 static int ipc_handle_cache_search(MPL_gavl_tree_t gavl_tree, const void *addr, uintptr_t len,
27 void **handle_obj)
28 {
29 int mpi_errno = MPI_SUCCESS;
30
31 MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_IPC_HANDLE_CACHE_SEARCH);
32 MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_IPC_HANDLE_CACHE_SEARCH);
33
34 *handle_obj = NULL;
35 #ifdef MPIDI_CH4_SHM_ENABLE_GPU
36 if (MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE) {
37 int mpl_err = MPL_SUCCESS;
38 mpl_err = MPL_gavl_tree_search(gavl_tree, addr, len, handle_obj);
39 MPIR_ERR_CHKANDJUMP(mpl_err != MPL_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**mpl_gavl_search");
40 }
41 #endif
42
43 fn_exit:
44 MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_IPC_HANDLE_CACHE_SEARCH);
45 return mpi_errno;
46 fn_fail:
47 goto fn_exit;
48 }
49
ipc_handle_cache_insert(MPL_gavl_tree_t gavl_tree,const void * addr,uintptr_t len,const void * handle_obj,bool * insert_successful)50 static int ipc_handle_cache_insert(MPL_gavl_tree_t gavl_tree, const void *addr, uintptr_t len,
51 const void *handle_obj, bool * insert_successful)
52 {
53 int mpi_errno = MPI_SUCCESS;
54
55 MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_IPC_HANDLE_CACHE_INSERT);
56 MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_IPC_HANDLE_CACHE_INSERT);
57
58 *insert_successful = false;
59 #ifdef MPIDI_CH4_SHM_ENABLE_GPU
60 if (MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE) {
61 int mpl_err = MPL_SUCCESS;
62 mpl_err = MPL_gavl_tree_insert(gavl_tree, addr, len, handle_obj);
63 MPIR_ERR_CHKANDJUMP(mpl_err != MPL_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**mpl_gavl_insert");
64 *insert_successful = true;
65 }
66 #endif
67
68 fn_exit:
69 MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_IPC_HANDLE_CACHE_INSERT);
70 return mpi_errno;
71 fn_fail:
72 goto fn_exit;
73 }
74
get_map_device(int remote_global_dev_id,MPL_gpu_device_handle_t local_dev_handle,MPI_Datatype recv_type,int * dev_id)75 static int get_map_device(int remote_global_dev_id,
76 MPL_gpu_device_handle_t local_dev_handle,
77 MPI_Datatype recv_type, int *dev_id)
78 {
79 int mpi_errno = MPI_SUCCESS;
80
81 MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_GET_MAP_DEVICE);
82 MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_GET_MAP_DEVICE);
83
84 #ifdef MPIDI_CH4_SHM_ENABLE_GPU
85 int recv_dev_id;
86 int recv_dt_contig;
87 MPIDI_GPUI_dev_id_t *avail_id = NULL;
88
89 MPIDI_Datatype_check_contig(recv_type, recv_dt_contig);
90
91 HASH_FIND_INT(MPIDI_GPUI_global.global_to_local_map, &remote_global_dev_id, avail_id);
92 MPL_gpu_get_dev_id(local_dev_handle, &recv_dev_id);
93 if (recv_dev_id < 0) {
94 /* when receiver's buffer is on host memory, recv_dev_id will be less than 0.
95 * however, when we decide to map buffer onto receiver's device, this mapping
96 * will be invalid, so we need to assign a default gpu instead; for now, we
97 * assume process can at least access one GPU, so device id 0 is set. */
98 recv_dev_id = 0;
99 }
100
101 if (avail_id == NULL) {
102 *dev_id = recv_dev_id;
103 } else {
104 if (!recv_dt_contig)
105 *dev_id = recv_dev_id;
106 else
107 *dev_id = avail_id->local_dev_id;
108 }
109 #endif
110
111 fn_exit:
112 MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_GET_MAP_DEVICE);
113 return mpi_errno;
114 fn_fail:
115 goto fn_exit;
116 }
117
ipc_handle_cache_delete(MPL_gavl_tree_t gavl_tree,const void * addr,uintptr_t len)118 static int ipc_handle_cache_delete(MPL_gavl_tree_t gavl_tree, const void *addr, uintptr_t len)
119 {
120 int mpi_errno = MPI_SUCCESS;
121
122 MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_IPC_HANDLE_DELETE);
123 MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_IPC_HANDLE_DELETE);
124
125 #ifdef MPIDI_CH4_SHM_ENABLE_GPU
126 int mpl_err = MPL_SUCCESS;
127 if (MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE) {
128 mpl_err = MPL_gavl_tree_delete_range(gavl_tree, addr, len);
129 MPIR_ERR_CHKANDJUMP(mpl_err != MPL_SUCCESS, mpi_errno, MPI_ERR_OTHER,
130 "**mpl_gavl_delete_range");
131 }
132 #endif
133
134 fn_exit:
135 MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_IPC_HANDLE_DELETE);
136 return mpi_errno;
137 fn_fail:
138 goto fn_exit;
139 }
140
MPIDI_GPU_ipc_handle_cache_insert(int rank,MPIR_Comm * comm,MPIDI_GPU_ipc_handle_t handle)141 int MPIDI_GPU_ipc_handle_cache_insert(int rank, MPIR_Comm * comm, MPIDI_GPU_ipc_handle_t handle)
142 {
143 int mpi_errno = MPI_SUCCESS;
144
145 MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_GPU_IPC_HANDLE_CACHE);
146 MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_GPU_IPC_HANDLE_CACHE);
147
148 #ifdef MPIDI_CH4_SHM_ENABLE_GPU
149 if (handle.handle_status == MPIDI_GPU_IPC_HANDLE_REMAP_REQUIRED) {
150 bool insert_successful = false;
151 int recv_lrank = MPIDI_GPUI_global.local_ranks[MPIDIU_rank_to_lpid(rank, comm)];
152
153 MPIDI_GPU_ipc_handle_t *handle_obj =
154 MPL_malloc(sizeof(MPIDI_GPU_ipc_handle_t), MPL_MEM_OTHER);
155 *handle_obj = handle;
156 handle_obj->handle_status = MPIDI_GPU_IPC_HANDLE_VALID;
157
158 mpi_errno = ipc_handle_cache_insert(MPIDI_GPUI_global.ipc_handle_track_trees[recv_lrank]
159 [handle.global_dev_id],
160 (void *) handle.remote_base_addr, handle.len,
161 handle_obj, &insert_successful);
162 MPIR_ERR_CHECK(mpi_errno);
163
164 if (insert_successful == false) {
165 MPL_free(handle_obj);
166 }
167 }
168 #endif
169
170 fn_exit:
171 MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_GPU_IPC_HANDLE_CACHE);
172 return mpi_errno;
173 fn_fail:
174 goto fn_exit;
175 }
176
MPIDI_GPU_get_ipc_attr(const void * vaddr,int rank,MPIR_Comm * comm,MPIDI_IPCI_ipc_attr_t * ipc_attr)177 int MPIDI_GPU_get_ipc_attr(const void *vaddr, int rank, MPIR_Comm * comm,
178 MPIDI_IPCI_ipc_attr_t * ipc_attr)
179 {
180 int mpi_errno = MPI_SUCCESS;
181 MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_GPU_GET_IPC_ATTR);
182 MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_GPU_GET_IPC_ATTR);
183
184 #ifdef MPIDI_CH4_SHM_ENABLE_GPU
185 int local_dev_id;
186 MPIDI_GPUI_dev_id_t *tmp;
187 void *pbase;
188 uintptr_t len;
189 int mpl_err = MPL_SUCCESS;
190 MPIDI_GPU_ipc_handle_t *handle_obj = NULL;
191 int recv_lrank;
192
193 recv_lrank = MPIDI_GPUI_global.local_ranks[MPIDIU_rank_to_lpid(rank, comm)];
194 ipc_attr->ipc_type = MPIDI_IPCI_TYPE__GPU;
195
196 MPL_gpu_get_dev_id(ipc_attr->gpu_attr.device, &local_dev_id);
197 HASH_FIND_INT(MPIDI_GPUI_global.local_to_global_map, &local_dev_id, tmp);
198
199 mpl_err = MPL_gpu_get_buffer_bounds(vaddr, &pbase, &len);
200 MPIR_ERR_CHKANDJUMP(mpl_err != MPL_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**gpu_get_buffer_info");
201
202 mpi_errno = ipc_handle_cache_search(MPIDI_GPUI_global.ipc_handle_track_trees[recv_lrank]
203 [tmp->global_dev_id], pbase, len, (void **) &handle_obj);
204 MPIR_ERR_CHECK(mpi_errno);
205
206 if (handle_obj == NULL) {
207 mpl_err = MPL_gpu_ipc_handle_create(pbase, &ipc_attr->ipc_handle.gpu.ipc_handle);
208 MPIR_ERR_CHKANDJUMP(mpl_err != MPL_SUCCESS, mpi_errno, MPI_ERR_OTHER,
209 "**gpu_ipc_handle_create");
210 ipc_attr->ipc_handle.gpu.handle_status = MPIDI_GPU_IPC_HANDLE_REMAP_REQUIRED;
211 } else {
212 ipc_attr->ipc_handle.gpu.handle_status = MPIDI_GPU_IPC_HANDLE_VALID;
213 }
214
215 /* MPIDI_GPU_get_ipc_attr will be called by sender to create an ipc handle.
216 * remote_base_addr, len and node_rank attributes in ipc handle will be sent
217 * to receiver and used to search cached ipc handle and/or insert new allocated
218 * handle obj on receiver side. offset attribute is always needed no matter
219 * whether we use caching or not in order to compute correct user addr. */
220 ipc_attr->ipc_handle.gpu.remote_base_addr = (uintptr_t) pbase;
221 ipc_attr->ipc_handle.gpu.len = len;
222 ipc_attr->ipc_handle.gpu.node_rank = MPIR_Process.local_rank;
223 ipc_attr->ipc_handle.gpu.offset = (uintptr_t) vaddr - (uintptr_t) pbase;
224
225 ipc_attr->ipc_handle.gpu.global_dev_id = tmp->global_dev_id;
226 ipc_attr->threshold.send_lmt_sz = MPIR_CVAR_CH4_IPC_GPU_P2P_THRESHOLD;
227 #else
228 /* Do not support IPC data transfer */
229 ipc_attr->ipc_type = MPIDI_IPCI_TYPE__NONE;
230 ipc_attr->threshold.send_lmt_sz = MPIR_AINT_MAX;
231 #endif
232
233 fn_exit:
234 MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_GPU_GET_IPC_ATTR);
235 return mpi_errno;
236 fn_fail:
237 goto fn_exit;
238 }
239
MPIDI_GPU_ipc_handle_map(MPIDI_GPU_ipc_handle_t handle,MPL_gpu_device_handle_t dev_handle,MPI_Datatype recv_type,void ** vaddr)240 int MPIDI_GPU_ipc_handle_map(MPIDI_GPU_ipc_handle_t handle,
241 MPL_gpu_device_handle_t dev_handle,
242 MPI_Datatype recv_type, void **vaddr)
243 {
244 int mpi_errno = MPI_SUCCESS;
245
246 MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_GPU_IPC_HANDLE_MAP);
247 MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_GPU_IPC_HANDLE_MAP);
248
249 #ifdef MPIDI_CH4_SHM_ENABLE_GPU
250 void *pbase;
251 int mpl_err = MPL_SUCCESS;
252 int dev_id;
253 MPIDI_GPUI_handle_obj_s *handle_obj = NULL;
254
255 if (handle.handle_status == MPIDI_GPU_IPC_HANDLE_REMAP_REQUIRED) {
256 for (int i = 0; i < MPIDI_GPUI_global.local_device_count; ++i) {
257 mpi_errno =
258 ipc_handle_cache_delete(MPIDI_GPUI_global.ipc_handle_mapped_trees[handle.node_rank]
259 [handle.global_dev_id][i], (void *) handle.remote_base_addr,
260 handle.len);
261 MPIR_ERR_CHECK(mpi_errno);
262 }
263 }
264
265 mpi_errno = get_map_device(handle.global_dev_id, dev_handle, recv_type, &dev_id);
266 MPIR_ERR_CHECK(mpi_errno);
267
268 mpi_errno = ipc_handle_cache_search(MPIDI_GPUI_global.ipc_handle_mapped_trees[handle.node_rank]
269 [handle.global_dev_id][dev_id],
270 (void *) handle.remote_base_addr, handle.len,
271 (void **) &handle_obj);
272 MPIR_ERR_CHECK(mpi_errno);
273
274 if (handle_obj == NULL) {
275 bool insert_successful = false;
276 MPL_gpu_get_dev_handle(dev_id, &dev_handle);
277 mpl_err = MPL_gpu_ipc_handle_map(handle.ipc_handle, dev_handle, &pbase);
278 MPIR_ERR_CHKANDJUMP(mpl_err != MPL_SUCCESS, mpi_errno, MPI_ERR_OTHER,
279 "**gpu_ipc_handle_map");
280
281 *vaddr = (void *) ((uintptr_t) pbase + handle.offset);
282
283 handle_obj =
284 (MPIDI_GPUI_handle_obj_s *) MPL_malloc(sizeof(MPIDI_GPUI_handle_obj_s), MPL_MEM_OTHER);
285 MPIR_Assert(handle_obj != NULL);
286 handle_obj->mapped_base_addr = (uintptr_t) pbase;
287 mpi_errno =
288 ipc_handle_cache_insert(MPIDI_GPUI_global.ipc_handle_mapped_trees[handle.node_rank]
289 [handle.global_dev_id][dev_id],
290 (void *) handle.remote_base_addr, handle.len, handle_obj,
291 &insert_successful);
292 MPIR_ERR_CHECK(mpi_errno);
293 if (insert_successful == false)
294 MPL_free(handle_obj);
295 } else {
296 *vaddr = (void *) (handle_obj->mapped_base_addr + handle.offset);
297 }
298 #endif
299
300 fn_exit:
301 MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_GPU_IPC_HANDLE_MAP);
302 return mpi_errno;
303 fn_fail:
304 goto fn_exit;
305 }
306
MPIDI_GPU_ipc_handle_unmap(void * vaddr,MPIDI_GPU_ipc_handle_t handle)307 int MPIDI_GPU_ipc_handle_unmap(void *vaddr, MPIDI_GPU_ipc_handle_t handle)
308 {
309 int mpi_errno = MPI_SUCCESS;
310 MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_GPU_IPC_HANDLE_UNMAP);
311 MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_GPU_IPC_HANDLE_UNMAP);
312
313 #ifdef MPIDI_CH4_SHM_ENABLE_GPU
314 if (!MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE) {
315 int mpl_err = MPL_SUCCESS;
316 mpl_err = MPL_gpu_ipc_handle_unmap((void *) ((uintptr_t) vaddr - handle.offset));
317 MPIR_ERR_CHKANDJUMP(mpl_err != MPL_SUCCESS, mpi_errno, MPI_ERR_OTHER,
318 "**gpu_ipc_handle_unmap");
319 }
320 #endif
321
322 fn_exit:
323 MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_GPU_IPC_HANDLE_UNMAP);
324 return mpi_errno;
325 fn_fail:
326 goto fn_exit;
327 }
328