1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  */
5 
6 /*
7 === BEGIN_MPI_T_CVAR_INFO_BLOCK ===
8 cvars:
9     - name        : MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE
10       category    : CH4
11       type        : int
12       default     : 1
13       class       : none
14       verbosity   : MPI_T_VERBOSITY_USER_BASIC
15       scope       : MPI_T_SCOPE_ALL_EQ
16       description : >-
17         By default, we will cache ipc handle. To manually disable ipc
18         handle cache, user can set this variable to 0.
19 === END_MPI_T_CVAR_INFO_BLOCK ===
20 */
21 
22 #include "mpidimpl.h"
23 #include "gpu_pre.h"
24 #include "gpu_types.h"
25 
ipc_handle_cache_search(MPL_gavl_tree_t gavl_tree,const void * addr,uintptr_t len,void ** handle_obj)26 static int ipc_handle_cache_search(MPL_gavl_tree_t gavl_tree, const void *addr, uintptr_t len,
27                                    void **handle_obj)
28 {
29     int mpi_errno = MPI_SUCCESS;
30 
31     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_IPC_HANDLE_CACHE_SEARCH);
32     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_IPC_HANDLE_CACHE_SEARCH);
33 
34     *handle_obj = NULL;
35 #ifdef MPIDI_CH4_SHM_ENABLE_GPU
36     if (MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE) {
37         int mpl_err = MPL_SUCCESS;
38         mpl_err = MPL_gavl_tree_search(gavl_tree, addr, len, handle_obj);
39         MPIR_ERR_CHKANDJUMP(mpl_err != MPL_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**mpl_gavl_search");
40     }
41 #endif
42 
43   fn_exit:
44     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_IPC_HANDLE_CACHE_SEARCH);
45     return mpi_errno;
46   fn_fail:
47     goto fn_exit;
48 }
49 
ipc_handle_cache_insert(MPL_gavl_tree_t gavl_tree,const void * addr,uintptr_t len,const void * handle_obj,bool * insert_successful)50 static int ipc_handle_cache_insert(MPL_gavl_tree_t gavl_tree, const void *addr, uintptr_t len,
51                                    const void *handle_obj, bool * insert_successful)
52 {
53     int mpi_errno = MPI_SUCCESS;
54 
55     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_IPC_HANDLE_CACHE_INSERT);
56     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_IPC_HANDLE_CACHE_INSERT);
57 
58     *insert_successful = false;
59 #ifdef MPIDI_CH4_SHM_ENABLE_GPU
60     if (MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE) {
61         int mpl_err = MPL_SUCCESS;
62         mpl_err = MPL_gavl_tree_insert(gavl_tree, addr, len, handle_obj);
63         MPIR_ERR_CHKANDJUMP(mpl_err != MPL_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**mpl_gavl_insert");
64         *insert_successful = true;
65     }
66 #endif
67 
68   fn_exit:
69     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_IPC_HANDLE_CACHE_INSERT);
70     return mpi_errno;
71   fn_fail:
72     goto fn_exit;
73 }
74 
get_map_device(int remote_global_dev_id,MPL_gpu_device_handle_t local_dev_handle,MPI_Datatype recv_type,int * dev_id)75 static int get_map_device(int remote_global_dev_id,
76                           MPL_gpu_device_handle_t local_dev_handle,
77                           MPI_Datatype recv_type, int *dev_id)
78 {
79     int mpi_errno = MPI_SUCCESS;
80 
81     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_GET_MAP_DEVICE);
82     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_GET_MAP_DEVICE);
83 
84 #ifdef MPIDI_CH4_SHM_ENABLE_GPU
85     int recv_dev_id;
86     int recv_dt_contig;
87     MPIDI_GPUI_dev_id_t *avail_id = NULL;
88 
89     MPIDI_Datatype_check_contig(recv_type, recv_dt_contig);
90 
91     HASH_FIND_INT(MPIDI_GPUI_global.global_to_local_map, &remote_global_dev_id, avail_id);
92     MPL_gpu_get_dev_id(local_dev_handle, &recv_dev_id);
93     if (recv_dev_id < 0) {
94         /* when receiver's buffer is on host memory, recv_dev_id will be less than 0.
95          * however, when we decide to map buffer onto receiver's device, this mapping
96          * will be invalid, so we need to assign a default gpu instead; for now, we
97          * assume process can at least access one GPU, so device id 0 is set. */
98         recv_dev_id = 0;
99     }
100 
101     if (avail_id == NULL) {
102         *dev_id = recv_dev_id;
103     } else {
104         if (!recv_dt_contig)
105             *dev_id = recv_dev_id;
106         else
107             *dev_id = avail_id->local_dev_id;
108     }
109 #endif
110 
111   fn_exit:
112     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_GET_MAP_DEVICE);
113     return mpi_errno;
114   fn_fail:
115     goto fn_exit;
116 }
117 
ipc_handle_cache_delete(MPL_gavl_tree_t gavl_tree,const void * addr,uintptr_t len)118 static int ipc_handle_cache_delete(MPL_gavl_tree_t gavl_tree, const void *addr, uintptr_t len)
119 {
120     int mpi_errno = MPI_SUCCESS;
121 
122     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_IPC_HANDLE_DELETE);
123     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_IPC_HANDLE_DELETE);
124 
125 #ifdef MPIDI_CH4_SHM_ENABLE_GPU
126     int mpl_err = MPL_SUCCESS;
127     if (MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE) {
128         mpl_err = MPL_gavl_tree_delete_range(gavl_tree, addr, len);
129         MPIR_ERR_CHKANDJUMP(mpl_err != MPL_SUCCESS, mpi_errno, MPI_ERR_OTHER,
130                             "**mpl_gavl_delete_range");
131     }
132 #endif
133 
134   fn_exit:
135     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_IPC_HANDLE_DELETE);
136     return mpi_errno;
137   fn_fail:
138     goto fn_exit;
139 }
140 
MPIDI_GPU_ipc_handle_cache_insert(int rank,MPIR_Comm * comm,MPIDI_GPU_ipc_handle_t handle)141 int MPIDI_GPU_ipc_handle_cache_insert(int rank, MPIR_Comm * comm, MPIDI_GPU_ipc_handle_t handle)
142 {
143     int mpi_errno = MPI_SUCCESS;
144 
145     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_GPU_IPC_HANDLE_CACHE);
146     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_GPU_IPC_HANDLE_CACHE);
147 
148 #ifdef MPIDI_CH4_SHM_ENABLE_GPU
149     if (handle.handle_status == MPIDI_GPU_IPC_HANDLE_REMAP_REQUIRED) {
150         bool insert_successful = false;
151         int recv_lrank = MPIDI_GPUI_global.local_ranks[MPIDIU_rank_to_lpid(rank, comm)];
152 
153         MPIDI_GPU_ipc_handle_t *handle_obj =
154             MPL_malloc(sizeof(MPIDI_GPU_ipc_handle_t), MPL_MEM_OTHER);
155         *handle_obj = handle;
156         handle_obj->handle_status = MPIDI_GPU_IPC_HANDLE_VALID;
157 
158         mpi_errno = ipc_handle_cache_insert(MPIDI_GPUI_global.ipc_handle_track_trees[recv_lrank]
159                                             [handle.global_dev_id],
160                                             (void *) handle.remote_base_addr, handle.len,
161                                             handle_obj, &insert_successful);
162         MPIR_ERR_CHECK(mpi_errno);
163 
164         if (insert_successful == false) {
165             MPL_free(handle_obj);
166         }
167     }
168 #endif
169 
170   fn_exit:
171     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_GPU_IPC_HANDLE_CACHE);
172     return mpi_errno;
173   fn_fail:
174     goto fn_exit;
175 }
176 
MPIDI_GPU_get_ipc_attr(const void * vaddr,int rank,MPIR_Comm * comm,MPIDI_IPCI_ipc_attr_t * ipc_attr)177 int MPIDI_GPU_get_ipc_attr(const void *vaddr, int rank, MPIR_Comm * comm,
178                            MPIDI_IPCI_ipc_attr_t * ipc_attr)
179 {
180     int mpi_errno = MPI_SUCCESS;
181     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_GPU_GET_IPC_ATTR);
182     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_GPU_GET_IPC_ATTR);
183 
184 #ifdef MPIDI_CH4_SHM_ENABLE_GPU
185     int local_dev_id;
186     MPIDI_GPUI_dev_id_t *tmp;
187     void *pbase;
188     uintptr_t len;
189     int mpl_err = MPL_SUCCESS;
190     MPIDI_GPU_ipc_handle_t *handle_obj = NULL;
191     int recv_lrank;
192 
193     recv_lrank = MPIDI_GPUI_global.local_ranks[MPIDIU_rank_to_lpid(rank, comm)];
194     ipc_attr->ipc_type = MPIDI_IPCI_TYPE__GPU;
195 
196     MPL_gpu_get_dev_id(ipc_attr->gpu_attr.device, &local_dev_id);
197     HASH_FIND_INT(MPIDI_GPUI_global.local_to_global_map, &local_dev_id, tmp);
198 
199     mpl_err = MPL_gpu_get_buffer_bounds(vaddr, &pbase, &len);
200     MPIR_ERR_CHKANDJUMP(mpl_err != MPL_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**gpu_get_buffer_info");
201 
202     mpi_errno = ipc_handle_cache_search(MPIDI_GPUI_global.ipc_handle_track_trees[recv_lrank]
203                                         [tmp->global_dev_id], pbase, len, (void **) &handle_obj);
204     MPIR_ERR_CHECK(mpi_errno);
205 
206     if (handle_obj == NULL) {
207         mpl_err = MPL_gpu_ipc_handle_create(pbase, &ipc_attr->ipc_handle.gpu.ipc_handle);
208         MPIR_ERR_CHKANDJUMP(mpl_err != MPL_SUCCESS, mpi_errno, MPI_ERR_OTHER,
209                             "**gpu_ipc_handle_create");
210         ipc_attr->ipc_handle.gpu.handle_status = MPIDI_GPU_IPC_HANDLE_REMAP_REQUIRED;
211     } else {
212         ipc_attr->ipc_handle.gpu.handle_status = MPIDI_GPU_IPC_HANDLE_VALID;
213     }
214 
215     /* MPIDI_GPU_get_ipc_attr will be called by sender to create an ipc handle.
216      * remote_base_addr, len and node_rank attributes in ipc handle will be sent
217      * to receiver and used to search cached ipc handle and/or insert new allocated
218      * handle obj on receiver side. offset attribute is always needed no matter
219      * whether we use caching or not in order to compute correct user addr. */
220     ipc_attr->ipc_handle.gpu.remote_base_addr = (uintptr_t) pbase;
221     ipc_attr->ipc_handle.gpu.len = len;
222     ipc_attr->ipc_handle.gpu.node_rank = MPIR_Process.local_rank;
223     ipc_attr->ipc_handle.gpu.offset = (uintptr_t) vaddr - (uintptr_t) pbase;
224 
225     ipc_attr->ipc_handle.gpu.global_dev_id = tmp->global_dev_id;
226     ipc_attr->threshold.send_lmt_sz = MPIR_CVAR_CH4_IPC_GPU_P2P_THRESHOLD;
227 #else
228     /* Do not support IPC data transfer */
229     ipc_attr->ipc_type = MPIDI_IPCI_TYPE__NONE;
230     ipc_attr->threshold.send_lmt_sz = MPIR_AINT_MAX;
231 #endif
232 
233   fn_exit:
234     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_GPU_GET_IPC_ATTR);
235     return mpi_errno;
236   fn_fail:
237     goto fn_exit;
238 }
239 
MPIDI_GPU_ipc_handle_map(MPIDI_GPU_ipc_handle_t handle,MPL_gpu_device_handle_t dev_handle,MPI_Datatype recv_type,void ** vaddr)240 int MPIDI_GPU_ipc_handle_map(MPIDI_GPU_ipc_handle_t handle,
241                              MPL_gpu_device_handle_t dev_handle,
242                              MPI_Datatype recv_type, void **vaddr)
243 {
244     int mpi_errno = MPI_SUCCESS;
245 
246     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_GPU_IPC_HANDLE_MAP);
247     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_GPU_IPC_HANDLE_MAP);
248 
249 #ifdef MPIDI_CH4_SHM_ENABLE_GPU
250     void *pbase;
251     int mpl_err = MPL_SUCCESS;
252     int dev_id;
253     MPIDI_GPUI_handle_obj_s *handle_obj = NULL;
254 
255     if (handle.handle_status == MPIDI_GPU_IPC_HANDLE_REMAP_REQUIRED) {
256         for (int i = 0; i < MPIDI_GPUI_global.local_device_count; ++i) {
257             mpi_errno =
258                 ipc_handle_cache_delete(MPIDI_GPUI_global.ipc_handle_mapped_trees[handle.node_rank]
259                                         [handle.global_dev_id][i], (void *) handle.remote_base_addr,
260                                         handle.len);
261             MPIR_ERR_CHECK(mpi_errno);
262         }
263     }
264 
265     mpi_errno = get_map_device(handle.global_dev_id, dev_handle, recv_type, &dev_id);
266     MPIR_ERR_CHECK(mpi_errno);
267 
268     mpi_errno = ipc_handle_cache_search(MPIDI_GPUI_global.ipc_handle_mapped_trees[handle.node_rank]
269                                         [handle.global_dev_id][dev_id],
270                                         (void *) handle.remote_base_addr, handle.len,
271                                         (void **) &handle_obj);
272     MPIR_ERR_CHECK(mpi_errno);
273 
274     if (handle_obj == NULL) {
275         bool insert_successful = false;
276         MPL_gpu_get_dev_handle(dev_id, &dev_handle);
277         mpl_err = MPL_gpu_ipc_handle_map(handle.ipc_handle, dev_handle, &pbase);
278         MPIR_ERR_CHKANDJUMP(mpl_err != MPL_SUCCESS, mpi_errno, MPI_ERR_OTHER,
279                             "**gpu_ipc_handle_map");
280 
281         *vaddr = (void *) ((uintptr_t) pbase + handle.offset);
282 
283         handle_obj =
284             (MPIDI_GPUI_handle_obj_s *) MPL_malloc(sizeof(MPIDI_GPUI_handle_obj_s), MPL_MEM_OTHER);
285         MPIR_Assert(handle_obj != NULL);
286         handle_obj->mapped_base_addr = (uintptr_t) pbase;
287         mpi_errno =
288             ipc_handle_cache_insert(MPIDI_GPUI_global.ipc_handle_mapped_trees[handle.node_rank]
289                                     [handle.global_dev_id][dev_id],
290                                     (void *) handle.remote_base_addr, handle.len, handle_obj,
291                                     &insert_successful);
292         MPIR_ERR_CHECK(mpi_errno);
293         if (insert_successful == false)
294             MPL_free(handle_obj);
295     } else {
296         *vaddr = (void *) (handle_obj->mapped_base_addr + handle.offset);
297     }
298 #endif
299 
300   fn_exit:
301     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_GPU_IPC_HANDLE_MAP);
302     return mpi_errno;
303   fn_fail:
304     goto fn_exit;
305 }
306 
MPIDI_GPU_ipc_handle_unmap(void * vaddr,MPIDI_GPU_ipc_handle_t handle)307 int MPIDI_GPU_ipc_handle_unmap(void *vaddr, MPIDI_GPU_ipc_handle_t handle)
308 {
309     int mpi_errno = MPI_SUCCESS;
310     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_GPU_IPC_HANDLE_UNMAP);
311     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_GPU_IPC_HANDLE_UNMAP);
312 
313 #ifdef MPIDI_CH4_SHM_ENABLE_GPU
314     if (!MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE) {
315         int mpl_err = MPL_SUCCESS;
316         mpl_err = MPL_gpu_ipc_handle_unmap((void *) ((uintptr_t) vaddr - handle.offset));
317         MPIR_ERR_CHKANDJUMP(mpl_err != MPL_SUCCESS, mpi_errno, MPI_ERR_OTHER,
318                             "**gpu_ipc_handle_unmap");
319     }
320 #endif
321 
322   fn_exit:
323     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_GPU_IPC_HANDLE_UNMAP);
324     return mpi_errno;
325   fn_fail:
326     goto fn_exit;
327 }
328