1 /*
2  * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
3  *                         University Research and Technology
4  *                         Corporation.  All rights reserved.
5  * Copyright (c) 2004-2014 The University of Tennessee and The University
6  *                         of Tennessee Research Foundation.  All rights
7  *                         reserved.
8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9  *                         University of Stuttgart.  All rights reserved.
10  * Copyright (c) 2004-2006 The Regents of the University of California.
11  *                         All rights reserved.
12  * Copyright (c) 2011-2015 NVIDIA Corporation.  All rights reserved.
13  * Copyright (c) 2015      Cisco Systems, Inc.  All rights reserved.
14  * Copyright (c) 2015      Research Organization for Information Science
15  *                         and Technology (RIST). All rights reserved.
16  * $COPYRIGHT$
17  *
18  * Additional copyrights may follow
19  *
20  * $HEADER$
21  */
22 
23 /**
24  * This file contains various support functions for doing CUDA
25  * operations.
26  */
27 #include "opal_config.h"
28 
29 #include <errno.h>
30 #include <unistd.h>
31 #include <cuda.h>
32 
33 #include "opal/align.h"
34 #include "opal/datatype/opal_convertor.h"
35 #include "opal/datatype/opal_datatype_cuda.h"
36 #include "opal/util/output.h"
37 #include "opal/util/show_help.h"
38 #include "opal/util/proc.h"
39 #include "opal/util/argv.h"
40 
41 #include "opal/mca/rcache/base/base.h"
42 #include "opal/runtime/opal_params.h"
43 #include "opal/mca/timer/base/base.h"
44 #include "opal/mca/dl/base/base.h"
45 
46 #include "common_cuda.h"
47 
48 /**
49  * Since function names can get redefined in cuda.h file, we need to do this
50  * stringifying to get the latest function name from the header file.  For
51  * example, cuda.h may have something like this:
52  * #define cuMemFree cuMemFree_v2
53  * We want to make sure we find cuMemFree_v2, not cuMemFree.
54  */
55 #define STRINGIFY2(x) #x
56 #define STRINGIFY(x) STRINGIFY2(x)
57 
58 #define OPAL_CUDA_DLSYM(libhandle, funcName)                                         \
59 do {                                                                                 \
60  char *err_msg;                                                                      \
61  void *ptr;                                                                          \
62  if (OPAL_SUCCESS !=                                                                 \
63      opal_dl_lookup(libhandle, STRINGIFY(funcName), &ptr, &err_msg)) {               \
64         opal_show_help("help-mpi-common-cuda.txt", "dlsym failed", true,             \
65                        STRINGIFY(funcName), err_msg);                                \
66         return 1;                                                                    \
67     } else {                                                                         \
68         *(void **)(&cuFunc.funcName) = ptr;                                          \
69         opal_output_verbose(15, mca_common_cuda_output,                              \
70                             "CUDA: successful dlsym of %s",                          \
71                             STRINGIFY(funcName));                                    \
72     }                                                                                \
73 } while (0)
74 
75 /* Structure to hold CUDA function pointers that get dynamically loaded. */
76 struct cudaFunctionTable {
77     int (*cuPointerGetAttribute)(void *, CUpointer_attribute, CUdeviceptr);
78     int (*cuMemcpyAsync)(CUdeviceptr, CUdeviceptr, size_t, CUstream);
79     int (*cuMemcpy)(CUdeviceptr, CUdeviceptr, size_t);
80     int (*cuMemAlloc)(CUdeviceptr *, unsigned int);
81     int (*cuMemFree)(CUdeviceptr buf);
82     int (*cuCtxGetCurrent)(void *cuContext);
83     int (*cuStreamCreate)(CUstream *, int);
84     int (*cuEventCreate)(CUevent *, int);
85     int (*cuEventRecord)(CUevent, CUstream);
86     int (*cuMemHostRegister)(void *, size_t, unsigned int);
87     int (*cuMemHostUnregister)(void *);
88     int (*cuEventQuery)(CUevent);
89     int (*cuEventDestroy)(CUevent);
90     int (*cuStreamWaitEvent)(CUstream, CUevent, unsigned int);
91     int (*cuMemGetAddressRange)(CUdeviceptr*, size_t*, CUdeviceptr);
92     int (*cuIpcGetEventHandle)(CUipcEventHandle*, CUevent);
93     int (*cuIpcOpenEventHandle)(CUevent*, CUipcEventHandle);
94     int (*cuIpcOpenMemHandle)(CUdeviceptr*, CUipcMemHandle, unsigned int);
95     int (*cuIpcCloseMemHandle)(CUdeviceptr);
96     int (*cuIpcGetMemHandle)(CUipcMemHandle*, CUdeviceptr);
97     int (*cuCtxGetDevice)(CUdevice *);
98     int (*cuDeviceCanAccessPeer)(int *, CUdevice, CUdevice);
99     int (*cuDeviceGet)(CUdevice *, int);
100 #if OPAL_CUDA_GDR_SUPPORT
101     int (*cuPointerSetAttribute)(const void *, CUpointer_attribute, CUdeviceptr);
102 #endif /* OPAL_CUDA_GDR_SUPPORT */
103     int (*cuCtxSetCurrent)(CUcontext);
104     int (*cuEventSynchronize)(CUevent);
105     int (*cuStreamSynchronize)(CUstream);
106     int (*cuStreamDestroy)(CUstream);
107 #if OPAL_CUDA_GET_ATTRIBUTES
108     int (*cuPointerGetAttributes)(unsigned int, CUpointer_attribute *, void **, CUdeviceptr);
109 #endif /* OPAL_CUDA_GET_ATTRIBUTES */
110 };
111 typedef struct cudaFunctionTable cudaFunctionTable_t;
112 static cudaFunctionTable_t cuFunc;
113 
114 static int stage_one_init_ref_count = 0;
115 static bool stage_three_init_complete = false;
116 static bool common_cuda_initialized = false;
117 static bool common_cuda_mca_parames_registered = false;
118 static int mca_common_cuda_verbose;
119 static int mca_common_cuda_output = 0;
120 bool mca_common_cuda_enabled = false;
121 static bool mca_common_cuda_register_memory = true;
122 static bool mca_common_cuda_warning = false;
123 static opal_list_t common_cuda_memory_registrations;
124 static CUstream ipcStream = NULL;
125 static CUstream dtohStream = NULL;
126 static CUstream htodStream = NULL;
127 static CUstream memcpyStream = NULL;
128 static int mca_common_cuda_gpu_mem_check_workaround = (CUDA_VERSION > 7000) ? 0 : 1;
129 static opal_mutex_t common_cuda_init_lock;
130 static opal_mutex_t common_cuda_htod_lock;
131 static opal_mutex_t common_cuda_dtoh_lock;
132 static opal_mutex_t common_cuda_ipc_lock;
133 
134 /* Functions called by opal layer - plugged into opal function table */
135 static int mca_common_cuda_is_gpu_buffer(const void*, opal_convertor_t*);
136 static int mca_common_cuda_memmove(void*, void*, size_t);
137 static int mca_common_cuda_cu_memcpy_async(void*, const void*, size_t, opal_convertor_t*);
138 static int mca_common_cuda_cu_memcpy(void*, const void*, size_t);
139 
140 /* Function that gets plugged into opal layer */
141 static int mca_common_cuda_stage_two_init(opal_common_cuda_function_table_t *);
142 
143 /* Structure to hold memory registrations that are delayed until first
144  * call to send or receive a GPU pointer */
145 struct common_cuda_mem_regs_t {
146     opal_list_item_t super;
147     void *ptr;
148     size_t amount;
149     char *msg;
150 };
151 typedef struct common_cuda_mem_regs_t common_cuda_mem_regs_t;
152 OBJ_CLASS_DECLARATION(common_cuda_mem_regs_t);
153 OBJ_CLASS_INSTANCE(common_cuda_mem_regs_t,
154                    opal_list_item_t,
155                    NULL,
156                    NULL);
157 
158 static int mca_common_cuda_async = 1;
159 static int mca_common_cuda_cumemcpy_async;
160 #if OPAL_ENABLE_DEBUG
161 static int mca_common_cuda_cumemcpy_timing;
162 #endif /* OPAL_ENABLE_DEBUG */
163 
164 /* Array of CUDA events to be queried for IPC stream, sending side and
165  * receiving side. */
166 CUevent *cuda_event_ipc_array = NULL;
167 CUevent *cuda_event_dtoh_array = NULL;
168 CUevent *cuda_event_htod_array = NULL;
169 
170 /* Array of fragments currently being moved by cuda async non-blocking
171  * operations */
172 struct mca_btl_base_descriptor_t **cuda_event_ipc_frag_array = NULL;
173 struct mca_btl_base_descriptor_t **cuda_event_dtoh_frag_array = NULL;
174 struct mca_btl_base_descriptor_t **cuda_event_htod_frag_array = NULL;
175 
176 /* First free/available location in cuda_event_status_array */
177 static int cuda_event_ipc_first_avail, cuda_event_dtoh_first_avail, cuda_event_htod_first_avail;
178 
179 /* First currently-being used location in the cuda_event_status_array */
180 static int cuda_event_ipc_first_used, cuda_event_dtoh_first_used, cuda_event_htod_first_used;
181 
182 /* Number of status items currently in use */
183 static int cuda_event_ipc_num_used, cuda_event_dtoh_num_used, cuda_event_htod_num_used;
184 
185 /* Size of array holding events */
186 int cuda_event_max = 400;
187 static int cuda_event_ipc_most = 0;
188 static int cuda_event_dtoh_most = 0;
189 static int cuda_event_htod_most = 0;
190 
191 /* Handle to libcuda.so */
192 opal_dl_handle_t *libcuda_handle = NULL;
193 
194 /* Unused variable that we register at init time and unregister at fini time.
195  * This is used to detect if user has done a device reset prior to MPI_Finalize.
196  * This is a workaround to avoid SEGVs.
197  */
198 static int checkmem;
199 static int ctx_ok = 1;
200 
201 #define CUDA_COMMON_TIMING 0
202 #if OPAL_ENABLE_DEBUG
203 /* Some timing support structures.  Enable this to help analyze
204  * internal performance issues. */
205 static opal_timer_t ts_start;
206 static opal_timer_t ts_end;
207 static double accum;
208 #define THOUSAND  1000L
209 #define MILLION   1000000L
210 static float mydifftime(opal_timer_t ts_start, opal_timer_t ts_end);
211 #endif /* OPAL_ENABLE_DEBUG */
212 
213 /* These functions are typically unused in the optimized builds. */
214 static void cuda_dump_evthandle(int, void *, char *) __opal_attribute_unused__ ;
215 static void cuda_dump_memhandle(int, void *, char *) __opal_attribute_unused__ ;
216 #if OPAL_ENABLE_DEBUG
217 #define CUDA_DUMP_MEMHANDLE(a) cuda_dump_memhandle a
218 #define CUDA_DUMP_EVTHANDLE(a) cuda_dump_evthandle a
219 #else
220 #define CUDA_DUMP_MEMHANDLE(a)
221 #define CUDA_DUMP_EVTHANDLE(a)
222 #endif /* OPAL_ENABLE_DEBUG */
223 
224 /* This is a seperate function so we can see these variables with ompi_info and
225  * also set them with the tools interface */
mca_common_cuda_register_mca_variables(void)226 void mca_common_cuda_register_mca_variables(void)
227 {
228 
229     if (false == common_cuda_mca_parames_registered) {
230         common_cuda_mca_parames_registered = true;
231     }
232     /* Set different levels of verbosity in the cuda related code. */
233     mca_common_cuda_verbose = 0;
234     (void) mca_base_var_register("ompi", "mpi", "common_cuda", "verbose",
235                                  "Set level of common cuda verbosity",
236                                  MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
237                                  OPAL_INFO_LVL_9,
238                                  MCA_BASE_VAR_SCOPE_READONLY,
239                                  &mca_common_cuda_verbose);
240 
241     /* Control whether system buffers get CUDA pinned or not.  Allows for
242      * performance analysis. */
243     mca_common_cuda_register_memory = true;
244     (void) mca_base_var_register("ompi", "mpi", "common_cuda", "register_memory",
245                                  "Whether to cuMemHostRegister preallocated BTL buffers",
246                                  MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
247                                  OPAL_INFO_LVL_9,
248                                  MCA_BASE_VAR_SCOPE_READONLY,
249                                  &mca_common_cuda_register_memory);
250 
251     /* Control whether we see warnings when CUDA memory registration fails.  This is
252      * useful when CUDA support is configured in, but we are running a regular MPI
253      * application without CUDA. */
254     mca_common_cuda_warning = true;
255     (void) mca_base_var_register("ompi", "mpi", "common_cuda", "warning",
256                                  "Whether to print warnings when CUDA registration fails",
257                                  MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
258                                  OPAL_INFO_LVL_9,
259                                  MCA_BASE_VAR_SCOPE_READONLY,
260                                  &mca_common_cuda_warning);
261 
262     /* Use this flag to test async vs sync copies */
263     mca_common_cuda_async = 1;
264     (void) mca_base_var_register("ompi", "mpi", "common_cuda", "memcpy_async",
265                                  "Set to 0 to force CUDA sync copy instead of async",
266                                  MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
267                                  OPAL_INFO_LVL_9,
268                                  MCA_BASE_VAR_SCOPE_READONLY,
269                                  &mca_common_cuda_async);
270 
271     /* Use this parameter to increase the number of outstanding events allows */
272     (void) mca_base_var_register("ompi", "mpi", "common_cuda", "event_max",
273                                  "Set number of oustanding CUDA events",
274                                  MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
275                                  OPAL_INFO_LVL_9,
276                                  MCA_BASE_VAR_SCOPE_READONLY,
277                                  &cuda_event_max);
278 
279     /* Use this flag to test cuMemcpyAsync vs cuMemcpy */
280     mca_common_cuda_cumemcpy_async = 1;
281     (void) mca_base_var_register("ompi", "mpi", "common_cuda", "cumemcpy_async",
282                                  "Set to 0 to force CUDA cuMemcpy instead of cuMemcpyAsync/cuStreamSynchronize",
283                                  MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
284                                  OPAL_INFO_LVL_5,
285                                  MCA_BASE_VAR_SCOPE_READONLY,
286                                  &mca_common_cuda_cumemcpy_async);
287 
288 #if OPAL_ENABLE_DEBUG
289     /* Use this flag to dump out timing of cumempcy sync and async */
290     mca_common_cuda_cumemcpy_timing = 0;
291     (void) mca_base_var_register("ompi", "mpi", "common_cuda", "cumemcpy_timing",
292                                  "Set to 1 to dump timing of eager copies",
293                                  MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
294                                  OPAL_INFO_LVL_5,
295                                  MCA_BASE_VAR_SCOPE_READONLY,
296                                  &mca_common_cuda_cumemcpy_timing);
297 #endif /* OPAL_ENABLE_DEBUG */
298 
299     (void) mca_base_var_register("ompi", "mpi", "common_cuda", "gpu_mem_check_workaround",
300                                  "Set to 0 to disable GPU memory check workaround. A user would rarely have to do this.",
301                                  MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
302                                  OPAL_INFO_LVL_9,
303                                  MCA_BASE_VAR_SCOPE_READONLY,
304                                  &mca_common_cuda_gpu_mem_check_workaround);
305 }
306 
307 /**
308  * This is the first stage of initialization.  This function is called
309  * explicitly by any BTLs that can support CUDA-aware. It is called during
310  * the component open phase of initialization. This fuction will look for
311  * the SONAME of the library which is libcuda.so.1. In most cases, this will
312  * result in the library found.  However, there are some setups that require
313  * the extra steps for searching. This function will then load the symbols
314  * needed from the CUDA driver library. Any failure will result in this
315  * initialization failing and status will be set showing that.
316  */
mca_common_cuda_stage_one_init(void)317 int mca_common_cuda_stage_one_init(void)
318 {
319     int retval, i, j;
320     char *cudalibs[] = {"libcuda.so.1", "libcuda.dylib", NULL};
321     char *searchpaths[] = {"", "/usr/lib64", NULL};
322     char **errmsgs = NULL;
323     char *errmsg = NULL;
324     int errsize;
325     bool stage_one_init_passed = false;
326 
327     stage_one_init_ref_count++;
328     if (stage_one_init_ref_count > 1) {
329         opal_output_verbose(10, mca_common_cuda_output,
330                             "CUDA: stage_one_init_ref_count is now %d, no need to init",
331                             stage_one_init_ref_count);
332         return OPAL_SUCCESS;
333     }
334 
335     /* This is a no-op in most cases as the parameters were registered earlier */
336     mca_common_cuda_register_mca_variables();
337 
338     OBJ_CONSTRUCT(&common_cuda_init_lock, opal_mutex_t);
339     OBJ_CONSTRUCT(&common_cuda_htod_lock, opal_mutex_t);
340     OBJ_CONSTRUCT(&common_cuda_dtoh_lock, opal_mutex_t);
341     OBJ_CONSTRUCT(&common_cuda_ipc_lock, opal_mutex_t);
342 
343     mca_common_cuda_output = opal_output_open(NULL);
344     opal_output_set_verbosity(mca_common_cuda_output, mca_common_cuda_verbose);
345 
346     opal_output_verbose(10, mca_common_cuda_output,
347                         "CUDA: stage_one_init_ref_count is now %d, initializing",
348                         stage_one_init_ref_count);
349 
350     /* First check if the support is enabled.  In the case that the user has
351      * turned it off, we do not need to continue with any CUDA specific
352      * initialization.  Do this after MCA parameter registration. */
353     if (!opal_cuda_support) {
354         return 1;
355     }
356 
357     if (!OPAL_HAVE_DL_SUPPORT) {
358         opal_show_help("help-mpi-common-cuda.txt", "dlopen disabled", true);
359         return 1;
360     }
361 
362     /* Now walk through all the potential names libcuda and find one
363      * that works.  If it does, all is good.  If not, print out all
364      * the messages about why things failed.  This code was careful
365      * to try and save away all error messages if the loading ultimately
366      * failed to help with debugging.
367      *
368      * NOTE: On the first loop we just utilize the default loading
369      * paths from the system.  For the second loop, set /usr/lib64 to
370      * the search path and try again.  This is done to handle the case
371      * where we have both 32 and 64 bit libcuda.so libraries
372      * installed.  Even when running in 64-bit mode, the /usr/lib
373      * directory is searched first and we may find a 32-bit
374      * libcuda.so.1 library.  Loading of this library will fail as the
375      * OPAL DL framework does not handle having the wrong ABI in the
376      * search path (unlike ld or ld.so).  Note that we only set this
377      * search path after the original search.  This is so that
378      * LD_LIBRARY_PATH and run path settings are respected.  Setting
379      * this search path overrides them (rather then being
380      * appended). */
381     j = 0;
382     while (searchpaths[j] != NULL) {
383         i = 0;
384         while (cudalibs[i] != NULL) {
385             char *filename = NULL;
386             char *str = NULL;
387 
388             /* If there's a non-empty search path, prepend it
389                to the library filename */
390             if (strlen(searchpaths[j]) > 0) {
391                 asprintf(&filename, "%s/%s", searchpaths[j], cudalibs[i]);
392             } else {
393                 filename = strdup(cudalibs[i]);
394             }
395             if (NULL == filename) {
396                 opal_show_help("help-mpi-common-cuda.txt", "No memory",
397                                true, OPAL_PROC_MY_HOSTNAME);
398                 return 1;
399             }
400 
401             retval = opal_dl_open(filename, false, false,
402                                   &libcuda_handle, &str);
403             if (OPAL_SUCCESS != retval || NULL == libcuda_handle) {
404                 if (NULL != str) {
405                     opal_argv_append(&errsize, &errmsgs, str);
406                 } else {
407                     opal_argv_append(&errsize, &errmsgs,
408                                      "opal_dl_open() returned NULL.");
409                 }
410                 opal_output_verbose(10, mca_common_cuda_output,
411                                     "CUDA: Library open error: %s",
412                                     errmsgs[errsize-1]);
413             } else {
414                 opal_output_verbose(10, mca_common_cuda_output,
415                                     "CUDA: Library successfully opened %s",
416                                     cudalibs[i]);
417                 stage_one_init_passed = true;
418                 break;
419             }
420             i++;
421 
422             free(filename);
423         }
424         if (true == stage_one_init_passed) {
425             break; /* Break out of outer loop */
426         }
427         j++;
428     }
429 
430     if (true != stage_one_init_passed) {
431         errmsg = opal_argv_join(errmsgs, '\n');
432         opal_show_help("help-mpi-common-cuda.txt", "dlopen failed", true,
433                        errmsg);
434         opal_cuda_support = 0;
435     }
436     opal_argv_free(errmsgs);
437     free(errmsg);
438 
439     if (true != stage_one_init_passed) {
440         return 1;
441     }
442     opal_cuda_add_initialization_function(&mca_common_cuda_stage_two_init);
443     OBJ_CONSTRUCT(&common_cuda_memory_registrations, opal_list_t);
444 
445     /* Map in the functions that we need.  Note that if there is an error
446      * the macro OPAL_CUDA_DLSYM will print an error and call return.  */
447     OPAL_CUDA_DLSYM(libcuda_handle, cuStreamCreate);
448     OPAL_CUDA_DLSYM(libcuda_handle, cuCtxGetCurrent);
449     OPAL_CUDA_DLSYM(libcuda_handle, cuEventCreate);
450     OPAL_CUDA_DLSYM(libcuda_handle, cuEventRecord);
451     OPAL_CUDA_DLSYM(libcuda_handle, cuMemHostRegister);
452     OPAL_CUDA_DLSYM(libcuda_handle, cuMemHostUnregister);
453     OPAL_CUDA_DLSYM(libcuda_handle, cuPointerGetAttribute);
454     OPAL_CUDA_DLSYM(libcuda_handle, cuEventQuery);
455     OPAL_CUDA_DLSYM(libcuda_handle, cuEventDestroy);
456     OPAL_CUDA_DLSYM(libcuda_handle, cuStreamWaitEvent);
457     OPAL_CUDA_DLSYM(libcuda_handle, cuMemcpyAsync);
458     OPAL_CUDA_DLSYM(libcuda_handle, cuMemcpy);
459     OPAL_CUDA_DLSYM(libcuda_handle, cuMemFree);
460     OPAL_CUDA_DLSYM(libcuda_handle, cuMemAlloc);
461     OPAL_CUDA_DLSYM(libcuda_handle, cuMemGetAddressRange);
462     OPAL_CUDA_DLSYM(libcuda_handle, cuIpcGetEventHandle);
463     OPAL_CUDA_DLSYM(libcuda_handle, cuIpcOpenEventHandle);
464     OPAL_CUDA_DLSYM(libcuda_handle, cuIpcOpenMemHandle);
465     OPAL_CUDA_DLSYM(libcuda_handle, cuIpcCloseMemHandle);
466     OPAL_CUDA_DLSYM(libcuda_handle, cuIpcGetMemHandle);
467     OPAL_CUDA_DLSYM(libcuda_handle, cuCtxGetDevice);
468     OPAL_CUDA_DLSYM(libcuda_handle, cuDeviceCanAccessPeer);
469     OPAL_CUDA_DLSYM(libcuda_handle, cuDeviceGet);
470 #if OPAL_CUDA_GDR_SUPPORT
471     OPAL_CUDA_DLSYM(libcuda_handle, cuPointerSetAttribute);
472 #endif /* OPAL_CUDA_GDR_SUPPORT */
473     OPAL_CUDA_DLSYM(libcuda_handle, cuCtxSetCurrent);
474     OPAL_CUDA_DLSYM(libcuda_handle, cuEventSynchronize);
475     OPAL_CUDA_DLSYM(libcuda_handle, cuStreamSynchronize);
476     OPAL_CUDA_DLSYM(libcuda_handle, cuStreamDestroy);
477 #if OPAL_CUDA_GET_ATTRIBUTES
478     OPAL_CUDA_DLSYM(libcuda_handle, cuPointerGetAttributes);
479 #endif /* OPAL_CUDA_GET_ATTRIBUTES */
480     return 0;
481 }
482 
483 /**
484  * This function is registered with the OPAL CUDA support.  In that way,
485  * these function pointers will be loaded into the OPAL CUDA code when
486  * the first convertor is initialized.  This does not trigger any CUDA
487  * specific initialization as this may just be a host buffer that is
488  * triggering this call.
489  */
mca_common_cuda_stage_two_init(opal_common_cuda_function_table_t * ftable)490 static int mca_common_cuda_stage_two_init(opal_common_cuda_function_table_t *ftable)
491 {
492     if (OPAL_UNLIKELY(!opal_cuda_support)) {
493         return OPAL_ERROR;
494     }
495 
496     ftable->gpu_is_gpu_buffer = &mca_common_cuda_is_gpu_buffer;
497     ftable->gpu_cu_memcpy_async = &mca_common_cuda_cu_memcpy_async;
498     ftable->gpu_cu_memcpy = &mca_common_cuda_cu_memcpy;
499     ftable->gpu_memmove = &mca_common_cuda_memmove;
500 
501     opal_output_verbose(30, mca_common_cuda_output,
502                         "CUDA: support functions initialized");
503     return OPAL_SUCCESS;
504 }
505 
506 /**
507  * This is the last phase of initialization.  This is triggered when we examine
508  * a buffer pointer and determine it is a GPU buffer.  We then assume the user
509  * has selected their GPU and we can go ahead with all the CUDA related
510  * initializations.  If we get an error, just return.  Cleanup of resources
511  * will happen when fini is called.
512  */
mca_common_cuda_stage_three_init(void)513 static int mca_common_cuda_stage_three_init(void)
514 {
515     int i, s, rc;
516     CUresult res;
517     CUcontext cuContext;
518     common_cuda_mem_regs_t *mem_reg;
519 
520     OPAL_THREAD_LOCK(&common_cuda_init_lock);
521     opal_output_verbose(20, mca_common_cuda_output,
522                         "CUDA: entering stage three init");
523 
524 /* Compiled without support or user disabled support */
525     if (OPAL_UNLIKELY(!opal_cuda_support)) {
526         opal_output_verbose(20, mca_common_cuda_output,
527                             "CUDA: No mpi cuda support, exiting stage three init");
528         stage_three_init_complete = true;
529         OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
530         return OPAL_ERROR;
531     }
532 
533     /* In case another thread snuck in and completed the initialization */
534     if (true == stage_three_init_complete) {
535         if (common_cuda_initialized) {
536             opal_output_verbose(20, mca_common_cuda_output,
537                                 "CUDA: Stage three already complete, exiting stage three init");
538             OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
539             return OPAL_SUCCESS;
540         } else {
541             opal_output_verbose(20, mca_common_cuda_output,
542                                 "CUDA: Stage three already complete, failed during the init");
543             OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
544             return OPAL_ERROR;
545         }
546     }
547 
548     /* Check to see if this process is running in a CUDA context.  If
549      * so, all is good.  If not, then disable registration of memory. */
550     res = cuFunc.cuCtxGetCurrent(&cuContext);
551     if (CUDA_SUCCESS != res) {
552         if (mca_common_cuda_warning) {
553             /* Check for the not initialized error since we can make suggestions to
554              * user for this error. */
555             if (CUDA_ERROR_NOT_INITIALIZED == res) {
556                 opal_show_help("help-mpi-common-cuda.txt", "cuCtxGetCurrent failed not initialized",
557                                true);
558             } else {
559                 opal_show_help("help-mpi-common-cuda.txt", "cuCtxGetCurrent failed",
560                                true, res);
561             }
562         }
563         mca_common_cuda_enabled = false;
564         mca_common_cuda_register_memory = false;
565     } else if ((CUDA_SUCCESS == res) && (NULL == cuContext)) {
566         if (mca_common_cuda_warning) {
567             opal_show_help("help-mpi-common-cuda.txt", "cuCtxGetCurrent returned NULL",
568                            true);
569         }
570         mca_common_cuda_enabled = false;
571         mca_common_cuda_register_memory = false;
572     } else {
573         /* All is good.  mca_common_cuda_register_memory will retain its original
574          * value.  Normally, that is 1, but the user can override it to disable
575          * registration of the internal buffers. */
576         mca_common_cuda_enabled = true;
577         opal_output_verbose(20, mca_common_cuda_output,
578                             "CUDA: cuCtxGetCurrent succeeded");
579     }
580 
581     /* No need to go on at this point.  If we cannot create a context and we are at
582      * the point where we are making MPI calls, it is time to fully disable
583      * CUDA support.
584      */
585     if (false == mca_common_cuda_enabled) {
586         OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
587         return OPAL_ERROR;
588     }
589 
590     if (true == mca_common_cuda_enabled) {
591         /* Set up an array to store outstanding IPC async copy events */
592         cuda_event_ipc_num_used = 0;
593         cuda_event_ipc_first_avail = 0;
594         cuda_event_ipc_first_used = 0;
595 
596         cuda_event_ipc_array = (CUevent *) calloc(cuda_event_max, sizeof(CUevent *));
597         if (NULL == cuda_event_ipc_array) {
598             opal_show_help("help-mpi-common-cuda.txt", "No memory",
599                            true, OPAL_PROC_MY_HOSTNAME);
600             rc = OPAL_ERROR;
601             goto cleanup_and_error;
602         }
603 
604         /* Create the events since they can be reused. */
605         for (i = 0; i < cuda_event_max; i++) {
606             res = cuFunc.cuEventCreate(&cuda_event_ipc_array[i], CU_EVENT_DISABLE_TIMING);
607             if (CUDA_SUCCESS != res) {
608                 opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
609                                true, OPAL_PROC_MY_HOSTNAME, res);
610                 rc = OPAL_ERROR;
611                 goto cleanup_and_error;
612             }
613         }
614 
615         /* The first available status index is 0.  Make an empty frag
616            array. */
617         cuda_event_ipc_frag_array = (struct mca_btl_base_descriptor_t **)
618             malloc(sizeof(struct mca_btl_base_descriptor_t *) * cuda_event_max);
619         if (NULL == cuda_event_ipc_frag_array) {
620             opal_show_help("help-mpi-common-cuda.txt", "No memory",
621                            true, OPAL_PROC_MY_HOSTNAME);
622             rc = OPAL_ERROR;
623             goto cleanup_and_error;
624         }
625     }
626 
627     if (true == mca_common_cuda_enabled) {
628         /* Set up an array to store outstanding async dtoh events.  Used on the
629          * sending side for asynchronous copies. */
630         cuda_event_dtoh_num_used = 0;
631         cuda_event_dtoh_first_avail = 0;
632         cuda_event_dtoh_first_used = 0;
633 
634         cuda_event_dtoh_array = (CUevent *) calloc(cuda_event_max, sizeof(CUevent *));
635         if (NULL == cuda_event_dtoh_array) {
636             opal_show_help("help-mpi-common-cuda.txt", "No memory",
637                            true, OPAL_PROC_MY_HOSTNAME);
638             rc = OPAL_ERROR;
639             goto cleanup_and_error;
640         }
641 
642         /* Create the events since they can be reused. */
643         for (i = 0; i < cuda_event_max; i++) {
644             res = cuFunc.cuEventCreate(&cuda_event_dtoh_array[i], CU_EVENT_DISABLE_TIMING);
645             if (CUDA_SUCCESS != res) {
646                 opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
647                                true, OPAL_PROC_MY_HOSTNAME, res);
648                 rc = OPAL_ERROR;
649                 goto cleanup_and_error;
650             }
651         }
652 
653         /* The first available status index is 0.  Make an empty frag
654            array. */
655         cuda_event_dtoh_frag_array = (struct mca_btl_base_descriptor_t **)
656             malloc(sizeof(struct mca_btl_base_descriptor_t *) * cuda_event_max);
657         if (NULL == cuda_event_dtoh_frag_array) {
658             opal_show_help("help-mpi-common-cuda.txt", "No memory",
659                            true, OPAL_PROC_MY_HOSTNAME);
660             rc = OPAL_ERROR;
661             goto cleanup_and_error;
662         }
663 
664         /* Set up an array to store outstanding async htod events.  Used on the
665          * receiving side for asynchronous copies. */
666         cuda_event_htod_num_used = 0;
667         cuda_event_htod_first_avail = 0;
668         cuda_event_htod_first_used = 0;
669 
670         cuda_event_htod_array = (CUevent *) calloc(cuda_event_max, sizeof(CUevent *));
671         if (NULL == cuda_event_htod_array) {
672             opal_show_help("help-mpi-common-cuda.txt", "No memory",
673                            true, OPAL_PROC_MY_HOSTNAME);
674            rc = OPAL_ERROR;
675            goto cleanup_and_error;
676         }
677 
678         /* Create the events since they can be reused. */
679         for (i = 0; i < cuda_event_max; i++) {
680             res = cuFunc.cuEventCreate(&cuda_event_htod_array[i], CU_EVENT_DISABLE_TIMING);
681             if (CUDA_SUCCESS != res) {
682                 opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
683                                true, OPAL_PROC_MY_HOSTNAME, res);
684                rc = OPAL_ERROR;
685                goto cleanup_and_error;
686             }
687         }
688 
689         /* The first available status index is 0.  Make an empty frag
690            array. */
691         cuda_event_htod_frag_array = (struct mca_btl_base_descriptor_t **)
692             malloc(sizeof(struct mca_btl_base_descriptor_t *) * cuda_event_max);
693         if (NULL == cuda_event_htod_frag_array) {
694             opal_show_help("help-mpi-common-cuda.txt", "No memory",
695                            true, OPAL_PROC_MY_HOSTNAME);
696            rc = OPAL_ERROR;
697            goto cleanup_and_error;
698         }
699     }
700 
701     s = opal_list_get_size(&common_cuda_memory_registrations);
702     for(i = 0; i < s; i++) {
703         mem_reg = (common_cuda_mem_regs_t *)
704             opal_list_remove_first(&common_cuda_memory_registrations);
705         if (mca_common_cuda_enabled && mca_common_cuda_register_memory) {
706             res = cuFunc.cuMemHostRegister(mem_reg->ptr, mem_reg->amount, 0);
707             if (res != CUDA_SUCCESS) {
708                 /* If registering the memory fails, print a message and continue.
709                  * This is not a fatal error. */
710                 opal_show_help("help-mpi-common-cuda.txt", "cuMemHostRegister during init failed",
711                                true, mem_reg->ptr, mem_reg->amount,
712                                OPAL_PROC_MY_HOSTNAME, res, mem_reg->msg);
713             } else {
714                 opal_output_verbose(20, mca_common_cuda_output,
715                                     "CUDA: cuMemHostRegister OK on rcache %s: "
716                                     "address=%p, bufsize=%d",
717                                     mem_reg->msg, mem_reg->ptr, (int)mem_reg->amount);
718             }
719         }
720         free(mem_reg->msg);
721         OBJ_RELEASE(mem_reg);
722     }
723 
724     /* Create stream for use in ipc asynchronous copies */
725     res = cuFunc.cuStreamCreate(&ipcStream, 0);
726     if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
727         opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed",
728                        true, OPAL_PROC_MY_HOSTNAME, res);
729         rc = OPAL_ERROR;
730         goto cleanup_and_error;
731     }
732 
733     /* Create stream for use in dtoh asynchronous copies */
734     res = cuFunc.cuStreamCreate(&dtohStream, 0);
735     if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
736         opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed",
737                        true, OPAL_PROC_MY_HOSTNAME, res);
738         rc = OPAL_ERROR;
739         goto cleanup_and_error;
740     }
741 
742     /* Create stream for use in htod asynchronous copies */
743     res = cuFunc.cuStreamCreate(&htodStream, 0);
744     if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
745         opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed",
746                        true, OPAL_PROC_MY_HOSTNAME, res);
747         rc = OPAL_ERROR;
748         goto cleanup_and_error;
749     }
750 
751     if (mca_common_cuda_cumemcpy_async) {
752         /* Create stream for use in cuMemcpyAsync synchronous copies */
753         res = cuFunc.cuStreamCreate(&memcpyStream, 0);
754         if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
755             opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed",
756                            true, OPAL_PROC_MY_HOSTNAME, res);
757             rc = OPAL_ERROR;
758             goto cleanup_and_error;
759         }
760     }
761 
762     res = cuFunc.cuMemHostRegister(&checkmem, sizeof(int), 0);
763     if (res != CUDA_SUCCESS) {
764         /* If registering the memory fails, print a message and continue.
765          * This is not a fatal error. */
766         opal_show_help("help-mpi-common-cuda.txt", "cuMemHostRegister during init failed",
767                        true, &checkmem, sizeof(int),
768                        OPAL_PROC_MY_HOSTNAME, res, "checkmem");
769 
770     } else {
771         opal_output_verbose(20, mca_common_cuda_output,
772                             "CUDA: cuMemHostRegister OK on test region");
773     }
774 
775     opal_output_verbose(20, mca_common_cuda_output,
776                         "CUDA: the extra gpu memory check is %s", (mca_common_cuda_gpu_mem_check_workaround == 1) ? "on":"off");
777 
778     opal_output_verbose(30, mca_common_cuda_output,
779                         "CUDA: initialized");
780     opal_atomic_mb();  /* Make sure next statement does not get reordered */
781     common_cuda_initialized = true;
782     stage_three_init_complete = true;
783     OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
784     return OPAL_SUCCESS;
785 
786     /* If we are here, something went wrong.  Cleanup and return an error. */
787  cleanup_and_error:
788     opal_atomic_mb(); /* Make sure next statement does not get reordered */
789     stage_three_init_complete = true;
790     OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
791     return rc;
792 }
793 
794 /**
795  * Cleanup all CUDA resources.
796  *
797  * Note: Still figuring out how to get cuMemHostUnregister called from the smcuda sm
798  * rcache.  Looks like with the memory pool from openib (grdma), the unregistering is
799  * called as the free list is destructed.  Not true for the sm mpool.  This means we
800  * are currently still leaking some host memory we registered with CUDA.
801  */
mca_common_cuda_fini(void)802 void mca_common_cuda_fini(void)
803 {
804     int i;
805     CUresult res;
806 
807     if (false == common_cuda_initialized) {
808         stage_one_init_ref_count--;
809         opal_output_verbose(20, mca_common_cuda_output,
810                             "CUDA: mca_common_cuda_fini, never completed initialization so "
811                             "skipping fini, ref_count is now %d", stage_one_init_ref_count);
812         return;
813     }
814 
815     if (0 == stage_one_init_ref_count) {
816         opal_output_verbose(20, mca_common_cuda_output,
817                             "CUDA: mca_common_cuda_fini, ref_count=%d, fini is already complete",
818                             stage_one_init_ref_count);
819         return;
820     }
821 
822     if (1 == stage_one_init_ref_count) {
823         opal_output_verbose(20, mca_common_cuda_output,
824                             "CUDA: mca_common_cuda_fini, ref_count=%d, cleaning up started",
825                             stage_one_init_ref_count);
826 
827         /* This call is in here to make sure the context is still valid.
828          * This was the one way of checking which did not cause problems
829          * while calling into the CUDA library.  This check will detect if
830          * a user has called cudaDeviceReset prior to MPI_Finalize. If so,
831          * then this call will fail and we skip cleaning up CUDA resources. */
832         res = cuFunc.cuMemHostUnregister(&checkmem);
833         if (CUDA_SUCCESS != res) {
834             ctx_ok = 0;
835         }
836         opal_output_verbose(20, mca_common_cuda_output,
837                             "CUDA: mca_common_cuda_fini, cuMemHostUnregister returned %d, ctx_ok=%d",
838                             res, ctx_ok);
839 
840         if (NULL != cuda_event_ipc_array) {
841             if (ctx_ok) {
842                 for (i = 0; i < cuda_event_max; i++) {
843                     if (NULL != cuda_event_ipc_array[i]) {
844                         cuFunc.cuEventDestroy(cuda_event_ipc_array[i]);
845                     }
846                 }
847             }
848             free(cuda_event_ipc_array);
849         }
850         if (NULL != cuda_event_htod_array) {
851             if (ctx_ok) {
852                 for (i = 0; i < cuda_event_max; i++) {
853                     if (NULL != cuda_event_htod_array[i]) {
854                         cuFunc.cuEventDestroy(cuda_event_htod_array[i]);
855                     }
856                 }
857             }
858             free(cuda_event_htod_array);
859         }
860 
861         if (NULL != cuda_event_dtoh_array) {
862             if (ctx_ok) {
863                 for (i = 0; i < cuda_event_max; i++) {
864                     if (NULL != cuda_event_dtoh_array[i]) {
865                         cuFunc.cuEventDestroy(cuda_event_dtoh_array[i]);
866                     }
867                 }
868             }
869             free(cuda_event_dtoh_array);
870         }
871 
872         if (NULL != cuda_event_ipc_frag_array) {
873             free(cuda_event_ipc_frag_array);
874         }
875         if (NULL != cuda_event_htod_frag_array) {
876             free(cuda_event_htod_frag_array);
877         }
878         if (NULL != cuda_event_dtoh_frag_array) {
879             free(cuda_event_dtoh_frag_array);
880         }
881         if ((NULL != ipcStream) && ctx_ok) {
882             cuFunc.cuStreamDestroy(ipcStream);
883         }
884         if ((NULL != dtohStream) && ctx_ok) {
885             cuFunc.cuStreamDestroy(dtohStream);
886         }
887         if ((NULL != htodStream) && ctx_ok) {
888             cuFunc.cuStreamDestroy(htodStream);
889         }
890         if ((NULL != memcpyStream) && ctx_ok) {
891             cuFunc.cuStreamDestroy(memcpyStream);
892         }
893         OBJ_DESTRUCT(&common_cuda_init_lock);
894         OBJ_DESTRUCT(&common_cuda_htod_lock);
895         OBJ_DESTRUCT(&common_cuda_dtoh_lock);
896         OBJ_DESTRUCT(&common_cuda_ipc_lock);
897         if (NULL != libcuda_handle) {
898             opal_dl_close(libcuda_handle);
899         }
900 
901         opal_output_verbose(20, mca_common_cuda_output,
902                             "CUDA: mca_common_cuda_fini, ref_count=%d, cleaning up all done",
903                             stage_one_init_ref_count);
904 
905         opal_output_close(mca_common_cuda_output);
906 
907     } else {
908         opal_output_verbose(20, mca_common_cuda_output,
909                             "CUDA: mca_common_cuda_fini, ref_count=%d, cuda still in use",
910                             stage_one_init_ref_count);
911     }
912     stage_one_init_ref_count--;
913 }
914 
915 /**
916  * Call the CUDA register function so we pin the memory in the CUDA
917  * space.
918  */
mca_common_cuda_register(void * ptr,size_t amount,char * msg)919 void mca_common_cuda_register(void *ptr, size_t amount, char *msg) {
920     int res;
921 
922     /* Always first check if the support is enabled.  If not, just return */
923     if (!opal_cuda_support)
924         return;
925 
926     if (!common_cuda_initialized) {
927         OPAL_THREAD_LOCK(&common_cuda_init_lock);
928         if (!common_cuda_initialized) {
929             common_cuda_mem_regs_t *regptr;
930             regptr = OBJ_NEW(common_cuda_mem_regs_t);
931             regptr->ptr = ptr;
932             regptr->amount = amount;
933             regptr->msg = strdup(msg);
934             opal_list_append(&common_cuda_memory_registrations,
935                              (opal_list_item_t*)regptr);
936             OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
937             return;
938         }
939         OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
940     }
941 
942     if (mca_common_cuda_enabled && mca_common_cuda_register_memory) {
943         res = cuFunc.cuMemHostRegister(ptr, amount, 0);
944         if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
945             /* If registering the memory fails, print a message and continue.
946              * This is not a fatal error. */
947             opal_show_help("help-mpi-common-cuda.txt", "cuMemHostRegister failed",
948                            true, ptr, amount,
949                            OPAL_PROC_MY_HOSTNAME, res, msg);
950         } else {
951             opal_output_verbose(20, mca_common_cuda_output,
952                                 "CUDA: cuMemHostRegister OK on rcache %s: "
953                                 "address=%p, bufsize=%d",
954                                 msg, ptr, (int)amount);
955         }
956     }
957 }
958 
959 /**
960  * Call the CUDA unregister function so we unpin the memory in the CUDA
961  * space.
962  */
mca_common_cuda_unregister(void * ptr,char * msg)963 void mca_common_cuda_unregister(void *ptr, char *msg) {
964     int res, i, s;
965     common_cuda_mem_regs_t *mem_reg;
966 
967     /* This can happen if memory was queued up to be registered, but
968      * no CUDA operations happened, so it never was registered.
969      * Therefore, just release any of the resources. */
970     if (!common_cuda_initialized) {
971         s = opal_list_get_size(&common_cuda_memory_registrations);
972         for(i = 0; i < s; i++) {
973             mem_reg = (common_cuda_mem_regs_t *)
974                 opal_list_remove_first(&common_cuda_memory_registrations);
975             free(mem_reg->msg);
976             OBJ_RELEASE(mem_reg);
977         }
978         return;
979     }
980 
981     if (mca_common_cuda_enabled && mca_common_cuda_register_memory) {
982         res = cuFunc.cuMemHostUnregister(ptr);
983         if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
984             /* If unregistering the memory fails, just continue.  This is during
985              * shutdown.  Only print when running in verbose mode. */
986             opal_output_verbose(20, mca_common_cuda_output,
987                                 "CUDA: cuMemHostUnregister failed: ptr=%p, res=%d, rcache=%s",
988                                 ptr, res, msg);
989 
990         } else {
991             opal_output_verbose(20, mca_common_cuda_output,
992                                 "CUDA: cuMemHostUnregister OK on rcache %s: "
993                                 "address=%p",
994                                 msg, ptr);
995         }
996     }
997 }
998 
999 /*
1000  * Get the memory handle of a local section of memory that can be sent
1001  * to the remote size so it can access the memory.  This is the
1002  * registration function for the sending side of a message transfer.
1003  */
cuda_getmemhandle(void * base,size_t size,mca_rcache_base_registration_t * newreg,mca_rcache_base_registration_t * hdrreg)1004 int cuda_getmemhandle(void *base, size_t size, mca_rcache_base_registration_t *newreg,
1005                       mca_rcache_base_registration_t *hdrreg)
1006 
1007 {
1008     CUmemorytype memType;
1009     CUresult result;
1010     CUipcMemHandle *memHandle;
1011     CUdeviceptr pbase;
1012     size_t psize;
1013 
1014     mca_rcache_common_cuda_reg_t *cuda_reg = (mca_rcache_common_cuda_reg_t*)newreg;
1015     memHandle = (CUipcMemHandle *)cuda_reg->data.memHandle;
1016 
1017     /* We should only be there if this is a CUDA device pointer */
1018     result = cuFunc.cuPointerGetAttribute(&memType,
1019                                           CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)base);
1020     assert(CUDA_SUCCESS == result);
1021     assert(CU_MEMORYTYPE_DEVICE == memType);
1022 
1023     /* Get the memory handle so we can send it to the remote process. */
1024     result = cuFunc.cuIpcGetMemHandle(memHandle, (CUdeviceptr)base);
1025     CUDA_DUMP_MEMHANDLE((100, memHandle, "GetMemHandle-After"));
1026 
1027     if (CUDA_SUCCESS != result) {
1028         opal_show_help("help-mpi-common-cuda.txt", "cuIpcGetMemHandle failed",
1029                        true, result, base);
1030         return OPAL_ERROR;
1031     } else {
1032         opal_output_verbose(20, mca_common_cuda_output,
1033                             "CUDA: cuIpcGetMemHandle passed: base=%p size=%d",
1034                             base, (int)size);
1035     }
1036 
1037     /* Need to get the real base and size of the memory handle.  This is
1038      * how the remote side saves the handles in a cache. */
1039     result = cuFunc.cuMemGetAddressRange(&pbase, &psize, (CUdeviceptr)base);
1040     if (CUDA_SUCCESS != result) {
1041         opal_show_help("help-mpi-common-cuda.txt", "cuMemGetAddressRange failed",
1042                        true, result, base);
1043         return OPAL_ERROR;
1044     } else {
1045         opal_output_verbose(10, mca_common_cuda_output,
1046                             "CUDA: cuMemGetAddressRange passed: addr=%p, size=%d, pbase=%p, psize=%d ",
1047                             base, (int)size, (void *)pbase, (int)psize);
1048     }
1049 
1050     /* Store all the information in the registration */
1051     cuda_reg->base.base = (void *)pbase;
1052     cuda_reg->base.bound = (unsigned char *)pbase + psize - 1;
1053     cuda_reg->data.memh_seg_addr.pval = (void *) pbase;
1054     cuda_reg->data.memh_seg_len = psize;
1055 
1056 #if OPAL_CUDA_SYNC_MEMOPS
1057     /* With CUDA 6.0, we can set an attribute on the memory pointer that will
1058      * ensure any synchronous copies are completed prior to any other access
1059      * of the memory region.  This means we do not need to record an event
1060      * and send to the remote side.
1061      */
1062     memType = 1; /* Just use this variable since we already have it */
1063     result = cuFunc.cuPointerSetAttribute(&memType, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
1064                                           (CUdeviceptr)base);
1065     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1066         opal_show_help("help-mpi-common-cuda.txt", "cuPointerSetAttribute failed",
1067                        true, OPAL_PROC_MY_HOSTNAME, result, base);
1068         return OPAL_ERROR;
1069     }
1070 #else
1071     /* Need to record the event to ensure that any memcopies into the
1072      * device memory have completed.  The event handle associated with
1073      * this event is sent to the remote process so that it will wait
1074      * on this event prior to copying data out of the device memory.
1075      * Note that this needs to be the NULL stream to make since it is
1076      * unknown what stream any copies into the device memory were done
1077      * with. */
1078     result = cuFunc.cuEventRecord((CUevent)cuda_reg->data.event, 0);
1079     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1080         opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
1081                        true, result, base);
1082         return OPAL_ERROR;
1083     }
1084 #endif /* OPAL_CUDA_SYNC_MEMOPS */
1085 
1086     return OPAL_SUCCESS;
1087 }
1088 
1089 /*
1090  * This function is called by the local side that called the cuda_getmemhandle.
1091  * There is nothing to be done so just return.
1092  */
cuda_ungetmemhandle(void * reg_data,mca_rcache_base_registration_t * reg)1093 int cuda_ungetmemhandle(void *reg_data, mca_rcache_base_registration_t *reg)
1094 {
1095     opal_output_verbose(10, mca_common_cuda_output,
1096                         "CUDA: cuda_ungetmemhandle (no-op): base=%p", reg->base);
1097     CUDA_DUMP_MEMHANDLE((100, ((mca_rcache_common_cuda_reg_t *)reg)->data.memHandle, "cuda_ungetmemhandle"));
1098 
1099     return OPAL_SUCCESS;
1100 }
1101 
1102 /*
1103  * Open a memory handle that refers to remote memory so we can get an address
1104  * that works on the local side.  This is the registration function for the
1105  * remote side of a transfer.  newreg contains the new handle.  hddrreg contains
1106  * the memory handle that was received from the remote side.
1107  */
cuda_openmemhandle(void * base,size_t size,mca_rcache_base_registration_t * newreg,mca_rcache_base_registration_t * hdrreg)1108 int cuda_openmemhandle(void *base, size_t size, mca_rcache_base_registration_t *newreg,
1109                        mca_rcache_base_registration_t *hdrreg)
1110 {
1111     CUresult result;
1112     CUipcMemHandle *memHandle;
1113     mca_rcache_common_cuda_reg_t *cuda_newreg = (mca_rcache_common_cuda_reg_t*)newreg;
1114 
1115     /* Save in local variable to avoid ugly casting */
1116     memHandle = (CUipcMemHandle *)cuda_newreg->data.memHandle;
1117     CUDA_DUMP_MEMHANDLE((100, memHandle, "Before call to cuIpcOpenMemHandle"));
1118 
1119     /* Open the memory handle and store it into the registration structure. */
1120     result = cuFunc.cuIpcOpenMemHandle((CUdeviceptr *)&newreg->alloc_base, *memHandle,
1121                                        CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS);
1122 
1123     /* If there are some stale entries in the cache, they can cause other
1124      * registrations to fail.  Let the caller know that so that can attempt
1125      * to clear them out. */
1126     if (CUDA_ERROR_ALREADY_MAPPED == result) {
1127         opal_output_verbose(10, mca_common_cuda_output,
1128                             "CUDA: cuIpcOpenMemHandle returned CUDA_ERROR_ALREADY_MAPPED for "
1129                             "p=%p,size=%d: notify memory pool\n", base, (int)size);
1130         return OPAL_ERR_WOULD_BLOCK;
1131     }
1132     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1133         opal_show_help("help-mpi-common-cuda.txt", "cuIpcOpenMemHandle failed",
1134                        true, OPAL_PROC_MY_HOSTNAME, result, base);
1135         /* Currently, this is a non-recoverable error */
1136         return OPAL_ERROR;
1137     } else {
1138         opal_output_verbose(10, mca_common_cuda_output,
1139                             "CUDA: cuIpcOpenMemHandle passed: base=%p (remote base=%p,size=%d)",
1140                             newreg->alloc_base, base, (int)size);
1141         CUDA_DUMP_MEMHANDLE((200, memHandle, "cuIpcOpenMemHandle"));
1142     }
1143 
1144     return OPAL_SUCCESS;
1145 }
1146 
1147 /*
1148  * Close a memory handle that refers to remote memory.
1149  */
cuda_closememhandle(void * reg_data,mca_rcache_base_registration_t * reg)1150 int cuda_closememhandle(void *reg_data, mca_rcache_base_registration_t *reg)
1151 {
1152     CUresult result;
1153     mca_rcache_common_cuda_reg_t *cuda_reg = (mca_rcache_common_cuda_reg_t*)reg;
1154 
1155     /* Only attempt to close if we have valid context.  This can change if a call
1156      * to the fini function is made and we discover context is gone. */
1157     if (ctx_ok) {
1158         result = cuFunc.cuIpcCloseMemHandle((CUdeviceptr)cuda_reg->base.alloc_base);
1159         if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1160             if (CUDA_ERROR_DEINITIALIZED != result) {
1161                 opal_show_help("help-mpi-common-cuda.txt", "cuIpcCloseMemHandle failed",
1162                 true, result, cuda_reg->base.alloc_base);
1163             }
1164             /* We will just continue on and hope things continue to work. */
1165         } else {
1166             opal_output_verbose(10, mca_common_cuda_output,
1167                                 "CUDA: cuIpcCloseMemHandle passed: base=%p",
1168                                 cuda_reg->base.alloc_base);
1169             CUDA_DUMP_MEMHANDLE((100, cuda_reg->data.memHandle, "cuIpcCloseMemHandle"));
1170         }
1171     }
1172 
1173     return OPAL_SUCCESS;
1174 }
1175 
mca_common_cuda_construct_event_and_handle(uintptr_t * event,void * handle)1176 void mca_common_cuda_construct_event_and_handle(uintptr_t *event, void *handle)
1177 {
1178     CUresult result;
1179 
1180     result = cuFunc.cuEventCreate((CUevent *)event, CU_EVENT_INTERPROCESS | CU_EVENT_DISABLE_TIMING);
1181     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1182         opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
1183                        true, OPAL_PROC_MY_HOSTNAME, result);
1184     }
1185 
1186     result = cuFunc.cuIpcGetEventHandle((CUipcEventHandle *)handle, (CUevent)*event);
1187     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1188         opal_show_help("help-mpi-common-cuda.txt", "cuIpcGetEventHandle failed",
1189                        true, result);
1190     }
1191 
1192     CUDA_DUMP_EVTHANDLE((10, handle, "construct_event_and_handle"));
1193 
1194 }
1195 
mca_common_cuda_destruct_event(uintptr_t event)1196 void mca_common_cuda_destruct_event(uintptr_t event)
1197 {
1198     CUresult result;
1199 
1200     /* Only attempt to destroy if we have valid context.  This can change if a call
1201      * to the fini function is made and we discover context is gone. */
1202     if (ctx_ok) {
1203         result = cuFunc.cuEventDestroy((CUevent)event);
1204         if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1205             opal_show_help("help-mpi-common-cuda.txt", "cuEventDestroy failed",
1206                            true, result);
1207         }
1208     }
1209 }
1210 
1211 
1212 /*
1213  * Put remote event on stream to ensure that the the start of the
1214  * copy does not start until the completion of the event.
1215  */
mca_common_wait_stream_synchronize(mca_rcache_common_cuda_reg_t * rget_reg)1216 void mca_common_wait_stream_synchronize(mca_rcache_common_cuda_reg_t *rget_reg)
1217 {
1218 #if OPAL_CUDA_SYNC_MEMOPS
1219     /* No need for any of this with SYNC_MEMOPS feature */
1220     return;
1221 #else /* OPAL_CUDA_SYNC_MEMOPS */
1222     CUipcEventHandle evtHandle;
1223     CUevent event;
1224     CUresult result;
1225 
1226     memcpy(&evtHandle, rget_reg->data.evtHandle, sizeof(evtHandle));
1227     CUDA_DUMP_EVTHANDLE((100, &evtHandle, "stream_synchronize"));
1228 
1229     result = cuFunc.cuIpcOpenEventHandle(&event, evtHandle);
1230     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1231         opal_show_help("help-mpi-common-cuda.txt", "cuIpcOpenEventHandle failed",
1232                        true, result);
1233     }
1234 
1235     /* BEGIN of Workaround - There is a bug in CUDA 4.1 RC2 and earlier
1236      * versions.  Need to record an event on the stream, even though
1237      * it is not used, to make sure we do not short circuit our way
1238      * out of the cuStreamWaitEvent test.
1239      */
1240     result = cuFunc.cuEventRecord(event, 0);
1241     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1242         opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
1243                        true, OPAL_PROC_MY_HOSTNAME, result);
1244     }
1245     /* END of Workaround */
1246 
1247     result = cuFunc.cuStreamWaitEvent(0, event, 0);
1248     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1249         opal_show_help("help-mpi-common-cuda.txt", "cuStreamWaitEvent failed",
1250                        true, result);
1251     }
1252 
1253     /* All done with this event. */
1254     result = cuFunc.cuEventDestroy(event);
1255     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1256         opal_show_help("help-mpi-common-cuda.txt", "cuEventDestroy failed",
1257                        true, result);
1258     }
1259 #endif /* OPAL_CUDA_SYNC_MEMOPS */
1260 }
1261 
1262 /*
1263  * Start the asynchronous copy.  Then record and save away an event that will
1264  * be queried to indicate the copy has completed.
1265  */
mca_common_cuda_memcpy(void * dst,void * src,size_t amount,char * msg,struct mca_btl_base_descriptor_t * frag,int * done)1266 int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
1267                            struct mca_btl_base_descriptor_t *frag, int *done)
1268 {
1269     CUresult result;
1270     int iter;
1271 
1272     OPAL_THREAD_LOCK(&common_cuda_ipc_lock);
1273     /* First make sure there is room to store the event.  If not, then
1274      * return an error.  The error message will tell the user to try and
1275      * run again, but with a larger array for storing events. */
1276     if (cuda_event_ipc_num_used == cuda_event_max) {
1277         opal_show_help("help-mpi-common-cuda.txt", "Out of cuEvent handles",
1278                        true, cuda_event_max, cuda_event_max+100, cuda_event_max+100);
1279         OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1280         return OPAL_ERR_OUT_OF_RESOURCE;
1281     }
1282 
1283     if (cuda_event_ipc_num_used > cuda_event_ipc_most) {
1284         cuda_event_ipc_most = cuda_event_ipc_num_used;
1285         /* Just print multiples of 10 */
1286         if (0 == (cuda_event_ipc_most % 10)) {
1287             opal_output_verbose(20, mca_common_cuda_output,
1288                                 "Maximum ipc events used is now %d", cuda_event_ipc_most);
1289         }
1290     }
1291 
1292     /* This is the standard way to run.  Running with synchronous copies is available
1293      * to measure the advantages of asynchronous copies. */
1294     if (OPAL_LIKELY(mca_common_cuda_async)) {
1295         result = cuFunc.cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, ipcStream);
1296         if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1297             opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
1298                            true, dst, src, amount, result);
1299             OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1300             return OPAL_ERROR;
1301         } else {
1302             opal_output_verbose(20, mca_common_cuda_output,
1303                                 "CUDA: cuMemcpyAsync passed: dst=%p, src=%p, size=%d",
1304                                 dst, src, (int)amount);
1305         }
1306         result = cuFunc.cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream);
1307         if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1308             opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
1309                            true, OPAL_PROC_MY_HOSTNAME, result);
1310             OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1311             return OPAL_ERROR;
1312         }
1313         cuda_event_ipc_frag_array[cuda_event_ipc_first_avail] = frag;
1314 
1315         /* Bump up the first available slot and number used by 1 */
1316         cuda_event_ipc_first_avail++;
1317         if (cuda_event_ipc_first_avail >= cuda_event_max) {
1318             cuda_event_ipc_first_avail = 0;
1319         }
1320         cuda_event_ipc_num_used++;
1321 
1322         *done = 0;
1323     } else {
1324         /* Mimic the async function so they use the same memcpy call. */
1325         result = cuFunc.cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, ipcStream);
1326         if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1327             opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
1328                            true, dst, src, amount, result);
1329             OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1330             return OPAL_ERROR;
1331         } else {
1332             opal_output_verbose(20, mca_common_cuda_output,
1333                                 "CUDA: cuMemcpyAsync passed: dst=%p, src=%p, size=%d",
1334                                 dst, src, (int)amount);
1335         }
1336 
1337         /* Record an event, then wait for it to complete with calls to cuEventQuery */
1338         result = cuFunc.cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream);
1339         if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1340             opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
1341                            true, OPAL_PROC_MY_HOSTNAME, result);
1342             OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1343             return OPAL_ERROR;
1344         }
1345 
1346         cuda_event_ipc_frag_array[cuda_event_ipc_first_avail] = frag;
1347 
1348         /* Bump up the first available slot and number used by 1 */
1349         cuda_event_ipc_first_avail++;
1350         if (cuda_event_ipc_first_avail >= cuda_event_max) {
1351             cuda_event_ipc_first_avail = 0;
1352         }
1353         cuda_event_ipc_num_used++;
1354 
1355         result = cuFunc.cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]);
1356         if ((CUDA_SUCCESS != result) && (CUDA_ERROR_NOT_READY != result)) {
1357             opal_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
1358                            true, result);
1359             OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1360             return OPAL_ERROR;
1361         }
1362 
1363         iter = 0;
1364         while (CUDA_ERROR_NOT_READY == result) {
1365             if (0 == (iter % 10)) {
1366                 opal_output(-1, "EVENT NOT DONE (iter=%d)", iter);
1367             }
1368             result = cuFunc.cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]);
1369             if ((CUDA_SUCCESS != result) && (CUDA_ERROR_NOT_READY != result)) {
1370                 opal_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
1371                                true, result);
1372             OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1373                 return OPAL_ERROR;
1374             }
1375             iter++;
1376         }
1377 
1378         --cuda_event_ipc_num_used;
1379         ++cuda_event_ipc_first_used;
1380         if (cuda_event_ipc_first_used >= cuda_event_max) {
1381             cuda_event_ipc_first_used = 0;
1382         }
1383         *done = 1;
1384     }
1385     OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1386     return OPAL_SUCCESS;
1387 }
1388 
1389 /*
1390  * Record an event and save the frag.  This is called by the sending side and
1391  * is used to queue an event when a htod copy has been initiated.
1392  */
mca_common_cuda_record_dtoh_event(char * msg,struct mca_btl_base_descriptor_t * frag)1393 int mca_common_cuda_record_dtoh_event(char *msg, struct mca_btl_base_descriptor_t *frag)
1394 {
1395     CUresult result;
1396 
1397     /* First make sure there is room to store the event.  If not, then
1398      * return an error.  The error message will tell the user to try and
1399      * run again, but with a larger array for storing events. */
1400     OPAL_THREAD_LOCK(&common_cuda_dtoh_lock);
1401     if (cuda_event_dtoh_num_used == cuda_event_max) {
1402         opal_show_help("help-mpi-common-cuda.txt", "Out of cuEvent handles",
1403                        true, cuda_event_max, cuda_event_max+100, cuda_event_max+100);
1404         return OPAL_ERR_OUT_OF_RESOURCE;
1405     }
1406 
1407     if (cuda_event_dtoh_num_used > cuda_event_dtoh_most) {
1408         cuda_event_dtoh_most = cuda_event_dtoh_num_used;
1409         /* Just print multiples of 10 */
1410         if (0 == (cuda_event_dtoh_most % 10)) {
1411             opal_output_verbose(20, mca_common_cuda_output,
1412                                 "Maximum DtoH events used is now %d", cuda_event_dtoh_most);
1413         }
1414     }
1415 
1416     result = cuFunc.cuEventRecord(cuda_event_dtoh_array[cuda_event_dtoh_first_avail], dtohStream);
1417     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1418         opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
1419                        true, OPAL_PROC_MY_HOSTNAME, result);
1420         OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock);
1421         return OPAL_ERROR;
1422     }
1423     cuda_event_dtoh_frag_array[cuda_event_dtoh_first_avail] = frag;
1424 
1425     /* Bump up the first available slot and number used by 1 */
1426     cuda_event_dtoh_first_avail++;
1427     if (cuda_event_dtoh_first_avail >= cuda_event_max) {
1428         cuda_event_dtoh_first_avail = 0;
1429     }
1430     cuda_event_dtoh_num_used++;
1431 
1432     OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock);
1433     return OPAL_SUCCESS;
1434 }
1435 
1436 /*
1437  * Record an event and save the frag.  This is called by the receiving side and
1438  * is used to queue an event when a dtoh copy has been initiated.
1439  */
mca_common_cuda_record_htod_event(char * msg,struct mca_btl_base_descriptor_t * frag)1440 int mca_common_cuda_record_htod_event(char *msg, struct mca_btl_base_descriptor_t *frag)
1441 {
1442     CUresult result;
1443 
1444     OPAL_THREAD_LOCK(&common_cuda_htod_lock);
1445     /* First make sure there is room to store the event.  If not, then
1446      * return an error.  The error message will tell the user to try and
1447      * run again, but with a larger array for storing events. */
1448     if (cuda_event_htod_num_used == cuda_event_max) {
1449         opal_show_help("help-mpi-common-cuda.txt", "Out of cuEvent handles",
1450                        true, cuda_event_max, cuda_event_max+100, cuda_event_max+100);
1451         OPAL_THREAD_UNLOCK(&common_cuda_htod_lock);
1452         return OPAL_ERR_OUT_OF_RESOURCE;
1453     }
1454 
1455     if (cuda_event_htod_num_used > cuda_event_htod_most) {
1456         cuda_event_htod_most = cuda_event_htod_num_used;
1457         /* Just print multiples of 10 */
1458         if (0 == (cuda_event_htod_most % 10)) {
1459             opal_output_verbose(20, mca_common_cuda_output,
1460                                 "Maximum HtoD events used is now %d", cuda_event_htod_most);
1461         }
1462     }
1463 
1464     result = cuFunc.cuEventRecord(cuda_event_htod_array[cuda_event_htod_first_avail], htodStream);
1465     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1466         opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
1467                        true, OPAL_PROC_MY_HOSTNAME, result);
1468         OPAL_THREAD_UNLOCK(&common_cuda_htod_lock);
1469         return OPAL_ERROR;
1470     }
1471     cuda_event_htod_frag_array[cuda_event_htod_first_avail] = frag;
1472 
1473    /* Bump up the first available slot and number used by 1 */
1474     cuda_event_htod_first_avail++;
1475     if (cuda_event_htod_first_avail >= cuda_event_max) {
1476         cuda_event_htod_first_avail = 0;
1477     }
1478     cuda_event_htod_num_used++;
1479 
1480     OPAL_THREAD_UNLOCK(&common_cuda_htod_lock);
1481     return OPAL_SUCCESS;
1482 }
1483 
1484 /**
1485  * Used to get the dtoh stream for initiating asynchronous copies.
1486  */
mca_common_cuda_get_dtoh_stream(void)1487 void *mca_common_cuda_get_dtoh_stream(void) {
1488     return (void *)dtohStream;
1489 }
1490 
1491 /**
1492  * Used to get the htod stream for initiating asynchronous copies.
1493  */
mca_common_cuda_get_htod_stream(void)1494 void *mca_common_cuda_get_htod_stream(void) {
1495     return (void *)htodStream;
1496 }
1497 
1498 /*
1499  * Function is called every time progress is called with the sm BTL.  If there
1500  * are outstanding events, check to see if one has completed.  If so, hand
1501  * back the fragment for further processing.
1502  */
progress_one_cuda_ipc_event(struct mca_btl_base_descriptor_t ** frag)1503 int progress_one_cuda_ipc_event(struct mca_btl_base_descriptor_t **frag) {
1504     CUresult result;
1505 
1506     OPAL_THREAD_LOCK(&common_cuda_ipc_lock);
1507     if (cuda_event_ipc_num_used > 0) {
1508         opal_output_verbose(20, mca_common_cuda_output,
1509                            "CUDA: progress_one_cuda_ipc_event, outstanding_events=%d",
1510                             cuda_event_ipc_num_used);
1511 
1512         result = cuFunc.cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]);
1513 
1514         /* We found an event that is not ready, so return. */
1515         if (CUDA_ERROR_NOT_READY == result) {
1516             opal_output_verbose(20, mca_common_cuda_output,
1517                                 "CUDA: cuEventQuery returned CUDA_ERROR_NOT_READY");
1518             *frag = NULL;
1519             OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1520             return 0;
1521         } else if (CUDA_SUCCESS != result) {
1522             opal_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
1523                            true, result);
1524             *frag = NULL;
1525             OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1526             return OPAL_ERROR;
1527         }
1528 
1529         *frag = cuda_event_ipc_frag_array[cuda_event_ipc_first_used];
1530         opal_output_verbose(10, mca_common_cuda_output,
1531                             "CUDA: cuEventQuery returned %d", result);
1532 
1533         /* Bump counters, loop around the circular buffer if necessary */
1534         --cuda_event_ipc_num_used;
1535         ++cuda_event_ipc_first_used;
1536         if (cuda_event_ipc_first_used >= cuda_event_max) {
1537             cuda_event_ipc_first_used = 0;
1538         }
1539         /* A return value of 1 indicates an event completed and a frag was returned */
1540         OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1541         return 1;
1542     }
1543     OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1544     return 0;
1545 }
1546 
1547 /**
1548  * Progress any dtoh event completions.
1549  */
progress_one_cuda_dtoh_event(struct mca_btl_base_descriptor_t ** frag)1550 int progress_one_cuda_dtoh_event(struct mca_btl_base_descriptor_t **frag) {
1551     CUresult result;
1552 
1553     OPAL_THREAD_LOCK(&common_cuda_dtoh_lock);
1554     if (cuda_event_dtoh_num_used > 0) {
1555         opal_output_verbose(30, mca_common_cuda_output,
1556                            "CUDA: progress_one_cuda_dtoh_event, outstanding_events=%d",
1557                             cuda_event_dtoh_num_used);
1558 
1559         result = cuFunc.cuEventQuery(cuda_event_dtoh_array[cuda_event_dtoh_first_used]);
1560 
1561         /* We found an event that is not ready, so return. */
1562         if (CUDA_ERROR_NOT_READY == result) {
1563             opal_output_verbose(30, mca_common_cuda_output,
1564                                 "CUDA: cuEventQuery returned CUDA_ERROR_NOT_READY");
1565             *frag = NULL;
1566             OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock);
1567             return 0;
1568         } else if (CUDA_SUCCESS != result) {
1569             opal_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
1570                            true, result);
1571             *frag = NULL;
1572             OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock);
1573             return OPAL_ERROR;
1574         }
1575 
1576         *frag = cuda_event_dtoh_frag_array[cuda_event_dtoh_first_used];
1577         opal_output_verbose(30, mca_common_cuda_output,
1578                             "CUDA: cuEventQuery returned %d", result);
1579 
1580         /* Bump counters, loop around the circular buffer if necessary */
1581         --cuda_event_dtoh_num_used;
1582         ++cuda_event_dtoh_first_used;
1583         if (cuda_event_dtoh_first_used >= cuda_event_max) {
1584             cuda_event_dtoh_first_used = 0;
1585         }
1586         /* A return value of 1 indicates an event completed and a frag was returned */
1587         OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock);
1588         return 1;
1589     }
1590     OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock);
1591     return 0;
1592 }
1593 
1594 /**
1595  * Progress any dtoh event completions.
1596  */
progress_one_cuda_htod_event(struct mca_btl_base_descriptor_t ** frag)1597 int progress_one_cuda_htod_event(struct mca_btl_base_descriptor_t **frag) {
1598     CUresult result;
1599 
1600     OPAL_THREAD_LOCK(&common_cuda_htod_lock);
1601     if (cuda_event_htod_num_used > 0) {
1602         opal_output_verbose(30, mca_common_cuda_output,
1603                            "CUDA: progress_one_cuda_htod_event, outstanding_events=%d",
1604                             cuda_event_htod_num_used);
1605 
1606         result = cuFunc.cuEventQuery(cuda_event_htod_array[cuda_event_htod_first_used]);
1607 
1608         /* We found an event that is not ready, so return. */
1609         if (CUDA_ERROR_NOT_READY == result) {
1610             opal_output_verbose(30, mca_common_cuda_output,
1611                                 "CUDA: cuEventQuery returned CUDA_ERROR_NOT_READY");
1612             *frag = NULL;
1613             OPAL_THREAD_UNLOCK(&common_cuda_htod_lock);
1614             return 0;
1615         } else if (CUDA_SUCCESS != result) {
1616             opal_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
1617                            true, result);
1618             *frag = NULL;
1619             OPAL_THREAD_UNLOCK(&common_cuda_htod_lock);
1620             return OPAL_ERROR;
1621         }
1622 
1623         *frag = cuda_event_htod_frag_array[cuda_event_htod_first_used];
1624         opal_output_verbose(30, mca_common_cuda_output,
1625                             "CUDA: cuEventQuery returned %d", result);
1626 
1627         /* Bump counters, loop around the circular buffer if necessary */
1628         --cuda_event_htod_num_used;
1629         ++cuda_event_htod_first_used;
1630         if (cuda_event_htod_first_used >= cuda_event_max) {
1631             cuda_event_htod_first_used = 0;
1632         }
1633         /* A return value of 1 indicates an event completed and a frag was returned */
1634         OPAL_THREAD_UNLOCK(&common_cuda_htod_lock);
1635         return 1;
1636     }
1637     OPAL_THREAD_UNLOCK(&common_cuda_htod_lock);
1638     return OPAL_ERR_RESOURCE_BUSY;
1639 }
1640 
1641 
1642 /**
1643  * Need to make sure the handle we are retrieving from the cache is still
1644  * valid.  Compare the cached handle to the one received.
1645  */
mca_common_cuda_memhandle_matches(mca_rcache_common_cuda_reg_t * new_reg,mca_rcache_common_cuda_reg_t * old_reg)1646 int mca_common_cuda_memhandle_matches(mca_rcache_common_cuda_reg_t *new_reg,
1647                                       mca_rcache_common_cuda_reg_t *old_reg)
1648 {
1649 
1650     if (0 == memcmp(new_reg->data.memHandle, old_reg->data.memHandle, sizeof(new_reg->data.memHandle))) {
1651         return 1;
1652     } else {
1653         return 0;
1654     }
1655 
1656 }
1657 
1658 /*
1659  * Function to dump memory handle information.  This is based on
1660  * definitions from cuiinterprocess_private.h.
1661  */
cuda_dump_memhandle(int verbose,void * memHandle,char * str)1662 static void cuda_dump_memhandle(int verbose, void *memHandle, char *str) {
1663 
1664     struct InterprocessMemHandleInternal
1665     {
1666         /* The first two entries are the CUinterprocessCtxHandle */
1667         int64_t ctxId; /* unique (within a process) id of the sharing context */
1668         int     pid;   /* pid of sharing context */
1669 
1670         int64_t size;
1671         int64_t blocksize;
1672         int64_t offset;
1673         int     gpuId;
1674         int     subDeviceIndex;
1675         int64_t serial;
1676     } memH;
1677 
1678     if (NULL == str) {
1679         str = "CUDA";
1680     }
1681     memcpy(&memH, memHandle, sizeof(memH));
1682     opal_output_verbose(verbose, mca_common_cuda_output,
1683                         "%s:ctxId=0x%" PRIx64 ", pid=%d, size=%" PRIu64 ", blocksize=%" PRIu64 ", offset=%"
1684                         PRIu64 ", gpuId=%d, subDeviceIndex=%d, serial=%" PRIu64,
1685                         str, memH.ctxId, memH.pid, memH.size, memH.blocksize, memH.offset,
1686                         memH.gpuId, memH.subDeviceIndex, memH.serial);
1687 }
1688 
1689 /*
1690  * Function to dump memory handle information.  This is based on
1691  * definitions from cuiinterprocess_private.h.
1692  */
cuda_dump_evthandle(int verbose,void * evtHandle,char * str)1693 static void cuda_dump_evthandle(int verbose, void *evtHandle, char *str) {
1694 
1695     struct InterprocessEventHandleInternal
1696     {
1697         unsigned long pid;
1698         unsigned long serial;
1699         int index;
1700     } evtH;
1701 
1702     if (NULL == str) {
1703         str = "CUDA";
1704     }
1705     memcpy(&evtH, evtHandle, sizeof(evtH));
1706     opal_output_verbose(verbose, mca_common_cuda_output,
1707                         "CUDA: %s:pid=%lu, serial=%lu, index=%d",
1708                         str, evtH.pid, evtH.serial, evtH.index);
1709 }
1710 
1711 
1712 /* Return microseconds of elapsed time. Microseconds are relevant when
1713  * trying to understand the fixed overhead of the communication. Used
1714  * when trying to time various functions.
1715  *
1716  * Cut and past the following to get timings where wanted.
1717  *
1718  *   clock_gettime(CLOCK_MONOTONIC, &ts_start);
1719  *   FUNCTION OF INTEREST
1720  *   clock_gettime(CLOCK_MONOTONIC, &ts_end);
1721  *   accum = mydifftime(ts_start, ts_end);
1722  *   opal_output(0, "Function took   %7.2f usecs\n", accum);
1723  *
1724  */
1725 #if OPAL_ENABLE_DEBUG
mydifftime(opal_timer_t ts_start,opal_timer_t ts_end)1726 static float mydifftime(opal_timer_t ts_start, opal_timer_t ts_end) {
1727     return (ts_end - ts_start);
1728 }
1729 #endif /* OPAL_ENABLE_DEBUG */
1730 
1731 /* Routines that get plugged into the opal datatype code */
mca_common_cuda_is_gpu_buffer(const void * pUserBuf,opal_convertor_t * convertor)1732 static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t *convertor)
1733 {
1734     int res;
1735     CUmemorytype memType = 0;
1736     CUdeviceptr dbuf = (CUdeviceptr)pUserBuf;
1737     CUcontext ctx = NULL, memCtx = NULL;
1738 #if OPAL_CUDA_GET_ATTRIBUTES
1739     uint32_t isManaged = 0;
1740     /* With CUDA 7.0, we can get multiple attributes with a single call */
1741     CUpointer_attribute attributes[3] = {CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
1742                                          CU_POINTER_ATTRIBUTE_CONTEXT,
1743                                          CU_POINTER_ATTRIBUTE_IS_MANAGED};
1744     void *attrdata[] = {(void *)&memType, (void *)&memCtx, (void *)&isManaged};
1745 
1746     res = cuFunc.cuPointerGetAttributes(3, attributes, attrdata, dbuf);
1747     OPAL_OUTPUT_VERBOSE((101, mca_common_cuda_output,
1748                         "dbuf=%p, memType=%d, memCtx=%p, isManaged=%d, res=%d",
1749                          (void *)dbuf, (int)memType, (void *)memCtx, isManaged, res));
1750 
1751     /* Mark unified memory buffers with a flag.  This will allow all unified
1752      * memory to be forced through host buffers.  Note that this memory can
1753      * be either host or device so we need to set this flag prior to that check. */
1754     if (1 == isManaged) {
1755         if (NULL != convertor) {
1756             convertor->flags |= CONVERTOR_CUDA_UNIFIED;
1757         }
1758     }
1759     if (res != CUDA_SUCCESS) {
1760         /* If we cannot determine it is device pointer,
1761          * just assume it is not. */
1762         return 0;
1763     } else if (memType == CU_MEMORYTYPE_HOST) {
1764         /* Host memory, nothing to do here */
1765         return 0;
1766     } else if (memType == 0) {
1767         /* This can happen when CUDA is initialized but dbuf is not valid CUDA pointer */
1768         return 0;
1769     }
1770     /* Must be a device pointer */
1771     assert(memType == CU_MEMORYTYPE_DEVICE);
1772 #else /* OPAL_CUDA_GET_ATTRIBUTES */
1773     res = cuFunc.cuPointerGetAttribute(&memType,
1774                                        CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dbuf);
1775     if (res != CUDA_SUCCESS) {
1776         /* If we cannot determine it is device pointer,
1777          * just assume it is not. */
1778         return 0;
1779     } else if (memType == CU_MEMORYTYPE_HOST) {
1780         /* Host memory, nothing to do here */
1781         return 0;
1782     }
1783     /* Must be a device pointer */
1784     assert(memType == CU_MEMORYTYPE_DEVICE);
1785 #endif /* OPAL_CUDA_GET_ATTRIBUTES */
1786 
1787     /* This piece of code was added in to handle in a case involving
1788      * OMP threads.  The user had initialized CUDA and then spawned
1789      * two threads.  The first thread had the CUDA context, but the
1790      * second thread did not.  We therefore had no context to act upon
1791      * and future CUDA driver calls would fail.  Therefore, if we have
1792      * GPU memory, but no context, get the context from the GPU memory
1793      * and set the current context to that.  It is rare that we will not
1794      * have a context. */
1795     res = cuFunc.cuCtxGetCurrent(&ctx);
1796     if (OPAL_UNLIKELY(NULL == ctx)) {
1797         if (CUDA_SUCCESS == res) {
1798 #if !OPAL_CUDA_GET_ATTRIBUTES
1799             res = cuFunc.cuPointerGetAttribute(&memCtx,
1800                                                CU_POINTER_ATTRIBUTE_CONTEXT, dbuf);
1801             if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
1802                 opal_output(0, "CUDA: error calling cuPointerGetAttribute: "
1803                             "res=%d, ptr=%p aborting...", res, pUserBuf);
1804                 return OPAL_ERROR;
1805             }
1806 #endif /* OPAL_CUDA_GET_ATTRIBUTES */
1807             res = cuFunc.cuCtxSetCurrent(memCtx);
1808             if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
1809                 opal_output(0, "CUDA: error calling cuCtxSetCurrent: "
1810                             "res=%d, ptr=%p aborting...", res, pUserBuf);
1811                 return OPAL_ERROR;
1812             } else {
1813                 OPAL_OUTPUT_VERBOSE((10, mca_common_cuda_output,
1814                                      "CUDA: cuCtxSetCurrent passed: ptr=%p", pUserBuf));
1815             }
1816         } else {
1817             /* Print error and proceed */
1818             opal_output(0, "CUDA: error calling cuCtxGetCurrent: "
1819                         "res=%d, ptr=%p aborting...", res, pUserBuf);
1820             return OPAL_ERROR;
1821         }
1822     }
1823 
1824     /* WORKAROUND - They are times when the above code determines a pice of memory
1825      * is GPU memory, but it actually is not.  That has been seen on multi-GPU systems
1826      * with 6 or 8 GPUs on them. Therefore, we will do this extra check.  Note if we
1827      * made it this far, then the assumption at this point is we have GPU memory.
1828      * Unfotunately, this extra call is costing us another 100 ns almost doubling
1829      * the cost of this entire function. */
1830     if (OPAL_LIKELY(mca_common_cuda_gpu_mem_check_workaround)) {
1831         CUdeviceptr pbase;
1832         size_t psize;
1833         res = cuFunc.cuMemGetAddressRange(&pbase, &psize, dbuf);
1834         if (CUDA_SUCCESS != res) {
1835             opal_output_verbose(5, mca_common_cuda_output,
1836                                 "CUDA: cuMemGetAddressRange failed on this pointer: res=%d, buf=%p "
1837                                 "Overriding check and setting to host pointer. ",
1838                               res, (void *)dbuf);
1839             /* This cannot be GPU memory if the previous call failed */
1840             return 0;
1841         }
1842     }
1843 
1844     /* First access on a device pointer finalizes CUDA support initialization.
1845      * If initialization fails, disable support. */
1846     if (!stage_three_init_complete) {
1847         if (0 != mca_common_cuda_stage_three_init()) {
1848             opal_cuda_support = 0;
1849         }
1850     }
1851 
1852     return 1;
1853 }
1854 
mca_common_cuda_cu_memcpy_async(void * dest,const void * src,size_t size,opal_convertor_t * convertor)1855 static int mca_common_cuda_cu_memcpy_async(void *dest, const void *src, size_t size,
1856                                          opal_convertor_t* convertor)
1857 {
1858     return cuFunc.cuMemcpyAsync((CUdeviceptr)dest, (CUdeviceptr)src, size,
1859                                 (CUstream)convertor->stream);
1860 }
1861 
1862 /**
1863  * This function is plugged into various areas where a cuMemcpy would be called.
1864  * This is a synchronous operation that will not return until the copy is complete.
1865  */
mca_common_cuda_cu_memcpy(void * dest,const void * src,size_t size)1866 static int mca_common_cuda_cu_memcpy(void *dest, const void *src, size_t size)
1867 {
1868     CUresult result;
1869 #if OPAL_ENABLE_DEBUG
1870     CUmemorytype memTypeSrc, memTypeDst;
1871     if (OPAL_UNLIKELY(mca_common_cuda_cumemcpy_timing)) {
1872         /* Nice to know type of source and destination for timing output. Do
1873          * not care about return code as memory type will just be set to 0 */
1874         result = cuFunc.cuPointerGetAttribute(&memTypeDst,
1875                                               CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)dest);
1876         result = cuFunc.cuPointerGetAttribute(&memTypeSrc,
1877                                               CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)src);
1878         ts_start = opal_timer_base_get_usec();
1879     }
1880 #endif
1881     if (mca_common_cuda_cumemcpy_async) {
1882         result = cuFunc.cuMemcpyAsync((CUdeviceptr)dest, (CUdeviceptr)src, size, memcpyStream);
1883         if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1884             opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
1885                            true, dest, src, size, result);
1886             return OPAL_ERROR;
1887         }
1888         result = cuFunc.cuStreamSynchronize(memcpyStream);
1889         if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1890             opal_show_help("help-mpi-common-cuda.txt", "cuStreamSynchronize failed",
1891                            true, OPAL_PROC_MY_HOSTNAME, result);
1892             return OPAL_ERROR;
1893         }
1894     } else {
1895          result = cuFunc.cuMemcpy((CUdeviceptr)dest, (CUdeviceptr)src, size);
1896          if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1897              opal_show_help("help-mpi-common-cuda.txt", "cuMemcpy failed",
1898                             true, OPAL_PROC_MY_HOSTNAME, result);
1899              return OPAL_ERROR;
1900          }
1901     }
1902 #if OPAL_ENABLE_DEBUG
1903     if (OPAL_UNLIKELY(mca_common_cuda_cumemcpy_timing)) {
1904         ts_end = opal_timer_base_get_usec();
1905         accum = mydifftime(ts_start, ts_end);
1906         if (mca_common_cuda_cumemcpy_async) {
1907             opal_output(0, "cuMemcpyAsync took   %7.2f usecs, size=%d, (src=%p (%d), dst=%p (%d))\n",
1908                         accum, (int)size, src, memTypeSrc, dest, memTypeDst);
1909         } else {
1910             opal_output(0, "cuMemcpy took   %7.2f usecs, size=%d,  (src=%p (%d), dst=%p (%d))\n",
1911                         accum, (int)size, src, memTypeSrc, dest, memTypeDst);
1912         }
1913     }
1914 #endif
1915     return OPAL_SUCCESS;
1916 }
1917 
mca_common_cuda_memmove(void * dest,void * src,size_t size)1918 static int mca_common_cuda_memmove(void *dest, void *src, size_t size)
1919 {
1920     CUdeviceptr tmp;
1921     int result;
1922 
1923     result = cuFunc.cuMemAlloc(&tmp,size);
1924     if (mca_common_cuda_cumemcpy_async) {
1925         result = cuFunc.cuMemcpyAsync(tmp, (CUdeviceptr)src, size, memcpyStream);
1926         if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1927             opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
1928                            true, tmp, src, size, result);
1929             return OPAL_ERROR;
1930         }
1931         result = cuFunc.cuMemcpyAsync((CUdeviceptr)dest, tmp, size, memcpyStream);
1932         if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1933             opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
1934                            true, dest, tmp, size, result);
1935             return OPAL_ERROR;
1936         }
1937         result = cuFunc.cuStreamSynchronize(memcpyStream);
1938         if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1939             opal_show_help("help-mpi-common-cuda.txt", "cuStreamSynchronize failed",
1940                            true, OPAL_PROC_MY_HOSTNAME, result);
1941             return OPAL_ERROR;
1942         }
1943     } else {
1944         result = cuFunc.cuMemcpy(tmp, (CUdeviceptr)src, size);
1945         if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
1946             opal_output(0, "CUDA: memmove-Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
1947                         result, (void *)tmp, src, (int)size);
1948             return OPAL_ERROR;
1949         }
1950         result = cuFunc.cuMemcpy((CUdeviceptr)dest, tmp, size);
1951         if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
1952             opal_output(0, "CUDA: memmove-Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
1953                         result, dest, (void *)tmp, (int)size);
1954             return OPAL_ERROR;
1955         }
1956     }
1957     cuFunc.cuMemFree(tmp);
1958     return OPAL_SUCCESS;
1959 }
1960 
mca_common_cuda_get_device(int * devicenum)1961 int mca_common_cuda_get_device(int *devicenum)
1962 {
1963     CUdevice cuDev;
1964     int res;
1965 
1966     res = cuFunc.cuCtxGetDevice(&cuDev);
1967     if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
1968         opal_output(0, "CUDA: cuCtxGetDevice failed: res=%d",
1969                     res);
1970         return res;
1971     }
1972     *devicenum = cuDev;
1973     return 0;
1974 }
1975 
mca_common_cuda_device_can_access_peer(int * access,int dev1,int dev2)1976 int mca_common_cuda_device_can_access_peer(int *access, int dev1, int dev2)
1977 {
1978     int res;
1979     res = cuFunc.cuDeviceCanAccessPeer(access, (CUdevice)dev1, (CUdevice)dev2);
1980     if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
1981         opal_output(0, "CUDA: cuDeviceCanAccessPeer failed: res=%d",
1982                     res);
1983         return res;
1984     }
1985     return 0;
1986 }
1987 
mca_common_cuda_get_address_range(void * pbase,size_t * psize,void * base)1988 int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base)
1989 {
1990     CUresult result;
1991     result = cuFunc.cuMemGetAddressRange((CUdeviceptr *)pbase, psize, (CUdeviceptr)base);
1992     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1993         opal_show_help("help-mpi-common-cuda.txt", "cuMemGetAddressRange failed 2",
1994                        true, OPAL_PROC_MY_HOSTNAME, result, base);
1995         return OPAL_ERROR;
1996     } else {
1997         opal_output_verbose(50, mca_common_cuda_output,
1998                             "CUDA: cuMemGetAddressRange passed: addr=%p, pbase=%p, psize=%lu ",
1999                             base, *(char **)pbase, *psize);
2000     }
2001     return 0;
2002 }
2003 
2004 #if OPAL_CUDA_GDR_SUPPORT
2005 /* Check to see if the memory was freed between the time it was stored in
2006  * the registration cache and now.  Return true if the memory was previously
2007  * freed.  This is indicated by the BUFFER_ID value in the registration cache
2008  * not matching the BUFFER_ID of the buffer we are checking.  Return false
2009  * if the registration is still good.
2010  */
mca_common_cuda_previously_freed_memory(mca_rcache_base_registration_t * reg)2011 bool mca_common_cuda_previously_freed_memory(mca_rcache_base_registration_t *reg)
2012 {
2013     int res;
2014     unsigned long long bufID;
2015     unsigned char *dbuf = reg->base;
2016 
2017     res = cuFunc.cuPointerGetAttribute(&bufID, CU_POINTER_ATTRIBUTE_BUFFER_ID,
2018                                        (CUdeviceptr)dbuf);
2019     /* If we cannot determine the BUFFER_ID, then print a message and default
2020      * to forcing the registration to be kicked out. */
2021     if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
2022         opal_show_help("help-mpi-common-cuda.txt", "bufferID failed",
2023                        true, OPAL_PROC_MY_HOSTNAME, res);
2024         return true;
2025     }
2026     opal_output_verbose(50, mca_common_cuda_output,
2027                         "CUDA: base=%p, bufID=%llu, reg->gpu_bufID=%llu, %s", dbuf, bufID, reg->gpu_bufID,
2028                         (reg->gpu_bufID == bufID ? "BUFFER_ID match":"BUFFER_ID do not match"));
2029     if (bufID != reg->gpu_bufID) {
2030         return true;
2031     } else {
2032         return false;
2033     }
2034 }
2035 
2036 /*
2037  * Get the buffer ID from the memory and store it in the registration.
2038  * This is needed to ensure the cached registration is not stale.  If
2039  * we fail to get buffer ID, print an error and set buffer ID to 0.
2040  * Also set SYNC_MEMOPS on any GPU registration to ensure that
2041  * synchronous copies complete before the buffer is accessed.
2042  */
mca_common_cuda_get_buffer_id(mca_rcache_base_registration_t * reg)2043 void mca_common_cuda_get_buffer_id(mca_rcache_base_registration_t *reg)
2044 {
2045     int res;
2046     unsigned long long bufID = 0;
2047     unsigned char *dbuf = reg->base;
2048     int enable = 1;
2049 
2050     res = cuFunc.cuPointerGetAttribute(&bufID, CU_POINTER_ATTRIBUTE_BUFFER_ID,
2051                                        (CUdeviceptr)dbuf);
2052     if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
2053         opal_show_help("help-mpi-common-cuda.txt", "bufferID failed",
2054                        true, OPAL_PROC_MY_HOSTNAME, res);
2055     }
2056     reg->gpu_bufID = bufID;
2057 
2058     res = cuFunc.cuPointerSetAttribute(&enable, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
2059                                        (CUdeviceptr)dbuf);
2060     if (OPAL_UNLIKELY(CUDA_SUCCESS != res)) {
2061         opal_show_help("help-mpi-common-cuda.txt", "cuPointerSetAttribute failed",
2062                        true, OPAL_PROC_MY_HOSTNAME, res, dbuf);
2063     }
2064 }
2065 #endif /* OPAL_CUDA_GDR_SUPPORT */
2066