1 /*
2 * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
3 * University Research and Technology
4 * Corporation. All rights reserved.
5 * Copyright (c) 2004-2014 The University of Tennessee and The University
6 * of Tennessee Research Foundation. All rights
7 * reserved.
8 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9 * University of Stuttgart. All rights reserved.
10 * Copyright (c) 2004-2006 The Regents of the University of California.
11 * All rights reserved.
12 * Copyright (c) 2011-2015 NVIDIA Corporation. All rights reserved.
13 * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
14 * Copyright (c) 2015 Research Organization for Information Science
15 * and Technology (RIST). All rights reserved.
16 * $COPYRIGHT$
17 *
18 * Additional copyrights may follow
19 *
20 * $HEADER$
21 */
22
23 /**
24 * This file contains various support functions for doing CUDA
25 * operations.
26 */
27 #include "opal_config.h"
28
29 #include <errno.h>
30 #include <unistd.h>
31 #include <cuda.h>
32
33 #include "opal/align.h"
34 #include "opal/datatype/opal_convertor.h"
35 #include "opal/datatype/opal_datatype_cuda.h"
36 #include "opal/util/output.h"
37 #include "opal/util/show_help.h"
38 #include "opal/util/proc.h"
39 #include "opal/util/argv.h"
40
41 #include "opal/mca/rcache/base/base.h"
42 #include "opal/runtime/opal_params.h"
43 #include "opal/mca/timer/base/base.h"
44 #include "opal/mca/dl/base/base.h"
45
46 #include "common_cuda.h"
47
48 /**
49 * Since function names can get redefined in cuda.h file, we need to do this
50 * stringifying to get the latest function name from the header file. For
51 * example, cuda.h may have something like this:
52 * #define cuMemFree cuMemFree_v2
53 * We want to make sure we find cuMemFree_v2, not cuMemFree.
54 */
55 #define STRINGIFY2(x) #x
56 #define STRINGIFY(x) STRINGIFY2(x)
57
58 #define OPAL_CUDA_DLSYM(libhandle, funcName) \
59 do { \
60 char *err_msg; \
61 void *ptr; \
62 if (OPAL_SUCCESS != \
63 opal_dl_lookup(libhandle, STRINGIFY(funcName), &ptr, &err_msg)) { \
64 opal_show_help("help-mpi-common-cuda.txt", "dlsym failed", true, \
65 STRINGIFY(funcName), err_msg); \
66 return 1; \
67 } else { \
68 *(void **)(&cuFunc.funcName) = ptr; \
69 opal_output_verbose(15, mca_common_cuda_output, \
70 "CUDA: successful dlsym of %s", \
71 STRINGIFY(funcName)); \
72 } \
73 } while (0)
74
75 /* Structure to hold CUDA function pointers that get dynamically loaded. */
76 struct cudaFunctionTable {
77 int (*cuPointerGetAttribute)(void *, CUpointer_attribute, CUdeviceptr);
78 int (*cuMemcpyAsync)(CUdeviceptr, CUdeviceptr, size_t, CUstream);
79 int (*cuMemcpy)(CUdeviceptr, CUdeviceptr, size_t);
80 int (*cuMemAlloc)(CUdeviceptr *, unsigned int);
81 int (*cuMemFree)(CUdeviceptr buf);
82 int (*cuCtxGetCurrent)(void *cuContext);
83 int (*cuStreamCreate)(CUstream *, int);
84 int (*cuEventCreate)(CUevent *, int);
85 int (*cuEventRecord)(CUevent, CUstream);
86 int (*cuMemHostRegister)(void *, size_t, unsigned int);
87 int (*cuMemHostUnregister)(void *);
88 int (*cuEventQuery)(CUevent);
89 int (*cuEventDestroy)(CUevent);
90 int (*cuStreamWaitEvent)(CUstream, CUevent, unsigned int);
91 int (*cuMemGetAddressRange)(CUdeviceptr*, size_t*, CUdeviceptr);
92 int (*cuIpcGetEventHandle)(CUipcEventHandle*, CUevent);
93 int (*cuIpcOpenEventHandle)(CUevent*, CUipcEventHandle);
94 int (*cuIpcOpenMemHandle)(CUdeviceptr*, CUipcMemHandle, unsigned int);
95 int (*cuIpcCloseMemHandle)(CUdeviceptr);
96 int (*cuIpcGetMemHandle)(CUipcMemHandle*, CUdeviceptr);
97 int (*cuCtxGetDevice)(CUdevice *);
98 int (*cuDeviceCanAccessPeer)(int *, CUdevice, CUdevice);
99 int (*cuDeviceGet)(CUdevice *, int);
100 #if OPAL_CUDA_GDR_SUPPORT
101 int (*cuPointerSetAttribute)(const void *, CUpointer_attribute, CUdeviceptr);
102 #endif /* OPAL_CUDA_GDR_SUPPORT */
103 int (*cuCtxSetCurrent)(CUcontext);
104 int (*cuEventSynchronize)(CUevent);
105 int (*cuStreamSynchronize)(CUstream);
106 int (*cuStreamDestroy)(CUstream);
107 #if OPAL_CUDA_GET_ATTRIBUTES
108 int (*cuPointerGetAttributes)(unsigned int, CUpointer_attribute *, void **, CUdeviceptr);
109 #endif /* OPAL_CUDA_GET_ATTRIBUTES */
110 };
111 typedef struct cudaFunctionTable cudaFunctionTable_t;
112 static cudaFunctionTable_t cuFunc;
113
114 static int stage_one_init_ref_count = 0;
115 static bool stage_three_init_complete = false;
116 static bool common_cuda_initialized = false;
117 static bool common_cuda_mca_parames_registered = false;
118 static int mca_common_cuda_verbose;
119 static int mca_common_cuda_output = 0;
120 bool mca_common_cuda_enabled = false;
121 static bool mca_common_cuda_register_memory = true;
122 static bool mca_common_cuda_warning = false;
123 static opal_list_t common_cuda_memory_registrations;
124 static CUstream ipcStream = NULL;
125 static CUstream dtohStream = NULL;
126 static CUstream htodStream = NULL;
127 static CUstream memcpyStream = NULL;
128 static int mca_common_cuda_gpu_mem_check_workaround = (CUDA_VERSION > 7000) ? 0 : 1;
129 static opal_mutex_t common_cuda_init_lock;
130 static opal_mutex_t common_cuda_htod_lock;
131 static opal_mutex_t common_cuda_dtoh_lock;
132 static opal_mutex_t common_cuda_ipc_lock;
133
134 /* Functions called by opal layer - plugged into opal function table */
135 static int mca_common_cuda_is_gpu_buffer(const void*, opal_convertor_t*);
136 static int mca_common_cuda_memmove(void*, void*, size_t);
137 static int mca_common_cuda_cu_memcpy_async(void*, const void*, size_t, opal_convertor_t*);
138 static int mca_common_cuda_cu_memcpy(void*, const void*, size_t);
139
140 /* Function that gets plugged into opal layer */
141 static int mca_common_cuda_stage_two_init(opal_common_cuda_function_table_t *);
142
143 /* Structure to hold memory registrations that are delayed until first
144 * call to send or receive a GPU pointer */
145 struct common_cuda_mem_regs_t {
146 opal_list_item_t super;
147 void *ptr;
148 size_t amount;
149 char *msg;
150 };
151 typedef struct common_cuda_mem_regs_t common_cuda_mem_regs_t;
152 OBJ_CLASS_DECLARATION(common_cuda_mem_regs_t);
153 OBJ_CLASS_INSTANCE(common_cuda_mem_regs_t,
154 opal_list_item_t,
155 NULL,
156 NULL);
157
158 static int mca_common_cuda_async = 1;
159 static int mca_common_cuda_cumemcpy_async;
160 #if OPAL_ENABLE_DEBUG
161 static int mca_common_cuda_cumemcpy_timing;
162 #endif /* OPAL_ENABLE_DEBUG */
163
164 /* Array of CUDA events to be queried for IPC stream, sending side and
165 * receiving side. */
166 CUevent *cuda_event_ipc_array = NULL;
167 CUevent *cuda_event_dtoh_array = NULL;
168 CUevent *cuda_event_htod_array = NULL;
169
170 /* Array of fragments currently being moved by cuda async non-blocking
171 * operations */
172 struct mca_btl_base_descriptor_t **cuda_event_ipc_frag_array = NULL;
173 struct mca_btl_base_descriptor_t **cuda_event_dtoh_frag_array = NULL;
174 struct mca_btl_base_descriptor_t **cuda_event_htod_frag_array = NULL;
175
176 /* First free/available location in cuda_event_status_array */
177 static int cuda_event_ipc_first_avail, cuda_event_dtoh_first_avail, cuda_event_htod_first_avail;
178
179 /* First currently-being used location in the cuda_event_status_array */
180 static int cuda_event_ipc_first_used, cuda_event_dtoh_first_used, cuda_event_htod_first_used;
181
182 /* Number of status items currently in use */
183 static int cuda_event_ipc_num_used, cuda_event_dtoh_num_used, cuda_event_htod_num_used;
184
185 /* Size of array holding events */
186 int cuda_event_max = 400;
187 static int cuda_event_ipc_most = 0;
188 static int cuda_event_dtoh_most = 0;
189 static int cuda_event_htod_most = 0;
190
191 /* Handle to libcuda.so */
192 opal_dl_handle_t *libcuda_handle = NULL;
193
194 /* Unused variable that we register at init time and unregister at fini time.
195 * This is used to detect if user has done a device reset prior to MPI_Finalize.
196 * This is a workaround to avoid SEGVs.
197 */
198 static int checkmem;
199 static int ctx_ok = 1;
200
201 #define CUDA_COMMON_TIMING 0
202 #if OPAL_ENABLE_DEBUG
203 /* Some timing support structures. Enable this to help analyze
204 * internal performance issues. */
205 static opal_timer_t ts_start;
206 static opal_timer_t ts_end;
207 static double accum;
208 #define THOUSAND 1000L
209 #define MILLION 1000000L
210 static float mydifftime(opal_timer_t ts_start, opal_timer_t ts_end);
211 #endif /* OPAL_ENABLE_DEBUG */
212
213 /* These functions are typically unused in the optimized builds. */
214 static void cuda_dump_evthandle(int, void *, char *) __opal_attribute_unused__ ;
215 static void cuda_dump_memhandle(int, void *, char *) __opal_attribute_unused__ ;
216 #if OPAL_ENABLE_DEBUG
217 #define CUDA_DUMP_MEMHANDLE(a) cuda_dump_memhandle a
218 #define CUDA_DUMP_EVTHANDLE(a) cuda_dump_evthandle a
219 #else
220 #define CUDA_DUMP_MEMHANDLE(a)
221 #define CUDA_DUMP_EVTHANDLE(a)
222 #endif /* OPAL_ENABLE_DEBUG */
223
224 /* This is a seperate function so we can see these variables with ompi_info and
225 * also set them with the tools interface */
mca_common_cuda_register_mca_variables(void)226 void mca_common_cuda_register_mca_variables(void)
227 {
228
229 if (false == common_cuda_mca_parames_registered) {
230 common_cuda_mca_parames_registered = true;
231 }
232 /* Set different levels of verbosity in the cuda related code. */
233 mca_common_cuda_verbose = 0;
234 (void) mca_base_var_register("ompi", "mpi", "common_cuda", "verbose",
235 "Set level of common cuda verbosity",
236 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
237 OPAL_INFO_LVL_9,
238 MCA_BASE_VAR_SCOPE_READONLY,
239 &mca_common_cuda_verbose);
240
241 /* Control whether system buffers get CUDA pinned or not. Allows for
242 * performance analysis. */
243 mca_common_cuda_register_memory = true;
244 (void) mca_base_var_register("ompi", "mpi", "common_cuda", "register_memory",
245 "Whether to cuMemHostRegister preallocated BTL buffers",
246 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
247 OPAL_INFO_LVL_9,
248 MCA_BASE_VAR_SCOPE_READONLY,
249 &mca_common_cuda_register_memory);
250
251 /* Control whether we see warnings when CUDA memory registration fails. This is
252 * useful when CUDA support is configured in, but we are running a regular MPI
253 * application without CUDA. */
254 mca_common_cuda_warning = true;
255 (void) mca_base_var_register("ompi", "mpi", "common_cuda", "warning",
256 "Whether to print warnings when CUDA registration fails",
257 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
258 OPAL_INFO_LVL_9,
259 MCA_BASE_VAR_SCOPE_READONLY,
260 &mca_common_cuda_warning);
261
262 /* Use this flag to test async vs sync copies */
263 mca_common_cuda_async = 1;
264 (void) mca_base_var_register("ompi", "mpi", "common_cuda", "memcpy_async",
265 "Set to 0 to force CUDA sync copy instead of async",
266 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
267 OPAL_INFO_LVL_9,
268 MCA_BASE_VAR_SCOPE_READONLY,
269 &mca_common_cuda_async);
270
271 /* Use this parameter to increase the number of outstanding events allows */
272 (void) mca_base_var_register("ompi", "mpi", "common_cuda", "event_max",
273 "Set number of oustanding CUDA events",
274 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
275 OPAL_INFO_LVL_9,
276 MCA_BASE_VAR_SCOPE_READONLY,
277 &cuda_event_max);
278
279 /* Use this flag to test cuMemcpyAsync vs cuMemcpy */
280 mca_common_cuda_cumemcpy_async = 1;
281 (void) mca_base_var_register("ompi", "mpi", "common_cuda", "cumemcpy_async",
282 "Set to 0 to force CUDA cuMemcpy instead of cuMemcpyAsync/cuStreamSynchronize",
283 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
284 OPAL_INFO_LVL_5,
285 MCA_BASE_VAR_SCOPE_READONLY,
286 &mca_common_cuda_cumemcpy_async);
287
288 #if OPAL_ENABLE_DEBUG
289 /* Use this flag to dump out timing of cumempcy sync and async */
290 mca_common_cuda_cumemcpy_timing = 0;
291 (void) mca_base_var_register("ompi", "mpi", "common_cuda", "cumemcpy_timing",
292 "Set to 1 to dump timing of eager copies",
293 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
294 OPAL_INFO_LVL_5,
295 MCA_BASE_VAR_SCOPE_READONLY,
296 &mca_common_cuda_cumemcpy_timing);
297 #endif /* OPAL_ENABLE_DEBUG */
298
299 (void) mca_base_var_register("ompi", "mpi", "common_cuda", "gpu_mem_check_workaround",
300 "Set to 0 to disable GPU memory check workaround. A user would rarely have to do this.",
301 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
302 OPAL_INFO_LVL_9,
303 MCA_BASE_VAR_SCOPE_READONLY,
304 &mca_common_cuda_gpu_mem_check_workaround);
305 }
306
307 /**
308 * This is the first stage of initialization. This function is called
309 * explicitly by any BTLs that can support CUDA-aware. It is called during
310 * the component open phase of initialization. This fuction will look for
311 * the SONAME of the library which is libcuda.so.1. In most cases, this will
312 * result in the library found. However, there are some setups that require
313 * the extra steps for searching. This function will then load the symbols
314 * needed from the CUDA driver library. Any failure will result in this
315 * initialization failing and status will be set showing that.
316 */
mca_common_cuda_stage_one_init(void)317 int mca_common_cuda_stage_one_init(void)
318 {
319 int retval, i, j;
320 char *cudalibs[] = {"libcuda.so.1", "libcuda.dylib", NULL};
321 char *searchpaths[] = {"", "/usr/lib64", NULL};
322 char **errmsgs = NULL;
323 char *errmsg = NULL;
324 int errsize;
325 bool stage_one_init_passed = false;
326
327 stage_one_init_ref_count++;
328 if (stage_one_init_ref_count > 1) {
329 opal_output_verbose(10, mca_common_cuda_output,
330 "CUDA: stage_one_init_ref_count is now %d, no need to init",
331 stage_one_init_ref_count);
332 return OPAL_SUCCESS;
333 }
334
335 /* This is a no-op in most cases as the parameters were registered earlier */
336 mca_common_cuda_register_mca_variables();
337
338 OBJ_CONSTRUCT(&common_cuda_init_lock, opal_mutex_t);
339 OBJ_CONSTRUCT(&common_cuda_htod_lock, opal_mutex_t);
340 OBJ_CONSTRUCT(&common_cuda_dtoh_lock, opal_mutex_t);
341 OBJ_CONSTRUCT(&common_cuda_ipc_lock, opal_mutex_t);
342
343 mca_common_cuda_output = opal_output_open(NULL);
344 opal_output_set_verbosity(mca_common_cuda_output, mca_common_cuda_verbose);
345
346 opal_output_verbose(10, mca_common_cuda_output,
347 "CUDA: stage_one_init_ref_count is now %d, initializing",
348 stage_one_init_ref_count);
349
350 /* First check if the support is enabled. In the case that the user has
351 * turned it off, we do not need to continue with any CUDA specific
352 * initialization. Do this after MCA parameter registration. */
353 if (!opal_cuda_support) {
354 return 1;
355 }
356
357 if (!OPAL_HAVE_DL_SUPPORT) {
358 opal_show_help("help-mpi-common-cuda.txt", "dlopen disabled", true);
359 return 1;
360 }
361
362 /* Now walk through all the potential names libcuda and find one
363 * that works. If it does, all is good. If not, print out all
364 * the messages about why things failed. This code was careful
365 * to try and save away all error messages if the loading ultimately
366 * failed to help with debugging.
367 *
368 * NOTE: On the first loop we just utilize the default loading
369 * paths from the system. For the second loop, set /usr/lib64 to
370 * the search path and try again. This is done to handle the case
371 * where we have both 32 and 64 bit libcuda.so libraries
372 * installed. Even when running in 64-bit mode, the /usr/lib
373 * directory is searched first and we may find a 32-bit
374 * libcuda.so.1 library. Loading of this library will fail as the
375 * OPAL DL framework does not handle having the wrong ABI in the
376 * search path (unlike ld or ld.so). Note that we only set this
377 * search path after the original search. This is so that
378 * LD_LIBRARY_PATH and run path settings are respected. Setting
379 * this search path overrides them (rather then being
380 * appended). */
381 j = 0;
382 while (searchpaths[j] != NULL) {
383 i = 0;
384 while (cudalibs[i] != NULL) {
385 char *filename = NULL;
386 char *str = NULL;
387
388 /* If there's a non-empty search path, prepend it
389 to the library filename */
390 if (strlen(searchpaths[j]) > 0) {
391 asprintf(&filename, "%s/%s", searchpaths[j], cudalibs[i]);
392 } else {
393 filename = strdup(cudalibs[i]);
394 }
395 if (NULL == filename) {
396 opal_show_help("help-mpi-common-cuda.txt", "No memory",
397 true, OPAL_PROC_MY_HOSTNAME);
398 return 1;
399 }
400
401 retval = opal_dl_open(filename, false, false,
402 &libcuda_handle, &str);
403 if (OPAL_SUCCESS != retval || NULL == libcuda_handle) {
404 if (NULL != str) {
405 opal_argv_append(&errsize, &errmsgs, str);
406 } else {
407 opal_argv_append(&errsize, &errmsgs,
408 "opal_dl_open() returned NULL.");
409 }
410 opal_output_verbose(10, mca_common_cuda_output,
411 "CUDA: Library open error: %s",
412 errmsgs[errsize-1]);
413 } else {
414 opal_output_verbose(10, mca_common_cuda_output,
415 "CUDA: Library successfully opened %s",
416 cudalibs[i]);
417 stage_one_init_passed = true;
418 break;
419 }
420 i++;
421
422 free(filename);
423 }
424 if (true == stage_one_init_passed) {
425 break; /* Break out of outer loop */
426 }
427 j++;
428 }
429
430 if (true != stage_one_init_passed) {
431 errmsg = opal_argv_join(errmsgs, '\n');
432 opal_show_help("help-mpi-common-cuda.txt", "dlopen failed", true,
433 errmsg);
434 opal_cuda_support = 0;
435 }
436 opal_argv_free(errmsgs);
437 free(errmsg);
438
439 if (true != stage_one_init_passed) {
440 return 1;
441 }
442 opal_cuda_add_initialization_function(&mca_common_cuda_stage_two_init);
443 OBJ_CONSTRUCT(&common_cuda_memory_registrations, opal_list_t);
444
445 /* Map in the functions that we need. Note that if there is an error
446 * the macro OPAL_CUDA_DLSYM will print an error and call return. */
447 OPAL_CUDA_DLSYM(libcuda_handle, cuStreamCreate);
448 OPAL_CUDA_DLSYM(libcuda_handle, cuCtxGetCurrent);
449 OPAL_CUDA_DLSYM(libcuda_handle, cuEventCreate);
450 OPAL_CUDA_DLSYM(libcuda_handle, cuEventRecord);
451 OPAL_CUDA_DLSYM(libcuda_handle, cuMemHostRegister);
452 OPAL_CUDA_DLSYM(libcuda_handle, cuMemHostUnregister);
453 OPAL_CUDA_DLSYM(libcuda_handle, cuPointerGetAttribute);
454 OPAL_CUDA_DLSYM(libcuda_handle, cuEventQuery);
455 OPAL_CUDA_DLSYM(libcuda_handle, cuEventDestroy);
456 OPAL_CUDA_DLSYM(libcuda_handle, cuStreamWaitEvent);
457 OPAL_CUDA_DLSYM(libcuda_handle, cuMemcpyAsync);
458 OPAL_CUDA_DLSYM(libcuda_handle, cuMemcpy);
459 OPAL_CUDA_DLSYM(libcuda_handle, cuMemFree);
460 OPAL_CUDA_DLSYM(libcuda_handle, cuMemAlloc);
461 OPAL_CUDA_DLSYM(libcuda_handle, cuMemGetAddressRange);
462 OPAL_CUDA_DLSYM(libcuda_handle, cuIpcGetEventHandle);
463 OPAL_CUDA_DLSYM(libcuda_handle, cuIpcOpenEventHandle);
464 OPAL_CUDA_DLSYM(libcuda_handle, cuIpcOpenMemHandle);
465 OPAL_CUDA_DLSYM(libcuda_handle, cuIpcCloseMemHandle);
466 OPAL_CUDA_DLSYM(libcuda_handle, cuIpcGetMemHandle);
467 OPAL_CUDA_DLSYM(libcuda_handle, cuCtxGetDevice);
468 OPAL_CUDA_DLSYM(libcuda_handle, cuDeviceCanAccessPeer);
469 OPAL_CUDA_DLSYM(libcuda_handle, cuDeviceGet);
470 #if OPAL_CUDA_GDR_SUPPORT
471 OPAL_CUDA_DLSYM(libcuda_handle, cuPointerSetAttribute);
472 #endif /* OPAL_CUDA_GDR_SUPPORT */
473 OPAL_CUDA_DLSYM(libcuda_handle, cuCtxSetCurrent);
474 OPAL_CUDA_DLSYM(libcuda_handle, cuEventSynchronize);
475 OPAL_CUDA_DLSYM(libcuda_handle, cuStreamSynchronize);
476 OPAL_CUDA_DLSYM(libcuda_handle, cuStreamDestroy);
477 #if OPAL_CUDA_GET_ATTRIBUTES
478 OPAL_CUDA_DLSYM(libcuda_handle, cuPointerGetAttributes);
479 #endif /* OPAL_CUDA_GET_ATTRIBUTES */
480 return 0;
481 }
482
483 /**
484 * This function is registered with the OPAL CUDA support. In that way,
485 * these function pointers will be loaded into the OPAL CUDA code when
486 * the first convertor is initialized. This does not trigger any CUDA
487 * specific initialization as this may just be a host buffer that is
488 * triggering this call.
489 */
mca_common_cuda_stage_two_init(opal_common_cuda_function_table_t * ftable)490 static int mca_common_cuda_stage_two_init(opal_common_cuda_function_table_t *ftable)
491 {
492 if (OPAL_UNLIKELY(!opal_cuda_support)) {
493 return OPAL_ERROR;
494 }
495
496 ftable->gpu_is_gpu_buffer = &mca_common_cuda_is_gpu_buffer;
497 ftable->gpu_cu_memcpy_async = &mca_common_cuda_cu_memcpy_async;
498 ftable->gpu_cu_memcpy = &mca_common_cuda_cu_memcpy;
499 ftable->gpu_memmove = &mca_common_cuda_memmove;
500
501 opal_output_verbose(30, mca_common_cuda_output,
502 "CUDA: support functions initialized");
503 return OPAL_SUCCESS;
504 }
505
506 /**
507 * This is the last phase of initialization. This is triggered when we examine
508 * a buffer pointer and determine it is a GPU buffer. We then assume the user
509 * has selected their GPU and we can go ahead with all the CUDA related
510 * initializations. If we get an error, just return. Cleanup of resources
511 * will happen when fini is called.
512 */
mca_common_cuda_stage_three_init(void)513 static int mca_common_cuda_stage_three_init(void)
514 {
515 int i, s, rc;
516 CUresult res;
517 CUcontext cuContext;
518 common_cuda_mem_regs_t *mem_reg;
519
520 OPAL_THREAD_LOCK(&common_cuda_init_lock);
521 opal_output_verbose(20, mca_common_cuda_output,
522 "CUDA: entering stage three init");
523
524 /* Compiled without support or user disabled support */
525 if (OPAL_UNLIKELY(!opal_cuda_support)) {
526 opal_output_verbose(20, mca_common_cuda_output,
527 "CUDA: No mpi cuda support, exiting stage three init");
528 stage_three_init_complete = true;
529 OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
530 return OPAL_ERROR;
531 }
532
533 /* In case another thread snuck in and completed the initialization */
534 if (true == stage_three_init_complete) {
535 if (common_cuda_initialized) {
536 opal_output_verbose(20, mca_common_cuda_output,
537 "CUDA: Stage three already complete, exiting stage three init");
538 OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
539 return OPAL_SUCCESS;
540 } else {
541 opal_output_verbose(20, mca_common_cuda_output,
542 "CUDA: Stage three already complete, failed during the init");
543 OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
544 return OPAL_ERROR;
545 }
546 }
547
548 /* Check to see if this process is running in a CUDA context. If
549 * so, all is good. If not, then disable registration of memory. */
550 res = cuFunc.cuCtxGetCurrent(&cuContext);
551 if (CUDA_SUCCESS != res) {
552 if (mca_common_cuda_warning) {
553 /* Check for the not initialized error since we can make suggestions to
554 * user for this error. */
555 if (CUDA_ERROR_NOT_INITIALIZED == res) {
556 opal_show_help("help-mpi-common-cuda.txt", "cuCtxGetCurrent failed not initialized",
557 true);
558 } else {
559 opal_show_help("help-mpi-common-cuda.txt", "cuCtxGetCurrent failed",
560 true, res);
561 }
562 }
563 mca_common_cuda_enabled = false;
564 mca_common_cuda_register_memory = false;
565 } else if ((CUDA_SUCCESS == res) && (NULL == cuContext)) {
566 if (mca_common_cuda_warning) {
567 opal_show_help("help-mpi-common-cuda.txt", "cuCtxGetCurrent returned NULL",
568 true);
569 }
570 mca_common_cuda_enabled = false;
571 mca_common_cuda_register_memory = false;
572 } else {
573 /* All is good. mca_common_cuda_register_memory will retain its original
574 * value. Normally, that is 1, but the user can override it to disable
575 * registration of the internal buffers. */
576 mca_common_cuda_enabled = true;
577 opal_output_verbose(20, mca_common_cuda_output,
578 "CUDA: cuCtxGetCurrent succeeded");
579 }
580
581 /* No need to go on at this point. If we cannot create a context and we are at
582 * the point where we are making MPI calls, it is time to fully disable
583 * CUDA support.
584 */
585 if (false == mca_common_cuda_enabled) {
586 OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
587 return OPAL_ERROR;
588 }
589
590 if (true == mca_common_cuda_enabled) {
591 /* Set up an array to store outstanding IPC async copy events */
592 cuda_event_ipc_num_used = 0;
593 cuda_event_ipc_first_avail = 0;
594 cuda_event_ipc_first_used = 0;
595
596 cuda_event_ipc_array = (CUevent *) calloc(cuda_event_max, sizeof(CUevent *));
597 if (NULL == cuda_event_ipc_array) {
598 opal_show_help("help-mpi-common-cuda.txt", "No memory",
599 true, OPAL_PROC_MY_HOSTNAME);
600 rc = OPAL_ERROR;
601 goto cleanup_and_error;
602 }
603
604 /* Create the events since they can be reused. */
605 for (i = 0; i < cuda_event_max; i++) {
606 res = cuFunc.cuEventCreate(&cuda_event_ipc_array[i], CU_EVENT_DISABLE_TIMING);
607 if (CUDA_SUCCESS != res) {
608 opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
609 true, OPAL_PROC_MY_HOSTNAME, res);
610 rc = OPAL_ERROR;
611 goto cleanup_and_error;
612 }
613 }
614
615 /* The first available status index is 0. Make an empty frag
616 array. */
617 cuda_event_ipc_frag_array = (struct mca_btl_base_descriptor_t **)
618 malloc(sizeof(struct mca_btl_base_descriptor_t *) * cuda_event_max);
619 if (NULL == cuda_event_ipc_frag_array) {
620 opal_show_help("help-mpi-common-cuda.txt", "No memory",
621 true, OPAL_PROC_MY_HOSTNAME);
622 rc = OPAL_ERROR;
623 goto cleanup_and_error;
624 }
625 }
626
627 if (true == mca_common_cuda_enabled) {
628 /* Set up an array to store outstanding async dtoh events. Used on the
629 * sending side for asynchronous copies. */
630 cuda_event_dtoh_num_used = 0;
631 cuda_event_dtoh_first_avail = 0;
632 cuda_event_dtoh_first_used = 0;
633
634 cuda_event_dtoh_array = (CUevent *) calloc(cuda_event_max, sizeof(CUevent *));
635 if (NULL == cuda_event_dtoh_array) {
636 opal_show_help("help-mpi-common-cuda.txt", "No memory",
637 true, OPAL_PROC_MY_HOSTNAME);
638 rc = OPAL_ERROR;
639 goto cleanup_and_error;
640 }
641
642 /* Create the events since they can be reused. */
643 for (i = 0; i < cuda_event_max; i++) {
644 res = cuFunc.cuEventCreate(&cuda_event_dtoh_array[i], CU_EVENT_DISABLE_TIMING);
645 if (CUDA_SUCCESS != res) {
646 opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
647 true, OPAL_PROC_MY_HOSTNAME, res);
648 rc = OPAL_ERROR;
649 goto cleanup_and_error;
650 }
651 }
652
653 /* The first available status index is 0. Make an empty frag
654 array. */
655 cuda_event_dtoh_frag_array = (struct mca_btl_base_descriptor_t **)
656 malloc(sizeof(struct mca_btl_base_descriptor_t *) * cuda_event_max);
657 if (NULL == cuda_event_dtoh_frag_array) {
658 opal_show_help("help-mpi-common-cuda.txt", "No memory",
659 true, OPAL_PROC_MY_HOSTNAME);
660 rc = OPAL_ERROR;
661 goto cleanup_and_error;
662 }
663
664 /* Set up an array to store outstanding async htod events. Used on the
665 * receiving side for asynchronous copies. */
666 cuda_event_htod_num_used = 0;
667 cuda_event_htod_first_avail = 0;
668 cuda_event_htod_first_used = 0;
669
670 cuda_event_htod_array = (CUevent *) calloc(cuda_event_max, sizeof(CUevent *));
671 if (NULL == cuda_event_htod_array) {
672 opal_show_help("help-mpi-common-cuda.txt", "No memory",
673 true, OPAL_PROC_MY_HOSTNAME);
674 rc = OPAL_ERROR;
675 goto cleanup_and_error;
676 }
677
678 /* Create the events since they can be reused. */
679 for (i = 0; i < cuda_event_max; i++) {
680 res = cuFunc.cuEventCreate(&cuda_event_htod_array[i], CU_EVENT_DISABLE_TIMING);
681 if (CUDA_SUCCESS != res) {
682 opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
683 true, OPAL_PROC_MY_HOSTNAME, res);
684 rc = OPAL_ERROR;
685 goto cleanup_and_error;
686 }
687 }
688
689 /* The first available status index is 0. Make an empty frag
690 array. */
691 cuda_event_htod_frag_array = (struct mca_btl_base_descriptor_t **)
692 malloc(sizeof(struct mca_btl_base_descriptor_t *) * cuda_event_max);
693 if (NULL == cuda_event_htod_frag_array) {
694 opal_show_help("help-mpi-common-cuda.txt", "No memory",
695 true, OPAL_PROC_MY_HOSTNAME);
696 rc = OPAL_ERROR;
697 goto cleanup_and_error;
698 }
699 }
700
701 s = opal_list_get_size(&common_cuda_memory_registrations);
702 for(i = 0; i < s; i++) {
703 mem_reg = (common_cuda_mem_regs_t *)
704 opal_list_remove_first(&common_cuda_memory_registrations);
705 if (mca_common_cuda_enabled && mca_common_cuda_register_memory) {
706 res = cuFunc.cuMemHostRegister(mem_reg->ptr, mem_reg->amount, 0);
707 if (res != CUDA_SUCCESS) {
708 /* If registering the memory fails, print a message and continue.
709 * This is not a fatal error. */
710 opal_show_help("help-mpi-common-cuda.txt", "cuMemHostRegister during init failed",
711 true, mem_reg->ptr, mem_reg->amount,
712 OPAL_PROC_MY_HOSTNAME, res, mem_reg->msg);
713 } else {
714 opal_output_verbose(20, mca_common_cuda_output,
715 "CUDA: cuMemHostRegister OK on rcache %s: "
716 "address=%p, bufsize=%d",
717 mem_reg->msg, mem_reg->ptr, (int)mem_reg->amount);
718 }
719 }
720 free(mem_reg->msg);
721 OBJ_RELEASE(mem_reg);
722 }
723
724 /* Create stream for use in ipc asynchronous copies */
725 res = cuFunc.cuStreamCreate(&ipcStream, 0);
726 if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
727 opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed",
728 true, OPAL_PROC_MY_HOSTNAME, res);
729 rc = OPAL_ERROR;
730 goto cleanup_and_error;
731 }
732
733 /* Create stream for use in dtoh asynchronous copies */
734 res = cuFunc.cuStreamCreate(&dtohStream, 0);
735 if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
736 opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed",
737 true, OPAL_PROC_MY_HOSTNAME, res);
738 rc = OPAL_ERROR;
739 goto cleanup_and_error;
740 }
741
742 /* Create stream for use in htod asynchronous copies */
743 res = cuFunc.cuStreamCreate(&htodStream, 0);
744 if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
745 opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed",
746 true, OPAL_PROC_MY_HOSTNAME, res);
747 rc = OPAL_ERROR;
748 goto cleanup_and_error;
749 }
750
751 if (mca_common_cuda_cumemcpy_async) {
752 /* Create stream for use in cuMemcpyAsync synchronous copies */
753 res = cuFunc.cuStreamCreate(&memcpyStream, 0);
754 if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
755 opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed",
756 true, OPAL_PROC_MY_HOSTNAME, res);
757 rc = OPAL_ERROR;
758 goto cleanup_and_error;
759 }
760 }
761
762 res = cuFunc.cuMemHostRegister(&checkmem, sizeof(int), 0);
763 if (res != CUDA_SUCCESS) {
764 /* If registering the memory fails, print a message and continue.
765 * This is not a fatal error. */
766 opal_show_help("help-mpi-common-cuda.txt", "cuMemHostRegister during init failed",
767 true, &checkmem, sizeof(int),
768 OPAL_PROC_MY_HOSTNAME, res, "checkmem");
769
770 } else {
771 opal_output_verbose(20, mca_common_cuda_output,
772 "CUDA: cuMemHostRegister OK on test region");
773 }
774
775 opal_output_verbose(20, mca_common_cuda_output,
776 "CUDA: the extra gpu memory check is %s", (mca_common_cuda_gpu_mem_check_workaround == 1) ? "on":"off");
777
778 opal_output_verbose(30, mca_common_cuda_output,
779 "CUDA: initialized");
780 opal_atomic_mb(); /* Make sure next statement does not get reordered */
781 common_cuda_initialized = true;
782 stage_three_init_complete = true;
783 OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
784 return OPAL_SUCCESS;
785
786 /* If we are here, something went wrong. Cleanup and return an error. */
787 cleanup_and_error:
788 opal_atomic_mb(); /* Make sure next statement does not get reordered */
789 stage_three_init_complete = true;
790 OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
791 return rc;
792 }
793
794 /**
795 * Cleanup all CUDA resources.
796 *
797 * Note: Still figuring out how to get cuMemHostUnregister called from the smcuda sm
798 * rcache. Looks like with the memory pool from openib (grdma), the unregistering is
799 * called as the free list is destructed. Not true for the sm mpool. This means we
800 * are currently still leaking some host memory we registered with CUDA.
801 */
mca_common_cuda_fini(void)802 void mca_common_cuda_fini(void)
803 {
804 int i;
805 CUresult res;
806
807 if (false == common_cuda_initialized) {
808 stage_one_init_ref_count--;
809 opal_output_verbose(20, mca_common_cuda_output,
810 "CUDA: mca_common_cuda_fini, never completed initialization so "
811 "skipping fini, ref_count is now %d", stage_one_init_ref_count);
812 return;
813 }
814
815 if (0 == stage_one_init_ref_count) {
816 opal_output_verbose(20, mca_common_cuda_output,
817 "CUDA: mca_common_cuda_fini, ref_count=%d, fini is already complete",
818 stage_one_init_ref_count);
819 return;
820 }
821
822 if (1 == stage_one_init_ref_count) {
823 opal_output_verbose(20, mca_common_cuda_output,
824 "CUDA: mca_common_cuda_fini, ref_count=%d, cleaning up started",
825 stage_one_init_ref_count);
826
827 /* This call is in here to make sure the context is still valid.
828 * This was the one way of checking which did not cause problems
829 * while calling into the CUDA library. This check will detect if
830 * a user has called cudaDeviceReset prior to MPI_Finalize. If so,
831 * then this call will fail and we skip cleaning up CUDA resources. */
832 res = cuFunc.cuMemHostUnregister(&checkmem);
833 if (CUDA_SUCCESS != res) {
834 ctx_ok = 0;
835 }
836 opal_output_verbose(20, mca_common_cuda_output,
837 "CUDA: mca_common_cuda_fini, cuMemHostUnregister returned %d, ctx_ok=%d",
838 res, ctx_ok);
839
840 if (NULL != cuda_event_ipc_array) {
841 if (ctx_ok) {
842 for (i = 0; i < cuda_event_max; i++) {
843 if (NULL != cuda_event_ipc_array[i]) {
844 cuFunc.cuEventDestroy(cuda_event_ipc_array[i]);
845 }
846 }
847 }
848 free(cuda_event_ipc_array);
849 }
850 if (NULL != cuda_event_htod_array) {
851 if (ctx_ok) {
852 for (i = 0; i < cuda_event_max; i++) {
853 if (NULL != cuda_event_htod_array[i]) {
854 cuFunc.cuEventDestroy(cuda_event_htod_array[i]);
855 }
856 }
857 }
858 free(cuda_event_htod_array);
859 }
860
861 if (NULL != cuda_event_dtoh_array) {
862 if (ctx_ok) {
863 for (i = 0; i < cuda_event_max; i++) {
864 if (NULL != cuda_event_dtoh_array[i]) {
865 cuFunc.cuEventDestroy(cuda_event_dtoh_array[i]);
866 }
867 }
868 }
869 free(cuda_event_dtoh_array);
870 }
871
872 if (NULL != cuda_event_ipc_frag_array) {
873 free(cuda_event_ipc_frag_array);
874 }
875 if (NULL != cuda_event_htod_frag_array) {
876 free(cuda_event_htod_frag_array);
877 }
878 if (NULL != cuda_event_dtoh_frag_array) {
879 free(cuda_event_dtoh_frag_array);
880 }
881 if ((NULL != ipcStream) && ctx_ok) {
882 cuFunc.cuStreamDestroy(ipcStream);
883 }
884 if ((NULL != dtohStream) && ctx_ok) {
885 cuFunc.cuStreamDestroy(dtohStream);
886 }
887 if ((NULL != htodStream) && ctx_ok) {
888 cuFunc.cuStreamDestroy(htodStream);
889 }
890 if ((NULL != memcpyStream) && ctx_ok) {
891 cuFunc.cuStreamDestroy(memcpyStream);
892 }
893 OBJ_DESTRUCT(&common_cuda_init_lock);
894 OBJ_DESTRUCT(&common_cuda_htod_lock);
895 OBJ_DESTRUCT(&common_cuda_dtoh_lock);
896 OBJ_DESTRUCT(&common_cuda_ipc_lock);
897 if (NULL != libcuda_handle) {
898 opal_dl_close(libcuda_handle);
899 }
900
901 opal_output_verbose(20, mca_common_cuda_output,
902 "CUDA: mca_common_cuda_fini, ref_count=%d, cleaning up all done",
903 stage_one_init_ref_count);
904
905 opal_output_close(mca_common_cuda_output);
906
907 } else {
908 opal_output_verbose(20, mca_common_cuda_output,
909 "CUDA: mca_common_cuda_fini, ref_count=%d, cuda still in use",
910 stage_one_init_ref_count);
911 }
912 stage_one_init_ref_count--;
913 }
914
915 /**
916 * Call the CUDA register function so we pin the memory in the CUDA
917 * space.
918 */
mca_common_cuda_register(void * ptr,size_t amount,char * msg)919 void mca_common_cuda_register(void *ptr, size_t amount, char *msg) {
920 int res;
921
922 /* Always first check if the support is enabled. If not, just return */
923 if (!opal_cuda_support)
924 return;
925
926 if (!common_cuda_initialized) {
927 OPAL_THREAD_LOCK(&common_cuda_init_lock);
928 if (!common_cuda_initialized) {
929 common_cuda_mem_regs_t *regptr;
930 regptr = OBJ_NEW(common_cuda_mem_regs_t);
931 regptr->ptr = ptr;
932 regptr->amount = amount;
933 regptr->msg = strdup(msg);
934 opal_list_append(&common_cuda_memory_registrations,
935 (opal_list_item_t*)regptr);
936 OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
937 return;
938 }
939 OPAL_THREAD_UNLOCK(&common_cuda_init_lock);
940 }
941
942 if (mca_common_cuda_enabled && mca_common_cuda_register_memory) {
943 res = cuFunc.cuMemHostRegister(ptr, amount, 0);
944 if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
945 /* If registering the memory fails, print a message and continue.
946 * This is not a fatal error. */
947 opal_show_help("help-mpi-common-cuda.txt", "cuMemHostRegister failed",
948 true, ptr, amount,
949 OPAL_PROC_MY_HOSTNAME, res, msg);
950 } else {
951 opal_output_verbose(20, mca_common_cuda_output,
952 "CUDA: cuMemHostRegister OK on rcache %s: "
953 "address=%p, bufsize=%d",
954 msg, ptr, (int)amount);
955 }
956 }
957 }
958
959 /**
960 * Call the CUDA unregister function so we unpin the memory in the CUDA
961 * space.
962 */
mca_common_cuda_unregister(void * ptr,char * msg)963 void mca_common_cuda_unregister(void *ptr, char *msg) {
964 int res, i, s;
965 common_cuda_mem_regs_t *mem_reg;
966
967 /* This can happen if memory was queued up to be registered, but
968 * no CUDA operations happened, so it never was registered.
969 * Therefore, just release any of the resources. */
970 if (!common_cuda_initialized) {
971 s = opal_list_get_size(&common_cuda_memory_registrations);
972 for(i = 0; i < s; i++) {
973 mem_reg = (common_cuda_mem_regs_t *)
974 opal_list_remove_first(&common_cuda_memory_registrations);
975 free(mem_reg->msg);
976 OBJ_RELEASE(mem_reg);
977 }
978 return;
979 }
980
981 if (mca_common_cuda_enabled && mca_common_cuda_register_memory) {
982 res = cuFunc.cuMemHostUnregister(ptr);
983 if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
984 /* If unregistering the memory fails, just continue. This is during
985 * shutdown. Only print when running in verbose mode. */
986 opal_output_verbose(20, mca_common_cuda_output,
987 "CUDA: cuMemHostUnregister failed: ptr=%p, res=%d, rcache=%s",
988 ptr, res, msg);
989
990 } else {
991 opal_output_verbose(20, mca_common_cuda_output,
992 "CUDA: cuMemHostUnregister OK on rcache %s: "
993 "address=%p",
994 msg, ptr);
995 }
996 }
997 }
998
999 /*
1000 * Get the memory handle of a local section of memory that can be sent
1001 * to the remote size so it can access the memory. This is the
1002 * registration function for the sending side of a message transfer.
1003 */
cuda_getmemhandle(void * base,size_t size,mca_rcache_base_registration_t * newreg,mca_rcache_base_registration_t * hdrreg)1004 int cuda_getmemhandle(void *base, size_t size, mca_rcache_base_registration_t *newreg,
1005 mca_rcache_base_registration_t *hdrreg)
1006
1007 {
1008 CUmemorytype memType;
1009 CUresult result;
1010 CUipcMemHandle *memHandle;
1011 CUdeviceptr pbase;
1012 size_t psize;
1013
1014 mca_rcache_common_cuda_reg_t *cuda_reg = (mca_rcache_common_cuda_reg_t*)newreg;
1015 memHandle = (CUipcMemHandle *)cuda_reg->data.memHandle;
1016
1017 /* We should only be there if this is a CUDA device pointer */
1018 result = cuFunc.cuPointerGetAttribute(&memType,
1019 CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)base);
1020 assert(CUDA_SUCCESS == result);
1021 assert(CU_MEMORYTYPE_DEVICE == memType);
1022
1023 /* Get the memory handle so we can send it to the remote process. */
1024 result = cuFunc.cuIpcGetMemHandle(memHandle, (CUdeviceptr)base);
1025 CUDA_DUMP_MEMHANDLE((100, memHandle, "GetMemHandle-After"));
1026
1027 if (CUDA_SUCCESS != result) {
1028 opal_show_help("help-mpi-common-cuda.txt", "cuIpcGetMemHandle failed",
1029 true, result, base);
1030 return OPAL_ERROR;
1031 } else {
1032 opal_output_verbose(20, mca_common_cuda_output,
1033 "CUDA: cuIpcGetMemHandle passed: base=%p size=%d",
1034 base, (int)size);
1035 }
1036
1037 /* Need to get the real base and size of the memory handle. This is
1038 * how the remote side saves the handles in a cache. */
1039 result = cuFunc.cuMemGetAddressRange(&pbase, &psize, (CUdeviceptr)base);
1040 if (CUDA_SUCCESS != result) {
1041 opal_show_help("help-mpi-common-cuda.txt", "cuMemGetAddressRange failed",
1042 true, result, base);
1043 return OPAL_ERROR;
1044 } else {
1045 opal_output_verbose(10, mca_common_cuda_output,
1046 "CUDA: cuMemGetAddressRange passed: addr=%p, size=%d, pbase=%p, psize=%d ",
1047 base, (int)size, (void *)pbase, (int)psize);
1048 }
1049
1050 /* Store all the information in the registration */
1051 cuda_reg->base.base = (void *)pbase;
1052 cuda_reg->base.bound = (unsigned char *)pbase + psize - 1;
1053 cuda_reg->data.memh_seg_addr.pval = (void *) pbase;
1054 cuda_reg->data.memh_seg_len = psize;
1055
1056 #if OPAL_CUDA_SYNC_MEMOPS
1057 /* With CUDA 6.0, we can set an attribute on the memory pointer that will
1058 * ensure any synchronous copies are completed prior to any other access
1059 * of the memory region. This means we do not need to record an event
1060 * and send to the remote side.
1061 */
1062 memType = 1; /* Just use this variable since we already have it */
1063 result = cuFunc.cuPointerSetAttribute(&memType, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
1064 (CUdeviceptr)base);
1065 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1066 opal_show_help("help-mpi-common-cuda.txt", "cuPointerSetAttribute failed",
1067 true, OPAL_PROC_MY_HOSTNAME, result, base);
1068 return OPAL_ERROR;
1069 }
1070 #else
1071 /* Need to record the event to ensure that any memcopies into the
1072 * device memory have completed. The event handle associated with
1073 * this event is sent to the remote process so that it will wait
1074 * on this event prior to copying data out of the device memory.
1075 * Note that this needs to be the NULL stream to make since it is
1076 * unknown what stream any copies into the device memory were done
1077 * with. */
1078 result = cuFunc.cuEventRecord((CUevent)cuda_reg->data.event, 0);
1079 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1080 opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
1081 true, result, base);
1082 return OPAL_ERROR;
1083 }
1084 #endif /* OPAL_CUDA_SYNC_MEMOPS */
1085
1086 return OPAL_SUCCESS;
1087 }
1088
1089 /*
1090 * This function is called by the local side that called the cuda_getmemhandle.
1091 * There is nothing to be done so just return.
1092 */
cuda_ungetmemhandle(void * reg_data,mca_rcache_base_registration_t * reg)1093 int cuda_ungetmemhandle(void *reg_data, mca_rcache_base_registration_t *reg)
1094 {
1095 opal_output_verbose(10, mca_common_cuda_output,
1096 "CUDA: cuda_ungetmemhandle (no-op): base=%p", reg->base);
1097 CUDA_DUMP_MEMHANDLE((100, ((mca_rcache_common_cuda_reg_t *)reg)->data.memHandle, "cuda_ungetmemhandle"));
1098
1099 return OPAL_SUCCESS;
1100 }
1101
1102 /*
1103 * Open a memory handle that refers to remote memory so we can get an address
1104 * that works on the local side. This is the registration function for the
1105 * remote side of a transfer. newreg contains the new handle. hddrreg contains
1106 * the memory handle that was received from the remote side.
1107 */
cuda_openmemhandle(void * base,size_t size,mca_rcache_base_registration_t * newreg,mca_rcache_base_registration_t * hdrreg)1108 int cuda_openmemhandle(void *base, size_t size, mca_rcache_base_registration_t *newreg,
1109 mca_rcache_base_registration_t *hdrreg)
1110 {
1111 CUresult result;
1112 CUipcMemHandle *memHandle;
1113 mca_rcache_common_cuda_reg_t *cuda_newreg = (mca_rcache_common_cuda_reg_t*)newreg;
1114
1115 /* Save in local variable to avoid ugly casting */
1116 memHandle = (CUipcMemHandle *)cuda_newreg->data.memHandle;
1117 CUDA_DUMP_MEMHANDLE((100, memHandle, "Before call to cuIpcOpenMemHandle"));
1118
1119 /* Open the memory handle and store it into the registration structure. */
1120 result = cuFunc.cuIpcOpenMemHandle((CUdeviceptr *)&newreg->alloc_base, *memHandle,
1121 CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS);
1122
1123 /* If there are some stale entries in the cache, they can cause other
1124 * registrations to fail. Let the caller know that so that can attempt
1125 * to clear them out. */
1126 if (CUDA_ERROR_ALREADY_MAPPED == result) {
1127 opal_output_verbose(10, mca_common_cuda_output,
1128 "CUDA: cuIpcOpenMemHandle returned CUDA_ERROR_ALREADY_MAPPED for "
1129 "p=%p,size=%d: notify memory pool\n", base, (int)size);
1130 return OPAL_ERR_WOULD_BLOCK;
1131 }
1132 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1133 opal_show_help("help-mpi-common-cuda.txt", "cuIpcOpenMemHandle failed",
1134 true, OPAL_PROC_MY_HOSTNAME, result, base);
1135 /* Currently, this is a non-recoverable error */
1136 return OPAL_ERROR;
1137 } else {
1138 opal_output_verbose(10, mca_common_cuda_output,
1139 "CUDA: cuIpcOpenMemHandle passed: base=%p (remote base=%p,size=%d)",
1140 newreg->alloc_base, base, (int)size);
1141 CUDA_DUMP_MEMHANDLE((200, memHandle, "cuIpcOpenMemHandle"));
1142 }
1143
1144 return OPAL_SUCCESS;
1145 }
1146
1147 /*
1148 * Close a memory handle that refers to remote memory.
1149 */
cuda_closememhandle(void * reg_data,mca_rcache_base_registration_t * reg)1150 int cuda_closememhandle(void *reg_data, mca_rcache_base_registration_t *reg)
1151 {
1152 CUresult result;
1153 mca_rcache_common_cuda_reg_t *cuda_reg = (mca_rcache_common_cuda_reg_t*)reg;
1154
1155 /* Only attempt to close if we have valid context. This can change if a call
1156 * to the fini function is made and we discover context is gone. */
1157 if (ctx_ok) {
1158 result = cuFunc.cuIpcCloseMemHandle((CUdeviceptr)cuda_reg->base.alloc_base);
1159 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1160 if (CUDA_ERROR_DEINITIALIZED != result) {
1161 opal_show_help("help-mpi-common-cuda.txt", "cuIpcCloseMemHandle failed",
1162 true, result, cuda_reg->base.alloc_base);
1163 }
1164 /* We will just continue on and hope things continue to work. */
1165 } else {
1166 opal_output_verbose(10, mca_common_cuda_output,
1167 "CUDA: cuIpcCloseMemHandle passed: base=%p",
1168 cuda_reg->base.alloc_base);
1169 CUDA_DUMP_MEMHANDLE((100, cuda_reg->data.memHandle, "cuIpcCloseMemHandle"));
1170 }
1171 }
1172
1173 return OPAL_SUCCESS;
1174 }
1175
mca_common_cuda_construct_event_and_handle(uintptr_t * event,void * handle)1176 void mca_common_cuda_construct_event_and_handle(uintptr_t *event, void *handle)
1177 {
1178 CUresult result;
1179
1180 result = cuFunc.cuEventCreate((CUevent *)event, CU_EVENT_INTERPROCESS | CU_EVENT_DISABLE_TIMING);
1181 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1182 opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
1183 true, OPAL_PROC_MY_HOSTNAME, result);
1184 }
1185
1186 result = cuFunc.cuIpcGetEventHandle((CUipcEventHandle *)handle, (CUevent)*event);
1187 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1188 opal_show_help("help-mpi-common-cuda.txt", "cuIpcGetEventHandle failed",
1189 true, result);
1190 }
1191
1192 CUDA_DUMP_EVTHANDLE((10, handle, "construct_event_and_handle"));
1193
1194 }
1195
mca_common_cuda_destruct_event(uintptr_t event)1196 void mca_common_cuda_destruct_event(uintptr_t event)
1197 {
1198 CUresult result;
1199
1200 /* Only attempt to destroy if we have valid context. This can change if a call
1201 * to the fini function is made and we discover context is gone. */
1202 if (ctx_ok) {
1203 result = cuFunc.cuEventDestroy((CUevent)event);
1204 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1205 opal_show_help("help-mpi-common-cuda.txt", "cuEventDestroy failed",
1206 true, result);
1207 }
1208 }
1209 }
1210
1211
1212 /*
1213 * Put remote event on stream to ensure that the the start of the
1214 * copy does not start until the completion of the event.
1215 */
mca_common_wait_stream_synchronize(mca_rcache_common_cuda_reg_t * rget_reg)1216 void mca_common_wait_stream_synchronize(mca_rcache_common_cuda_reg_t *rget_reg)
1217 {
1218 #if OPAL_CUDA_SYNC_MEMOPS
1219 /* No need for any of this with SYNC_MEMOPS feature */
1220 return;
1221 #else /* OPAL_CUDA_SYNC_MEMOPS */
1222 CUipcEventHandle evtHandle;
1223 CUevent event;
1224 CUresult result;
1225
1226 memcpy(&evtHandle, rget_reg->data.evtHandle, sizeof(evtHandle));
1227 CUDA_DUMP_EVTHANDLE((100, &evtHandle, "stream_synchronize"));
1228
1229 result = cuFunc.cuIpcOpenEventHandle(&event, evtHandle);
1230 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1231 opal_show_help("help-mpi-common-cuda.txt", "cuIpcOpenEventHandle failed",
1232 true, result);
1233 }
1234
1235 /* BEGIN of Workaround - There is a bug in CUDA 4.1 RC2 and earlier
1236 * versions. Need to record an event on the stream, even though
1237 * it is not used, to make sure we do not short circuit our way
1238 * out of the cuStreamWaitEvent test.
1239 */
1240 result = cuFunc.cuEventRecord(event, 0);
1241 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1242 opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
1243 true, OPAL_PROC_MY_HOSTNAME, result);
1244 }
1245 /* END of Workaround */
1246
1247 result = cuFunc.cuStreamWaitEvent(0, event, 0);
1248 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1249 opal_show_help("help-mpi-common-cuda.txt", "cuStreamWaitEvent failed",
1250 true, result);
1251 }
1252
1253 /* All done with this event. */
1254 result = cuFunc.cuEventDestroy(event);
1255 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1256 opal_show_help("help-mpi-common-cuda.txt", "cuEventDestroy failed",
1257 true, result);
1258 }
1259 #endif /* OPAL_CUDA_SYNC_MEMOPS */
1260 }
1261
1262 /*
1263 * Start the asynchronous copy. Then record and save away an event that will
1264 * be queried to indicate the copy has completed.
1265 */
mca_common_cuda_memcpy(void * dst,void * src,size_t amount,char * msg,struct mca_btl_base_descriptor_t * frag,int * done)1266 int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
1267 struct mca_btl_base_descriptor_t *frag, int *done)
1268 {
1269 CUresult result;
1270 int iter;
1271
1272 OPAL_THREAD_LOCK(&common_cuda_ipc_lock);
1273 /* First make sure there is room to store the event. If not, then
1274 * return an error. The error message will tell the user to try and
1275 * run again, but with a larger array for storing events. */
1276 if (cuda_event_ipc_num_used == cuda_event_max) {
1277 opal_show_help("help-mpi-common-cuda.txt", "Out of cuEvent handles",
1278 true, cuda_event_max, cuda_event_max+100, cuda_event_max+100);
1279 OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1280 return OPAL_ERR_OUT_OF_RESOURCE;
1281 }
1282
1283 if (cuda_event_ipc_num_used > cuda_event_ipc_most) {
1284 cuda_event_ipc_most = cuda_event_ipc_num_used;
1285 /* Just print multiples of 10 */
1286 if (0 == (cuda_event_ipc_most % 10)) {
1287 opal_output_verbose(20, mca_common_cuda_output,
1288 "Maximum ipc events used is now %d", cuda_event_ipc_most);
1289 }
1290 }
1291
1292 /* This is the standard way to run. Running with synchronous copies is available
1293 * to measure the advantages of asynchronous copies. */
1294 if (OPAL_LIKELY(mca_common_cuda_async)) {
1295 result = cuFunc.cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, ipcStream);
1296 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1297 opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
1298 true, dst, src, amount, result);
1299 OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1300 return OPAL_ERROR;
1301 } else {
1302 opal_output_verbose(20, mca_common_cuda_output,
1303 "CUDA: cuMemcpyAsync passed: dst=%p, src=%p, size=%d",
1304 dst, src, (int)amount);
1305 }
1306 result = cuFunc.cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream);
1307 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1308 opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
1309 true, OPAL_PROC_MY_HOSTNAME, result);
1310 OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1311 return OPAL_ERROR;
1312 }
1313 cuda_event_ipc_frag_array[cuda_event_ipc_first_avail] = frag;
1314
1315 /* Bump up the first available slot and number used by 1 */
1316 cuda_event_ipc_first_avail++;
1317 if (cuda_event_ipc_first_avail >= cuda_event_max) {
1318 cuda_event_ipc_first_avail = 0;
1319 }
1320 cuda_event_ipc_num_used++;
1321
1322 *done = 0;
1323 } else {
1324 /* Mimic the async function so they use the same memcpy call. */
1325 result = cuFunc.cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, ipcStream);
1326 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1327 opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
1328 true, dst, src, amount, result);
1329 OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1330 return OPAL_ERROR;
1331 } else {
1332 opal_output_verbose(20, mca_common_cuda_output,
1333 "CUDA: cuMemcpyAsync passed: dst=%p, src=%p, size=%d",
1334 dst, src, (int)amount);
1335 }
1336
1337 /* Record an event, then wait for it to complete with calls to cuEventQuery */
1338 result = cuFunc.cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream);
1339 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1340 opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
1341 true, OPAL_PROC_MY_HOSTNAME, result);
1342 OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1343 return OPAL_ERROR;
1344 }
1345
1346 cuda_event_ipc_frag_array[cuda_event_ipc_first_avail] = frag;
1347
1348 /* Bump up the first available slot and number used by 1 */
1349 cuda_event_ipc_first_avail++;
1350 if (cuda_event_ipc_first_avail >= cuda_event_max) {
1351 cuda_event_ipc_first_avail = 0;
1352 }
1353 cuda_event_ipc_num_used++;
1354
1355 result = cuFunc.cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]);
1356 if ((CUDA_SUCCESS != result) && (CUDA_ERROR_NOT_READY != result)) {
1357 opal_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
1358 true, result);
1359 OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1360 return OPAL_ERROR;
1361 }
1362
1363 iter = 0;
1364 while (CUDA_ERROR_NOT_READY == result) {
1365 if (0 == (iter % 10)) {
1366 opal_output(-1, "EVENT NOT DONE (iter=%d)", iter);
1367 }
1368 result = cuFunc.cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]);
1369 if ((CUDA_SUCCESS != result) && (CUDA_ERROR_NOT_READY != result)) {
1370 opal_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
1371 true, result);
1372 OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1373 return OPAL_ERROR;
1374 }
1375 iter++;
1376 }
1377
1378 --cuda_event_ipc_num_used;
1379 ++cuda_event_ipc_first_used;
1380 if (cuda_event_ipc_first_used >= cuda_event_max) {
1381 cuda_event_ipc_first_used = 0;
1382 }
1383 *done = 1;
1384 }
1385 OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1386 return OPAL_SUCCESS;
1387 }
1388
1389 /*
1390 * Record an event and save the frag. This is called by the sending side and
1391 * is used to queue an event when a htod copy has been initiated.
1392 */
mca_common_cuda_record_dtoh_event(char * msg,struct mca_btl_base_descriptor_t * frag)1393 int mca_common_cuda_record_dtoh_event(char *msg, struct mca_btl_base_descriptor_t *frag)
1394 {
1395 CUresult result;
1396
1397 /* First make sure there is room to store the event. If not, then
1398 * return an error. The error message will tell the user to try and
1399 * run again, but with a larger array for storing events. */
1400 OPAL_THREAD_LOCK(&common_cuda_dtoh_lock);
1401 if (cuda_event_dtoh_num_used == cuda_event_max) {
1402 opal_show_help("help-mpi-common-cuda.txt", "Out of cuEvent handles",
1403 true, cuda_event_max, cuda_event_max+100, cuda_event_max+100);
1404 return OPAL_ERR_OUT_OF_RESOURCE;
1405 }
1406
1407 if (cuda_event_dtoh_num_used > cuda_event_dtoh_most) {
1408 cuda_event_dtoh_most = cuda_event_dtoh_num_used;
1409 /* Just print multiples of 10 */
1410 if (0 == (cuda_event_dtoh_most % 10)) {
1411 opal_output_verbose(20, mca_common_cuda_output,
1412 "Maximum DtoH events used is now %d", cuda_event_dtoh_most);
1413 }
1414 }
1415
1416 result = cuFunc.cuEventRecord(cuda_event_dtoh_array[cuda_event_dtoh_first_avail], dtohStream);
1417 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1418 opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
1419 true, OPAL_PROC_MY_HOSTNAME, result);
1420 OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock);
1421 return OPAL_ERROR;
1422 }
1423 cuda_event_dtoh_frag_array[cuda_event_dtoh_first_avail] = frag;
1424
1425 /* Bump up the first available slot and number used by 1 */
1426 cuda_event_dtoh_first_avail++;
1427 if (cuda_event_dtoh_first_avail >= cuda_event_max) {
1428 cuda_event_dtoh_first_avail = 0;
1429 }
1430 cuda_event_dtoh_num_used++;
1431
1432 OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock);
1433 return OPAL_SUCCESS;
1434 }
1435
1436 /*
1437 * Record an event and save the frag. This is called by the receiving side and
1438 * is used to queue an event when a dtoh copy has been initiated.
1439 */
mca_common_cuda_record_htod_event(char * msg,struct mca_btl_base_descriptor_t * frag)1440 int mca_common_cuda_record_htod_event(char *msg, struct mca_btl_base_descriptor_t *frag)
1441 {
1442 CUresult result;
1443
1444 OPAL_THREAD_LOCK(&common_cuda_htod_lock);
1445 /* First make sure there is room to store the event. If not, then
1446 * return an error. The error message will tell the user to try and
1447 * run again, but with a larger array for storing events. */
1448 if (cuda_event_htod_num_used == cuda_event_max) {
1449 opal_show_help("help-mpi-common-cuda.txt", "Out of cuEvent handles",
1450 true, cuda_event_max, cuda_event_max+100, cuda_event_max+100);
1451 OPAL_THREAD_UNLOCK(&common_cuda_htod_lock);
1452 return OPAL_ERR_OUT_OF_RESOURCE;
1453 }
1454
1455 if (cuda_event_htod_num_used > cuda_event_htod_most) {
1456 cuda_event_htod_most = cuda_event_htod_num_used;
1457 /* Just print multiples of 10 */
1458 if (0 == (cuda_event_htod_most % 10)) {
1459 opal_output_verbose(20, mca_common_cuda_output,
1460 "Maximum HtoD events used is now %d", cuda_event_htod_most);
1461 }
1462 }
1463
1464 result = cuFunc.cuEventRecord(cuda_event_htod_array[cuda_event_htod_first_avail], htodStream);
1465 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1466 opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
1467 true, OPAL_PROC_MY_HOSTNAME, result);
1468 OPAL_THREAD_UNLOCK(&common_cuda_htod_lock);
1469 return OPAL_ERROR;
1470 }
1471 cuda_event_htod_frag_array[cuda_event_htod_first_avail] = frag;
1472
1473 /* Bump up the first available slot and number used by 1 */
1474 cuda_event_htod_first_avail++;
1475 if (cuda_event_htod_first_avail >= cuda_event_max) {
1476 cuda_event_htod_first_avail = 0;
1477 }
1478 cuda_event_htod_num_used++;
1479
1480 OPAL_THREAD_UNLOCK(&common_cuda_htod_lock);
1481 return OPAL_SUCCESS;
1482 }
1483
1484 /**
1485 * Used to get the dtoh stream for initiating asynchronous copies.
1486 */
mca_common_cuda_get_dtoh_stream(void)1487 void *mca_common_cuda_get_dtoh_stream(void) {
1488 return (void *)dtohStream;
1489 }
1490
1491 /**
1492 * Used to get the htod stream for initiating asynchronous copies.
1493 */
mca_common_cuda_get_htod_stream(void)1494 void *mca_common_cuda_get_htod_stream(void) {
1495 return (void *)htodStream;
1496 }
1497
1498 /*
1499 * Function is called every time progress is called with the sm BTL. If there
1500 * are outstanding events, check to see if one has completed. If so, hand
1501 * back the fragment for further processing.
1502 */
progress_one_cuda_ipc_event(struct mca_btl_base_descriptor_t ** frag)1503 int progress_one_cuda_ipc_event(struct mca_btl_base_descriptor_t **frag) {
1504 CUresult result;
1505
1506 OPAL_THREAD_LOCK(&common_cuda_ipc_lock);
1507 if (cuda_event_ipc_num_used > 0) {
1508 opal_output_verbose(20, mca_common_cuda_output,
1509 "CUDA: progress_one_cuda_ipc_event, outstanding_events=%d",
1510 cuda_event_ipc_num_used);
1511
1512 result = cuFunc.cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]);
1513
1514 /* We found an event that is not ready, so return. */
1515 if (CUDA_ERROR_NOT_READY == result) {
1516 opal_output_verbose(20, mca_common_cuda_output,
1517 "CUDA: cuEventQuery returned CUDA_ERROR_NOT_READY");
1518 *frag = NULL;
1519 OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1520 return 0;
1521 } else if (CUDA_SUCCESS != result) {
1522 opal_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
1523 true, result);
1524 *frag = NULL;
1525 OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1526 return OPAL_ERROR;
1527 }
1528
1529 *frag = cuda_event_ipc_frag_array[cuda_event_ipc_first_used];
1530 opal_output_verbose(10, mca_common_cuda_output,
1531 "CUDA: cuEventQuery returned %d", result);
1532
1533 /* Bump counters, loop around the circular buffer if necessary */
1534 --cuda_event_ipc_num_used;
1535 ++cuda_event_ipc_first_used;
1536 if (cuda_event_ipc_first_used >= cuda_event_max) {
1537 cuda_event_ipc_first_used = 0;
1538 }
1539 /* A return value of 1 indicates an event completed and a frag was returned */
1540 OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1541 return 1;
1542 }
1543 OPAL_THREAD_UNLOCK(&common_cuda_ipc_lock);
1544 return 0;
1545 }
1546
1547 /**
1548 * Progress any dtoh event completions.
1549 */
progress_one_cuda_dtoh_event(struct mca_btl_base_descriptor_t ** frag)1550 int progress_one_cuda_dtoh_event(struct mca_btl_base_descriptor_t **frag) {
1551 CUresult result;
1552
1553 OPAL_THREAD_LOCK(&common_cuda_dtoh_lock);
1554 if (cuda_event_dtoh_num_used > 0) {
1555 opal_output_verbose(30, mca_common_cuda_output,
1556 "CUDA: progress_one_cuda_dtoh_event, outstanding_events=%d",
1557 cuda_event_dtoh_num_used);
1558
1559 result = cuFunc.cuEventQuery(cuda_event_dtoh_array[cuda_event_dtoh_first_used]);
1560
1561 /* We found an event that is not ready, so return. */
1562 if (CUDA_ERROR_NOT_READY == result) {
1563 opal_output_verbose(30, mca_common_cuda_output,
1564 "CUDA: cuEventQuery returned CUDA_ERROR_NOT_READY");
1565 *frag = NULL;
1566 OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock);
1567 return 0;
1568 } else if (CUDA_SUCCESS != result) {
1569 opal_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
1570 true, result);
1571 *frag = NULL;
1572 OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock);
1573 return OPAL_ERROR;
1574 }
1575
1576 *frag = cuda_event_dtoh_frag_array[cuda_event_dtoh_first_used];
1577 opal_output_verbose(30, mca_common_cuda_output,
1578 "CUDA: cuEventQuery returned %d", result);
1579
1580 /* Bump counters, loop around the circular buffer if necessary */
1581 --cuda_event_dtoh_num_used;
1582 ++cuda_event_dtoh_first_used;
1583 if (cuda_event_dtoh_first_used >= cuda_event_max) {
1584 cuda_event_dtoh_first_used = 0;
1585 }
1586 /* A return value of 1 indicates an event completed and a frag was returned */
1587 OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock);
1588 return 1;
1589 }
1590 OPAL_THREAD_UNLOCK(&common_cuda_dtoh_lock);
1591 return 0;
1592 }
1593
1594 /**
1595 * Progress any dtoh event completions.
1596 */
progress_one_cuda_htod_event(struct mca_btl_base_descriptor_t ** frag)1597 int progress_one_cuda_htod_event(struct mca_btl_base_descriptor_t **frag) {
1598 CUresult result;
1599
1600 OPAL_THREAD_LOCK(&common_cuda_htod_lock);
1601 if (cuda_event_htod_num_used > 0) {
1602 opal_output_verbose(30, mca_common_cuda_output,
1603 "CUDA: progress_one_cuda_htod_event, outstanding_events=%d",
1604 cuda_event_htod_num_used);
1605
1606 result = cuFunc.cuEventQuery(cuda_event_htod_array[cuda_event_htod_first_used]);
1607
1608 /* We found an event that is not ready, so return. */
1609 if (CUDA_ERROR_NOT_READY == result) {
1610 opal_output_verbose(30, mca_common_cuda_output,
1611 "CUDA: cuEventQuery returned CUDA_ERROR_NOT_READY");
1612 *frag = NULL;
1613 OPAL_THREAD_UNLOCK(&common_cuda_htod_lock);
1614 return 0;
1615 } else if (CUDA_SUCCESS != result) {
1616 opal_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
1617 true, result);
1618 *frag = NULL;
1619 OPAL_THREAD_UNLOCK(&common_cuda_htod_lock);
1620 return OPAL_ERROR;
1621 }
1622
1623 *frag = cuda_event_htod_frag_array[cuda_event_htod_first_used];
1624 opal_output_verbose(30, mca_common_cuda_output,
1625 "CUDA: cuEventQuery returned %d", result);
1626
1627 /* Bump counters, loop around the circular buffer if necessary */
1628 --cuda_event_htod_num_used;
1629 ++cuda_event_htod_first_used;
1630 if (cuda_event_htod_first_used >= cuda_event_max) {
1631 cuda_event_htod_first_used = 0;
1632 }
1633 /* A return value of 1 indicates an event completed and a frag was returned */
1634 OPAL_THREAD_UNLOCK(&common_cuda_htod_lock);
1635 return 1;
1636 }
1637 OPAL_THREAD_UNLOCK(&common_cuda_htod_lock);
1638 return OPAL_ERR_RESOURCE_BUSY;
1639 }
1640
1641
1642 /**
1643 * Need to make sure the handle we are retrieving from the cache is still
1644 * valid. Compare the cached handle to the one received.
1645 */
mca_common_cuda_memhandle_matches(mca_rcache_common_cuda_reg_t * new_reg,mca_rcache_common_cuda_reg_t * old_reg)1646 int mca_common_cuda_memhandle_matches(mca_rcache_common_cuda_reg_t *new_reg,
1647 mca_rcache_common_cuda_reg_t *old_reg)
1648 {
1649
1650 if (0 == memcmp(new_reg->data.memHandle, old_reg->data.memHandle, sizeof(new_reg->data.memHandle))) {
1651 return 1;
1652 } else {
1653 return 0;
1654 }
1655
1656 }
1657
1658 /*
1659 * Function to dump memory handle information. This is based on
1660 * definitions from cuiinterprocess_private.h.
1661 */
cuda_dump_memhandle(int verbose,void * memHandle,char * str)1662 static void cuda_dump_memhandle(int verbose, void *memHandle, char *str) {
1663
1664 struct InterprocessMemHandleInternal
1665 {
1666 /* The first two entries are the CUinterprocessCtxHandle */
1667 int64_t ctxId; /* unique (within a process) id of the sharing context */
1668 int pid; /* pid of sharing context */
1669
1670 int64_t size;
1671 int64_t blocksize;
1672 int64_t offset;
1673 int gpuId;
1674 int subDeviceIndex;
1675 int64_t serial;
1676 } memH;
1677
1678 if (NULL == str) {
1679 str = "CUDA";
1680 }
1681 memcpy(&memH, memHandle, sizeof(memH));
1682 opal_output_verbose(verbose, mca_common_cuda_output,
1683 "%s:ctxId=0x%" PRIx64 ", pid=%d, size=%" PRIu64 ", blocksize=%" PRIu64 ", offset=%"
1684 PRIu64 ", gpuId=%d, subDeviceIndex=%d, serial=%" PRIu64,
1685 str, memH.ctxId, memH.pid, memH.size, memH.blocksize, memH.offset,
1686 memH.gpuId, memH.subDeviceIndex, memH.serial);
1687 }
1688
1689 /*
1690 * Function to dump memory handle information. This is based on
1691 * definitions from cuiinterprocess_private.h.
1692 */
cuda_dump_evthandle(int verbose,void * evtHandle,char * str)1693 static void cuda_dump_evthandle(int verbose, void *evtHandle, char *str) {
1694
1695 struct InterprocessEventHandleInternal
1696 {
1697 unsigned long pid;
1698 unsigned long serial;
1699 int index;
1700 } evtH;
1701
1702 if (NULL == str) {
1703 str = "CUDA";
1704 }
1705 memcpy(&evtH, evtHandle, sizeof(evtH));
1706 opal_output_verbose(verbose, mca_common_cuda_output,
1707 "CUDA: %s:pid=%lu, serial=%lu, index=%d",
1708 str, evtH.pid, evtH.serial, evtH.index);
1709 }
1710
1711
1712 /* Return microseconds of elapsed time. Microseconds are relevant when
1713 * trying to understand the fixed overhead of the communication. Used
1714 * when trying to time various functions.
1715 *
1716 * Cut and past the following to get timings where wanted.
1717 *
1718 * clock_gettime(CLOCK_MONOTONIC, &ts_start);
1719 * FUNCTION OF INTEREST
1720 * clock_gettime(CLOCK_MONOTONIC, &ts_end);
1721 * accum = mydifftime(ts_start, ts_end);
1722 * opal_output(0, "Function took %7.2f usecs\n", accum);
1723 *
1724 */
1725 #if OPAL_ENABLE_DEBUG
mydifftime(opal_timer_t ts_start,opal_timer_t ts_end)1726 static float mydifftime(opal_timer_t ts_start, opal_timer_t ts_end) {
1727 return (ts_end - ts_start);
1728 }
1729 #endif /* OPAL_ENABLE_DEBUG */
1730
1731 /* Routines that get plugged into the opal datatype code */
mca_common_cuda_is_gpu_buffer(const void * pUserBuf,opal_convertor_t * convertor)1732 static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t *convertor)
1733 {
1734 int res;
1735 CUmemorytype memType = 0;
1736 CUdeviceptr dbuf = (CUdeviceptr)pUserBuf;
1737 CUcontext ctx = NULL, memCtx = NULL;
1738 #if OPAL_CUDA_GET_ATTRIBUTES
1739 uint32_t isManaged = 0;
1740 /* With CUDA 7.0, we can get multiple attributes with a single call */
1741 CUpointer_attribute attributes[3] = {CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
1742 CU_POINTER_ATTRIBUTE_CONTEXT,
1743 CU_POINTER_ATTRIBUTE_IS_MANAGED};
1744 void *attrdata[] = {(void *)&memType, (void *)&memCtx, (void *)&isManaged};
1745
1746 res = cuFunc.cuPointerGetAttributes(3, attributes, attrdata, dbuf);
1747 OPAL_OUTPUT_VERBOSE((101, mca_common_cuda_output,
1748 "dbuf=%p, memType=%d, memCtx=%p, isManaged=%d, res=%d",
1749 (void *)dbuf, (int)memType, (void *)memCtx, isManaged, res));
1750
1751 /* Mark unified memory buffers with a flag. This will allow all unified
1752 * memory to be forced through host buffers. Note that this memory can
1753 * be either host or device so we need to set this flag prior to that check. */
1754 if (1 == isManaged) {
1755 if (NULL != convertor) {
1756 convertor->flags |= CONVERTOR_CUDA_UNIFIED;
1757 }
1758 }
1759 if (res != CUDA_SUCCESS) {
1760 /* If we cannot determine it is device pointer,
1761 * just assume it is not. */
1762 return 0;
1763 } else if (memType == CU_MEMORYTYPE_HOST) {
1764 /* Host memory, nothing to do here */
1765 return 0;
1766 } else if (memType == 0) {
1767 /* This can happen when CUDA is initialized but dbuf is not valid CUDA pointer */
1768 return 0;
1769 }
1770 /* Must be a device pointer */
1771 assert(memType == CU_MEMORYTYPE_DEVICE);
1772 #else /* OPAL_CUDA_GET_ATTRIBUTES */
1773 res = cuFunc.cuPointerGetAttribute(&memType,
1774 CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dbuf);
1775 if (res != CUDA_SUCCESS) {
1776 /* If we cannot determine it is device pointer,
1777 * just assume it is not. */
1778 return 0;
1779 } else if (memType == CU_MEMORYTYPE_HOST) {
1780 /* Host memory, nothing to do here */
1781 return 0;
1782 }
1783 /* Must be a device pointer */
1784 assert(memType == CU_MEMORYTYPE_DEVICE);
1785 #endif /* OPAL_CUDA_GET_ATTRIBUTES */
1786
1787 /* This piece of code was added in to handle in a case involving
1788 * OMP threads. The user had initialized CUDA and then spawned
1789 * two threads. The first thread had the CUDA context, but the
1790 * second thread did not. We therefore had no context to act upon
1791 * and future CUDA driver calls would fail. Therefore, if we have
1792 * GPU memory, but no context, get the context from the GPU memory
1793 * and set the current context to that. It is rare that we will not
1794 * have a context. */
1795 res = cuFunc.cuCtxGetCurrent(&ctx);
1796 if (OPAL_UNLIKELY(NULL == ctx)) {
1797 if (CUDA_SUCCESS == res) {
1798 #if !OPAL_CUDA_GET_ATTRIBUTES
1799 res = cuFunc.cuPointerGetAttribute(&memCtx,
1800 CU_POINTER_ATTRIBUTE_CONTEXT, dbuf);
1801 if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
1802 opal_output(0, "CUDA: error calling cuPointerGetAttribute: "
1803 "res=%d, ptr=%p aborting...", res, pUserBuf);
1804 return OPAL_ERROR;
1805 }
1806 #endif /* OPAL_CUDA_GET_ATTRIBUTES */
1807 res = cuFunc.cuCtxSetCurrent(memCtx);
1808 if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
1809 opal_output(0, "CUDA: error calling cuCtxSetCurrent: "
1810 "res=%d, ptr=%p aborting...", res, pUserBuf);
1811 return OPAL_ERROR;
1812 } else {
1813 OPAL_OUTPUT_VERBOSE((10, mca_common_cuda_output,
1814 "CUDA: cuCtxSetCurrent passed: ptr=%p", pUserBuf));
1815 }
1816 } else {
1817 /* Print error and proceed */
1818 opal_output(0, "CUDA: error calling cuCtxGetCurrent: "
1819 "res=%d, ptr=%p aborting...", res, pUserBuf);
1820 return OPAL_ERROR;
1821 }
1822 }
1823
1824 /* WORKAROUND - They are times when the above code determines a pice of memory
1825 * is GPU memory, but it actually is not. That has been seen on multi-GPU systems
1826 * with 6 or 8 GPUs on them. Therefore, we will do this extra check. Note if we
1827 * made it this far, then the assumption at this point is we have GPU memory.
1828 * Unfotunately, this extra call is costing us another 100 ns almost doubling
1829 * the cost of this entire function. */
1830 if (OPAL_LIKELY(mca_common_cuda_gpu_mem_check_workaround)) {
1831 CUdeviceptr pbase;
1832 size_t psize;
1833 res = cuFunc.cuMemGetAddressRange(&pbase, &psize, dbuf);
1834 if (CUDA_SUCCESS != res) {
1835 opal_output_verbose(5, mca_common_cuda_output,
1836 "CUDA: cuMemGetAddressRange failed on this pointer: res=%d, buf=%p "
1837 "Overriding check and setting to host pointer. ",
1838 res, (void *)dbuf);
1839 /* This cannot be GPU memory if the previous call failed */
1840 return 0;
1841 }
1842 }
1843
1844 /* First access on a device pointer finalizes CUDA support initialization.
1845 * If initialization fails, disable support. */
1846 if (!stage_three_init_complete) {
1847 if (0 != mca_common_cuda_stage_three_init()) {
1848 opal_cuda_support = 0;
1849 }
1850 }
1851
1852 return 1;
1853 }
1854
mca_common_cuda_cu_memcpy_async(void * dest,const void * src,size_t size,opal_convertor_t * convertor)1855 static int mca_common_cuda_cu_memcpy_async(void *dest, const void *src, size_t size,
1856 opal_convertor_t* convertor)
1857 {
1858 return cuFunc.cuMemcpyAsync((CUdeviceptr)dest, (CUdeviceptr)src, size,
1859 (CUstream)convertor->stream);
1860 }
1861
1862 /**
1863 * This function is plugged into various areas where a cuMemcpy would be called.
1864 * This is a synchronous operation that will not return until the copy is complete.
1865 */
mca_common_cuda_cu_memcpy(void * dest,const void * src,size_t size)1866 static int mca_common_cuda_cu_memcpy(void *dest, const void *src, size_t size)
1867 {
1868 CUresult result;
1869 #if OPAL_ENABLE_DEBUG
1870 CUmemorytype memTypeSrc, memTypeDst;
1871 if (OPAL_UNLIKELY(mca_common_cuda_cumemcpy_timing)) {
1872 /* Nice to know type of source and destination for timing output. Do
1873 * not care about return code as memory type will just be set to 0 */
1874 result = cuFunc.cuPointerGetAttribute(&memTypeDst,
1875 CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)dest);
1876 result = cuFunc.cuPointerGetAttribute(&memTypeSrc,
1877 CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)src);
1878 ts_start = opal_timer_base_get_usec();
1879 }
1880 #endif
1881 if (mca_common_cuda_cumemcpy_async) {
1882 result = cuFunc.cuMemcpyAsync((CUdeviceptr)dest, (CUdeviceptr)src, size, memcpyStream);
1883 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1884 opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
1885 true, dest, src, size, result);
1886 return OPAL_ERROR;
1887 }
1888 result = cuFunc.cuStreamSynchronize(memcpyStream);
1889 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1890 opal_show_help("help-mpi-common-cuda.txt", "cuStreamSynchronize failed",
1891 true, OPAL_PROC_MY_HOSTNAME, result);
1892 return OPAL_ERROR;
1893 }
1894 } else {
1895 result = cuFunc.cuMemcpy((CUdeviceptr)dest, (CUdeviceptr)src, size);
1896 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1897 opal_show_help("help-mpi-common-cuda.txt", "cuMemcpy failed",
1898 true, OPAL_PROC_MY_HOSTNAME, result);
1899 return OPAL_ERROR;
1900 }
1901 }
1902 #if OPAL_ENABLE_DEBUG
1903 if (OPAL_UNLIKELY(mca_common_cuda_cumemcpy_timing)) {
1904 ts_end = opal_timer_base_get_usec();
1905 accum = mydifftime(ts_start, ts_end);
1906 if (mca_common_cuda_cumemcpy_async) {
1907 opal_output(0, "cuMemcpyAsync took %7.2f usecs, size=%d, (src=%p (%d), dst=%p (%d))\n",
1908 accum, (int)size, src, memTypeSrc, dest, memTypeDst);
1909 } else {
1910 opal_output(0, "cuMemcpy took %7.2f usecs, size=%d, (src=%p (%d), dst=%p (%d))\n",
1911 accum, (int)size, src, memTypeSrc, dest, memTypeDst);
1912 }
1913 }
1914 #endif
1915 return OPAL_SUCCESS;
1916 }
1917
mca_common_cuda_memmove(void * dest,void * src,size_t size)1918 static int mca_common_cuda_memmove(void *dest, void *src, size_t size)
1919 {
1920 CUdeviceptr tmp;
1921 int result;
1922
1923 result = cuFunc.cuMemAlloc(&tmp,size);
1924 if (mca_common_cuda_cumemcpy_async) {
1925 result = cuFunc.cuMemcpyAsync(tmp, (CUdeviceptr)src, size, memcpyStream);
1926 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1927 opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
1928 true, tmp, src, size, result);
1929 return OPAL_ERROR;
1930 }
1931 result = cuFunc.cuMemcpyAsync((CUdeviceptr)dest, tmp, size, memcpyStream);
1932 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1933 opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
1934 true, dest, tmp, size, result);
1935 return OPAL_ERROR;
1936 }
1937 result = cuFunc.cuStreamSynchronize(memcpyStream);
1938 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1939 opal_show_help("help-mpi-common-cuda.txt", "cuStreamSynchronize failed",
1940 true, OPAL_PROC_MY_HOSTNAME, result);
1941 return OPAL_ERROR;
1942 }
1943 } else {
1944 result = cuFunc.cuMemcpy(tmp, (CUdeviceptr)src, size);
1945 if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
1946 opal_output(0, "CUDA: memmove-Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
1947 result, (void *)tmp, src, (int)size);
1948 return OPAL_ERROR;
1949 }
1950 result = cuFunc.cuMemcpy((CUdeviceptr)dest, tmp, size);
1951 if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
1952 opal_output(0, "CUDA: memmove-Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
1953 result, dest, (void *)tmp, (int)size);
1954 return OPAL_ERROR;
1955 }
1956 }
1957 cuFunc.cuMemFree(tmp);
1958 return OPAL_SUCCESS;
1959 }
1960
mca_common_cuda_get_device(int * devicenum)1961 int mca_common_cuda_get_device(int *devicenum)
1962 {
1963 CUdevice cuDev;
1964 int res;
1965
1966 res = cuFunc.cuCtxGetDevice(&cuDev);
1967 if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
1968 opal_output(0, "CUDA: cuCtxGetDevice failed: res=%d",
1969 res);
1970 return res;
1971 }
1972 *devicenum = cuDev;
1973 return 0;
1974 }
1975
mca_common_cuda_device_can_access_peer(int * access,int dev1,int dev2)1976 int mca_common_cuda_device_can_access_peer(int *access, int dev1, int dev2)
1977 {
1978 int res;
1979 res = cuFunc.cuDeviceCanAccessPeer(access, (CUdevice)dev1, (CUdevice)dev2);
1980 if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
1981 opal_output(0, "CUDA: cuDeviceCanAccessPeer failed: res=%d",
1982 res);
1983 return res;
1984 }
1985 return 0;
1986 }
1987
mca_common_cuda_get_address_range(void * pbase,size_t * psize,void * base)1988 int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base)
1989 {
1990 CUresult result;
1991 result = cuFunc.cuMemGetAddressRange((CUdeviceptr *)pbase, psize, (CUdeviceptr)base);
1992 if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1993 opal_show_help("help-mpi-common-cuda.txt", "cuMemGetAddressRange failed 2",
1994 true, OPAL_PROC_MY_HOSTNAME, result, base);
1995 return OPAL_ERROR;
1996 } else {
1997 opal_output_verbose(50, mca_common_cuda_output,
1998 "CUDA: cuMemGetAddressRange passed: addr=%p, pbase=%p, psize=%lu ",
1999 base, *(char **)pbase, *psize);
2000 }
2001 return 0;
2002 }
2003
2004 #if OPAL_CUDA_GDR_SUPPORT
2005 /* Check to see if the memory was freed between the time it was stored in
2006 * the registration cache and now. Return true if the memory was previously
2007 * freed. This is indicated by the BUFFER_ID value in the registration cache
2008 * not matching the BUFFER_ID of the buffer we are checking. Return false
2009 * if the registration is still good.
2010 */
mca_common_cuda_previously_freed_memory(mca_rcache_base_registration_t * reg)2011 bool mca_common_cuda_previously_freed_memory(mca_rcache_base_registration_t *reg)
2012 {
2013 int res;
2014 unsigned long long bufID;
2015 unsigned char *dbuf = reg->base;
2016
2017 res = cuFunc.cuPointerGetAttribute(&bufID, CU_POINTER_ATTRIBUTE_BUFFER_ID,
2018 (CUdeviceptr)dbuf);
2019 /* If we cannot determine the BUFFER_ID, then print a message and default
2020 * to forcing the registration to be kicked out. */
2021 if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
2022 opal_show_help("help-mpi-common-cuda.txt", "bufferID failed",
2023 true, OPAL_PROC_MY_HOSTNAME, res);
2024 return true;
2025 }
2026 opal_output_verbose(50, mca_common_cuda_output,
2027 "CUDA: base=%p, bufID=%llu, reg->gpu_bufID=%llu, %s", dbuf, bufID, reg->gpu_bufID,
2028 (reg->gpu_bufID == bufID ? "BUFFER_ID match":"BUFFER_ID do not match"));
2029 if (bufID != reg->gpu_bufID) {
2030 return true;
2031 } else {
2032 return false;
2033 }
2034 }
2035
2036 /*
2037 * Get the buffer ID from the memory and store it in the registration.
2038 * This is needed to ensure the cached registration is not stale. If
2039 * we fail to get buffer ID, print an error and set buffer ID to 0.
2040 * Also set SYNC_MEMOPS on any GPU registration to ensure that
2041 * synchronous copies complete before the buffer is accessed.
2042 */
mca_common_cuda_get_buffer_id(mca_rcache_base_registration_t * reg)2043 void mca_common_cuda_get_buffer_id(mca_rcache_base_registration_t *reg)
2044 {
2045 int res;
2046 unsigned long long bufID = 0;
2047 unsigned char *dbuf = reg->base;
2048 int enable = 1;
2049
2050 res = cuFunc.cuPointerGetAttribute(&bufID, CU_POINTER_ATTRIBUTE_BUFFER_ID,
2051 (CUdeviceptr)dbuf);
2052 if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
2053 opal_show_help("help-mpi-common-cuda.txt", "bufferID failed",
2054 true, OPAL_PROC_MY_HOSTNAME, res);
2055 }
2056 reg->gpu_bufID = bufID;
2057
2058 res = cuFunc.cuPointerSetAttribute(&enable, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
2059 (CUdeviceptr)dbuf);
2060 if (OPAL_UNLIKELY(CUDA_SUCCESS != res)) {
2061 opal_show_help("help-mpi-common-cuda.txt", "cuPointerSetAttribute failed",
2062 true, OPAL_PROC_MY_HOSTNAME, res, dbuf);
2063 }
2064 }
2065 #endif /* OPAL_CUDA_GDR_SUPPORT */
2066