1 /**
2 * Copyright (C) Mellanox Technologies Ltd. 2001-2017. ALL RIGHTS RESERVED.
3 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
4 *
5 * See file LICENSE for terms.
6 */
7
8 #ifdef HAVE_CONFIG_H
9 # include "config.h"
10 #endif
11
12 #include <ucm/cuda/cudamem.h>
13
14 #include <ucm/event/event.h>
15 #include <ucm/util/log.h>
16 #include <ucm/util/reloc.h>
17 #include <ucm/util/replace.h>
18 #include <ucm/util/sys.h>
19 #include <ucs/debug/assert.h>
20 #include <ucs/sys/compiler.h>
21 #include <ucs/sys/preprocessor.h>
22 #include <ucs/sys/topo.h>
23 #include <ucs/memory/memory_type.h>
24
25 #include <sys/mman.h>
26 #include <pthread.h>
27 #include <string.h>
28 #include <unistd.h>
29
30
31 UCM_DEFINE_REPLACE_DLSYM_FUNC(cuMemFree, CUresult, -1, CUdeviceptr)
32 UCM_DEFINE_REPLACE_DLSYM_FUNC(cuMemFreeHost, CUresult, -1, void *)
33 UCM_DEFINE_REPLACE_DLSYM_FUNC(cuMemAlloc, CUresult, -1, CUdeviceptr *, size_t)
34 UCM_DEFINE_REPLACE_DLSYM_FUNC(cuMemAllocManaged, CUresult, -1, CUdeviceptr *,
35 size_t, unsigned int)
36 UCM_DEFINE_REPLACE_DLSYM_FUNC(cuMemAllocPitch, CUresult, -1, CUdeviceptr *, size_t *,
37 size_t, size_t, unsigned int)
38 UCM_DEFINE_REPLACE_DLSYM_FUNC(cuMemHostGetDevicePointer, CUresult, -1, CUdeviceptr *,
39 void *, unsigned int)
40 UCM_DEFINE_REPLACE_DLSYM_FUNC(cuMemHostUnregister, CUresult, -1, void *)
41 UCM_DEFINE_REPLACE_DLSYM_FUNC(cudaFree, cudaError_t, -1, void*)
42 UCM_DEFINE_REPLACE_DLSYM_FUNC(cudaFreeHost, cudaError_t, -1, void*)
43 UCM_DEFINE_REPLACE_DLSYM_FUNC(cudaMalloc, cudaError_t, -1, void**, size_t)
44 UCM_DEFINE_REPLACE_DLSYM_FUNC(cudaMallocManaged, cudaError_t, -1, void**, size_t, unsigned int)
45 UCM_DEFINE_REPLACE_DLSYM_FUNC(cudaMallocPitch, cudaError_t, -1, void**, size_t *,
46 size_t, size_t)
47 UCM_DEFINE_REPLACE_DLSYM_FUNC(cudaHostGetDevicePointer, cudaError_t, -1, void**,
48 void *, unsigned int)
49 UCM_DEFINE_REPLACE_DLSYM_FUNC(cudaHostUnregister, cudaError_t, -1, void*)
50
51 #if ENABLE_SYMBOL_OVERRIDE
UCM_OVERRIDE_FUNC(cuMemFree,CUresult)52 UCM_OVERRIDE_FUNC(cuMemFree, CUresult)
53 UCM_OVERRIDE_FUNC(cuMemFreeHost, CUresult)
54 UCM_OVERRIDE_FUNC(cuMemAlloc, CUresult)
55 UCM_OVERRIDE_FUNC(cuMemAllocManaged, CUresult)
56 UCM_OVERRIDE_FUNC(cuMemAllocPitch, CUresult)
57 UCM_OVERRIDE_FUNC(cuMemHostGetDevicePointer, CUresult)
58 UCM_OVERRIDE_FUNC(cuMemHostUnregister, CUresult)
59 UCM_OVERRIDE_FUNC(cudaFree, cudaError_t)
60 UCM_OVERRIDE_FUNC(cudaFreeHost, cudaError_t)
61 UCM_OVERRIDE_FUNC(cudaMalloc, cudaError_t)
62 UCM_OVERRIDE_FUNC(cudaMallocManaged, cudaError_t)
63 UCM_OVERRIDE_FUNC(cudaMallocPitch, cudaError_t)
64 UCM_OVERRIDE_FUNC(cudaHostGetDevicePointer, cudaError_t)
65 UCM_OVERRIDE_FUNC(cudaHostUnregister, cudaError_t)
66 #endif
67
68
69 static void ucm_cuda_set_ptr_attr(CUdeviceptr dptr)
70 {
71 if ((void*)dptr == NULL) {
72 ucm_trace("skipping cuPointerSetAttribute for null pointer");
73 return;
74 }
75
76 unsigned int value = 1;
77 CUresult ret;
78 const char *cu_err_str;
79
80 ret = cuPointerSetAttribute(&value, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, dptr);
81 if (ret != CUDA_SUCCESS) {
82 cuGetErrorString(ret, &cu_err_str);
83 ucm_warn("cuPointerSetAttribute(%p) failed: %s", (void *) dptr, cu_err_str);
84 }
85 }
86
87 static UCS_F_ALWAYS_INLINE void
ucm_dispatch_mem_type_alloc(void * addr,size_t length,ucs_memory_type_t mem_type)88 ucm_dispatch_mem_type_alloc(void *addr, size_t length, ucs_memory_type_t mem_type)
89 {
90 ucm_event_t event;
91
92 event.mem_type.address = addr;
93 event.mem_type.size = length;
94 event.mem_type.mem_type = mem_type;
95 ucm_event_dispatch(UCM_EVENT_MEM_TYPE_ALLOC, &event);
96 }
97
98 static UCS_F_ALWAYS_INLINE void
ucm_dispatch_mem_type_free(void * addr,size_t length,ucs_memory_type_t mem_type)99 ucm_dispatch_mem_type_free(void *addr, size_t length, ucs_memory_type_t mem_type)
100 {
101 ucm_event_t event;
102
103 event.mem_type.address = addr;
104 event.mem_type.size = length;
105 event.mem_type.mem_type = mem_type;
106 ucm_event_dispatch(UCM_EVENT_MEM_TYPE_FREE, &event);
107 }
108
ucm_cudafree_dispatch_events(CUdeviceptr dptr,const char * func_name)109 static void ucm_cudafree_dispatch_events(CUdeviceptr dptr, const char *func_name)
110 {
111 CUresult ret;
112 CUdeviceptr pbase;
113 size_t psize;
114
115 if (dptr == 0) {
116 return;
117 }
118
119 ret = cuMemGetAddressRange(&pbase, &psize, dptr);
120 if (ret == CUDA_SUCCESS) {
121 if (dptr != pbase) {
122 ucm_warn("%s(%p) called with unexpected pointer (expected: %p)",
123 func_name, (void*)dptr, (void*)pbase);
124 }
125 } else {
126 ucm_debug("cuMemGetAddressRange(devPtr=%p) failed", (void*)dptr);
127 psize = 1; /* set minimum length */
128 }
129
130 ucm_dispatch_mem_type_free((void *)dptr, psize, UCS_MEMORY_TYPE_CUDA);
131 }
132
ucm_cuMemFree(CUdeviceptr dptr)133 CUresult ucm_cuMemFree(CUdeviceptr dptr)
134 {
135 CUresult ret;
136
137 ucm_event_enter();
138
139 ucm_trace("ucm_cuMemFree(dptr=%p)",(void*)dptr);
140
141 ucm_cudafree_dispatch_events(dptr, "cuMemFree");
142
143 ret = ucm_orig_cuMemFree(dptr);
144
145 ucm_event_leave();
146 return ret;
147 }
148
ucm_cuMemFreeHost(void * p)149 CUresult ucm_cuMemFreeHost(void *p)
150 {
151 CUresult ret;
152
153 ucm_event_enter();
154
155 ucm_trace("ucm_cuMemFreeHost(ptr=%p)", p);
156
157 ucm_dispatch_vm_munmap(p, 0);
158
159 ret = ucm_orig_cuMemFreeHost(p);
160
161 ucm_event_leave();
162 return ret;
163 }
164
ucm_cuMemAlloc(CUdeviceptr * dptr,size_t size)165 CUresult ucm_cuMemAlloc(CUdeviceptr *dptr, size_t size)
166 {
167 CUresult ret;
168
169 ucm_event_enter();
170
171 ret = ucm_orig_cuMemAlloc(dptr, size);
172 if (ret == CUDA_SUCCESS) {
173 ucm_trace("ucm_cuMemAlloc(dptr=%p size:%lu)",(void *)*dptr, size);
174 ucm_dispatch_mem_type_alloc((void *)*dptr, size, UCS_MEMORY_TYPE_CUDA);
175 ucm_cuda_set_ptr_attr(*dptr);
176 }
177
178 ucm_event_leave();
179 return ret;
180 }
181
ucm_cuMemAllocManaged(CUdeviceptr * dptr,size_t size,unsigned int flags)182 CUresult ucm_cuMemAllocManaged(CUdeviceptr *dptr, size_t size, unsigned int flags)
183 {
184 CUresult ret;
185
186 ucm_event_enter();
187
188 ret = ucm_orig_cuMemAllocManaged(dptr, size, flags);
189 if (ret == CUDA_SUCCESS) {
190 ucm_trace("ucm_cuMemAllocManaged(dptr=%p size:%lu, flags:%d)",
191 (void *)*dptr, size, flags);
192 ucm_dispatch_mem_type_alloc((void *)*dptr, size,
193 UCS_MEMORY_TYPE_CUDA_MANAGED);
194 }
195
196 ucm_event_leave();
197 return ret;
198 }
199
ucm_cuMemAllocPitch(CUdeviceptr * dptr,size_t * pPitch,size_t WidthInBytes,size_t Height,unsigned int ElementSizeBytes)200 CUresult ucm_cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch,
201 size_t WidthInBytes, size_t Height,
202 unsigned int ElementSizeBytes)
203 {
204 CUresult ret;
205
206 ucm_event_enter();
207
208 ret = ucm_orig_cuMemAllocPitch(dptr, pPitch, WidthInBytes, Height, ElementSizeBytes);
209 if (ret == CUDA_SUCCESS) {
210 ucm_trace("ucm_cuMemAllocPitch(dptr=%p size:%lu)",(void *)*dptr,
211 (WidthInBytes * Height));
212 ucm_dispatch_mem_type_alloc((void *)*dptr, WidthInBytes * Height,
213 UCS_MEMORY_TYPE_CUDA);
214 ucm_cuda_set_ptr_attr(*dptr);
215 }
216
217 ucm_event_leave();
218 return ret;
219 }
220
ucm_cuMemHostGetDevicePointer(CUdeviceptr * pdptr,void * p,unsigned int Flags)221 CUresult ucm_cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags)
222 {
223 CUresult ret;
224
225 ucm_event_enter();
226
227 ret = ucm_orig_cuMemHostGetDevicePointer(pdptr, p, Flags);
228 if (ret == CUDA_SUCCESS) {
229 ucm_trace("ucm_cuMemHostGetDevicePointer(pdptr=%p p=%p)",(void *)*pdptr, p);
230 }
231
232 ucm_event_leave();
233 return ret;
234 }
235
ucm_cuMemHostUnregister(void * p)236 CUresult ucm_cuMemHostUnregister(void *p)
237 {
238 CUresult ret;
239
240 ucm_event_enter();
241
242 ucm_trace("ucm_cuMemHostUnregister(ptr=%p)", p);
243
244 ret = ucm_orig_cuMemHostUnregister(p);
245
246 ucm_event_leave();
247 return ret;
248 }
249
ucm_cudaFree(void * devPtr)250 cudaError_t ucm_cudaFree(void *devPtr)
251 {
252 cudaError_t ret;
253
254 ucm_event_enter();
255
256 ucm_trace("ucm_cudaFree(devPtr=%p)", devPtr);
257
258 ucm_cudafree_dispatch_events((CUdeviceptr)devPtr, "cudaFree");
259
260 ret = ucm_orig_cudaFree(devPtr);
261
262 ucm_event_leave();
263
264 return ret;
265 }
266
ucm_cudaFreeHost(void * ptr)267 cudaError_t ucm_cudaFreeHost(void *ptr)
268 {
269 cudaError_t ret;
270
271 ucm_event_enter();
272
273 ucm_trace("ucm_cudaFreeHost(ptr=%p)", ptr);
274
275 ucm_dispatch_vm_munmap(ptr, 0);
276
277 ret = ucm_orig_cudaFreeHost(ptr);
278
279 ucm_event_leave();
280 return ret;
281 }
282
ucm_cudaMalloc(void ** devPtr,size_t size)283 cudaError_t ucm_cudaMalloc(void **devPtr, size_t size)
284 {
285 cudaError_t ret;
286
287 ucm_event_enter();
288
289 ret = ucm_orig_cudaMalloc(devPtr, size);
290 if (ret == cudaSuccess) {
291 ucm_trace("ucm_cudaMalloc(devPtr=%p size:%lu)", *devPtr, size);
292 ucm_dispatch_mem_type_alloc(*devPtr, size, UCS_MEMORY_TYPE_CUDA);
293 ucm_cuda_set_ptr_attr((CUdeviceptr) *devPtr);
294 }
295
296 ucm_event_leave();
297
298 return ret;
299 }
300
ucm_cudaMallocManaged(void ** devPtr,size_t size,unsigned int flags)301 cudaError_t ucm_cudaMallocManaged(void **devPtr, size_t size, unsigned int flags)
302 {
303 cudaError_t ret;
304
305 ucm_event_enter();
306
307 ret = ucm_orig_cudaMallocManaged(devPtr, size, flags);
308 if (ret == cudaSuccess) {
309 ucm_trace("ucm_cudaMallocManaged(devPtr=%p size:%lu flags:%d)",
310 *devPtr, size, flags);
311 ucm_dispatch_mem_type_alloc(*devPtr, size, UCS_MEMORY_TYPE_CUDA_MANAGED);
312 }
313
314 ucm_event_leave();
315
316 return ret;
317 }
318
ucm_cudaMallocPitch(void ** devPtr,size_t * pitch,size_t width,size_t height)319 cudaError_t ucm_cudaMallocPitch(void **devPtr, size_t *pitch,
320 size_t width, size_t height)
321 {
322 cudaError_t ret;
323
324 ucm_event_enter();
325
326 ret = ucm_orig_cudaMallocPitch(devPtr, pitch, width, height);
327 if (ret == cudaSuccess) {
328 ucm_trace("ucm_cudaMallocPitch(devPtr=%p size:%lu)",*devPtr, (width * height));
329 ucm_dispatch_mem_type_alloc(*devPtr, (width * height), UCS_MEMORY_TYPE_CUDA);
330 ucm_cuda_set_ptr_attr((CUdeviceptr) *devPtr);
331 }
332
333 ucm_event_leave();
334 return ret;
335 }
336
ucm_cudaHostGetDevicePointer(void ** pDevice,void * pHost,unsigned int flags)337 cudaError_t ucm_cudaHostGetDevicePointer(void **pDevice, void *pHost, unsigned int flags)
338 {
339 cudaError_t ret;
340
341 ucm_event_enter();
342
343 ret = ucm_orig_cudaHostGetDevicePointer(pDevice, pHost, flags);
344 if (ret == cudaSuccess) {
345 ucm_trace("ucm_cuMemHostGetDevicePointer(pDevice=%p pHost=%p)", pDevice, pHost);
346 }
347
348 ucm_event_leave();
349 return ret;
350 }
351
ucm_cudaHostUnregister(void * ptr)352 cudaError_t ucm_cudaHostUnregister(void *ptr)
353 {
354 cudaError_t ret;
355
356 ucm_event_enter();
357
358 ucm_trace("ucm_cudaHostUnregister(ptr=%p)", ptr);
359
360 ret = ucm_orig_cudaHostUnregister(ptr);
361
362 ucm_event_leave();
363 return ret;
364 }
365
366 static ucm_reloc_patch_t patches[] = {
367 {UCS_PP_MAKE_STRING(cuMemFree), ucm_override_cuMemFree},
368 {UCS_PP_MAKE_STRING(cuMemFreeHost), ucm_override_cuMemFreeHost},
369 {UCS_PP_MAKE_STRING(cuMemAlloc), ucm_override_cuMemAlloc},
370 {UCS_PP_MAKE_STRING(cuMemAllocManaged), ucm_override_cuMemAllocManaged},
371 {UCS_PP_MAKE_STRING(cuMemAllocPitch), ucm_override_cuMemAllocPitch},
372 {UCS_PP_MAKE_STRING(cuMemHostGetDevicePointer), ucm_override_cuMemHostGetDevicePointer},
373 {UCS_PP_MAKE_STRING(cuMemHostUnregister), ucm_override_cuMemHostUnregister},
374 {UCS_PP_MAKE_STRING(cudaFree), ucm_override_cudaFree},
375 {UCS_PP_MAKE_STRING(cudaFreeHost), ucm_override_cudaFreeHost},
376 {UCS_PP_MAKE_STRING(cudaMalloc), ucm_override_cudaMalloc},
377 {UCS_PP_MAKE_STRING(cudaMallocManaged), ucm_override_cudaMallocManaged},
378 {UCS_PP_MAKE_STRING(cudaMallocPitch), ucm_override_cudaMallocPitch},
379 {UCS_PP_MAKE_STRING(cudaHostGetDevicePointer), ucm_override_cudaHostGetDevicePointer},
380 {UCS_PP_MAKE_STRING(cudaHostUnregister), ucm_override_cudaHostUnregister},
381 {NULL, NULL}
382 };
383
ucm_cudamem_install(int events)384 static ucs_status_t ucm_cudamem_install(int events)
385 {
386 static int ucm_cudamem_installed = 0;
387 static pthread_mutex_t install_mutex = PTHREAD_MUTEX_INITIALIZER;
388 ucm_reloc_patch_t *patch;
389 ucs_status_t status = UCS_OK;
390
391 if (!(events & (UCM_EVENT_MEM_TYPE_ALLOC | UCM_EVENT_MEM_TYPE_FREE))) {
392 goto out;
393 }
394
395 if (!ucm_global_opts.enable_cuda_reloc) {
396 ucm_debug("installing cudamem relocations is disabled by configuration");
397 status = UCS_ERR_UNSUPPORTED;
398 goto out;
399 }
400
401 pthread_mutex_lock(&install_mutex);
402
403 if (ucm_cudamem_installed) {
404 goto out_unlock;
405 }
406
407 for (patch = patches; patch->symbol != NULL; ++patch) {
408 status = ucm_reloc_modify(patch);
409 if (status != UCS_OK) {
410 ucm_warn("failed to install relocation table entry for '%s'", patch->symbol);
411 goto out_unlock;
412 }
413 }
414
415 ucm_debug("cudaFree hooks are ready");
416 ucm_cudamem_installed = 1;
417
418 out_unlock:
419 pthread_mutex_unlock(&install_mutex);
420 out:
421 return status;
422 }
423
ucm_cudamem_scan_regions_cb(void * arg,void * addr,size_t length,int prot,const char * path)424 static int ucm_cudamem_scan_regions_cb(void *arg, void *addr, size_t length,
425 int prot, const char *path)
426 {
427 static const char *cuda_path_pattern = "/dev/nvidia";
428 ucm_event_handler_t *handler = arg;
429 ucm_event_t event;
430
431 /* we are interested in blocks which don't have any access permissions, or
432 * mapped to nvidia device.
433 */
434 if ((prot & (PROT_READ|PROT_WRITE|PROT_EXEC)) &&
435 strncmp(path, cuda_path_pattern, strlen(cuda_path_pattern))) {
436 return 0;
437 }
438
439 ucm_debug("dispatching initial memtype allocation for %p..%p %s",
440 addr, UCS_PTR_BYTE_OFFSET(addr, length), path);
441
442 event.mem_type.address = addr;
443 event.mem_type.size = length;
444 event.mem_type.mem_type = UCS_MEMORY_TYPE_LAST; /* unknown memory type */
445
446 ucm_event_enter();
447 handler->cb(UCM_EVENT_MEM_TYPE_ALLOC, &event, handler->arg);
448 ucm_event_leave();
449
450 return 0;
451 }
452
ucm_cudamem_get_existing_alloc(ucm_event_handler_t * handler)453 static void ucm_cudamem_get_existing_alloc(ucm_event_handler_t *handler)
454 {
455 if (handler->events & UCM_EVENT_MEM_TYPE_ALLOC) {
456 ucm_parse_proc_self_maps(ucm_cudamem_scan_regions_cb, handler);
457 }
458 }
459
ucm_cuda_get_current_device_info(ucs_sys_bus_id_t * bus_id,ucs_memory_type_t mem_type)460 ucs_status_t ucm_cuda_get_current_device_info(ucs_sys_bus_id_t *bus_id,
461 ucs_memory_type_t mem_type)
462 {
463 static ucs_sys_bus_id_t cached_bus_id = {0xffff, 0xff, 0xff, 0xff};
464 CUresult cu_err;
465 CUdevice cuda_device;
466 CUdevice_attribute attribute;
467 int attr_result;
468
469 ucm_trace("ucm_cuda_get_current_device_info");
470
471 if (mem_type != UCS_MEMORY_TYPE_CUDA) {
472 return UCS_ERR_UNSUPPORTED;
473 }
474
475 if (cached_bus_id.slot != 0xff) {
476 memcpy(bus_id, &cached_bus_id, sizeof(cached_bus_id));
477 return UCS_OK;
478 }
479
480 /* Find cuda dev that the current ctx is using and find it's path*/
481 cu_err = cuCtxGetDevice(&cuda_device);
482 if (CUDA_SUCCESS != cu_err) {
483 ucm_debug("no cuda device context found");
484 return UCS_ERR_NO_RESOURCE;
485 }
486
487 attribute = CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID;
488 cu_err = cuDeviceGetAttribute(&attr_result, attribute, cuda_device);
489 if (CUDA_SUCCESS != cu_err) {
490 ucm_error("unable to get cuda device domain");
491 return UCS_ERR_IO_ERROR;
492 }
493
494 bus_id->domain = (uint16_t)attr_result;
495
496 attribute = CU_DEVICE_ATTRIBUTE_PCI_BUS_ID;
497 cu_err = cuDeviceGetAttribute(&attr_result, attribute, cuda_device);
498 if (CUDA_SUCCESS != cu_err) {
499 ucm_error("unable to get cuda device bus id");
500 return UCS_ERR_IO_ERROR;
501 }
502
503 bus_id->bus = (uint8_t)attr_result;
504 bus_id->slot = 0;
505 bus_id->function = 0;
506 cached_bus_id = *bus_id;
507
508 ucm_trace("found bus_id %x:%x:%x:%x for device %d", bus_id->domain,
509 bus_id->bus,
510 bus_id->slot,
511 bus_id->function,
512 cuda_device);
513
514 return UCS_OK;
515 }
516
517 static ucm_event_installer_t ucm_cuda_initializer = {
518 .install = ucm_cudamem_install,
519 .get_existing_alloc = ucm_cudamem_get_existing_alloc,
520 .get_mem_type_current_device_info = ucm_cuda_get_current_device_info
521 };
522
523 UCS_STATIC_INIT {
524 ucs_list_add_tail(&ucm_event_installer_list, &ucm_cuda_initializer.list);
525 }
526
527 UCS_STATIC_CLEANUP {
528 ucs_list_del(&ucm_cuda_initializer.list);
529 }
530