1 /**
2  * Copyright (C) Mellanox Technologies Ltd. 2001-2017.  ALL RIGHTS RESERVED.
3  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
4  *
5  * See file LICENSE for terms.
6  */
7 
8 #ifdef HAVE_CONFIG_H
9 #  include "config.h"
10 #endif
11 
12 #include <ucm/cuda/cudamem.h>
13 
14 #include <ucm/event/event.h>
15 #include <ucm/util/log.h>
16 #include <ucm/util/reloc.h>
17 #include <ucm/util/replace.h>
18 #include <ucm/util/sys.h>
19 #include <ucs/debug/assert.h>
20 #include <ucs/sys/compiler.h>
21 #include <ucs/sys/preprocessor.h>
22 #include <ucs/sys/topo.h>
23 #include <ucs/memory/memory_type.h>
24 
25 #include <sys/mman.h>
26 #include <pthread.h>
27 #include <string.h>
28 #include <unistd.h>
29 
30 
31 UCM_DEFINE_REPLACE_DLSYM_FUNC(cuMemFree, CUresult, -1, CUdeviceptr)
32 UCM_DEFINE_REPLACE_DLSYM_FUNC(cuMemFreeHost, CUresult, -1, void *)
33 UCM_DEFINE_REPLACE_DLSYM_FUNC(cuMemAlloc, CUresult, -1, CUdeviceptr *, size_t)
34 UCM_DEFINE_REPLACE_DLSYM_FUNC(cuMemAllocManaged, CUresult, -1, CUdeviceptr *,
35                               size_t, unsigned int)
36 UCM_DEFINE_REPLACE_DLSYM_FUNC(cuMemAllocPitch, CUresult, -1, CUdeviceptr *, size_t *,
37                               size_t, size_t, unsigned int)
38 UCM_DEFINE_REPLACE_DLSYM_FUNC(cuMemHostGetDevicePointer, CUresult, -1, CUdeviceptr *,
39                               void *, unsigned int)
40 UCM_DEFINE_REPLACE_DLSYM_FUNC(cuMemHostUnregister, CUresult, -1, void *)
41 UCM_DEFINE_REPLACE_DLSYM_FUNC(cudaFree, cudaError_t, -1, void*)
42 UCM_DEFINE_REPLACE_DLSYM_FUNC(cudaFreeHost, cudaError_t, -1, void*)
43 UCM_DEFINE_REPLACE_DLSYM_FUNC(cudaMalloc, cudaError_t, -1, void**, size_t)
44 UCM_DEFINE_REPLACE_DLSYM_FUNC(cudaMallocManaged, cudaError_t, -1, void**, size_t, unsigned int)
45 UCM_DEFINE_REPLACE_DLSYM_FUNC(cudaMallocPitch, cudaError_t, -1, void**, size_t *,
46                               size_t, size_t)
47 UCM_DEFINE_REPLACE_DLSYM_FUNC(cudaHostGetDevicePointer, cudaError_t, -1, void**,
48                               void *, unsigned int)
49 UCM_DEFINE_REPLACE_DLSYM_FUNC(cudaHostUnregister, cudaError_t, -1, void*)
50 
51 #if ENABLE_SYMBOL_OVERRIDE
UCM_OVERRIDE_FUNC(cuMemFree,CUresult)52 UCM_OVERRIDE_FUNC(cuMemFree,                 CUresult)
53 UCM_OVERRIDE_FUNC(cuMemFreeHost,             CUresult)
54 UCM_OVERRIDE_FUNC(cuMemAlloc,                CUresult)
55 UCM_OVERRIDE_FUNC(cuMemAllocManaged,         CUresult)
56 UCM_OVERRIDE_FUNC(cuMemAllocPitch,           CUresult)
57 UCM_OVERRIDE_FUNC(cuMemHostGetDevicePointer, CUresult)
58 UCM_OVERRIDE_FUNC(cuMemHostUnregister,       CUresult)
59 UCM_OVERRIDE_FUNC(cudaFree,                  cudaError_t)
60 UCM_OVERRIDE_FUNC(cudaFreeHost,              cudaError_t)
61 UCM_OVERRIDE_FUNC(cudaMalloc,                cudaError_t)
62 UCM_OVERRIDE_FUNC(cudaMallocManaged,         cudaError_t)
63 UCM_OVERRIDE_FUNC(cudaMallocPitch,           cudaError_t)
64 UCM_OVERRIDE_FUNC(cudaHostGetDevicePointer,  cudaError_t)
65 UCM_OVERRIDE_FUNC(cudaHostUnregister,        cudaError_t)
66 #endif
67 
68 
69 static void ucm_cuda_set_ptr_attr(CUdeviceptr dptr)
70 {
71     if ((void*)dptr == NULL) {
72         ucm_trace("skipping cuPointerSetAttribute for null pointer");
73         return;
74     }
75 
76     unsigned int value = 1;
77     CUresult ret;
78     const char *cu_err_str;
79 
80     ret = cuPointerSetAttribute(&value, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, dptr);
81     if (ret != CUDA_SUCCESS) {
82         cuGetErrorString(ret, &cu_err_str);
83         ucm_warn("cuPointerSetAttribute(%p) failed: %s", (void *) dptr, cu_err_str);
84     }
85 }
86 
87 static UCS_F_ALWAYS_INLINE void
ucm_dispatch_mem_type_alloc(void * addr,size_t length,ucs_memory_type_t mem_type)88 ucm_dispatch_mem_type_alloc(void *addr, size_t length, ucs_memory_type_t mem_type)
89 {
90     ucm_event_t event;
91 
92     event.mem_type.address  = addr;
93     event.mem_type.size     = length;
94     event.mem_type.mem_type = mem_type;
95     ucm_event_dispatch(UCM_EVENT_MEM_TYPE_ALLOC, &event);
96 }
97 
98 static UCS_F_ALWAYS_INLINE void
ucm_dispatch_mem_type_free(void * addr,size_t length,ucs_memory_type_t mem_type)99 ucm_dispatch_mem_type_free(void *addr, size_t length, ucs_memory_type_t mem_type)
100 {
101     ucm_event_t event;
102 
103     event.mem_type.address  = addr;
104     event.mem_type.size     = length;
105     event.mem_type.mem_type = mem_type;
106     ucm_event_dispatch(UCM_EVENT_MEM_TYPE_FREE, &event);
107 }
108 
ucm_cudafree_dispatch_events(CUdeviceptr dptr,const char * func_name)109 static void ucm_cudafree_dispatch_events(CUdeviceptr dptr, const char *func_name)
110 {
111     CUresult ret;
112     CUdeviceptr pbase;
113     size_t psize;
114 
115     if (dptr == 0) {
116         return;
117     }
118 
119     ret = cuMemGetAddressRange(&pbase, &psize, dptr);
120     if (ret == CUDA_SUCCESS) {
121         if (dptr != pbase) {
122             ucm_warn("%s(%p) called with unexpected pointer (expected: %p)",
123                      func_name, (void*)dptr, (void*)pbase);
124         }
125     } else {
126         ucm_debug("cuMemGetAddressRange(devPtr=%p) failed", (void*)dptr);
127         psize = 1; /* set minimum length */
128     }
129 
130     ucm_dispatch_mem_type_free((void *)dptr, psize, UCS_MEMORY_TYPE_CUDA);
131 }
132 
ucm_cuMemFree(CUdeviceptr dptr)133 CUresult ucm_cuMemFree(CUdeviceptr dptr)
134 {
135     CUresult ret;
136 
137     ucm_event_enter();
138 
139     ucm_trace("ucm_cuMemFree(dptr=%p)",(void*)dptr);
140 
141     ucm_cudafree_dispatch_events(dptr, "cuMemFree");
142 
143     ret = ucm_orig_cuMemFree(dptr);
144 
145     ucm_event_leave();
146     return ret;
147 }
148 
ucm_cuMemFreeHost(void * p)149 CUresult ucm_cuMemFreeHost(void *p)
150 {
151     CUresult ret;
152 
153     ucm_event_enter();
154 
155     ucm_trace("ucm_cuMemFreeHost(ptr=%p)", p);
156 
157     ucm_dispatch_vm_munmap(p, 0);
158 
159     ret = ucm_orig_cuMemFreeHost(p);
160 
161     ucm_event_leave();
162     return ret;
163 }
164 
ucm_cuMemAlloc(CUdeviceptr * dptr,size_t size)165 CUresult ucm_cuMemAlloc(CUdeviceptr *dptr, size_t size)
166 {
167     CUresult ret;
168 
169     ucm_event_enter();
170 
171     ret = ucm_orig_cuMemAlloc(dptr, size);
172     if (ret == CUDA_SUCCESS) {
173         ucm_trace("ucm_cuMemAlloc(dptr=%p size:%lu)",(void *)*dptr, size);
174         ucm_dispatch_mem_type_alloc((void *)*dptr, size, UCS_MEMORY_TYPE_CUDA);
175         ucm_cuda_set_ptr_attr(*dptr);
176     }
177 
178     ucm_event_leave();
179     return ret;
180 }
181 
ucm_cuMemAllocManaged(CUdeviceptr * dptr,size_t size,unsigned int flags)182 CUresult ucm_cuMemAllocManaged(CUdeviceptr *dptr, size_t size, unsigned int flags)
183 {
184     CUresult ret;
185 
186     ucm_event_enter();
187 
188     ret = ucm_orig_cuMemAllocManaged(dptr, size, flags);
189     if (ret == CUDA_SUCCESS) {
190         ucm_trace("ucm_cuMemAllocManaged(dptr=%p size:%lu, flags:%d)",
191                   (void *)*dptr, size, flags);
192         ucm_dispatch_mem_type_alloc((void *)*dptr, size,
193                                     UCS_MEMORY_TYPE_CUDA_MANAGED);
194     }
195 
196     ucm_event_leave();
197     return ret;
198 }
199 
ucm_cuMemAllocPitch(CUdeviceptr * dptr,size_t * pPitch,size_t WidthInBytes,size_t Height,unsigned int ElementSizeBytes)200 CUresult ucm_cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch,
201                              size_t WidthInBytes, size_t Height,
202                              unsigned int ElementSizeBytes)
203 {
204     CUresult ret;
205 
206     ucm_event_enter();
207 
208     ret = ucm_orig_cuMemAllocPitch(dptr, pPitch, WidthInBytes, Height, ElementSizeBytes);
209     if (ret == CUDA_SUCCESS) {
210         ucm_trace("ucm_cuMemAllocPitch(dptr=%p size:%lu)",(void *)*dptr,
211                   (WidthInBytes * Height));
212         ucm_dispatch_mem_type_alloc((void *)*dptr, WidthInBytes * Height,
213                                     UCS_MEMORY_TYPE_CUDA);
214         ucm_cuda_set_ptr_attr(*dptr);
215     }
216 
217     ucm_event_leave();
218     return ret;
219 }
220 
ucm_cuMemHostGetDevicePointer(CUdeviceptr * pdptr,void * p,unsigned int Flags)221 CUresult ucm_cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags)
222 {
223     CUresult ret;
224 
225     ucm_event_enter();
226 
227     ret = ucm_orig_cuMemHostGetDevicePointer(pdptr, p, Flags);
228     if (ret == CUDA_SUCCESS) {
229         ucm_trace("ucm_cuMemHostGetDevicePointer(pdptr=%p p=%p)",(void *)*pdptr, p);
230     }
231 
232     ucm_event_leave();
233     return ret;
234 }
235 
ucm_cuMemHostUnregister(void * p)236 CUresult ucm_cuMemHostUnregister(void *p)
237 {
238     CUresult ret;
239 
240     ucm_event_enter();
241 
242     ucm_trace("ucm_cuMemHostUnregister(ptr=%p)", p);
243 
244     ret = ucm_orig_cuMemHostUnregister(p);
245 
246     ucm_event_leave();
247     return ret;
248 }
249 
ucm_cudaFree(void * devPtr)250 cudaError_t ucm_cudaFree(void *devPtr)
251 {
252     cudaError_t ret;
253 
254     ucm_event_enter();
255 
256     ucm_trace("ucm_cudaFree(devPtr=%p)", devPtr);
257 
258     ucm_cudafree_dispatch_events((CUdeviceptr)devPtr, "cudaFree");
259 
260     ret = ucm_orig_cudaFree(devPtr);
261 
262     ucm_event_leave();
263 
264     return ret;
265 }
266 
ucm_cudaFreeHost(void * ptr)267 cudaError_t ucm_cudaFreeHost(void *ptr)
268 {
269     cudaError_t ret;
270 
271     ucm_event_enter();
272 
273     ucm_trace("ucm_cudaFreeHost(ptr=%p)", ptr);
274 
275     ucm_dispatch_vm_munmap(ptr, 0);
276 
277     ret = ucm_orig_cudaFreeHost(ptr);
278 
279     ucm_event_leave();
280     return ret;
281 }
282 
ucm_cudaMalloc(void ** devPtr,size_t size)283 cudaError_t ucm_cudaMalloc(void **devPtr, size_t size)
284 {
285     cudaError_t ret;
286 
287     ucm_event_enter();
288 
289     ret = ucm_orig_cudaMalloc(devPtr, size);
290     if (ret == cudaSuccess) {
291         ucm_trace("ucm_cudaMalloc(devPtr=%p size:%lu)", *devPtr, size);
292         ucm_dispatch_mem_type_alloc(*devPtr, size, UCS_MEMORY_TYPE_CUDA);
293         ucm_cuda_set_ptr_attr((CUdeviceptr) *devPtr);
294     }
295 
296     ucm_event_leave();
297 
298     return ret;
299 }
300 
ucm_cudaMallocManaged(void ** devPtr,size_t size,unsigned int flags)301 cudaError_t ucm_cudaMallocManaged(void **devPtr, size_t size, unsigned int flags)
302 {
303     cudaError_t ret;
304 
305     ucm_event_enter();
306 
307     ret = ucm_orig_cudaMallocManaged(devPtr, size, flags);
308     if (ret == cudaSuccess) {
309         ucm_trace("ucm_cudaMallocManaged(devPtr=%p size:%lu flags:%d)",
310                   *devPtr, size, flags);
311         ucm_dispatch_mem_type_alloc(*devPtr, size, UCS_MEMORY_TYPE_CUDA_MANAGED);
312     }
313 
314     ucm_event_leave();
315 
316     return ret;
317 }
318 
ucm_cudaMallocPitch(void ** devPtr,size_t * pitch,size_t width,size_t height)319 cudaError_t ucm_cudaMallocPitch(void **devPtr, size_t *pitch,
320                                 size_t width, size_t height)
321 {
322     cudaError_t ret;
323 
324     ucm_event_enter();
325 
326     ret = ucm_orig_cudaMallocPitch(devPtr, pitch, width, height);
327     if (ret == cudaSuccess) {
328         ucm_trace("ucm_cudaMallocPitch(devPtr=%p size:%lu)",*devPtr, (width * height));
329         ucm_dispatch_mem_type_alloc(*devPtr, (width * height), UCS_MEMORY_TYPE_CUDA);
330         ucm_cuda_set_ptr_attr((CUdeviceptr) *devPtr);
331     }
332 
333     ucm_event_leave();
334     return ret;
335 }
336 
ucm_cudaHostGetDevicePointer(void ** pDevice,void * pHost,unsigned int flags)337 cudaError_t ucm_cudaHostGetDevicePointer(void **pDevice, void *pHost, unsigned int flags)
338 {
339     cudaError_t ret;
340 
341     ucm_event_enter();
342 
343     ret = ucm_orig_cudaHostGetDevicePointer(pDevice, pHost, flags);
344     if (ret == cudaSuccess) {
345         ucm_trace("ucm_cuMemHostGetDevicePointer(pDevice=%p pHost=%p)", pDevice, pHost);
346     }
347 
348     ucm_event_leave();
349     return ret;
350 }
351 
ucm_cudaHostUnregister(void * ptr)352 cudaError_t ucm_cudaHostUnregister(void *ptr)
353 {
354     cudaError_t ret;
355 
356     ucm_event_enter();
357 
358     ucm_trace("ucm_cudaHostUnregister(ptr=%p)", ptr);
359 
360     ret = ucm_orig_cudaHostUnregister(ptr);
361 
362     ucm_event_leave();
363     return ret;
364 }
365 
366 static ucm_reloc_patch_t patches[] = {
367     {UCS_PP_MAKE_STRING(cuMemFree),                 ucm_override_cuMemFree},
368     {UCS_PP_MAKE_STRING(cuMemFreeHost),             ucm_override_cuMemFreeHost},
369     {UCS_PP_MAKE_STRING(cuMemAlloc),                ucm_override_cuMemAlloc},
370     {UCS_PP_MAKE_STRING(cuMemAllocManaged),         ucm_override_cuMemAllocManaged},
371     {UCS_PP_MAKE_STRING(cuMemAllocPitch),           ucm_override_cuMemAllocPitch},
372     {UCS_PP_MAKE_STRING(cuMemHostGetDevicePointer), ucm_override_cuMemHostGetDevicePointer},
373     {UCS_PP_MAKE_STRING(cuMemHostUnregister),       ucm_override_cuMemHostUnregister},
374     {UCS_PP_MAKE_STRING(cudaFree),                  ucm_override_cudaFree},
375     {UCS_PP_MAKE_STRING(cudaFreeHost),              ucm_override_cudaFreeHost},
376     {UCS_PP_MAKE_STRING(cudaMalloc),                ucm_override_cudaMalloc},
377     {UCS_PP_MAKE_STRING(cudaMallocManaged),         ucm_override_cudaMallocManaged},
378     {UCS_PP_MAKE_STRING(cudaMallocPitch),           ucm_override_cudaMallocPitch},
379     {UCS_PP_MAKE_STRING(cudaHostGetDevicePointer),  ucm_override_cudaHostGetDevicePointer},
380     {UCS_PP_MAKE_STRING(cudaHostUnregister),        ucm_override_cudaHostUnregister},
381     {NULL,                                          NULL}
382 };
383 
ucm_cudamem_install(int events)384 static ucs_status_t ucm_cudamem_install(int events)
385 {
386     static int ucm_cudamem_installed = 0;
387     static pthread_mutex_t install_mutex = PTHREAD_MUTEX_INITIALIZER;
388     ucm_reloc_patch_t *patch;
389     ucs_status_t status = UCS_OK;
390 
391     if (!(events & (UCM_EVENT_MEM_TYPE_ALLOC | UCM_EVENT_MEM_TYPE_FREE))) {
392         goto out;
393     }
394 
395     if (!ucm_global_opts.enable_cuda_reloc) {
396         ucm_debug("installing cudamem relocations is disabled by configuration");
397         status = UCS_ERR_UNSUPPORTED;
398         goto out;
399     }
400 
401     pthread_mutex_lock(&install_mutex);
402 
403     if (ucm_cudamem_installed) {
404         goto out_unlock;
405     }
406 
407     for (patch = patches; patch->symbol != NULL; ++patch) {
408         status = ucm_reloc_modify(patch);
409         if (status != UCS_OK) {
410             ucm_warn("failed to install relocation table entry for '%s'", patch->symbol);
411             goto out_unlock;
412         }
413     }
414 
415     ucm_debug("cudaFree hooks are ready");
416     ucm_cudamem_installed = 1;
417 
418 out_unlock:
419     pthread_mutex_unlock(&install_mutex);
420 out:
421     return status;
422 }
423 
ucm_cudamem_scan_regions_cb(void * arg,void * addr,size_t length,int prot,const char * path)424 static int ucm_cudamem_scan_regions_cb(void *arg, void *addr, size_t length,
425                                        int prot, const char *path)
426 {
427     static const char *cuda_path_pattern = "/dev/nvidia";
428     ucm_event_handler_t *handler         = arg;
429     ucm_event_t event;
430 
431     /* we are interested in blocks which don't have any access permissions, or
432      * mapped to nvidia device.
433      */
434     if ((prot & (PROT_READ|PROT_WRITE|PROT_EXEC)) &&
435         strncmp(path, cuda_path_pattern, strlen(cuda_path_pattern))) {
436         return 0;
437     }
438 
439     ucm_debug("dispatching initial memtype allocation for %p..%p %s",
440               addr, UCS_PTR_BYTE_OFFSET(addr, length), path);
441 
442     event.mem_type.address  = addr;
443     event.mem_type.size     = length;
444     event.mem_type.mem_type = UCS_MEMORY_TYPE_LAST; /* unknown memory type */
445 
446     ucm_event_enter();
447     handler->cb(UCM_EVENT_MEM_TYPE_ALLOC, &event, handler->arg);
448     ucm_event_leave();
449 
450     return 0;
451 }
452 
ucm_cudamem_get_existing_alloc(ucm_event_handler_t * handler)453 static void ucm_cudamem_get_existing_alloc(ucm_event_handler_t *handler)
454 {
455     if (handler->events & UCM_EVENT_MEM_TYPE_ALLOC) {
456         ucm_parse_proc_self_maps(ucm_cudamem_scan_regions_cb, handler);
457     }
458 }
459 
ucm_cuda_get_current_device_info(ucs_sys_bus_id_t * bus_id,ucs_memory_type_t mem_type)460 ucs_status_t ucm_cuda_get_current_device_info(ucs_sys_bus_id_t *bus_id,
461                                               ucs_memory_type_t mem_type)
462 {
463     static ucs_sys_bus_id_t cached_bus_id = {0xffff, 0xff, 0xff, 0xff};
464     CUresult cu_err;
465     CUdevice cuda_device;
466     CUdevice_attribute attribute;
467     int attr_result;
468 
469     ucm_trace("ucm_cuda_get_current_device_info");
470 
471     if (mem_type != UCS_MEMORY_TYPE_CUDA) {
472         return UCS_ERR_UNSUPPORTED;
473     }
474 
475     if (cached_bus_id.slot != 0xff) {
476         memcpy(bus_id, &cached_bus_id, sizeof(cached_bus_id));
477         return UCS_OK;
478     }
479 
480     /* Find cuda dev that the current ctx is using and find it's path*/
481     cu_err = cuCtxGetDevice(&cuda_device);
482     if (CUDA_SUCCESS != cu_err) {
483         ucm_debug("no cuda device context found");
484         return UCS_ERR_NO_RESOURCE;
485     }
486 
487     attribute = CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID;
488     cu_err = cuDeviceGetAttribute(&attr_result, attribute, cuda_device);
489     if (CUDA_SUCCESS != cu_err) {
490         ucm_error("unable to get cuda device domain");
491         return UCS_ERR_IO_ERROR;
492     }
493 
494     bus_id->domain = (uint16_t)attr_result;
495 
496     attribute = CU_DEVICE_ATTRIBUTE_PCI_BUS_ID;
497     cu_err = cuDeviceGetAttribute(&attr_result, attribute, cuda_device);
498     if (CUDA_SUCCESS != cu_err) {
499         ucm_error("unable to get cuda device bus id");
500         return UCS_ERR_IO_ERROR;
501     }
502 
503     bus_id->bus      = (uint8_t)attr_result;
504     bus_id->slot     = 0;
505     bus_id->function = 0;
506     cached_bus_id    = *bus_id;
507 
508     ucm_trace("found bus_id %x:%x:%x:%x for device %d", bus_id->domain,
509                                                         bus_id->bus,
510                                                         bus_id->slot,
511                                                         bus_id->function,
512                                                         cuda_device);
513 
514     return UCS_OK;
515 }
516 
517 static ucm_event_installer_t ucm_cuda_initializer = {
518     .install                          = ucm_cudamem_install,
519     .get_existing_alloc               = ucm_cudamem_get_existing_alloc,
520     .get_mem_type_current_device_info = ucm_cuda_get_current_device_info
521 };
522 
523 UCS_STATIC_INIT {
524     ucs_list_add_tail(&ucm_event_installer_list, &ucm_cuda_initializer.list);
525 }
526 
527 UCS_STATIC_CLEANUP {
528     ucs_list_del(&ucm_cuda_initializer.list);
529 }
530