1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  */
5 
6 #include "yaksi.h"
7 #include "yaksuri_cudai.h"
8 #include <assert.h>
9 #include <string.h>
10 #include <cuda.h>
11 #include <cuda_runtime_api.h>
12 
cuda_host_malloc(uintptr_t size)13 static void *cuda_host_malloc(uintptr_t size)
14 {
15     void *ptr = NULL;
16 
17     cudaError_t cerr = cudaMallocHost(&ptr, size);
18     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
19 
20     return ptr;
21 }
22 
cuda_gpu_malloc(uintptr_t size,int device)23 static void *cuda_gpu_malloc(uintptr_t size, int device)
24 {
25     void *ptr = NULL;
26     cudaError_t cerr;
27 
28     int cur_device;
29     cerr = cudaGetDevice(&cur_device);
30     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
31 
32     if (cur_device != device) {
33         cerr = cudaSetDevice(device);
34         YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
35     }
36 
37     cerr = cudaMalloc(&ptr, size);
38     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
39 
40     if (cur_device != device) {
41         cerr = cudaSetDevice(cur_device);
42         YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
43     }
44 
45     return ptr;
46 }
47 
cuda_host_free(void * ptr)48 static void cuda_host_free(void *ptr)
49 {
50     cudaError_t cerr = cudaFreeHost(ptr);
51     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
52 }
53 
cuda_gpu_free(void * ptr)54 static void cuda_gpu_free(void *ptr)
55 {
56     cudaError_t cerr = cudaFree(ptr);
57     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
58 }
59 
60 yaksuri_cudai_global_s yaksuri_cudai_global;
61 
finalize_hook(void)62 static int finalize_hook(void)
63 {
64     int rc = YAKSA_SUCCESS;
65     cudaError_t cerr;
66 
67     for (int i = 0; i < yaksuri_cudai_global.ndevices; i++) {
68         cerr = cudaSetDevice(i);
69         YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
70 
71         cerr = cudaStreamDestroy(yaksuri_cudai_global.stream[i]);
72         YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
73 
74         free(yaksuri_cudai_global.p2p[i]);
75     }
76     free(yaksuri_cudai_global.stream);
77     free(yaksuri_cudai_global.p2p);
78 
79   fn_exit:
80     return rc;
81   fn_fail:
82     goto fn_exit;
83 }
84 
get_num_devices(int * ndevices)85 static int get_num_devices(int *ndevices)
86 {
87     *ndevices = yaksuri_cudai_global.ndevices;
88 
89     return YAKSA_SUCCESS;
90 }
91 
check_p2p_comm(int sdev,int ddev,bool * is_enabled)92 static int check_p2p_comm(int sdev, int ddev, bool * is_enabled)
93 {
94 #if CUDA_P2P == CUDA_P2P_ENABLED
95     *is_enabled = yaksuri_cudai_global.p2p[sdev][ddev];
96 #elif CUDA_P2P == CUDA_P2P_CLIQUES
97     if ((sdev + ddev) % 2)
98         *is_enabled = 0;
99     else
100         *is_enabled = yaksuri_cudai_global.p2p[sdev][ddev];
101 #else
102     *is_enabled = 0;
103 #endif
104 
105     return YAKSA_SUCCESS;
106 }
107 
yaksuri_cuda_init_hook(yaksur_gpudriver_hooks_s ** hooks)108 int yaksuri_cuda_init_hook(yaksur_gpudriver_hooks_s ** hooks)
109 {
110     int rc = YAKSA_SUCCESS;
111     cudaError_t cerr;
112 
113     cerr = cudaGetDeviceCount(&yaksuri_cudai_global.ndevices);
114     YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
115 
116     if (getenv("CUDA_VISIBLE_DEVICES") == NULL) {
117         /* user did not do any filtering for us; if any of the devices
118          * is in exclusive mode, disable GPU support to avoid
119          * incorrect device sharing */
120         bool excl = false;
121         for (int i = 0; i < yaksuri_cudai_global.ndevices; i++) {
122             struct cudaDeviceProp prop;
123 
124             cerr = cudaGetDeviceProperties(&prop, i);
125             YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
126 
127             if (prop.computeMode != cudaComputeModeDefault) {
128                 excl = true;
129                 break;
130             }
131         }
132 
133         if (excl == true) {
134             fprintf(stderr, "[yaksa] ====> Disabling CUDA support <====\n");
135             fprintf(stderr,
136                     "[yaksa] CUDA is setup in exclusive compute mode, but CUDA_VISIBLE_DEVICES is not set\n");
137             fprintf(stderr,
138                     "[yaksa] You can silence this warning by setting CUDA_VISIBLE_DEVICES\n");
139             fflush(stderr);
140             *hooks = NULL;
141             goto fn_exit;
142         }
143     }
144 
145     yaksuri_cudai_global.stream = (cudaStream_t *)
146         malloc(yaksuri_cudai_global.ndevices * sizeof(cudaStream_t));
147 
148     yaksuri_cudai_global.p2p = (bool **) malloc(yaksuri_cudai_global.ndevices * sizeof(bool *));
149     for (int i = 0; i < yaksuri_cudai_global.ndevices; i++) {
150         yaksuri_cudai_global.p2p[i] = (bool *) malloc(yaksuri_cudai_global.ndevices * sizeof(bool));
151     }
152 
153     int cur_device;
154     cerr = cudaGetDevice(&cur_device);
155     YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
156 
157     for (int i = 0; i < yaksuri_cudai_global.ndevices; i++) {
158         cerr = cudaSetDevice(i);
159         YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
160 
161         cerr = cudaStreamCreateWithFlags(&yaksuri_cudai_global.stream[i], cudaStreamNonBlocking);
162         YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
163 
164         for (int j = 0; j < yaksuri_cudai_global.ndevices; j++) {
165             if (i == j) {
166                 yaksuri_cudai_global.p2p[i][j] = 1;
167             } else {
168                 int val;
169                 cerr = cudaDeviceCanAccessPeer(&val, i, j);
170                 YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
171 
172                 if (val) {
173                     cerr = cudaDeviceEnablePeerAccess(j, 0);
174                     if (cerr != cudaErrorPeerAccessAlreadyEnabled) {
175                         YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
176                     }
177                     yaksuri_cudai_global.p2p[i][j] = 1;
178                 } else {
179                     yaksuri_cudai_global.p2p[i][j] = 0;
180                 }
181             }
182         }
183     }
184 
185     cerr = cudaSetDevice(cur_device);
186     YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
187 
188     *hooks = (yaksur_gpudriver_hooks_s *) malloc(sizeof(yaksur_gpudriver_hooks_s));
189     (*hooks)->get_num_devices = get_num_devices;
190     (*hooks)->check_p2p_comm = check_p2p_comm;
191     (*hooks)->finalize = finalize_hook;
192     (*hooks)->get_iov_pack_threshold = yaksuri_cudai_get_iov_pack_threshold;
193     (*hooks)->get_iov_unpack_threshold = yaksuri_cudai_get_iov_unpack_threshold;
194     (*hooks)->ipack = yaksuri_cudai_ipack;
195     (*hooks)->iunpack = yaksuri_cudai_iunpack;
196     (*hooks)->pup_is_supported = yaksuri_cudai_pup_is_supported;
197     (*hooks)->host_malloc = cuda_host_malloc;
198     (*hooks)->host_free = cuda_host_free;
199     (*hooks)->gpu_malloc = cuda_gpu_malloc;
200     (*hooks)->gpu_free = cuda_gpu_free;
201     (*hooks)->get_ptr_attr = yaksuri_cudai_get_ptr_attr;
202     (*hooks)->event_record = yaksuri_cudai_event_record;
203     (*hooks)->event_query = yaksuri_cudai_event_query;
204     (*hooks)->add_dependency = yaksuri_cudai_add_dependency;
205     (*hooks)->type_create = yaksuri_cudai_type_create_hook;
206     (*hooks)->type_free = yaksuri_cudai_type_free_hook;
207     (*hooks)->info_create = yaksuri_cudai_info_create_hook;
208     (*hooks)->info_free = yaksuri_cudai_info_free_hook;
209     (*hooks)->info_keyval_append = yaksuri_cudai_info_keyval_append;
210 
211   fn_exit:
212     return rc;
213   fn_fail:
214     goto fn_exit;
215 }
216