1 #ifdef __cplusplus
2 extern "C" {
3 #endif
4 int cuda_debug_sync = 0;
5 int gpu_index = 0;
6 #ifdef __cplusplus
7 }
8 #endif // __cplusplus
9 
10 #ifdef GPU
11 
12 #include "dark_cuda.h"
13 #include "utils.h"
14 #include "blas.h"
15 #include "assert.h"
16 #include <stdlib.h>
17 #include <time.h>
18 #include <cuda.h>
19 #include <stdio.h>
20 
21 #pragma comment(lib, "cuda.lib")
22 
23 
24 #ifdef CUDNN
25 #ifndef USE_CMAKE_LIBS
26 #pragma comment(lib, "cudnn.lib")
27 #endif  // USE_CMAKE_LIBS
28 #endif  // CUDNN
29 
30 #if defined(CUDNN_HALF) && !defined(CUDNN)
31 #error "If you set CUDNN_HALF=1 then you must set CUDNN=1"
32 #endif
33 
34 
cuda_set_device(int n)35 void cuda_set_device(int n)
36 {
37     gpu_index = n;
38     cudaError_t status = cudaSetDevice(n);
39     if(status != cudaSuccess) CHECK_CUDA(status);
40 }
41 
cuda_get_device()42 int cuda_get_device()
43 {
44     int n = 0;
45     cudaError_t status = cudaGetDevice(&n);
46     CHECK_CUDA(status);
47     return n;
48 }
49 
cuda_get_context()50 void *cuda_get_context()
51 {
52     CUcontext pctx;
53     CUresult status = cuCtxGetCurrent(&pctx);
54     if(status != CUDA_SUCCESS) fprintf(stderr, " Error: cuCtxGetCurrent() is failed \n");
55     return (void *)pctx;
56 }
57 
check_error(cudaError_t status)58 void check_error(cudaError_t status)
59 {
60     cudaError_t status2 = cudaGetLastError();
61     if (status != cudaSuccess)
62     {
63         const char *s = cudaGetErrorString(status);
64         char buffer[256];
65         printf("\n CUDA Error: %s\n", s);
66         snprintf(buffer, 256, "CUDA Error: %s", s);
67 #ifdef WIN32
68         getchar();
69 #endif
70         error(buffer);
71     }
72     if (status2 != cudaSuccess)
73     {
74         const char *s = cudaGetErrorString(status2);
75         char buffer[256];
76         printf("\n CUDA Error Prev: %s\n", s);
77         snprintf(buffer, 256, "CUDA Error Prev: %s", s);
78 #ifdef WIN32
79         getchar();
80 #endif
81         error(buffer);
82     }
83 }
84 
check_error_extended(cudaError_t status,const char * file,int line,const char * date_time)85 void check_error_extended(cudaError_t status, const char *file, int line, const char *date_time)
86 {
87     if (status != cudaSuccess) {
88         printf("CUDA status Error: file: %s() : line: %d : build time: %s \n", file, line, date_time);
89         check_error(status);
90     }
91 #if defined(DEBUG) || defined(CUDA_DEBUG)
92     cuda_debug_sync = 1;
93 #endif
94     if (cuda_debug_sync) {
95         status = cudaDeviceSynchronize();
96         if (status != cudaSuccess)
97             printf("CUDA status = cudaDeviceSynchronize() Error: file: %s() : line: %d : build time: %s \n", file, line, date_time);
98     }
99     check_error(status);
100 }
101 
cuda_gridsize(size_t n)102 dim3 cuda_gridsize(size_t n){
103     size_t k = (n-1) / BLOCK + 1;
104     size_t x = k;
105     size_t y = 1;
106     if(x > 65535){
107         x = ceil(sqrt(k));
108         y = (n-1)/(x*BLOCK) + 1;
109     }
110     //dim3 d = { (unsigned int)x, (unsigned int)y, 1 };
111     dim3 d;
112     d.x = x;
113     d.y = y;
114     d.z = 1;
115     //printf("%ld %ld %ld %ld\n", n, x, y, x*y*BLOCK);
116     return d;
117 }
118 
119 static cudaStream_t streamsArray[16];    // cudaStreamSynchronize( get_cuda_stream() );
120 static int streamInit[16] = { 0 };
121 
get_cuda_stream()122 cudaStream_t get_cuda_stream() {
123     int i = cuda_get_device();
124     if (!streamInit[i]) {
125         //printf("Create CUDA-stream \n");
126         cudaError_t status = cudaStreamCreate(&streamsArray[i]);
127         //cudaError_t status = cudaStreamCreateWithFlags(&streamsArray[i], cudaStreamNonBlocking);
128         if (status != cudaSuccess) {
129             printf(" cudaStreamCreate error: %d \n", status);
130             const char *s = cudaGetErrorString(status);
131             printf("CUDA Error: %s\n", s);
132             status = cudaStreamCreateWithFlags(&streamsArray[i], cudaStreamDefault);
133             CHECK_CUDA(status);
134         }
135         streamInit[i] = 1;
136     }
137     return streamsArray[i];
138 }
139 
140 static cudaStream_t streamsArray2[16];    // cudaStreamSynchronize( get_cuda_memcpy_stream() );
141 static int streamInit2[16] = { 0 };
142 
get_cuda_memcpy_stream()143 cudaStream_t get_cuda_memcpy_stream() {
144     int i = cuda_get_device();
145     if (!streamInit2[i]) {
146         cudaError_t status = cudaStreamCreate(&streamsArray2[i]);
147         //cudaError_t status = cudaStreamCreateWithFlags(&streamsArray2[i], cudaStreamNonBlocking);
148         if (status != cudaSuccess) {
149             printf(" cudaStreamCreate-Memcpy error: %d \n", status);
150             const char *s = cudaGetErrorString(status);
151             printf("CUDA Error: %s\n", s);
152             status = cudaStreamCreateWithFlags(&streamsArray2[i], cudaStreamDefault);
153             CHECK_CUDA(status);
154         }
155         streamInit2[i] = 1;
156     }
157     return streamsArray2[i];
158 }
159 
160 
161 #ifdef CUDNN
cudnn_handle()162 cudnnHandle_t cudnn_handle()
163 {
164     static int init[16] = {0};
165     static cudnnHandle_t handle[16];
166     int i = cuda_get_device();
167     if(!init[i]) {
168         cudnnCreate(&handle[i]);
169         init[i] = 1;
170         cudnnStatus_t status = cudnnSetStream(handle[i], get_cuda_stream());
171         CHECK_CUDNN(status);
172     }
173     return handle[i];
174 }
175 
176 
cudnn_check_error(cudnnStatus_t status)177 void cudnn_check_error(cudnnStatus_t status)
178 {
179 #if defined(DEBUG) || defined(CUDA_DEBUG)
180     cudaDeviceSynchronize();
181 #endif
182     if (cuda_debug_sync) {
183         cudaDeviceSynchronize();
184     }
185     cudnnStatus_t status2 = CUDNN_STATUS_SUCCESS;
186 #ifdef CUDNN_ERRQUERY_RAWCODE
187     cudnnStatus_t status_tmp = cudnnQueryRuntimeError(cudnn_handle(), &status2, CUDNN_ERRQUERY_RAWCODE, NULL);
188 #endif
189     if (status != CUDNN_STATUS_SUCCESS)
190     {
191         const char *s = cudnnGetErrorString(status);
192         char buffer[256];
193         printf("\n cuDNN Error: %s\n", s);
194         snprintf(buffer, 256, "cuDNN Error: %s", s);
195 #ifdef WIN32
196         getchar();
197 #endif
198         error(buffer);
199     }
200     if (status2 != CUDNN_STATUS_SUCCESS)
201     {
202         const char *s = cudnnGetErrorString(status2);
203         char buffer[256];
204         printf("\n cuDNN Error Prev: %s\n", s);
205         snprintf(buffer, 256, "cuDNN Error Prev: %s", s);
206 #ifdef WIN32
207         getchar();
208 #endif
209         error(buffer);
210     }
211 }
212 
cudnn_check_error_extended(cudnnStatus_t status,const char * file,int line,const char * date_time)213 void cudnn_check_error_extended(cudnnStatus_t status, const char *file, int line, const char *date_time)
214 {
215     if (status != CUDNN_STATUS_SUCCESS) {
216         printf("\n cuDNN status Error in: file: %s() : line: %d : build time: %s \n", file, line, date_time);
217         cudnn_check_error(status);
218     }
219 #if defined(DEBUG) || defined(CUDA_DEBUG)
220     cuda_debug_sync = 1;
221 #endif
222     if (cuda_debug_sync) {
223         cudaError_t status = cudaDeviceSynchronize();
224         if (status != CUDNN_STATUS_SUCCESS)
225             printf("\n cudaError_t status = cudaDeviceSynchronize() Error in: file: %s() : line: %d : build time: %s \n", file, line, date_time);
226     }
227     cudnn_check_error(status);
228 }
229 #endif
230 
blas_handle()231 cublasHandle_t blas_handle()
232 {
233     static int init[16] = {0};
234     static cublasHandle_t handle[16];
235     int i = cuda_get_device();
236     if(!init[i]) {
237         cublasCreate(&handle[i]);
238         cublasStatus_t status = cublasSetStream(handle[i], get_cuda_stream());
239         CHECK_CUDA((cudaError_t)status);
240         init[i] = 1;
241     }
242     return handle[i];
243 }
244 
245 static float **pinned_ptr = NULL;
246 static size_t pinned_num_of_blocks = 0;
247 static size_t pinned_index = 0;
248 static size_t pinned_block_id = 0;
249 static const size_t pinned_block_size = (size_t)1024 * 1024 * 1024 * 1;   // 1 GB block size
250 static pthread_mutex_t mutex_pinned = PTHREAD_MUTEX_INITIALIZER;
251 
252 // free CPU-pinned memory
free_pinned_memory()253 void free_pinned_memory()
254 {
255     if (pinned_ptr) {
256         int k;
257         for (k = 0; k < pinned_num_of_blocks; ++k) {
258             cuda_free_host(pinned_ptr[k]);
259         }
260         free(pinned_ptr);
261         pinned_ptr = NULL;
262     }
263 }
264 
265 // custom CPU-pinned memory allocation
pre_allocate_pinned_memory(const size_t size)266 void pre_allocate_pinned_memory(const size_t size)
267 {
268     const size_t num_of_blocks = size / pinned_block_size + ((size % pinned_block_size) ? 1 : 0);
269     printf("pre_allocate... pinned_ptr = %p \n", pinned_ptr);
270 
271     pthread_mutex_lock(&mutex_pinned);
272     if (!pinned_ptr) {
273         pinned_ptr = (float **)calloc(num_of_blocks, sizeof(float *));
274         if(!pinned_ptr) error("calloc failed in pre_allocate() \n");
275 
276         printf("pre_allocate: size = %Iu MB, num_of_blocks = %Iu, block_size = %Iu MB \n",
277             size / (1024*1024), num_of_blocks, pinned_block_size / (1024 * 1024));
278 
279         int k;
280         for (k = 0; k < num_of_blocks; ++k) {
281             cudaError_t status = cudaHostAlloc((void **)&pinned_ptr[k], pinned_block_size, cudaHostRegisterMapped);
282             if (status != cudaSuccess) fprintf(stderr, " Can't pre-allocate CUDA-pinned buffer on CPU-RAM \n");
283             CHECK_CUDA(status);
284             if (!pinned_ptr[k]) error("cudaHostAlloc failed\n");
285             else {
286                 printf(" Allocated %d pinned block \n", pinned_block_size);
287             }
288         }
289         pinned_num_of_blocks = num_of_blocks;
290     }
291     pthread_mutex_unlock(&mutex_pinned);
292 }
293 
294 // simple - get pre-allocated pinned memory
cuda_make_array_pinned_preallocated(float * x,size_t n)295 float *cuda_make_array_pinned_preallocated(float *x, size_t n)
296 {
297     pthread_mutex_lock(&mutex_pinned);
298     float *x_cpu = NULL;
299     const size_t memory_step = 512;// 4096;
300     const size_t size = sizeof(float)*n;
301     const size_t allocation_size = ((size / memory_step) + 1) * memory_step;
302 
303     if (pinned_ptr && pinned_block_id < pinned_num_of_blocks && (allocation_size < pinned_block_size/2))
304     {
305         if ((allocation_size + pinned_index) > pinned_block_size) {
306             const float filled = (float)100 * pinned_index / pinned_block_size;
307             printf("\n Pinned block_id = %d, filled = %f %% \n", pinned_block_id, filled);
308             pinned_block_id++;
309             pinned_index = 0;
310         }
311         if ((allocation_size + pinned_index) < pinned_block_size && pinned_block_id < pinned_num_of_blocks) {
312             x_cpu = (float *)((char *)pinned_ptr[pinned_block_id] + pinned_index);
313             pinned_index += allocation_size;
314         }
315         else {
316             //printf("Pre-allocated pinned memory is over! \n");
317         }
318     }
319 
320     if(!x_cpu) {
321         if (allocation_size > pinned_block_size / 2) {
322             printf("Try to allocate new pinned memory, size = %d MB \n", size / (1024 * 1024));
323             cudaError_t status = cudaHostAlloc((void **)&x_cpu, size, cudaHostRegisterMapped);
324             if (status != cudaSuccess) fprintf(stderr, " Can't allocate CUDA-pinned memory on CPU-RAM (pre-allocated memory is over too) \n");
325             CHECK_CUDA(status);
326         }
327         else {
328             printf("Try to allocate new pinned BLOCK, size = %d MB \n", size / (1024 * 1024));
329             pinned_num_of_blocks++;
330             pinned_block_id = pinned_num_of_blocks - 1;
331             pinned_index = 0;
332             pinned_ptr = (float **)realloc(pinned_ptr, pinned_num_of_blocks * sizeof(float *));
333             cudaError_t status = cudaHostAlloc((void **)&pinned_ptr[pinned_block_id], pinned_block_size, cudaHostRegisterMapped);
334             if (status != cudaSuccess) fprintf(stderr, " Can't pre-allocate CUDA-pinned buffer on CPU-RAM \n");
335             CHECK_CUDA(status);
336             x_cpu = pinned_ptr[pinned_block_id];
337         }
338     }
339 
340     if (x) {
341         cudaError_t status = cudaMemcpyAsync(x_cpu, x, size, cudaMemcpyDefault, get_cuda_stream());
342         CHECK_CUDA(status);
343     }
344 
345     pthread_mutex_unlock(&mutex_pinned);
346     return x_cpu;
347 }
348 
cuda_make_array_pinned(float * x,size_t n)349 float *cuda_make_array_pinned(float *x, size_t n)
350 {
351     float *x_gpu;
352     size_t size = sizeof(float)*n;
353     //cudaError_t status = cudaMalloc((void **)&x_gpu, size);
354     cudaError_t status = cudaHostAlloc((void **)&x_gpu, size, cudaHostRegisterMapped);
355     if (status != cudaSuccess) fprintf(stderr, " Can't allocate CUDA-pinned memory on CPU-RAM \n");
356     CHECK_CUDA(status);
357     if (x) {
358         status = cudaMemcpyAsync(x_gpu, x, size, cudaMemcpyDefault, get_cuda_stream());
359         CHECK_CUDA(status);
360     }
361     if (!x_gpu) error("cudaHostAlloc failed\n");
362     return x_gpu;
363 }
364 
cuda_make_array(float * x,size_t n)365 float *cuda_make_array(float *x, size_t n)
366 {
367     float *x_gpu;
368     size_t size = sizeof(float)*n;
369     cudaError_t status = cudaMalloc((void **)&x_gpu, size);
370     //cudaError_t status = cudaMallocManaged((void **)&x_gpu, size, cudaMemAttachGlobal);
371     //status = cudaMemAdvise(x_gpu, size, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
372     if (status != cudaSuccess) fprintf(stderr, " Try to set subdivisions=64 in your cfg-file. \n");
373     CHECK_CUDA(status);
374     if(x){
375         //status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice);
376         status = cudaMemcpyAsync(x_gpu, x, size, cudaMemcpyDefault, get_cuda_stream());
377         CHECK_CUDA(status);
378     }
379     if(!x_gpu) error("Cuda malloc failed\n");
380     return x_gpu;
381 }
382 
cuda_make_array_pointers(void ** x,size_t n)383 void **cuda_make_array_pointers(void **x, size_t n)
384 {
385     void **x_gpu;
386     size_t size = sizeof(void*) * n;
387     cudaError_t status = cudaMalloc((void **)&x_gpu, size);
388     if (status != cudaSuccess) fprintf(stderr, " Try to set subdivisions=64 in your cfg-file. \n");
389     CHECK_CUDA(status);
390     if (x) {
391         status = cudaMemcpyAsync(x_gpu, x, size, cudaMemcpyDefault, get_cuda_stream());
392         CHECK_CUDA(status);
393     }
394     if (!x_gpu) error("Cuda malloc failed\n");
395     return x_gpu;
396 }
397 
cuda_random(float * x_gpu,size_t n)398 void cuda_random(float *x_gpu, size_t n)
399 {
400     static curandGenerator_t gen[16];
401     static int init[16] = {0};
402     int i = cuda_get_device();
403     if(!init[i]){
404         curandCreateGenerator(&gen[i], CURAND_RNG_PSEUDO_DEFAULT);
405         curandSetPseudoRandomGeneratorSeed(gen[i], time(0));
406         init[i] = 1;
407     }
408     curandGenerateUniform(gen[i], x_gpu, n);
409     CHECK_CUDA(cudaPeekAtLastError());
410 }
411 
cuda_compare(float * x_gpu,float * x,size_t n,char * s)412 float cuda_compare(float *x_gpu, float *x, size_t n, char *s)
413 {
414     float* tmp = (float*)xcalloc(n, sizeof(float));
415     cuda_pull_array(x_gpu, tmp, n);
416     //int i;
417     //for(i = 0; i < n; ++i) printf("%f %f\n", tmp[i], x[i]);
418     axpy_cpu(n, -1, x, 1, tmp, 1);
419     float err = dot_cpu(n, tmp, 1, tmp, 1);
420     printf("Error %s: %f\n", s, sqrt(err/n));
421     free(tmp);
422     return err;
423 }
424 
cuda_make_int_array(size_t n)425 int *cuda_make_int_array(size_t n)
426 {
427     int *x_gpu;
428     size_t size = sizeof(int)*n;
429     cudaError_t status = cudaMalloc((void **)&x_gpu, size);
430     if(status != cudaSuccess) fprintf(stderr, " Try to set subdivisions=64 in your cfg-file. \n");
431     CHECK_CUDA(status);
432     return x_gpu;
433 }
434 
cuda_make_int_array_new_api(int * x,size_t n)435 int *cuda_make_int_array_new_api(int *x, size_t n)
436 {
437 	int *x_gpu;
438 	size_t size = sizeof(int)*n;
439 	cudaError_t status = cudaMalloc((void **)&x_gpu, size);
440     CHECK_CUDA(status);
441 	if (x) {
442 		//status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice);
443         cudaError_t status = cudaMemcpyAsync(x_gpu, x, size, cudaMemcpyHostToDevice, get_cuda_stream());
444         CHECK_CUDA(status);
445 	}
446 	if (!x_gpu) error("Cuda malloc failed\n");
447 	return x_gpu;
448 }
449 
cuda_free(float * x_gpu)450 void cuda_free(float *x_gpu)
451 {
452     //cudaStreamSynchronize(get_cuda_stream());
453     cudaError_t status = cudaFree(x_gpu);
454     CHECK_CUDA(status);
455 }
456 
cuda_free_host(float * x_cpu)457 void cuda_free_host(float *x_cpu)
458 {
459     //cudaStreamSynchronize(get_cuda_stream());
460     cudaError_t status = cudaFreeHost(x_cpu);
461     CHECK_CUDA(status);
462 }
463 
cuda_push_array(float * x_gpu,float * x,size_t n)464 void cuda_push_array(float *x_gpu, float *x, size_t n)
465 {
466     size_t size = sizeof(float)*n;
467     //cudaError_t status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice);
468     cudaError_t status = cudaMemcpyAsync(x_gpu, x, size, cudaMemcpyHostToDevice, get_cuda_stream());
469     CHECK_CUDA(status);
470 }
471 
cuda_pull_array(float * x_gpu,float * x,size_t n)472 void cuda_pull_array(float *x_gpu, float *x, size_t n)
473 {
474     size_t size = sizeof(float)*n;
475     //cudaError_t status = cudaMemcpy(x, x_gpu, size, cudaMemcpyDeviceToHost);
476     cudaError_t status = cudaMemcpyAsync(x, x_gpu, size, cudaMemcpyDeviceToHost, get_cuda_stream());
477     CHECK_CUDA(status);
478     cudaStreamSynchronize(get_cuda_stream());
479 }
480 
cuda_pull_array_async(float * x_gpu,float * x,size_t n)481 void cuda_pull_array_async(float *x_gpu, float *x, size_t n)
482 {
483     size_t size = sizeof(float)*n;
484     cudaError_t status = cudaMemcpyAsync(x, x_gpu, size, cudaMemcpyDefault, get_cuda_stream());
485     check_error(status);
486     //cudaStreamSynchronize(get_cuda_stream());
487 }
488 
get_number_of_blocks(int array_size,int block_size)489 int get_number_of_blocks(int array_size, int block_size)
490 {
491     return array_size / block_size + ((array_size % block_size > 0) ? 1 : 0);
492 }
493 
get_gpu_compute_capability(int i,char * device_name)494 int get_gpu_compute_capability(int i, char *device_name)
495 {
496     typedef struct cudaDeviceProp cudaDeviceProp;
497     cudaDeviceProp prop;
498     cudaError_t status = cudaGetDeviceProperties(&prop, i);
499     CHECK_CUDA(status);
500     if (device_name) strcpy(device_name, prop.name);
501     int cc = prop.major * 100 + prop.minor * 10;    // __CUDA_ARCH__ format
502     return cc;
503 }
504 
show_cuda_cudnn_info()505 void show_cuda_cudnn_info()
506 {
507     int cuda_version = 0, cuda_driver_version = 0, device_count = 0;
508     CHECK_CUDA(cudaRuntimeGetVersion(&cuda_version));
509     CHECK_CUDA(cudaDriverGetVersion(&cuda_driver_version));
510     fprintf(stderr, " CUDA-version: %d (%d)", cuda_version, cuda_driver_version);
511     if(cuda_version > cuda_driver_version) fprintf(stderr, "\n Warning: CUDA-version is higher than Driver-version! \n");
512 #ifdef CUDNN
513     fprintf(stderr, ", cuDNN: %d.%d.%d", CUDNN_MAJOR, CUDNN_MINOR, CUDNN_PATCHLEVEL);
514 #endif  // CUDNN
515 #ifdef CUDNN_HALF
516     fprintf(stderr, ", CUDNN_HALF=1");
517 #endif  // CUDNN_HALF
518     CHECK_CUDA(cudaGetDeviceCount(&device_count));
519     fprintf(stderr, ", GPU count: %d ", device_count);
520     fprintf(stderr, " \n");
521 }
522 
523 #else // GPU
524 #include "darknet.h"
cuda_set_device(int n)525 void cuda_set_device(int n) {}
526 #endif // GPU
527