1 #ifdef __cplusplus
2 extern "C" {
3 #endif
4 int cuda_debug_sync = 0;
5 int gpu_index = 0;
6 #ifdef __cplusplus
7 }
8 #endif // __cplusplus
9
10 #ifdef GPU
11
12 #include "dark_cuda.h"
13 #include "utils.h"
14 #include "blas.h"
15 #include "assert.h"
16 #include <stdlib.h>
17 #include <time.h>
18 #include <cuda.h>
19 #include <stdio.h>
20
21 #pragma comment(lib, "cuda.lib")
22
23
24 #ifdef CUDNN
25 #ifndef USE_CMAKE_LIBS
26 #pragma comment(lib, "cudnn.lib")
27 #endif // USE_CMAKE_LIBS
28 #endif // CUDNN
29
30 #if defined(CUDNN_HALF) && !defined(CUDNN)
31 #error "If you set CUDNN_HALF=1 then you must set CUDNN=1"
32 #endif
33
34
cuda_set_device(int n)35 void cuda_set_device(int n)
36 {
37 gpu_index = n;
38 cudaError_t status = cudaSetDevice(n);
39 if(status != cudaSuccess) CHECK_CUDA(status);
40 }
41
cuda_get_device()42 int cuda_get_device()
43 {
44 int n = 0;
45 cudaError_t status = cudaGetDevice(&n);
46 CHECK_CUDA(status);
47 return n;
48 }
49
cuda_get_context()50 void *cuda_get_context()
51 {
52 CUcontext pctx;
53 CUresult status = cuCtxGetCurrent(&pctx);
54 if(status != CUDA_SUCCESS) fprintf(stderr, " Error: cuCtxGetCurrent() is failed \n");
55 return (void *)pctx;
56 }
57
check_error(cudaError_t status)58 void check_error(cudaError_t status)
59 {
60 cudaError_t status2 = cudaGetLastError();
61 if (status != cudaSuccess)
62 {
63 const char *s = cudaGetErrorString(status);
64 char buffer[256];
65 printf("\n CUDA Error: %s\n", s);
66 snprintf(buffer, 256, "CUDA Error: %s", s);
67 #ifdef WIN32
68 getchar();
69 #endif
70 error(buffer);
71 }
72 if (status2 != cudaSuccess)
73 {
74 const char *s = cudaGetErrorString(status2);
75 char buffer[256];
76 printf("\n CUDA Error Prev: %s\n", s);
77 snprintf(buffer, 256, "CUDA Error Prev: %s", s);
78 #ifdef WIN32
79 getchar();
80 #endif
81 error(buffer);
82 }
83 }
84
check_error_extended(cudaError_t status,const char * file,int line,const char * date_time)85 void check_error_extended(cudaError_t status, const char *file, int line, const char *date_time)
86 {
87 if (status != cudaSuccess) {
88 printf("CUDA status Error: file: %s() : line: %d : build time: %s \n", file, line, date_time);
89 check_error(status);
90 }
91 #if defined(DEBUG) || defined(CUDA_DEBUG)
92 cuda_debug_sync = 1;
93 #endif
94 if (cuda_debug_sync) {
95 status = cudaDeviceSynchronize();
96 if (status != cudaSuccess)
97 printf("CUDA status = cudaDeviceSynchronize() Error: file: %s() : line: %d : build time: %s \n", file, line, date_time);
98 }
99 check_error(status);
100 }
101
cuda_gridsize(size_t n)102 dim3 cuda_gridsize(size_t n){
103 size_t k = (n-1) / BLOCK + 1;
104 size_t x = k;
105 size_t y = 1;
106 if(x > 65535){
107 x = ceil(sqrt(k));
108 y = (n-1)/(x*BLOCK) + 1;
109 }
110 //dim3 d = { (unsigned int)x, (unsigned int)y, 1 };
111 dim3 d;
112 d.x = x;
113 d.y = y;
114 d.z = 1;
115 //printf("%ld %ld %ld %ld\n", n, x, y, x*y*BLOCK);
116 return d;
117 }
118
119 static cudaStream_t streamsArray[16]; // cudaStreamSynchronize( get_cuda_stream() );
120 static int streamInit[16] = { 0 };
121
get_cuda_stream()122 cudaStream_t get_cuda_stream() {
123 int i = cuda_get_device();
124 if (!streamInit[i]) {
125 //printf("Create CUDA-stream \n");
126 cudaError_t status = cudaStreamCreate(&streamsArray[i]);
127 //cudaError_t status = cudaStreamCreateWithFlags(&streamsArray[i], cudaStreamNonBlocking);
128 if (status != cudaSuccess) {
129 printf(" cudaStreamCreate error: %d \n", status);
130 const char *s = cudaGetErrorString(status);
131 printf("CUDA Error: %s\n", s);
132 status = cudaStreamCreateWithFlags(&streamsArray[i], cudaStreamDefault);
133 CHECK_CUDA(status);
134 }
135 streamInit[i] = 1;
136 }
137 return streamsArray[i];
138 }
139
140 static cudaStream_t streamsArray2[16]; // cudaStreamSynchronize( get_cuda_memcpy_stream() );
141 static int streamInit2[16] = { 0 };
142
get_cuda_memcpy_stream()143 cudaStream_t get_cuda_memcpy_stream() {
144 int i = cuda_get_device();
145 if (!streamInit2[i]) {
146 cudaError_t status = cudaStreamCreate(&streamsArray2[i]);
147 //cudaError_t status = cudaStreamCreateWithFlags(&streamsArray2[i], cudaStreamNonBlocking);
148 if (status != cudaSuccess) {
149 printf(" cudaStreamCreate-Memcpy error: %d \n", status);
150 const char *s = cudaGetErrorString(status);
151 printf("CUDA Error: %s\n", s);
152 status = cudaStreamCreateWithFlags(&streamsArray2[i], cudaStreamDefault);
153 CHECK_CUDA(status);
154 }
155 streamInit2[i] = 1;
156 }
157 return streamsArray2[i];
158 }
159
160
161 #ifdef CUDNN
cudnn_handle()162 cudnnHandle_t cudnn_handle()
163 {
164 static int init[16] = {0};
165 static cudnnHandle_t handle[16];
166 int i = cuda_get_device();
167 if(!init[i]) {
168 cudnnCreate(&handle[i]);
169 init[i] = 1;
170 cudnnStatus_t status = cudnnSetStream(handle[i], get_cuda_stream());
171 CHECK_CUDNN(status);
172 }
173 return handle[i];
174 }
175
176
cudnn_check_error(cudnnStatus_t status)177 void cudnn_check_error(cudnnStatus_t status)
178 {
179 #if defined(DEBUG) || defined(CUDA_DEBUG)
180 cudaDeviceSynchronize();
181 #endif
182 if (cuda_debug_sync) {
183 cudaDeviceSynchronize();
184 }
185 cudnnStatus_t status2 = CUDNN_STATUS_SUCCESS;
186 #ifdef CUDNN_ERRQUERY_RAWCODE
187 cudnnStatus_t status_tmp = cudnnQueryRuntimeError(cudnn_handle(), &status2, CUDNN_ERRQUERY_RAWCODE, NULL);
188 #endif
189 if (status != CUDNN_STATUS_SUCCESS)
190 {
191 const char *s = cudnnGetErrorString(status);
192 char buffer[256];
193 printf("\n cuDNN Error: %s\n", s);
194 snprintf(buffer, 256, "cuDNN Error: %s", s);
195 #ifdef WIN32
196 getchar();
197 #endif
198 error(buffer);
199 }
200 if (status2 != CUDNN_STATUS_SUCCESS)
201 {
202 const char *s = cudnnGetErrorString(status2);
203 char buffer[256];
204 printf("\n cuDNN Error Prev: %s\n", s);
205 snprintf(buffer, 256, "cuDNN Error Prev: %s", s);
206 #ifdef WIN32
207 getchar();
208 #endif
209 error(buffer);
210 }
211 }
212
cudnn_check_error_extended(cudnnStatus_t status,const char * file,int line,const char * date_time)213 void cudnn_check_error_extended(cudnnStatus_t status, const char *file, int line, const char *date_time)
214 {
215 if (status != CUDNN_STATUS_SUCCESS) {
216 printf("\n cuDNN status Error in: file: %s() : line: %d : build time: %s \n", file, line, date_time);
217 cudnn_check_error(status);
218 }
219 #if defined(DEBUG) || defined(CUDA_DEBUG)
220 cuda_debug_sync = 1;
221 #endif
222 if (cuda_debug_sync) {
223 cudaError_t status = cudaDeviceSynchronize();
224 if (status != CUDNN_STATUS_SUCCESS)
225 printf("\n cudaError_t status = cudaDeviceSynchronize() Error in: file: %s() : line: %d : build time: %s \n", file, line, date_time);
226 }
227 cudnn_check_error(status);
228 }
229 #endif
230
blas_handle()231 cublasHandle_t blas_handle()
232 {
233 static int init[16] = {0};
234 static cublasHandle_t handle[16];
235 int i = cuda_get_device();
236 if(!init[i]) {
237 cublasCreate(&handle[i]);
238 cublasStatus_t status = cublasSetStream(handle[i], get_cuda_stream());
239 CHECK_CUDA((cudaError_t)status);
240 init[i] = 1;
241 }
242 return handle[i];
243 }
244
245 static float **pinned_ptr = NULL;
246 static size_t pinned_num_of_blocks = 0;
247 static size_t pinned_index = 0;
248 static size_t pinned_block_id = 0;
249 static const size_t pinned_block_size = (size_t)1024 * 1024 * 1024 * 1; // 1 GB block size
250 static pthread_mutex_t mutex_pinned = PTHREAD_MUTEX_INITIALIZER;
251
252 // free CPU-pinned memory
free_pinned_memory()253 void free_pinned_memory()
254 {
255 if (pinned_ptr) {
256 int k;
257 for (k = 0; k < pinned_num_of_blocks; ++k) {
258 cuda_free_host(pinned_ptr[k]);
259 }
260 free(pinned_ptr);
261 pinned_ptr = NULL;
262 }
263 }
264
265 // custom CPU-pinned memory allocation
pre_allocate_pinned_memory(const size_t size)266 void pre_allocate_pinned_memory(const size_t size)
267 {
268 const size_t num_of_blocks = size / pinned_block_size + ((size % pinned_block_size) ? 1 : 0);
269 printf("pre_allocate... pinned_ptr = %p \n", pinned_ptr);
270
271 pthread_mutex_lock(&mutex_pinned);
272 if (!pinned_ptr) {
273 pinned_ptr = (float **)calloc(num_of_blocks, sizeof(float *));
274 if(!pinned_ptr) error("calloc failed in pre_allocate() \n");
275
276 printf("pre_allocate: size = %Iu MB, num_of_blocks = %Iu, block_size = %Iu MB \n",
277 size / (1024*1024), num_of_blocks, pinned_block_size / (1024 * 1024));
278
279 int k;
280 for (k = 0; k < num_of_blocks; ++k) {
281 cudaError_t status = cudaHostAlloc((void **)&pinned_ptr[k], pinned_block_size, cudaHostRegisterMapped);
282 if (status != cudaSuccess) fprintf(stderr, " Can't pre-allocate CUDA-pinned buffer on CPU-RAM \n");
283 CHECK_CUDA(status);
284 if (!pinned_ptr[k]) error("cudaHostAlloc failed\n");
285 else {
286 printf(" Allocated %d pinned block \n", pinned_block_size);
287 }
288 }
289 pinned_num_of_blocks = num_of_blocks;
290 }
291 pthread_mutex_unlock(&mutex_pinned);
292 }
293
294 // simple - get pre-allocated pinned memory
cuda_make_array_pinned_preallocated(float * x,size_t n)295 float *cuda_make_array_pinned_preallocated(float *x, size_t n)
296 {
297 pthread_mutex_lock(&mutex_pinned);
298 float *x_cpu = NULL;
299 const size_t memory_step = 512;// 4096;
300 const size_t size = sizeof(float)*n;
301 const size_t allocation_size = ((size / memory_step) + 1) * memory_step;
302
303 if (pinned_ptr && pinned_block_id < pinned_num_of_blocks && (allocation_size < pinned_block_size/2))
304 {
305 if ((allocation_size + pinned_index) > pinned_block_size) {
306 const float filled = (float)100 * pinned_index / pinned_block_size;
307 printf("\n Pinned block_id = %d, filled = %f %% \n", pinned_block_id, filled);
308 pinned_block_id++;
309 pinned_index = 0;
310 }
311 if ((allocation_size + pinned_index) < pinned_block_size && pinned_block_id < pinned_num_of_blocks) {
312 x_cpu = (float *)((char *)pinned_ptr[pinned_block_id] + pinned_index);
313 pinned_index += allocation_size;
314 }
315 else {
316 //printf("Pre-allocated pinned memory is over! \n");
317 }
318 }
319
320 if(!x_cpu) {
321 if (allocation_size > pinned_block_size / 2) {
322 printf("Try to allocate new pinned memory, size = %d MB \n", size / (1024 * 1024));
323 cudaError_t status = cudaHostAlloc((void **)&x_cpu, size, cudaHostRegisterMapped);
324 if (status != cudaSuccess) fprintf(stderr, " Can't allocate CUDA-pinned memory on CPU-RAM (pre-allocated memory is over too) \n");
325 CHECK_CUDA(status);
326 }
327 else {
328 printf("Try to allocate new pinned BLOCK, size = %d MB \n", size / (1024 * 1024));
329 pinned_num_of_blocks++;
330 pinned_block_id = pinned_num_of_blocks - 1;
331 pinned_index = 0;
332 pinned_ptr = (float **)realloc(pinned_ptr, pinned_num_of_blocks * sizeof(float *));
333 cudaError_t status = cudaHostAlloc((void **)&pinned_ptr[pinned_block_id], pinned_block_size, cudaHostRegisterMapped);
334 if (status != cudaSuccess) fprintf(stderr, " Can't pre-allocate CUDA-pinned buffer on CPU-RAM \n");
335 CHECK_CUDA(status);
336 x_cpu = pinned_ptr[pinned_block_id];
337 }
338 }
339
340 if (x) {
341 cudaError_t status = cudaMemcpyAsync(x_cpu, x, size, cudaMemcpyDefault, get_cuda_stream());
342 CHECK_CUDA(status);
343 }
344
345 pthread_mutex_unlock(&mutex_pinned);
346 return x_cpu;
347 }
348
cuda_make_array_pinned(float * x,size_t n)349 float *cuda_make_array_pinned(float *x, size_t n)
350 {
351 float *x_gpu;
352 size_t size = sizeof(float)*n;
353 //cudaError_t status = cudaMalloc((void **)&x_gpu, size);
354 cudaError_t status = cudaHostAlloc((void **)&x_gpu, size, cudaHostRegisterMapped);
355 if (status != cudaSuccess) fprintf(stderr, " Can't allocate CUDA-pinned memory on CPU-RAM \n");
356 CHECK_CUDA(status);
357 if (x) {
358 status = cudaMemcpyAsync(x_gpu, x, size, cudaMemcpyDefault, get_cuda_stream());
359 CHECK_CUDA(status);
360 }
361 if (!x_gpu) error("cudaHostAlloc failed\n");
362 return x_gpu;
363 }
364
cuda_make_array(float * x,size_t n)365 float *cuda_make_array(float *x, size_t n)
366 {
367 float *x_gpu;
368 size_t size = sizeof(float)*n;
369 cudaError_t status = cudaMalloc((void **)&x_gpu, size);
370 //cudaError_t status = cudaMallocManaged((void **)&x_gpu, size, cudaMemAttachGlobal);
371 //status = cudaMemAdvise(x_gpu, size, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
372 if (status != cudaSuccess) fprintf(stderr, " Try to set subdivisions=64 in your cfg-file. \n");
373 CHECK_CUDA(status);
374 if(x){
375 //status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice);
376 status = cudaMemcpyAsync(x_gpu, x, size, cudaMemcpyDefault, get_cuda_stream());
377 CHECK_CUDA(status);
378 }
379 if(!x_gpu) error("Cuda malloc failed\n");
380 return x_gpu;
381 }
382
cuda_make_array_pointers(void ** x,size_t n)383 void **cuda_make_array_pointers(void **x, size_t n)
384 {
385 void **x_gpu;
386 size_t size = sizeof(void*) * n;
387 cudaError_t status = cudaMalloc((void **)&x_gpu, size);
388 if (status != cudaSuccess) fprintf(stderr, " Try to set subdivisions=64 in your cfg-file. \n");
389 CHECK_CUDA(status);
390 if (x) {
391 status = cudaMemcpyAsync(x_gpu, x, size, cudaMemcpyDefault, get_cuda_stream());
392 CHECK_CUDA(status);
393 }
394 if (!x_gpu) error("Cuda malloc failed\n");
395 return x_gpu;
396 }
397
cuda_random(float * x_gpu,size_t n)398 void cuda_random(float *x_gpu, size_t n)
399 {
400 static curandGenerator_t gen[16];
401 static int init[16] = {0};
402 int i = cuda_get_device();
403 if(!init[i]){
404 curandCreateGenerator(&gen[i], CURAND_RNG_PSEUDO_DEFAULT);
405 curandSetPseudoRandomGeneratorSeed(gen[i], time(0));
406 init[i] = 1;
407 }
408 curandGenerateUniform(gen[i], x_gpu, n);
409 CHECK_CUDA(cudaPeekAtLastError());
410 }
411
cuda_compare(float * x_gpu,float * x,size_t n,char * s)412 float cuda_compare(float *x_gpu, float *x, size_t n, char *s)
413 {
414 float* tmp = (float*)xcalloc(n, sizeof(float));
415 cuda_pull_array(x_gpu, tmp, n);
416 //int i;
417 //for(i = 0; i < n; ++i) printf("%f %f\n", tmp[i], x[i]);
418 axpy_cpu(n, -1, x, 1, tmp, 1);
419 float err = dot_cpu(n, tmp, 1, tmp, 1);
420 printf("Error %s: %f\n", s, sqrt(err/n));
421 free(tmp);
422 return err;
423 }
424
cuda_make_int_array(size_t n)425 int *cuda_make_int_array(size_t n)
426 {
427 int *x_gpu;
428 size_t size = sizeof(int)*n;
429 cudaError_t status = cudaMalloc((void **)&x_gpu, size);
430 if(status != cudaSuccess) fprintf(stderr, " Try to set subdivisions=64 in your cfg-file. \n");
431 CHECK_CUDA(status);
432 return x_gpu;
433 }
434
cuda_make_int_array_new_api(int * x,size_t n)435 int *cuda_make_int_array_new_api(int *x, size_t n)
436 {
437 int *x_gpu;
438 size_t size = sizeof(int)*n;
439 cudaError_t status = cudaMalloc((void **)&x_gpu, size);
440 CHECK_CUDA(status);
441 if (x) {
442 //status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice);
443 cudaError_t status = cudaMemcpyAsync(x_gpu, x, size, cudaMemcpyHostToDevice, get_cuda_stream());
444 CHECK_CUDA(status);
445 }
446 if (!x_gpu) error("Cuda malloc failed\n");
447 return x_gpu;
448 }
449
cuda_free(float * x_gpu)450 void cuda_free(float *x_gpu)
451 {
452 //cudaStreamSynchronize(get_cuda_stream());
453 cudaError_t status = cudaFree(x_gpu);
454 CHECK_CUDA(status);
455 }
456
cuda_free_host(float * x_cpu)457 void cuda_free_host(float *x_cpu)
458 {
459 //cudaStreamSynchronize(get_cuda_stream());
460 cudaError_t status = cudaFreeHost(x_cpu);
461 CHECK_CUDA(status);
462 }
463
cuda_push_array(float * x_gpu,float * x,size_t n)464 void cuda_push_array(float *x_gpu, float *x, size_t n)
465 {
466 size_t size = sizeof(float)*n;
467 //cudaError_t status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice);
468 cudaError_t status = cudaMemcpyAsync(x_gpu, x, size, cudaMemcpyHostToDevice, get_cuda_stream());
469 CHECK_CUDA(status);
470 }
471
cuda_pull_array(float * x_gpu,float * x,size_t n)472 void cuda_pull_array(float *x_gpu, float *x, size_t n)
473 {
474 size_t size = sizeof(float)*n;
475 //cudaError_t status = cudaMemcpy(x, x_gpu, size, cudaMemcpyDeviceToHost);
476 cudaError_t status = cudaMemcpyAsync(x, x_gpu, size, cudaMemcpyDeviceToHost, get_cuda_stream());
477 CHECK_CUDA(status);
478 cudaStreamSynchronize(get_cuda_stream());
479 }
480
cuda_pull_array_async(float * x_gpu,float * x,size_t n)481 void cuda_pull_array_async(float *x_gpu, float *x, size_t n)
482 {
483 size_t size = sizeof(float)*n;
484 cudaError_t status = cudaMemcpyAsync(x, x_gpu, size, cudaMemcpyDefault, get_cuda_stream());
485 check_error(status);
486 //cudaStreamSynchronize(get_cuda_stream());
487 }
488
get_number_of_blocks(int array_size,int block_size)489 int get_number_of_blocks(int array_size, int block_size)
490 {
491 return array_size / block_size + ((array_size % block_size > 0) ? 1 : 0);
492 }
493
get_gpu_compute_capability(int i,char * device_name)494 int get_gpu_compute_capability(int i, char *device_name)
495 {
496 typedef struct cudaDeviceProp cudaDeviceProp;
497 cudaDeviceProp prop;
498 cudaError_t status = cudaGetDeviceProperties(&prop, i);
499 CHECK_CUDA(status);
500 if (device_name) strcpy(device_name, prop.name);
501 int cc = prop.major * 100 + prop.minor * 10; // __CUDA_ARCH__ format
502 return cc;
503 }
504
show_cuda_cudnn_info()505 void show_cuda_cudnn_info()
506 {
507 int cuda_version = 0, cuda_driver_version = 0, device_count = 0;
508 CHECK_CUDA(cudaRuntimeGetVersion(&cuda_version));
509 CHECK_CUDA(cudaDriverGetVersion(&cuda_driver_version));
510 fprintf(stderr, " CUDA-version: %d (%d)", cuda_version, cuda_driver_version);
511 if(cuda_version > cuda_driver_version) fprintf(stderr, "\n Warning: CUDA-version is higher than Driver-version! \n");
512 #ifdef CUDNN
513 fprintf(stderr, ", cuDNN: %d.%d.%d", CUDNN_MAJOR, CUDNN_MINOR, CUDNN_PATCHLEVEL);
514 #endif // CUDNN
515 #ifdef CUDNN_HALF
516 fprintf(stderr, ", CUDNN_HALF=1");
517 #endif // CUDNN_HALF
518 CHECK_CUDA(cudaGetDeviceCount(&device_count));
519 fprintf(stderr, ", GPU count: %d ", device_count);
520 fprintf(stderr, " \n");
521 }
522
523 #else // GPU
524 #include "darknet.h"
cuda_set_device(int n)525 void cuda_set_device(int n) {}
526 #endif // GPU
527