1 #include "HalideRuntimeCuda.h"
2 #include "device_buffer_utils.h"
3 #include "device_interface.h"
4 #include "mini_cuda.h"
5 #include "printer.h"
6 #include "scoped_mutex_lock.h"
7 #include "scoped_spin_lock.h"
8
9 namespace Halide {
10 namespace Runtime {
11 namespace Internal {
12 namespace Cuda {
13
14 // Define the function pointers for the CUDA API.
15 #define CUDA_FN(ret, fn, args) WEAK ret(CUDAAPI *fn) args;
16 #define CUDA_FN_OPTIONAL(ret, fn, args) WEAK ret(CUDAAPI *fn) args;
17 #define CUDA_FN_3020(ret, fn, fn_3020, args) WEAK ret(CUDAAPI *fn) args;
18 #define CUDA_FN_4000(ret, fn, fn_4000, args) WEAK ret(CUDAAPI *fn) args;
19 #include "cuda_functions.h"
20 #undef CUDA_FN
21 #undef CUDA_FN_OPTIONAL
22 #undef CUDA_FN_3020
23 #undef CUDA_FN_4000
24
25 // The default implementation of halide_cuda_get_symbol attempts to load
26 // the CUDA shared library/DLL, and then get the symbol from it.
27 WEAK void *lib_cuda = NULL;
28 volatile ScopedSpinLock::AtomicFlag WEAK lib_cuda_lock = 0;
29
halide_cuda_get_symbol(void * user_context,const char * name)30 extern "C" WEAK void *halide_cuda_get_symbol(void *user_context, const char *name) {
31 // Only try to load the library if we can't already get the symbol
32 // from the library. Even if the library is NULL, the symbols may
33 // already be available in the process.
34 void *symbol = halide_get_library_symbol(lib_cuda, name);
35 if (symbol) {
36 return symbol;
37 }
38
39 const char *lib_names[] = {
40 #ifdef WINDOWS
41 "nvcuda.dll",
42 #else
43 "libcuda.so",
44 "libcuda.dylib",
45 "/Library/Frameworks/CUDA.framework/CUDA",
46 #endif
47 };
48 for (size_t i = 0; i < sizeof(lib_names) / sizeof(lib_names[0]); i++) {
49 lib_cuda = halide_load_library(lib_names[i]);
50 if (lib_cuda) {
51 debug(user_context) << " Loaded CUDA runtime library: " << lib_names[i] << "\n";
52 break;
53 }
54 }
55
56 return halide_get_library_symbol(lib_cuda, name);
57 }
58
59 template<typename T>
get_cuda_symbol(void * user_context,const char * name,bool optional=false)60 ALWAYS_INLINE T get_cuda_symbol(void *user_context, const char *name, bool optional = false) {
61 T s = (T)halide_cuda_get_symbol(user_context, name);
62 if (!optional && !s) {
63 error(user_context) << "CUDA API not found: " << name << "\n";
64 }
65 return s;
66 }
67
68 // Load a CUDA shared object/dll and get the CUDA API function pointers from it.
load_libcuda(void * user_context)69 WEAK void load_libcuda(void *user_context) {
70 debug(user_context) << " load_libcuda (user_context: " << user_context << ")\n";
71 halide_assert(user_context, cuInit == NULL);
72
73 #define CUDA_FN(ret, fn, args) fn = get_cuda_symbol<ret(CUDAAPI *) args>(user_context, #fn);
74 #define CUDA_FN_OPTIONAL(ret, fn, args) fn = get_cuda_symbol<ret(CUDAAPI *) args>(user_context, #fn, true);
75 #define CUDA_FN_3020(ret, fn, fn_3020, args) fn = get_cuda_symbol<ret(CUDAAPI *) args>(user_context, #fn_3020);
76 #define CUDA_FN_4000(ret, fn, fn_4000, args) fn = get_cuda_symbol<ret(CUDAAPI *) args>(user_context, #fn_4000);
77 #include "cuda_functions.h"
78 #undef CUDA_FN
79 #undef CUDA_FN_OPTIONAL
80 #undef CUDA_FN_3020
81 #undef CUDA_FN_4000
82 }
83
84 // Call load_libcuda() if CUDA library has not been loaded.
85 // This function is thread safe.
86 // Note that initialization might fail. The caller can detect such failure by checking whether cuInit is NULL.
ensure_libcuda_init(void * user_context)87 WEAK void ensure_libcuda_init(void *user_context) {
88 ScopedSpinLock spinlock(&lib_cuda_lock);
89 if (!cuInit) {
90 load_libcuda(user_context);
91 }
92 }
93
94 extern WEAK halide_device_interface_t cuda_device_interface;
95
96 WEAK const char *get_error_name(CUresult error);
97 WEAK CUresult create_cuda_context(void *user_context, CUcontext *ctx);
98
99 // A cuda context defined in this module with weak linkage
100 CUcontext WEAK context = 0;
101 // This lock protexts the above context variable.
102 WEAK halide_mutex context_lock;
103
104 // A free list, used when allocations are being cached.
105 WEAK struct FreeListItem {
106 CUdeviceptr ptr;
107 CUcontext ctx;
108 CUstream stream;
109 size_t size;
110 FreeListItem *next;
111 } *free_list = 0;
112 WEAK halide_mutex free_list_lock;
113
114 } // namespace Cuda
115 } // namespace Internal
116 } // namespace Runtime
117 } // namespace Halide
118
119 using namespace Halide::Runtime::Internal;
120 using namespace Halide::Runtime::Internal::Cuda;
121
122 extern "C" {
123
124 // The default implementation of halide_cuda_acquire_context uses the global
125 // pointers above, and serializes access with a spin lock.
126 // Overriding implementations of acquire/release must implement the following
127 // behavior:
128 // - halide_cuda_acquire_context should always store a valid context/command
129 // queue in ctx/q, or return an error code.
130 // - A call to halide_cuda_acquire_context is followed by a matching call to
131 // halide_cuda_release_context. halide_cuda_acquire_context should block while a
132 // previous call (if any) has not yet been released via halide_cuda_release_context.
halide_cuda_acquire_context(void * user_context,CUcontext * ctx,bool create=true)133 WEAK int halide_cuda_acquire_context(void *user_context, CUcontext *ctx, bool create = true) {
134 // TODO: Should we use a more "assertive" assert? these asserts do
135 // not block execution on failure.
136 halide_assert(user_context, ctx != NULL);
137
138 // If the context has not been initialized, initialize it now.
139 halide_assert(user_context, &context != NULL);
140
141 // Note that this null-check of the context is *not* locked with
142 // respect to device_release, so we may get a non-null context
143 // that's in the process of being destroyed. Things will go badly
144 // in general if you call device_release while other Halide code
145 // is running though.
146 CUcontext local_val = context;
147 if (local_val == NULL) {
148 if (!create) {
149 *ctx = NULL;
150 return 0;
151 }
152
153 {
154 ScopedMutexLock spinlock(&context_lock);
155 local_val = context;
156 if (local_val == NULL) {
157 CUresult error = create_cuda_context(user_context, &local_val);
158 if (error != CUDA_SUCCESS) {
159 return error;
160 }
161 }
162 // Normally in double-checked locking you need a release
163 // fence here that synchronizes with an acquire fence
164 // above to ensure context is fully constructed before
165 // assigning to the global, but there's no way that
166 // create_cuda_context can access the "context" global, so
167 // we should be OK just storing to it here.
168 context = local_val;
169 } // spinlock
170 }
171
172 *ctx = local_val;
173 return 0;
174 }
175
halide_cuda_release_context(void * user_context)176 WEAK int halide_cuda_release_context(void *user_context) {
177 return 0;
178 }
179
180 // Return the stream to use for executing kernels and synchronization. Only called
181 // for versions of cuda which support streams. Default is to use the main stream
182 // for the context (NULL stream). The context is passed in for convenience, but
183 // any sort of scoping must be handled by that of the
184 // halide_cuda_acquire_context/halide_cuda_release_context pair, not this call.
halide_cuda_get_stream(void * user_context,CUcontext ctx,CUstream * stream)185 WEAK int halide_cuda_get_stream(void *user_context, CUcontext ctx, CUstream *stream) {
186 // There are two default streams we could use. stream 0 is fully
187 // synchronous. stream 2 gives a separate non-blocking stream per
188 // thread.
189 *stream = 0;
190 return 0;
191 }
192
193 } // extern "C"
194
195 namespace Halide {
196 namespace Runtime {
197 namespace Internal {
198 namespace Cuda {
199
200 // Helper object to acquire and release the cuda context.
201 class Context {
202 void *user_context;
203
204 public:
205 CUcontext context;
206 int error;
207
208 // Constructor sets 'error' if any occurs.
Context(void * user_context)209 ALWAYS_INLINE Context(void *user_context)
210 : user_context(user_context),
211 context(NULL),
212 error(CUDA_SUCCESS) {
213 #ifdef DEBUG_RUNTIME
214 halide_start_clock(user_context);
215 #endif
216 error = halide_cuda_acquire_context(user_context, &context);
217 if (error != 0) {
218 return;
219 }
220
221 // The default acquire_context loads libcuda as a
222 // side-effect. However, if acquire_context has been
223 // overridden, we may still need to load libcuda
224 ensure_libcuda_init(user_context);
225
226 halide_assert(user_context, context != NULL);
227 halide_assert(user_context, cuInit != NULL);
228
229 error = cuCtxPushCurrent(context);
230 }
231
~Context()232 ALWAYS_INLINE ~Context() {
233 if (error == 0) {
234 CUcontext old;
235 cuCtxPopCurrent(&old);
236 }
237
238 halide_cuda_release_context(user_context);
239 }
240 };
241
242 // Halide allocates a device API controlled pointer slot as part of
243 // each compiled module. The slot is used to store information to
244 // avoid having to reload/recompile kernel code on each call into a
245 // Halide filter. The cuda runtime uses this pointer to maintain a
246 // linked list of contexts into which the module has been loaded.
247 //
248 // A global list of all registered filters is also kept so all modules
249 // loaded on a given context can be unloaded and removed from the list
250 // when halide_device_release is called on a specific context.
251 //
252 // The registered_filters struct is not freed as it is pointed to by the
253 // Halide generated code. The module_state structs are freed.
254
255 struct module_state {
256 CUcontext context;
257 CUmodule module;
258 module_state *next;
259 };
260
261 struct registered_filters {
262 module_state *modules;
263 registered_filters *next;
264 };
265 WEAK registered_filters *filters_list = NULL;
266 // This spinlock protects the above filters_list.
267 WEAK halide_mutex filters_list_lock;
268
find_module_for_context(const registered_filters * filters,CUcontext ctx)269 WEAK module_state *find_module_for_context(const registered_filters *filters, CUcontext ctx) {
270 module_state *modules = filters->modules;
271 while (modules != NULL) {
272 if (modules->context == ctx) {
273 return modules;
274 }
275 modules = modules->next;
276 }
277 return NULL;
278 }
279
create_cuda_context(void * user_context,CUcontext * ctx)280 WEAK CUresult create_cuda_context(void *user_context, CUcontext *ctx) {
281 // Initialize CUDA
282 ensure_libcuda_init(user_context);
283 if (!cuInit) {
284 error(user_context) << "Could not find cuda system libraries";
285 return CUDA_ERROR_FILE_NOT_FOUND;
286 }
287
288 CUresult err = cuInit(0);
289 if (err != CUDA_SUCCESS) {
290 error(user_context) << "CUDA: cuInit failed: "
291 << get_error_name(err);
292 return err;
293 }
294
295 // Make sure we have a device
296 int deviceCount = 0;
297 err = cuDeviceGetCount(&deviceCount);
298 if (err != CUDA_SUCCESS) {
299 error(user_context) << "CUDA: cuGetDeviceCount failed: "
300 << get_error_name(err);
301 return err;
302 }
303
304 if (deviceCount <= 0) {
305 halide_error(user_context, "CUDA: No devices available");
306 return CUDA_ERROR_NO_DEVICE;
307 }
308
309 int device = halide_get_gpu_device(user_context);
310 if (device == -1 && deviceCount == 1) {
311 device = 0;
312 } else if (device == -1) {
313 debug(user_context) << "CUDA: Multiple CUDA devices detected. Selecting the one with the most cores.\n";
314 int best_core_count = 0;
315 for (int i = 0; i < deviceCount; i++) {
316 CUdevice dev;
317 CUresult status = cuDeviceGet(&dev, i);
318 if (status != CUDA_SUCCESS) {
319 debug(user_context) << " Failed to get device " << i << "\n";
320 continue;
321 }
322 int core_count = 0;
323 status = cuDeviceGetAttribute(&core_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
324 debug(user_context) << " Device " << i << " has " << core_count << " cores\n";
325 if (status != CUDA_SUCCESS) {
326 continue;
327 }
328 if (core_count >= best_core_count) {
329 device = i;
330 best_core_count = core_count;
331 }
332 }
333 }
334
335 // Get device
336 CUdevice dev;
337 CUresult status = cuDeviceGet(&dev, device);
338 if (status != CUDA_SUCCESS) {
339 halide_error(user_context, "CUDA: Failed to get device\n");
340 return status;
341 }
342
343 debug(user_context) << " Got device " << dev << "\n";
344
345 // Dump device attributes
346 #ifdef DEBUG_RUNTIME
347 {
348 char name[256];
349 name[0] = 0;
350 err = cuDeviceGetName(name, 256, dev);
351 debug(user_context) << " " << name << "\n";
352
353 if (err != CUDA_SUCCESS) {
354 error(user_context) << "CUDA: cuDeviceGetName failed: "
355 << get_error_name(err);
356 return err;
357 }
358
359 size_t memory = 0;
360 err = cuDeviceTotalMem(&memory, dev);
361 debug(user_context) << " total memory: " << (int)(memory >> 20) << " MB\n";
362
363 if (err != CUDA_SUCCESS) {
364 error(user_context) << "CUDA: cuDeviceTotalMem failed: "
365 << get_error_name(err);
366 return err;
367 }
368
369 // Declare variables for other state we want to query.
370 int max_threads_per_block = 0, warp_size = 0, num_cores = 0;
371 int max_block_size[] = {0, 0, 0};
372 int max_grid_size[] = {0, 0, 0};
373 int max_shared_mem = 0, max_constant_mem = 0;
374 int cc_major = 0, cc_minor = 0;
375
376 struct {
377 int *dst;
378 CUdevice_attribute attr;
379 } attrs[] = {
380 {&max_threads_per_block, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK},
381 {&warp_size, CU_DEVICE_ATTRIBUTE_WARP_SIZE},
382 {&num_cores, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT},
383 {&max_block_size[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X},
384 {&max_block_size[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y},
385 {&max_block_size[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z},
386 {&max_grid_size[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X},
387 {&max_grid_size[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y},
388 {&max_grid_size[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z},
389 {&max_shared_mem, CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK},
390 {&max_constant_mem, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY},
391 {&cc_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR},
392 {&cc_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR},
393 {NULL, CU_DEVICE_ATTRIBUTE_MAX}};
394
395 // Do all the queries.
396 for (int i = 0; attrs[i].dst; i++) {
397 err = cuDeviceGetAttribute(attrs[i].dst, attrs[i].attr, dev);
398 if (err != CUDA_SUCCESS) {
399 error(user_context)
400 << "CUDA: cuDeviceGetAttribute failed ("
401 << get_error_name(err)
402 << ") for attribute " << (int)attrs[i].attr;
403 return err;
404 }
405 }
406
407 // threads per core is a function of the compute capability
408 int threads_per_core;
409 switch (cc_major) {
410 case 1:
411 threads_per_core = 8;
412 break;
413 case 2:
414 threads_per_core = (cc_minor == 0 ? 32 : 48);
415 break;
416 case 3:
417 threads_per_core = 192;
418 break;
419 case 5:
420 threads_per_core = 128;
421 break;
422 case 6:
423 threads_per_core = (cc_minor == 0 ? 64 : 128);
424 break;
425 case 7:
426 threads_per_core = 64;
427 break;
428 default:
429 threads_per_core = 0;
430 break;
431 }
432
433 debug(user_context)
434 << " max threads per block: " << max_threads_per_block << "\n"
435 << " warp size: " << warp_size << "\n"
436 << " max block size: " << max_block_size[0]
437 << " " << max_block_size[1] << " " << max_block_size[2] << "\n"
438 << " max grid size: " << max_grid_size[0]
439 << " " << max_grid_size[1] << " " << max_grid_size[2] << "\n"
440 << " max shared memory per block: " << max_shared_mem << "\n"
441 << " max constant memory per block: " << max_constant_mem << "\n"
442 << " compute capability " << cc_major << "." << cc_minor << "\n"
443 << " cuda cores: " << num_cores << " x " << threads_per_core
444 << " = " << num_cores * threads_per_core << "\n";
445 }
446 #endif
447
448 // Create context
449 debug(user_context) << " cuCtxCreate " << dev << " -> ";
450 err = cuCtxCreate(ctx, 0, dev);
451 if (err != CUDA_SUCCESS) {
452 debug(user_context) << get_error_name(err) << "\n";
453 error(user_context) << "CUDA: cuCtxCreate failed: "
454 << get_error_name(err);
455 return err;
456 } else {
457 unsigned int version = 0;
458 cuCtxGetApiVersion(*ctx, &version);
459 debug(user_context) << *ctx << "(" << version << ")\n";
460 }
461 // Creation automatically pushes the context, but we'll pop to allow the caller
462 // to decide when to push.
463 CUcontext dummy;
464 err = cuCtxPopCurrent(&dummy);
465 if (err != CUDA_SUCCESS) {
466 error(user_context) << "CUDA: cuCtxPopCurrent failed: "
467 << get_error_name(err);
468 return err;
469 }
470
471 return CUDA_SUCCESS;
472 }
473
474 // This feature may be useful during CUDA backend or runtime
475 // development. It does not seem to find many errors in general Halide
476 // use and causes false positives in at least one environment, where
477 // it prevents using debug mode with cuda.
478 #define ENABLE_POINTER_VALIDATION 0
479
validate_device_pointer(void * user_context,halide_buffer_t * buf,size_t size=0)480 WEAK bool validate_device_pointer(void *user_context, halide_buffer_t *buf, size_t size = 0) {
481 // The technique using cuPointerGetAttribute and CU_POINTER_ATTRIBUTE_CONTEXT
482 // requires unified virtual addressing is enabled and that is not the case
483 // for 32-bit processes on Mac OS X. So for now, as a total hack, just return true
484 // in 32-bit. This could of course be wrong the other way for cards that only
485 // support 32-bit addressing in 64-bit processes, but I expect those cards do not
486 // support unified addressing at all.
487 // TODO: figure out a way to validate pointers in all cases if strictly necessary.
488 #if defined(BITS_32) || !ENABLE_POINTER_VALIDATION
489 return true;
490 #else
491 if (buf->device == 0)
492 return true;
493
494 CUdeviceptr dev_ptr = (CUdeviceptr)buf->device;
495
496 CUcontext ctx;
497 CUresult result = cuPointerGetAttribute(&ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dev_ptr);
498 if (result != CUDA_SUCCESS) {
499 error(user_context) << "Bad device pointer " << (void *)dev_ptr
500 << ": cuPointerGetAttribute returned "
501 << get_error_name(result);
502 return false;
503 }
504 return true;
505 #endif
506 }
507
508 } // namespace Cuda
509 } // namespace Internal
510 } // namespace Runtime
511 } // namespace Halide
512
513 extern "C" {
halide_cuda_initialize_kernels(void * user_context,void ** state_ptr,const char * ptx_src,int size)514 WEAK int halide_cuda_initialize_kernels(void *user_context, void **state_ptr, const char *ptx_src, int size) {
515 debug(user_context) << "CUDA: halide_cuda_initialize_kernels (user_context: " << user_context
516 << ", state_ptr: " << state_ptr
517 << ", ptx_src: " << (void *)ptx_src
518 << ", size: " << size << "\n";
519
520 Context ctx(user_context);
521 if (ctx.error != 0) {
522 return ctx.error;
523 }
524
525 #ifdef DEBUG_RUNTIME
526 uint64_t t_before = halide_current_time_ns(user_context);
527 #endif
528
529 halide_assert(user_context, &filters_list_lock != NULL);
530 {
531 ScopedMutexLock spinlock(&filters_list_lock);
532
533 // Create the state object if necessary. This only happens once, regardless
534 // of how many times halide_initialize_kernels/halide_release is called.
535 // halide_release traverses this list and releases the module objects, but
536 // it does not modify the list nodes created/inserted here.
537 registered_filters **filters = (registered_filters **)state_ptr;
538 if (!(*filters)) {
539 *filters = (registered_filters *)malloc(sizeof(registered_filters));
540 (*filters)->modules = NULL;
541 (*filters)->next = filters_list;
542 filters_list = *filters;
543 }
544
545 // Create the module itself if necessary.
546 module_state *loaded_module = find_module_for_context(*filters, ctx.context);
547 if (loaded_module == NULL) {
548 loaded_module = (module_state *)malloc(sizeof(module_state));
549 debug(user_context) << " cuModuleLoadData " << (void *)ptx_src << ", " << size << " -> ";
550
551 CUjit_option options[] = {CU_JIT_MAX_REGISTERS};
552 unsigned int max_regs_per_thread = 64;
553
554 // A hack to enable control over max register count for
555 // testing. This should be surfaced in the schedule somehow
556 // instead.
557 char *regs = getenv("HL_CUDA_JIT_MAX_REGISTERS");
558 if (regs) {
559 max_regs_per_thread = atoi(regs);
560 }
561 void *optionValues[] = {(void *)(uintptr_t)max_regs_per_thread};
562 CUresult err = cuModuleLoadDataEx(&loaded_module->module, ptx_src, 1, options, optionValues);
563
564 if (err != CUDA_SUCCESS) {
565 free(loaded_module);
566 error(user_context) << "CUDA: cuModuleLoadData failed: "
567 << get_error_name(err);
568 return err;
569 } else {
570 debug(user_context) << (void *)(loaded_module->module) << "\n";
571 }
572 loaded_module->context = ctx.context;
573 loaded_module->next = (*filters)->modules;
574 (*filters)->modules = loaded_module;
575 }
576 } // spinlock
577
578 #ifdef DEBUG_RUNTIME
579 uint64_t t_after = halide_current_time_ns(user_context);
580 debug(user_context) << " Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
581 #endif
582
583 return 0;
584 }
585
halide_cuda_release_unused_device_allocations(void * user_context)586 WEAK int halide_cuda_release_unused_device_allocations(void *user_context) {
587 FreeListItem *to_free;
588 {
589 ScopedMutexLock lock(&free_list_lock);
590 to_free = free_list;
591 free_list = NULL;
592 }
593 while (to_free) {
594 debug(user_context) << " cuMemFree " << (void *)(to_free->ptr) << "\n";
595 cuMemFree(to_free->ptr);
596 FreeListItem *next = to_free->next;
597 free(to_free);
598 to_free = next;
599 }
600 return 0;
601 }
602
603 namespace Halide {
604 namespace Runtime {
605 namespace Internal {
606
607 WEAK halide_device_allocation_pool cuda_allocation_pool;
608
register_cuda_allocation_pool()609 WEAK __attribute__((constructor)) void register_cuda_allocation_pool() {
610 cuda_allocation_pool.release_unused = &halide_cuda_release_unused_device_allocations;
611 halide_register_device_allocation_pool(&cuda_allocation_pool);
612 }
613
quantize_allocation_size(uint64_t sz)614 ALWAYS_INLINE uint64_t quantize_allocation_size(uint64_t sz) {
615 int z = __builtin_clzll(sz);
616 if (z < 60) {
617 sz--;
618 sz = sz >> (60 - z);
619 sz++;
620 sz = sz << (60 - z);
621 }
622 return sz;
623 }
624
625 } // namespace Internal
626 } // namespace Runtime
627 } // namespace Halide
628
halide_cuda_device_free(void * user_context,halide_buffer_t * buf)629 WEAK int halide_cuda_device_free(void *user_context, halide_buffer_t *buf) {
630 // halide_device_free, at present, can be exposed to clients and they
631 // should be allowed to call halide_device_free on any halide_buffer_t
632 // including ones that have never been used with a GPU.
633 if (buf->device == 0) {
634 return 0;
635 }
636
637 CUdeviceptr dev_ptr = (CUdeviceptr)buf->device;
638
639 debug(user_context)
640 << "CUDA: halide_cuda_device_free (user_context: " << user_context
641 << ", buf: " << buf << ")\n";
642
643 Context ctx(user_context);
644 if (ctx.error != CUDA_SUCCESS) {
645 return ctx.error;
646 }
647
648 #ifdef DEBUG_RUNTIME
649 uint64_t t_before = halide_current_time_ns(user_context);
650 #endif
651
652 halide_assert(user_context, validate_device_pointer(user_context, buf));
653
654 CUresult err = CUDA_SUCCESS;
655 if (halide_can_reuse_device_allocations(user_context)) {
656 debug(user_context) << " caching allocation for later use: " << (void *)(dev_ptr) << "\n";
657
658 FreeListItem *item = (FreeListItem *)malloc(sizeof(FreeListItem));
659 item->ctx = ctx.context;
660 item->size = quantize_allocation_size(buf->size_in_bytes());
661 item->ptr = dev_ptr;
662
663 if (cuStreamSynchronize) {
664 // We don't want to use a buffer freed one stream on
665 // another, as there are no synchronization guarantees and
666 // everything is async.
667 int result = halide_cuda_get_stream(user_context, ctx.context, &item->stream);
668 if (result != 0) {
669 error(user_context) << "CUDA: In halide_cuda_device_free, halide_cuda_get_stream returned " << result << "\n";
670 }
671 } else {
672 item->stream = NULL;
673 }
674
675 {
676 ScopedMutexLock lock(&free_list_lock);
677 item->next = free_list;
678 free_list = item;
679 }
680 } else {
681 debug(user_context) << " cuMemFree " << (void *)(dev_ptr) << "\n";
682 err = cuMemFree(dev_ptr);
683 // If cuMemFree fails, it isn't likely to succeed later, so just drop
684 // the reference.
685 }
686 buf->device_interface->impl->release_module();
687 buf->device_interface = NULL;
688 buf->device = 0;
689 if (err != CUDA_SUCCESS) {
690 // We may be called as a destructor, so don't raise an error here.
691 return err;
692 }
693
694 #ifdef DEBUG_RUNTIME
695 uint64_t t_after = halide_current_time_ns(user_context);
696 debug(user_context) << " Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
697 #endif
698
699 return 0;
700 }
701
halide_cuda_device_release(void * user_context)702 WEAK int halide_cuda_device_release(void *user_context) {
703 debug(user_context)
704 << "CUDA: halide_cuda_device_release (user_context: " << user_context << ")\n";
705
706 // If we haven't even loaded libcuda, don't load it just to quit.
707 if (!lib_cuda) {
708 return 0;
709 }
710
711 int err;
712 CUcontext ctx;
713 err = halide_cuda_acquire_context(user_context, &ctx, false);
714 if (err != CUDA_SUCCESS) {
715 return err;
716 }
717
718 if (ctx) {
719 // It's possible that this is being called from the destructor of
720 // a static variable, in which case the driver may already be
721 // shutting down.
722 err = cuCtxPushCurrent(ctx);
723 if (err != CUDA_SUCCESS) {
724 err = cuCtxSynchronize();
725 }
726 halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED);
727
728 // Dump the contents of the free list, ignoring errors.
729 halide_cuda_release_unused_device_allocations(user_context);
730
731 {
732 ScopedMutexLock spinlock(&filters_list_lock);
733
734 // Unload the modules attached to this context. Note that the list
735 // nodes themselves are not freed, only the module objects are
736 // released. Subsequent calls to halide_init_kernels might re-create
737 // the program object using the same list node to store the module
738 // object.
739 registered_filters *filters = filters_list;
740 while (filters) {
741 module_state **prev_ptr = &filters->modules;
742 module_state *loaded_module = filters->modules;
743 while (loaded_module != NULL) {
744 if (loaded_module->context == ctx) {
745 debug(user_context) << " cuModuleUnload " << loaded_module->module << "\n";
746 err = cuModuleUnload(loaded_module->module);
747 halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED);
748 *prev_ptr = loaded_module->next;
749 free(loaded_module);
750 loaded_module = *prev_ptr;
751 } else {
752 loaded_module = loaded_module->next;
753 prev_ptr = &loaded_module->next;
754 }
755 }
756 filters = filters->next;
757 }
758 } // spinlock
759
760 CUcontext old_ctx;
761 cuCtxPopCurrent(&old_ctx);
762
763 // Only destroy the context if we own it
764
765 {
766 ScopedMutexLock spinlock(&context_lock);
767
768 if (ctx == context) {
769 debug(user_context) << " cuCtxDestroy " << context << "\n";
770 err = cuProfilerStop();
771 err = cuCtxDestroy(context);
772 halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED);
773 context = NULL;
774 }
775 } // spinlock
776 }
777
778 halide_cuda_release_context(user_context);
779
780 return 0;
781 }
782
halide_cuda_device_malloc(void * user_context,halide_buffer_t * buf)783 WEAK int halide_cuda_device_malloc(void *user_context, halide_buffer_t *buf) {
784 debug(user_context)
785 << "CUDA: halide_cuda_device_malloc (user_context: " << user_context
786 << ", buf: " << buf << ")\n";
787
788 Context ctx(user_context);
789 if (ctx.error != CUDA_SUCCESS) {
790 return ctx.error;
791 }
792
793 size_t size = buf->size_in_bytes();
794 if (halide_can_reuse_device_allocations(user_context)) {
795 size = quantize_allocation_size(size);
796 }
797 halide_assert(user_context, size != 0);
798 if (buf->device) {
799 // This buffer already has a device allocation
800 halide_assert(user_context, validate_device_pointer(user_context, buf, size));
801 return 0;
802 }
803
804 // Check all strides positive.
805 for (int i = 0; i < buf->dimensions; i++) {
806 halide_assert(user_context, buf->dim[i].stride >= 0);
807 }
808
809 debug(user_context) << " allocating " << *buf << "\n";
810
811 #ifdef DEBUG_RUNTIME
812 uint64_t t_before = halide_current_time_ns(user_context);
813 #endif
814
815 CUdeviceptr p = 0;
816 FreeListItem *to_free = NULL;
817 if (halide_can_reuse_device_allocations(user_context)) {
818 CUstream stream = NULL;
819 if (cuStreamSynchronize != NULL) {
820 int result = halide_cuda_get_stream(user_context, ctx.context, &stream);
821 if (result != 0) {
822 error(user_context) << "CUDA: In halide_cuda_device_malloc, halide_cuda_get_stream returned " << result << "\n";
823 }
824 }
825
826 ScopedMutexLock lock(&free_list_lock);
827 // Best-fit allocation. There are three tunable constants
828 // here. A bucket is claimed if the size requested is at least
829 // 7/8 of the size of the bucket. We keep at most 32 unused
830 // allocations. We round up each allocation size to its top 4
831 // most significant bits (see quantize_allocation_size).
832 FreeListItem *best = NULL, *item = free_list;
833 FreeListItem **best_prev = NULL, **prev_ptr = &free_list;
834 int depth = 0;
835 while (item) {
836 if ((size <= item->size) && // Fits
837 (size >= (item->size / 8) * 7) && // Not too much slop
838 (ctx.context == item->ctx) && // Same cuda context
839 (stream == item->stream) && // Can only safely re-use on the same stream on which it was freed
840 ((best == NULL) || (best->size > item->size))) { // Better than previous best fit
841 best = item;
842 best_prev = prev_ptr;
843 prev_ptr = &item->next;
844 item = item->next;
845 } else if (depth > 32) {
846 // Allocations after here have not been used for a
847 // long time. Just detach the rest of the free list
848 // and defer the actual cuMemFree calls until after we
849 // release the free_list_lock.
850 to_free = item;
851 *prev_ptr = NULL;
852 item = NULL;
853 break;
854 } else {
855 prev_ptr = &item->next;
856 item = item->next;
857 }
858 depth++;
859 }
860
861 if (best) {
862 p = best->ptr;
863 *best_prev = best->next;
864 free(best);
865 }
866 }
867
868 while (to_free) {
869 FreeListItem *next = to_free->next;
870 cuMemFree(to_free->ptr);
871 free(to_free);
872 to_free = next;
873 }
874
875 if (!p) {
876 debug(user_context) << " cuMemAlloc " << (uint64_t)size << " -> ";
877
878 // Quantize all allocation sizes to the top 4 bits, to make
879 // reuse likelier. Wastes on average 4% memory per allocation.
880
881 CUresult err = cuMemAlloc(&p, size);
882 if (err == CUDA_ERROR_OUT_OF_MEMORY) {
883 halide_cuda_release_unused_device_allocations(user_context);
884 err = cuMemAlloc(&p, size);
885 }
886 if (err != CUDA_SUCCESS) {
887 debug(user_context) << get_error_name(err) << "\n";
888 error(user_context) << "CUDA: cuMemAlloc failed: "
889 << get_error_name(err);
890 return err;
891 } else {
892 debug(user_context) << (void *)p << "\n";
893 }
894 }
895 halide_assert(user_context, p);
896 buf->device = p;
897 buf->device_interface = &cuda_device_interface;
898 buf->device_interface->impl->use_module();
899
900 #ifdef DEBUG_RUNTIME
901 uint64_t t_after = halide_current_time_ns(user_context);
902 debug(user_context) << " Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
903 #endif
904
905 return 0;
906 }
907
908 namespace {
cuda_do_multidimensional_copy(void * user_context,const device_copy & c,uint64_t src,uint64_t dst,int d,bool from_host,bool to_host)909 WEAK int cuda_do_multidimensional_copy(void *user_context, const device_copy &c,
910 uint64_t src, uint64_t dst, int d, bool from_host, bool to_host) {
911 if (d > MAX_COPY_DIMS) {
912 error(user_context) << "Buffer has too many dimensions to copy to/from GPU\n";
913 return -1;
914 } else if (d == 0) {
915 CUresult err = CUDA_SUCCESS;
916 const char *copy_name;
917 debug(user_context) << " from " << (from_host ? "host" : "device")
918 << " to " << (to_host ? "host" : "device") << ", "
919 << (void *)src << " -> " << (void *)dst << ", " << c.chunk_size << " bytes\n";
920 if (!from_host && to_host) {
921 debug(user_context) << "cuMemcpyDtoH(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n";
922 err = cuMemcpyDtoH((void *)dst, (CUdeviceptr)src, c.chunk_size);
923 } else if (from_host && !to_host) {
924 debug(user_context) << "cuMemcpyHtoD(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n";
925 err = cuMemcpyHtoD((CUdeviceptr)dst, (void *)src, c.chunk_size);
926 } else if (!from_host && !to_host) {
927 debug(user_context) << "cuMemcpyDtoD(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n";
928 err = cuMemcpyDtoD((CUdeviceptr)dst, (CUdeviceptr)src, c.chunk_size);
929 } else if (dst != src) {
930 debug(user_context) << "memcpy(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n";
931 // Could reach here if a user called directly into the
932 // cuda API for a device->host copy on a source buffer
933 // with device_dirty = false.
934 memcpy((void *)dst, (void *)src, c.chunk_size);
935 }
936 if (err != CUDA_SUCCESS) {
937 error(user_context) << "CUDA: " << copy_name << " failed: " << get_error_name(err);
938 return (int)err;
939 }
940 } else {
941 ssize_t src_off = 0, dst_off = 0;
942 for (int i = 0; i < (int)c.extent[d - 1]; i++) {
943 int err = cuda_do_multidimensional_copy(user_context, c, src + src_off, dst + dst_off, d - 1, from_host, to_host);
944 dst_off += c.dst_stride_bytes[d - 1];
945 src_off += c.src_stride_bytes[d - 1];
946 if (err) {
947 return err;
948 }
949 }
950 }
951 return 0;
952 }
953 } // namespace
954
halide_cuda_buffer_copy(void * user_context,struct halide_buffer_t * src,const struct halide_device_interface_t * dst_device_interface,struct halide_buffer_t * dst)955 WEAK int halide_cuda_buffer_copy(void *user_context, struct halide_buffer_t *src,
956 const struct halide_device_interface_t *dst_device_interface,
957 struct halide_buffer_t *dst) {
958 // We only handle copies to cuda or to host
959 halide_assert(user_context, dst_device_interface == NULL ||
960 dst_device_interface == &cuda_device_interface);
961
962 if ((src->device_dirty() || src->host == NULL) &&
963 src->device_interface != &cuda_device_interface) {
964 halide_assert(user_context, dst_device_interface == &cuda_device_interface);
965 // This is handled at the higher level.
966 return halide_error_code_incompatible_device_interface;
967 }
968
969 bool from_host = (src->device_interface != &cuda_device_interface) ||
970 (src->device == 0) ||
971 (src->host_dirty() && src->host != NULL);
972 bool to_host = !dst_device_interface;
973
974 halide_assert(user_context, from_host || src->device);
975 halide_assert(user_context, to_host || dst->device);
976
977 device_copy c = make_buffer_copy(src, from_host, dst, to_host);
978
979 int err = 0;
980 {
981 Context ctx(user_context);
982 if (ctx.error != CUDA_SUCCESS) {
983 return ctx.error;
984 }
985
986 debug(user_context)
987 << "CUDA: halide_cuda_buffer_copy (user_context: " << user_context
988 << ", src: " << src << ", dst: " << dst << ")\n";
989
990 #ifdef DEBUG_RUNTIME
991 uint64_t t_before = halide_current_time_ns(user_context);
992 if (!from_host) {
993 halide_assert(user_context, validate_device_pointer(user_context, src));
994 }
995 if (!to_host) {
996 halide_assert(user_context, validate_device_pointer(user_context, dst));
997 }
998 #endif
999
1000 err = cuda_do_multidimensional_copy(user_context, c, c.src + c.src_begin, c.dst, dst->dimensions, from_host, to_host);
1001
1002 #ifdef DEBUG_RUNTIME
1003 uint64_t t_after = halide_current_time_ns(user_context);
1004 debug(user_context) << " Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
1005 #endif
1006 }
1007
1008 return err;
1009 }
1010
1011 namespace {
1012
cuda_device_crop_from_offset(const struct halide_buffer_t * src,int64_t offset,struct halide_buffer_t * dst)1013 WEAK int cuda_device_crop_from_offset(const struct halide_buffer_t *src,
1014 int64_t offset,
1015 struct halide_buffer_t *dst) {
1016 dst->device = src->device + offset;
1017 dst->device_interface = src->device_interface;
1018 dst->set_device_dirty(src->device_dirty());
1019 return 0;
1020 }
1021
1022 } // namespace
1023
halide_cuda_device_crop(void * user_context,const struct halide_buffer_t * src,struct halide_buffer_t * dst)1024 WEAK int halide_cuda_device_crop(void *user_context, const struct halide_buffer_t *src,
1025 struct halide_buffer_t *dst) {
1026 debug(user_context)
1027 << "CUDA: halide_cuda_device_crop (user_context: " << user_context
1028 << ", src: " << src << ", dst: " << dst << ")\n";
1029
1030 // Pointer arithmetic works fine.
1031 const int64_t offset = calc_device_crop_byte_offset(src, dst);
1032 return cuda_device_crop_from_offset(src, offset, dst);
1033 }
1034
halide_cuda_device_slice(void * user_context,const struct halide_buffer_t * src,int slice_dim,int slice_pos,struct halide_buffer_t * dst)1035 WEAK int halide_cuda_device_slice(void *user_context, const struct halide_buffer_t *src,
1036 int slice_dim, int slice_pos,
1037 struct halide_buffer_t *dst) {
1038 debug(user_context)
1039 << "CUDA: halide_cuda_device_slice (user_context: " << user_context
1040 << ", src: " << src << ", slice_dim " << slice_dim << ", slice_pos "
1041 << slice_pos << ", dst: " << dst << ")\n";
1042
1043 // Pointer arithmetic works fine.
1044 const int64_t offset = calc_device_slice_byte_offset(src, slice_dim, slice_pos);
1045 return cuda_device_crop_from_offset(src, offset, dst);
1046 }
1047
halide_cuda_device_release_crop(void * user_context,struct halide_buffer_t * dst)1048 WEAK int halide_cuda_device_release_crop(void *user_context, struct halide_buffer_t *dst) {
1049 debug(user_context)
1050 << "CUDA: halide_cuda_release_crop (user_context: " << user_context
1051 << ", dst: " << dst << ")\n";
1052 return 0;
1053 }
1054
halide_cuda_copy_to_device(void * user_context,halide_buffer_t * buf)1055 WEAK int halide_cuda_copy_to_device(void *user_context, halide_buffer_t *buf) {
1056 return halide_cuda_buffer_copy(user_context, buf, &cuda_device_interface, buf);
1057 }
1058
halide_cuda_copy_to_host(void * user_context,halide_buffer_t * buf)1059 WEAK int halide_cuda_copy_to_host(void *user_context, halide_buffer_t *buf) {
1060 return halide_cuda_buffer_copy(user_context, buf, NULL, buf);
1061 }
1062
1063 // Used to generate correct timings when tracing
halide_cuda_device_sync(void * user_context,struct halide_buffer_t *)1064 WEAK int halide_cuda_device_sync(void *user_context, struct halide_buffer_t *) {
1065 debug(user_context)
1066 << "CUDA: halide_cuda_device_sync (user_context: " << user_context << ")\n";
1067
1068 Context ctx(user_context);
1069 if (ctx.error != CUDA_SUCCESS) {
1070 return ctx.error;
1071 }
1072
1073 #ifdef DEBUG_RUNTIME
1074 uint64_t t_before = halide_current_time_ns(user_context);
1075 #endif
1076
1077 CUresult err;
1078 if (cuStreamSynchronize != NULL) {
1079 CUstream stream;
1080 int result = halide_cuda_get_stream(user_context, ctx.context, &stream);
1081 if (result != 0) {
1082 error(user_context) << "CUDA: In halide_cuda_device_sync, halide_cuda_get_stream returned " << result << "\n";
1083 }
1084 err = cuStreamSynchronize(stream);
1085 } else {
1086 err = cuCtxSynchronize();
1087 }
1088 if (err != CUDA_SUCCESS) {
1089 error(user_context) << "CUDA: cuCtxSynchronize failed: "
1090 << get_error_name(err);
1091 return err;
1092 }
1093
1094 #ifdef DEBUG_RUNTIME
1095 uint64_t t_after = halide_current_time_ns(user_context);
1096 debug(user_context) << " Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
1097 #endif
1098
1099 return 0;
1100 }
1101
halide_cuda_run(void * user_context,void * state_ptr,const char * entry_name,int blocksX,int blocksY,int blocksZ,int threadsX,int threadsY,int threadsZ,int shared_mem_bytes,size_t arg_sizes[],void * args[],int8_t arg_is_buffer[],int num_attributes,float * vertex_buffer,int num_coords_dim0,int num_coords_dim1)1102 WEAK int halide_cuda_run(void *user_context,
1103 void *state_ptr,
1104 const char *entry_name,
1105 int blocksX, int blocksY, int blocksZ,
1106 int threadsX, int threadsY, int threadsZ,
1107 int shared_mem_bytes,
1108 size_t arg_sizes[],
1109 void *args[],
1110 int8_t arg_is_buffer[],
1111 int num_attributes,
1112 float *vertex_buffer,
1113 int num_coords_dim0,
1114 int num_coords_dim1) {
1115
1116 debug(user_context) << "CUDA: halide_cuda_run ("
1117 << "user_context: " << user_context << ", "
1118 << "entry: " << entry_name << ", "
1119 << "blocks: " << blocksX << "x" << blocksY << "x" << blocksZ << ", "
1120 << "threads: " << threadsX << "x" << threadsY << "x" << threadsZ << ", "
1121 << "shmem: " << shared_mem_bytes << "\n";
1122
1123 CUresult err;
1124 Context ctx(user_context);
1125 if (ctx.error != CUDA_SUCCESS) {
1126 return ctx.error;
1127 }
1128
1129 debug(user_context) << "Got context.\n";
1130
1131 #ifdef DEBUG_RUNTIME
1132 uint64_t t_before = halide_current_time_ns(user_context);
1133 #endif
1134
1135 halide_assert(user_context, state_ptr);
1136 module_state *loaded_module = find_module_for_context((registered_filters *)state_ptr, ctx.context);
1137 halide_assert(user_context, loaded_module != NULL);
1138 CUmodule mod = loaded_module->module;
1139 debug(user_context) << "Got module " << mod << "\n";
1140 halide_assert(user_context, mod);
1141 CUfunction f;
1142 err = cuModuleGetFunction(&f, mod, entry_name);
1143 debug(user_context) << "Got function " << f << "\n";
1144 if (err != CUDA_SUCCESS) {
1145 error(user_context) << "CUDA: cuModuleGetFunction failed: "
1146 << get_error_name(err);
1147 return err;
1148 }
1149
1150 size_t num_args = 0;
1151 while (arg_sizes[num_args] != 0) {
1152 debug(user_context) << " halide_cuda_run " << (int)num_args
1153 << " " << (int)arg_sizes[num_args]
1154 << " [" << (*((void **)args[num_args])) << " ...] "
1155 << arg_is_buffer[num_args] << "\n";
1156 num_args++;
1157 }
1158
1159 // We need storage for both the arg and the pointer to it if if
1160 // has to be translated.
1161 void **translated_args = (void **)malloc((num_args + 1) * sizeof(void *));
1162 uint64_t *dev_handles = (uint64_t *)malloc(num_args * sizeof(uint64_t));
1163 for (size_t i = 0; i <= num_args; i++) { // Get NULL at end.
1164 if (arg_is_buffer[i]) {
1165 halide_assert(user_context, arg_sizes[i] == sizeof(uint64_t));
1166 dev_handles[i] = ((halide_buffer_t *)args[i])->device;
1167 translated_args[i] = &(dev_handles[i]);
1168 debug(user_context) << " halide_cuda_run translated arg" << (int)i
1169 << " [" << (*((void **)translated_args[i])) << " ...]\n";
1170 } else {
1171 translated_args[i] = args[i];
1172 }
1173 }
1174
1175 CUstream stream = NULL;
1176 // We use whether this routine was defined in the cuda driver library
1177 // as a test for streams support in the cuda implementation.
1178 if (cuStreamSynchronize != NULL) {
1179 int result = halide_cuda_get_stream(user_context, ctx.context, &stream);
1180 if (result != 0) {
1181 error(user_context) << "CUDA: In halide_cuda_run, halide_cuda_get_stream returned " << result << "\n";
1182 free(dev_handles);
1183 free(translated_args);
1184 return result;
1185 }
1186 }
1187
1188 err = cuLaunchKernel(f,
1189 blocksX, blocksY, blocksZ,
1190 threadsX, threadsY, threadsZ,
1191 shared_mem_bytes,
1192 stream,
1193 translated_args,
1194 NULL);
1195 free(dev_handles);
1196 free(translated_args);
1197 if (err != CUDA_SUCCESS) {
1198 error(user_context) << "CUDA: cuLaunchKernel failed: "
1199 << get_error_name(err);
1200 return err;
1201 }
1202
1203 #ifdef DEBUG_RUNTIME
1204 err = cuCtxSynchronize();
1205 if (err != CUDA_SUCCESS) {
1206 error(user_context) << "CUDA: cuCtxSynchronize failed: "
1207 << get_error_name(err);
1208 return err;
1209 }
1210 uint64_t t_after = halide_current_time_ns(user_context);
1211 debug(user_context) << " Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
1212 #endif
1213 return 0;
1214 }
1215
halide_cuda_device_and_host_malloc(void * user_context,struct halide_buffer_t * buf)1216 WEAK int halide_cuda_device_and_host_malloc(void *user_context, struct halide_buffer_t *buf) {
1217 return halide_default_device_and_host_malloc(user_context, buf, &cuda_device_interface);
1218 }
1219
halide_cuda_device_and_host_free(void * user_context,struct halide_buffer_t * buf)1220 WEAK int halide_cuda_device_and_host_free(void *user_context, struct halide_buffer_t *buf) {
1221 return halide_default_device_and_host_free(user_context, buf, &cuda_device_interface);
1222 }
1223
halide_cuda_wrap_device_ptr(void * user_context,struct halide_buffer_t * buf,uint64_t device_ptr)1224 WEAK int halide_cuda_wrap_device_ptr(void *user_context, struct halide_buffer_t *buf, uint64_t device_ptr) {
1225 halide_assert(user_context, buf->device == 0);
1226 if (buf->device != 0) {
1227 return -2;
1228 }
1229 buf->device = device_ptr;
1230 buf->device_interface = &cuda_device_interface;
1231 buf->device_interface->impl->use_module();
1232 #ifdef DEBUG_RUNTIME
1233 if (!validate_device_pointer(user_context, buf)) {
1234 buf->device_interface->impl->release_module();
1235 buf->device = 0;
1236 buf->device_interface = NULL;
1237 return -3;
1238 }
1239 #endif
1240 return 0;
1241 }
1242
halide_cuda_detach_device_ptr(void * user_context,struct halide_buffer_t * buf)1243 WEAK int halide_cuda_detach_device_ptr(void *user_context, struct halide_buffer_t *buf) {
1244 if (buf->device == NULL) {
1245 return 0;
1246 }
1247 halide_assert(user_context, buf->device_interface == &cuda_device_interface);
1248 buf->device_interface->impl->release_module();
1249 buf->device = 0;
1250 buf->device_interface = NULL;
1251 return 0;
1252 }
1253
halide_cuda_get_device_ptr(void * user_context,struct halide_buffer_t * buf)1254 WEAK uintptr_t halide_cuda_get_device_ptr(void *user_context, struct halide_buffer_t *buf) {
1255 if (buf->device == NULL) {
1256 return 0;
1257 }
1258 halide_assert(user_context, buf->device_interface == &cuda_device_interface);
1259 return (uintptr_t)buf->device;
1260 }
1261
halide_cuda_device_interface()1262 WEAK const halide_device_interface_t *halide_cuda_device_interface() {
1263 return &cuda_device_interface;
1264 }
1265
halide_cuda_compute_capability(void * user_context,int * major,int * minor)1266 WEAK int halide_cuda_compute_capability(void *user_context, int *major, int *minor) {
1267 if (!lib_cuda) {
1268 // If cuda can't be found, we want to return 0, 0 and it's not
1269 // considered an error. So we should be very careful about
1270 // looking for libcuda without tripping any errors in the rest
1271 // of this runtime.
1272 void *sym = halide_cuda_get_symbol(user_context, "cuInit");
1273 if (!sym) {
1274 *major = *minor = 0;
1275 return 0;
1276 }
1277 }
1278
1279 {
1280 Context ctx(user_context);
1281 if (ctx.error != 0) {
1282 return ctx.error;
1283 }
1284
1285 CUresult err;
1286
1287 CUdevice dev;
1288 err = cuCtxGetDevice(&dev);
1289 if (err != CUDA_SUCCESS) {
1290 error(user_context)
1291 << "CUDA: cuCtxGetDevice failed ("
1292 << Halide::Runtime::Internal::Cuda::get_error_name(err)
1293 << ")";
1294 return err;
1295 }
1296
1297 err = cuDeviceGetAttribute(major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev);
1298 if (err == CUDA_SUCCESS) {
1299 err = cuDeviceGetAttribute(minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev);
1300 }
1301
1302 if (err != CUDA_SUCCESS) {
1303 error(user_context)
1304 << "CUDA: cuDeviceGetAttribute failed ("
1305 << Halide::Runtime::Internal::Cuda::get_error_name(err)
1306 << ")";
1307 return err;
1308 }
1309 }
1310
1311 return 0;
1312 }
1313
1314 namespace {
halide_cuda_cleanup()1315 WEAK __attribute__((destructor)) void halide_cuda_cleanup() {
1316 halide_cuda_device_release(NULL);
1317 }
1318 } // namespace
1319
1320 } // extern "C" linkage
1321
1322 namespace Halide {
1323 namespace Runtime {
1324 namespace Internal {
1325 namespace Cuda {
1326
get_error_name(CUresult err)1327 WEAK const char *get_error_name(CUresult err) {
1328 switch (err) {
1329 case CUDA_SUCCESS:
1330 return "CUDA_SUCCESS";
1331 case CUDA_ERROR_INVALID_VALUE:
1332 return "CUDA_ERROR_INVALID_VALUE";
1333 case CUDA_ERROR_OUT_OF_MEMORY:
1334 return "CUDA_ERROR_OUT_OF_MEMORY";
1335 case CUDA_ERROR_NOT_INITIALIZED:
1336 return "CUDA_ERROR_NOT_INITIALIZED";
1337 case CUDA_ERROR_DEINITIALIZED:
1338 return "CUDA_ERROR_DEINITIALIZED";
1339 case CUDA_ERROR_PROFILER_DISABLED:
1340 return "CUDA_ERROR_PROFILER_DISABLED";
1341 case CUDA_ERROR_PROFILER_NOT_INITIALIZED:
1342 return "CUDA_ERROR_PROFILER_NOT_INITIALIZED";
1343 case CUDA_ERROR_PROFILER_ALREADY_STARTED:
1344 return "CUDA_ERROR_PROFILER_ALREADY_STARTED";
1345 case CUDA_ERROR_PROFILER_ALREADY_STOPPED:
1346 return "CUDA_ERROR_PROFILER_ALREADY_STOPPED";
1347 case CUDA_ERROR_NO_DEVICE:
1348 return "CUDA_ERROR_NO_DEVICE";
1349 case CUDA_ERROR_INVALID_DEVICE:
1350 return "CUDA_ERROR_INVALID_DEVICE";
1351 case CUDA_ERROR_INVALID_IMAGE:
1352 return "CUDA_ERROR_INVALID_IMAGE";
1353 case CUDA_ERROR_INVALID_CONTEXT:
1354 return "CUDA_ERROR_INVALID_CONTEXT";
1355 case CUDA_ERROR_CONTEXT_ALREADY_CURRENT:
1356 return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT";
1357 case CUDA_ERROR_MAP_FAILED:
1358 return "CUDA_ERROR_MAP_FAILED";
1359 case CUDA_ERROR_UNMAP_FAILED:
1360 return "CUDA_ERROR_UNMAP_FAILED";
1361 case CUDA_ERROR_ARRAY_IS_MAPPED:
1362 return "CUDA_ERROR_ARRAY_IS_MAPPED";
1363 case CUDA_ERROR_ALREADY_MAPPED:
1364 return "CUDA_ERROR_ALREADY_MAPPED";
1365 case CUDA_ERROR_NO_BINARY_FOR_GPU:
1366 return "CUDA_ERROR_NO_BINARY_FOR_GPU";
1367 case CUDA_ERROR_ALREADY_ACQUIRED:
1368 return "CUDA_ERROR_ALREADY_ACQUIRED";
1369 case CUDA_ERROR_NOT_MAPPED:
1370 return "CUDA_ERROR_NOT_MAPPED";
1371 case CUDA_ERROR_NOT_MAPPED_AS_ARRAY:
1372 return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY";
1373 case CUDA_ERROR_NOT_MAPPED_AS_POINTER:
1374 return "CUDA_ERROR_NOT_MAPPED_AS_POINTER";
1375 case CUDA_ERROR_ECC_UNCORRECTABLE:
1376 return "CUDA_ERROR_ECC_UNCORRECTABLE";
1377 case CUDA_ERROR_UNSUPPORTED_LIMIT:
1378 return "CUDA_ERROR_UNSUPPORTED_LIMIT";
1379 case CUDA_ERROR_CONTEXT_ALREADY_IN_USE:
1380 return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE";
1381 case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED:
1382 return "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED";
1383 case CUDA_ERROR_INVALID_PTX:
1384 return "CUDA_ERROR_INVALID_PTX";
1385 case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT:
1386 return "CUDA_ERROR_INVALID_GRAPHICS_CONTEXT";
1387 case CUDA_ERROR_NVLINK_UNCORRECTABLE:
1388 return "CUDA_ERROR_NVLINK_UNCORRECTABLE";
1389 case CUDA_ERROR_JIT_COMPILER_NOT_FOUND:
1390 return "CUDA_ERROR_JIT_COMPILER_NOT_FOUND";
1391 case CUDA_ERROR_INVALID_SOURCE:
1392 return "CUDA_ERROR_INVALID_SOURCE";
1393 case CUDA_ERROR_FILE_NOT_FOUND:
1394 return "CUDA_ERROR_FILE_NOT_FOUND";
1395 case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND:
1396 return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND";
1397 case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED:
1398 return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED";
1399 case CUDA_ERROR_OPERATING_SYSTEM:
1400 return "CUDA_ERROR_OPERATING_SYSTEM";
1401 case CUDA_ERROR_INVALID_HANDLE:
1402 return "CUDA_ERROR_INVALID_HANDLE";
1403 case CUDA_ERROR_NOT_FOUND:
1404 return "CUDA_ERROR_NOT_FOUND";
1405 case CUDA_ERROR_NOT_READY:
1406 return "CUDA_ERROR_NOT_READY";
1407 case CUDA_ERROR_ILLEGAL_ADDRESS:
1408 return "CUDA_ERROR_ILLEGAL_ADDRESS";
1409 case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
1410 return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
1411 case CUDA_ERROR_LAUNCH_TIMEOUT:
1412 return "CUDA_ERROR_LAUNCH_TIMEOUT";
1413 case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING:
1414 return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING";
1415 case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED:
1416 return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED";
1417 case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED:
1418 return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED";
1419 case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE:
1420 return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE";
1421 case CUDA_ERROR_CONTEXT_IS_DESTROYED:
1422 return "CUDA_ERROR_CONTEXT_IS_DESTROYED";
1423 // A trap instruction produces the below error, which is how we codegen asserts on GPU
1424 case CUDA_ERROR_ILLEGAL_INSTRUCTION:
1425 return "Illegal instruction or Halide assertion failure inside kernel";
1426 case CUDA_ERROR_MISALIGNED_ADDRESS:
1427 return "CUDA_ERROR_MISALIGNED_ADDRESS";
1428 case CUDA_ERROR_INVALID_ADDRESS_SPACE:
1429 return "CUDA_ERROR_INVALID_ADDRESS_SPACE";
1430 case CUDA_ERROR_INVALID_PC:
1431 return "CUDA_ERROR_INVALID_PC";
1432 case CUDA_ERROR_LAUNCH_FAILED:
1433 return "CUDA_ERROR_LAUNCH_FAILED";
1434 case CUDA_ERROR_NOT_PERMITTED:
1435 return "CUDA_ERROR_NOT_PERMITTED";
1436 case CUDA_ERROR_NOT_SUPPORTED:
1437 return "CUDA_ERROR_NOT_SUPPORTED";
1438 case CUDA_ERROR_UNKNOWN:
1439 return "CUDA_ERROR_UNKNOWN";
1440 default:
1441 // This is unfortunate as usually get_cuda_error is called in the middle of
1442 // an error print, but dropping the number on the floor is worse.
1443 error(NULL) << "Unknown cuda error " << err << "\n";
1444 return "<Unknown error>";
1445 }
1446 }
1447
1448 WEAK halide_device_interface_impl_t cuda_device_interface_impl = {
1449 halide_use_jit_module,
1450 halide_release_jit_module,
1451 halide_cuda_device_malloc,
1452 halide_cuda_device_free,
1453 halide_cuda_device_sync,
1454 halide_cuda_device_release,
1455 halide_cuda_copy_to_host,
1456 halide_cuda_copy_to_device,
1457 halide_cuda_device_and_host_malloc,
1458 halide_cuda_device_and_host_free,
1459 halide_cuda_buffer_copy,
1460 halide_cuda_device_crop,
1461 halide_cuda_device_slice,
1462 halide_cuda_device_release_crop,
1463 halide_cuda_wrap_device_ptr,
1464 halide_cuda_detach_device_ptr,
1465 };
1466
1467 WEAK halide_device_interface_t cuda_device_interface = {
1468 halide_device_malloc,
1469 halide_device_free,
1470 halide_device_sync,
1471 halide_device_release,
1472 halide_copy_to_host,
1473 halide_copy_to_device,
1474 halide_device_and_host_malloc,
1475 halide_device_and_host_free,
1476 halide_buffer_copy,
1477 halide_device_crop,
1478 halide_device_slice,
1479 halide_device_release_crop,
1480 halide_device_wrap_native,
1481 halide_device_detach_native,
1482 halide_cuda_compute_capability,
1483 &cuda_device_interface_impl};
1484
1485 } // namespace Cuda
1486 } // namespace Internal
1487 } // namespace Runtime
1488 } // namespace Halide
1489