1 #include "HalideRuntimeCuda.h"
2 #include "device_buffer_utils.h"
3 #include "device_interface.h"
4 #include "mini_cuda.h"
5 #include "printer.h"
6 #include "scoped_mutex_lock.h"
7 #include "scoped_spin_lock.h"
8 
9 namespace Halide {
10 namespace Runtime {
11 namespace Internal {
12 namespace Cuda {
13 
14 // Define the function pointers for the CUDA API.
15 #define CUDA_FN(ret, fn, args) WEAK ret(CUDAAPI *fn) args;
16 #define CUDA_FN_OPTIONAL(ret, fn, args) WEAK ret(CUDAAPI *fn) args;
17 #define CUDA_FN_3020(ret, fn, fn_3020, args) WEAK ret(CUDAAPI *fn) args;
18 #define CUDA_FN_4000(ret, fn, fn_4000, args) WEAK ret(CUDAAPI *fn) args;
19 #include "cuda_functions.h"
20 #undef CUDA_FN
21 #undef CUDA_FN_OPTIONAL
22 #undef CUDA_FN_3020
23 #undef CUDA_FN_4000
24 
25 // The default implementation of halide_cuda_get_symbol attempts to load
26 // the CUDA shared library/DLL, and then get the symbol from it.
27 WEAK void *lib_cuda = NULL;
28 volatile ScopedSpinLock::AtomicFlag WEAK lib_cuda_lock = 0;
29 
halide_cuda_get_symbol(void * user_context,const char * name)30 extern "C" WEAK void *halide_cuda_get_symbol(void *user_context, const char *name) {
31     // Only try to load the library if we can't already get the symbol
32     // from the library. Even if the library is NULL, the symbols may
33     // already be available in the process.
34     void *symbol = halide_get_library_symbol(lib_cuda, name);
35     if (symbol) {
36         return symbol;
37     }
38 
39     const char *lib_names[] = {
40 #ifdef WINDOWS
41         "nvcuda.dll",
42 #else
43         "libcuda.so",
44         "libcuda.dylib",
45         "/Library/Frameworks/CUDA.framework/CUDA",
46 #endif
47     };
48     for (size_t i = 0; i < sizeof(lib_names) / sizeof(lib_names[0]); i++) {
49         lib_cuda = halide_load_library(lib_names[i]);
50         if (lib_cuda) {
51             debug(user_context) << "    Loaded CUDA runtime library: " << lib_names[i] << "\n";
52             break;
53         }
54     }
55 
56     return halide_get_library_symbol(lib_cuda, name);
57 }
58 
59 template<typename T>
get_cuda_symbol(void * user_context,const char * name,bool optional=false)60 ALWAYS_INLINE T get_cuda_symbol(void *user_context, const char *name, bool optional = false) {
61     T s = (T)halide_cuda_get_symbol(user_context, name);
62     if (!optional && !s) {
63         error(user_context) << "CUDA API not found: " << name << "\n";
64     }
65     return s;
66 }
67 
68 // Load a CUDA shared object/dll and get the CUDA API function pointers from it.
load_libcuda(void * user_context)69 WEAK void load_libcuda(void *user_context) {
70     debug(user_context) << "    load_libcuda (user_context: " << user_context << ")\n";
71     halide_assert(user_context, cuInit == NULL);
72 
73 #define CUDA_FN(ret, fn, args) fn = get_cuda_symbol<ret(CUDAAPI *) args>(user_context, #fn);
74 #define CUDA_FN_OPTIONAL(ret, fn, args) fn = get_cuda_symbol<ret(CUDAAPI *) args>(user_context, #fn, true);
75 #define CUDA_FN_3020(ret, fn, fn_3020, args) fn = get_cuda_symbol<ret(CUDAAPI *) args>(user_context, #fn_3020);
76 #define CUDA_FN_4000(ret, fn, fn_4000, args) fn = get_cuda_symbol<ret(CUDAAPI *) args>(user_context, #fn_4000);
77 #include "cuda_functions.h"
78 #undef CUDA_FN
79 #undef CUDA_FN_OPTIONAL
80 #undef CUDA_FN_3020
81 #undef CUDA_FN_4000
82 }
83 
84 // Call load_libcuda() if CUDA library has not been loaded.
85 // This function is thread safe.
86 // Note that initialization might fail. The caller can detect such failure by checking whether cuInit is NULL.
ensure_libcuda_init(void * user_context)87 WEAK void ensure_libcuda_init(void *user_context) {
88     ScopedSpinLock spinlock(&lib_cuda_lock);
89     if (!cuInit) {
90         load_libcuda(user_context);
91     }
92 }
93 
94 extern WEAK halide_device_interface_t cuda_device_interface;
95 
96 WEAK const char *get_error_name(CUresult error);
97 WEAK CUresult create_cuda_context(void *user_context, CUcontext *ctx);
98 
99 // A cuda context defined in this module with weak linkage
100 CUcontext WEAK context = 0;
101 // This lock protexts the above context variable.
102 WEAK halide_mutex context_lock;
103 
104 // A free list, used when allocations are being cached.
105 WEAK struct FreeListItem {
106     CUdeviceptr ptr;
107     CUcontext ctx;
108     CUstream stream;
109     size_t size;
110     FreeListItem *next;
111 } *free_list = 0;
112 WEAK halide_mutex free_list_lock;
113 
114 }  // namespace Cuda
115 }  // namespace Internal
116 }  // namespace Runtime
117 }  // namespace Halide
118 
119 using namespace Halide::Runtime::Internal;
120 using namespace Halide::Runtime::Internal::Cuda;
121 
122 extern "C" {
123 
124 // The default implementation of halide_cuda_acquire_context uses the global
125 // pointers above, and serializes access with a spin lock.
126 // Overriding implementations of acquire/release must implement the following
127 // behavior:
128 // - halide_cuda_acquire_context should always store a valid context/command
129 //   queue in ctx/q, or return an error code.
130 // - A call to halide_cuda_acquire_context is followed by a matching call to
131 //   halide_cuda_release_context. halide_cuda_acquire_context should block while a
132 //   previous call (if any) has not yet been released via halide_cuda_release_context.
halide_cuda_acquire_context(void * user_context,CUcontext * ctx,bool create=true)133 WEAK int halide_cuda_acquire_context(void *user_context, CUcontext *ctx, bool create = true) {
134     // TODO: Should we use a more "assertive" assert? these asserts do
135     // not block execution on failure.
136     halide_assert(user_context, ctx != NULL);
137 
138     // If the context has not been initialized, initialize it now.
139     halide_assert(user_context, &context != NULL);
140 
141     // Note that this null-check of the context is *not* locked with
142     // respect to device_release, so we may get a non-null context
143     // that's in the process of being destroyed. Things will go badly
144     // in general if you call device_release while other Halide code
145     // is running though.
146     CUcontext local_val = context;
147     if (local_val == NULL) {
148         if (!create) {
149             *ctx = NULL;
150             return 0;
151         }
152 
153         {
154             ScopedMutexLock spinlock(&context_lock);
155             local_val = context;
156             if (local_val == NULL) {
157                 CUresult error = create_cuda_context(user_context, &local_val);
158                 if (error != CUDA_SUCCESS) {
159                     return error;
160                 }
161             }
162             // Normally in double-checked locking you need a release
163             // fence here that synchronizes with an acquire fence
164             // above to ensure context is fully constructed before
165             // assigning to the global, but there's no way that
166             // create_cuda_context can access the "context" global, so
167             // we should be OK just storing to it here.
168             context = local_val;
169         }  // spinlock
170     }
171 
172     *ctx = local_val;
173     return 0;
174 }
175 
halide_cuda_release_context(void * user_context)176 WEAK int halide_cuda_release_context(void *user_context) {
177     return 0;
178 }
179 
180 // Return the stream to use for executing kernels and synchronization. Only called
181 // for versions of cuda which support streams. Default is to use the main stream
182 // for the context (NULL stream). The context is passed in for convenience, but
183 // any sort of scoping must be handled by that of the
184 // halide_cuda_acquire_context/halide_cuda_release_context pair, not this call.
halide_cuda_get_stream(void * user_context,CUcontext ctx,CUstream * stream)185 WEAK int halide_cuda_get_stream(void *user_context, CUcontext ctx, CUstream *stream) {
186     // There are two default streams we could use. stream 0 is fully
187     // synchronous. stream 2 gives a separate non-blocking stream per
188     // thread.
189     *stream = 0;
190     return 0;
191 }
192 
193 }  // extern "C"
194 
195 namespace Halide {
196 namespace Runtime {
197 namespace Internal {
198 namespace Cuda {
199 
200 // Helper object to acquire and release the cuda context.
201 class Context {
202     void *user_context;
203 
204 public:
205     CUcontext context;
206     int error;
207 
208     // Constructor sets 'error' if any occurs.
Context(void * user_context)209     ALWAYS_INLINE Context(void *user_context)
210         : user_context(user_context),
211           context(NULL),
212           error(CUDA_SUCCESS) {
213 #ifdef DEBUG_RUNTIME
214         halide_start_clock(user_context);
215 #endif
216         error = halide_cuda_acquire_context(user_context, &context);
217         if (error != 0) {
218             return;
219         }
220 
221         // The default acquire_context loads libcuda as a
222         // side-effect. However, if acquire_context has been
223         // overridden, we may still need to load libcuda
224         ensure_libcuda_init(user_context);
225 
226         halide_assert(user_context, context != NULL);
227         halide_assert(user_context, cuInit != NULL);
228 
229         error = cuCtxPushCurrent(context);
230     }
231 
~Context()232     ALWAYS_INLINE ~Context() {
233         if (error == 0) {
234             CUcontext old;
235             cuCtxPopCurrent(&old);
236         }
237 
238         halide_cuda_release_context(user_context);
239     }
240 };
241 
242 // Halide allocates a device API controlled pointer slot as part of
243 // each compiled module. The slot is used to store information to
244 // avoid having to reload/recompile kernel code on each call into a
245 // Halide filter. The cuda runtime uses this pointer to maintain a
246 // linked list of contexts into which the module has been loaded.
247 //
248 // A global list of all registered filters is also kept so all modules
249 // loaded on a given context can be unloaded and removed from the list
250 // when halide_device_release is called on a specific context.
251 //
252 // The registered_filters struct is not freed as it is pointed to by the
253 // Halide generated code. The module_state structs are freed.
254 
255 struct module_state {
256     CUcontext context;
257     CUmodule module;
258     module_state *next;
259 };
260 
261 struct registered_filters {
262     module_state *modules;
263     registered_filters *next;
264 };
265 WEAK registered_filters *filters_list = NULL;
266 // This spinlock protects the above filters_list.
267 WEAK halide_mutex filters_list_lock;
268 
find_module_for_context(const registered_filters * filters,CUcontext ctx)269 WEAK module_state *find_module_for_context(const registered_filters *filters, CUcontext ctx) {
270     module_state *modules = filters->modules;
271     while (modules != NULL) {
272         if (modules->context == ctx) {
273             return modules;
274         }
275         modules = modules->next;
276     }
277     return NULL;
278 }
279 
create_cuda_context(void * user_context,CUcontext * ctx)280 WEAK CUresult create_cuda_context(void *user_context, CUcontext *ctx) {
281     // Initialize CUDA
282     ensure_libcuda_init(user_context);
283     if (!cuInit) {
284         error(user_context) << "Could not find cuda system libraries";
285         return CUDA_ERROR_FILE_NOT_FOUND;
286     }
287 
288     CUresult err = cuInit(0);
289     if (err != CUDA_SUCCESS) {
290         error(user_context) << "CUDA: cuInit failed: "
291                             << get_error_name(err);
292         return err;
293     }
294 
295     // Make sure we have a device
296     int deviceCount = 0;
297     err = cuDeviceGetCount(&deviceCount);
298     if (err != CUDA_SUCCESS) {
299         error(user_context) << "CUDA: cuGetDeviceCount failed: "
300                             << get_error_name(err);
301         return err;
302     }
303 
304     if (deviceCount <= 0) {
305         halide_error(user_context, "CUDA: No devices available");
306         return CUDA_ERROR_NO_DEVICE;
307     }
308 
309     int device = halide_get_gpu_device(user_context);
310     if (device == -1 && deviceCount == 1) {
311         device = 0;
312     } else if (device == -1) {
313         debug(user_context) << "CUDA: Multiple CUDA devices detected. Selecting the one with the most cores.\n";
314         int best_core_count = 0;
315         for (int i = 0; i < deviceCount; i++) {
316             CUdevice dev;
317             CUresult status = cuDeviceGet(&dev, i);
318             if (status != CUDA_SUCCESS) {
319                 debug(user_context) << "      Failed to get device " << i << "\n";
320                 continue;
321             }
322             int core_count = 0;
323             status = cuDeviceGetAttribute(&core_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
324             debug(user_context) << "      Device " << i << " has " << core_count << " cores\n";
325             if (status != CUDA_SUCCESS) {
326                 continue;
327             }
328             if (core_count >= best_core_count) {
329                 device = i;
330                 best_core_count = core_count;
331             }
332         }
333     }
334 
335     // Get device
336     CUdevice dev;
337     CUresult status = cuDeviceGet(&dev, device);
338     if (status != CUDA_SUCCESS) {
339         halide_error(user_context, "CUDA: Failed to get device\n");
340         return status;
341     }
342 
343     debug(user_context) << "    Got device " << dev << "\n";
344 
345 // Dump device attributes
346 #ifdef DEBUG_RUNTIME
347     {
348         char name[256];
349         name[0] = 0;
350         err = cuDeviceGetName(name, 256, dev);
351         debug(user_context) << "      " << name << "\n";
352 
353         if (err != CUDA_SUCCESS) {
354             error(user_context) << "CUDA: cuDeviceGetName failed: "
355                                 << get_error_name(err);
356             return err;
357         }
358 
359         size_t memory = 0;
360         err = cuDeviceTotalMem(&memory, dev);
361         debug(user_context) << "      total memory: " << (int)(memory >> 20) << " MB\n";
362 
363         if (err != CUDA_SUCCESS) {
364             error(user_context) << "CUDA: cuDeviceTotalMem failed: "
365                                 << get_error_name(err);
366             return err;
367         }
368 
369         // Declare variables for other state we want to query.
370         int max_threads_per_block = 0, warp_size = 0, num_cores = 0;
371         int max_block_size[] = {0, 0, 0};
372         int max_grid_size[] = {0, 0, 0};
373         int max_shared_mem = 0, max_constant_mem = 0;
374         int cc_major = 0, cc_minor = 0;
375 
376         struct {
377             int *dst;
378             CUdevice_attribute attr;
379         } attrs[] = {
380             {&max_threads_per_block, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK},
381             {&warp_size, CU_DEVICE_ATTRIBUTE_WARP_SIZE},
382             {&num_cores, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT},
383             {&max_block_size[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X},
384             {&max_block_size[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y},
385             {&max_block_size[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z},
386             {&max_grid_size[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X},
387             {&max_grid_size[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y},
388             {&max_grid_size[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z},
389             {&max_shared_mem, CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK},
390             {&max_constant_mem, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY},
391             {&cc_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR},
392             {&cc_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR},
393             {NULL, CU_DEVICE_ATTRIBUTE_MAX}};
394 
395         // Do all the queries.
396         for (int i = 0; attrs[i].dst; i++) {
397             err = cuDeviceGetAttribute(attrs[i].dst, attrs[i].attr, dev);
398             if (err != CUDA_SUCCESS) {
399                 error(user_context)
400                     << "CUDA: cuDeviceGetAttribute failed ("
401                     << get_error_name(err)
402                     << ") for attribute " << (int)attrs[i].attr;
403                 return err;
404             }
405         }
406 
407         // threads per core is a function of the compute capability
408         int threads_per_core;
409         switch (cc_major) {
410         case 1:
411             threads_per_core = 8;
412             break;
413         case 2:
414             threads_per_core = (cc_minor == 0 ? 32 : 48);
415             break;
416         case 3:
417             threads_per_core = 192;
418             break;
419         case 5:
420             threads_per_core = 128;
421             break;
422         case 6:
423             threads_per_core = (cc_minor == 0 ? 64 : 128);
424             break;
425         case 7:
426             threads_per_core = 64;
427             break;
428         default:
429             threads_per_core = 0;
430             break;
431         }
432 
433         debug(user_context)
434             << "      max threads per block: " << max_threads_per_block << "\n"
435             << "      warp size: " << warp_size << "\n"
436             << "      max block size: " << max_block_size[0]
437             << " " << max_block_size[1] << " " << max_block_size[2] << "\n"
438             << "      max grid size: " << max_grid_size[0]
439             << " " << max_grid_size[1] << " " << max_grid_size[2] << "\n"
440             << "      max shared memory per block: " << max_shared_mem << "\n"
441             << "      max constant memory per block: " << max_constant_mem << "\n"
442             << "      compute capability " << cc_major << "." << cc_minor << "\n"
443             << "      cuda cores: " << num_cores << " x " << threads_per_core
444             << " = " << num_cores * threads_per_core << "\n";
445     }
446 #endif
447 
448     // Create context
449     debug(user_context) << "    cuCtxCreate " << dev << " -> ";
450     err = cuCtxCreate(ctx, 0, dev);
451     if (err != CUDA_SUCCESS) {
452         debug(user_context) << get_error_name(err) << "\n";
453         error(user_context) << "CUDA: cuCtxCreate failed: "
454                             << get_error_name(err);
455         return err;
456     } else {
457         unsigned int version = 0;
458         cuCtxGetApiVersion(*ctx, &version);
459         debug(user_context) << *ctx << "(" << version << ")\n";
460     }
461     // Creation automatically pushes the context, but we'll pop to allow the caller
462     // to decide when to push.
463     CUcontext dummy;
464     err = cuCtxPopCurrent(&dummy);
465     if (err != CUDA_SUCCESS) {
466         error(user_context) << "CUDA: cuCtxPopCurrent failed: "
467                             << get_error_name(err);
468         return err;
469     }
470 
471     return CUDA_SUCCESS;
472 }
473 
474 // This feature may be useful during CUDA backend or runtime
475 // development. It does not seem to find many errors in general Halide
476 // use and causes false positives in at least one environment, where
477 // it prevents using debug mode with cuda.
478 #define ENABLE_POINTER_VALIDATION 0
479 
validate_device_pointer(void * user_context,halide_buffer_t * buf,size_t size=0)480 WEAK bool validate_device_pointer(void *user_context, halide_buffer_t *buf, size_t size = 0) {
481 // The technique using cuPointerGetAttribute and CU_POINTER_ATTRIBUTE_CONTEXT
482 // requires unified virtual addressing is enabled and that is not the case
483 // for 32-bit processes on Mac OS X. So for now, as a total hack, just return true
484 // in 32-bit. This could of course be wrong the other way for cards that only
485 // support 32-bit addressing in 64-bit processes, but I expect those cards do not
486 // support unified addressing at all.
487 // TODO: figure out a way to validate pointers in all cases if strictly necessary.
488 #if defined(BITS_32) || !ENABLE_POINTER_VALIDATION
489     return true;
490 #else
491     if (buf->device == 0)
492         return true;
493 
494     CUdeviceptr dev_ptr = (CUdeviceptr)buf->device;
495 
496     CUcontext ctx;
497     CUresult result = cuPointerGetAttribute(&ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dev_ptr);
498     if (result != CUDA_SUCCESS) {
499         error(user_context) << "Bad device pointer " << (void *)dev_ptr
500                             << ": cuPointerGetAttribute returned "
501                             << get_error_name(result);
502         return false;
503     }
504     return true;
505 #endif
506 }
507 
508 }  // namespace Cuda
509 }  // namespace Internal
510 }  // namespace Runtime
511 }  // namespace Halide
512 
513 extern "C" {
halide_cuda_initialize_kernels(void * user_context,void ** state_ptr,const char * ptx_src,int size)514 WEAK int halide_cuda_initialize_kernels(void *user_context, void **state_ptr, const char *ptx_src, int size) {
515     debug(user_context) << "CUDA: halide_cuda_initialize_kernels (user_context: " << user_context
516                         << ", state_ptr: " << state_ptr
517                         << ", ptx_src: " << (void *)ptx_src
518                         << ", size: " << size << "\n";
519 
520     Context ctx(user_context);
521     if (ctx.error != 0) {
522         return ctx.error;
523     }
524 
525 #ifdef DEBUG_RUNTIME
526     uint64_t t_before = halide_current_time_ns(user_context);
527 #endif
528 
529     halide_assert(user_context, &filters_list_lock != NULL);
530     {
531         ScopedMutexLock spinlock(&filters_list_lock);
532 
533         // Create the state object if necessary. This only happens once, regardless
534         // of how many times halide_initialize_kernels/halide_release is called.
535         // halide_release traverses this list and releases the module objects, but
536         // it does not modify the list nodes created/inserted here.
537         registered_filters **filters = (registered_filters **)state_ptr;
538         if (!(*filters)) {
539             *filters = (registered_filters *)malloc(sizeof(registered_filters));
540             (*filters)->modules = NULL;
541             (*filters)->next = filters_list;
542             filters_list = *filters;
543         }
544 
545         // Create the module itself if necessary.
546         module_state *loaded_module = find_module_for_context(*filters, ctx.context);
547         if (loaded_module == NULL) {
548             loaded_module = (module_state *)malloc(sizeof(module_state));
549             debug(user_context) << "    cuModuleLoadData " << (void *)ptx_src << ", " << size << " -> ";
550 
551             CUjit_option options[] = {CU_JIT_MAX_REGISTERS};
552             unsigned int max_regs_per_thread = 64;
553 
554             // A hack to enable control over max register count for
555             // testing. This should be surfaced in the schedule somehow
556             // instead.
557             char *regs = getenv("HL_CUDA_JIT_MAX_REGISTERS");
558             if (regs) {
559                 max_regs_per_thread = atoi(regs);
560             }
561             void *optionValues[] = {(void *)(uintptr_t)max_regs_per_thread};
562             CUresult err = cuModuleLoadDataEx(&loaded_module->module, ptx_src, 1, options, optionValues);
563 
564             if (err != CUDA_SUCCESS) {
565                 free(loaded_module);
566                 error(user_context) << "CUDA: cuModuleLoadData failed: "
567                                     << get_error_name(err);
568                 return err;
569             } else {
570                 debug(user_context) << (void *)(loaded_module->module) << "\n";
571             }
572             loaded_module->context = ctx.context;
573             loaded_module->next = (*filters)->modules;
574             (*filters)->modules = loaded_module;
575         }
576     }  // spinlock
577 
578 #ifdef DEBUG_RUNTIME
579     uint64_t t_after = halide_current_time_ns(user_context);
580     debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
581 #endif
582 
583     return 0;
584 }
585 
halide_cuda_release_unused_device_allocations(void * user_context)586 WEAK int halide_cuda_release_unused_device_allocations(void *user_context) {
587     FreeListItem *to_free;
588     {
589         ScopedMutexLock lock(&free_list_lock);
590         to_free = free_list;
591         free_list = NULL;
592     }
593     while (to_free) {
594         debug(user_context) << "    cuMemFree " << (void *)(to_free->ptr) << "\n";
595         cuMemFree(to_free->ptr);
596         FreeListItem *next = to_free->next;
597         free(to_free);
598         to_free = next;
599     }
600     return 0;
601 }
602 
603 namespace Halide {
604 namespace Runtime {
605 namespace Internal {
606 
607 WEAK halide_device_allocation_pool cuda_allocation_pool;
608 
register_cuda_allocation_pool()609 WEAK __attribute__((constructor)) void register_cuda_allocation_pool() {
610     cuda_allocation_pool.release_unused = &halide_cuda_release_unused_device_allocations;
611     halide_register_device_allocation_pool(&cuda_allocation_pool);
612 }
613 
quantize_allocation_size(uint64_t sz)614 ALWAYS_INLINE uint64_t quantize_allocation_size(uint64_t sz) {
615     int z = __builtin_clzll(sz);
616     if (z < 60) {
617         sz--;
618         sz = sz >> (60 - z);
619         sz++;
620         sz = sz << (60 - z);
621     }
622     return sz;
623 }
624 
625 }  // namespace Internal
626 }  // namespace Runtime
627 }  // namespace Halide
628 
halide_cuda_device_free(void * user_context,halide_buffer_t * buf)629 WEAK int halide_cuda_device_free(void *user_context, halide_buffer_t *buf) {
630     // halide_device_free, at present, can be exposed to clients and they
631     // should be allowed to call halide_device_free on any halide_buffer_t
632     // including ones that have never been used with a GPU.
633     if (buf->device == 0) {
634         return 0;
635     }
636 
637     CUdeviceptr dev_ptr = (CUdeviceptr)buf->device;
638 
639     debug(user_context)
640         << "CUDA: halide_cuda_device_free (user_context: " << user_context
641         << ", buf: " << buf << ")\n";
642 
643     Context ctx(user_context);
644     if (ctx.error != CUDA_SUCCESS) {
645         return ctx.error;
646     }
647 
648 #ifdef DEBUG_RUNTIME
649     uint64_t t_before = halide_current_time_ns(user_context);
650 #endif
651 
652     halide_assert(user_context, validate_device_pointer(user_context, buf));
653 
654     CUresult err = CUDA_SUCCESS;
655     if (halide_can_reuse_device_allocations(user_context)) {
656         debug(user_context) << "    caching allocation for later use: " << (void *)(dev_ptr) << "\n";
657 
658         FreeListItem *item = (FreeListItem *)malloc(sizeof(FreeListItem));
659         item->ctx = ctx.context;
660         item->size = quantize_allocation_size(buf->size_in_bytes());
661         item->ptr = dev_ptr;
662 
663         if (cuStreamSynchronize) {
664             // We don't want to use a buffer freed one stream on
665             // another, as there are no synchronization guarantees and
666             // everything is async.
667             int result = halide_cuda_get_stream(user_context, ctx.context, &item->stream);
668             if (result != 0) {
669                 error(user_context) << "CUDA: In halide_cuda_device_free, halide_cuda_get_stream returned " << result << "\n";
670             }
671         } else {
672             item->stream = NULL;
673         }
674 
675         {
676             ScopedMutexLock lock(&free_list_lock);
677             item->next = free_list;
678             free_list = item;
679         }
680     } else {
681         debug(user_context) << "    cuMemFree " << (void *)(dev_ptr) << "\n";
682         err = cuMemFree(dev_ptr);
683         // If cuMemFree fails, it isn't likely to succeed later, so just drop
684         // the reference.
685     }
686     buf->device_interface->impl->release_module();
687     buf->device_interface = NULL;
688     buf->device = 0;
689     if (err != CUDA_SUCCESS) {
690         // We may be called as a destructor, so don't raise an error here.
691         return err;
692     }
693 
694 #ifdef DEBUG_RUNTIME
695     uint64_t t_after = halide_current_time_ns(user_context);
696     debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
697 #endif
698 
699     return 0;
700 }
701 
halide_cuda_device_release(void * user_context)702 WEAK int halide_cuda_device_release(void *user_context) {
703     debug(user_context)
704         << "CUDA: halide_cuda_device_release (user_context: " << user_context << ")\n";
705 
706     // If we haven't even loaded libcuda, don't load it just to quit.
707     if (!lib_cuda) {
708         return 0;
709     }
710 
711     int err;
712     CUcontext ctx;
713     err = halide_cuda_acquire_context(user_context, &ctx, false);
714     if (err != CUDA_SUCCESS) {
715         return err;
716     }
717 
718     if (ctx) {
719         // It's possible that this is being called from the destructor of
720         // a static variable, in which case the driver may already be
721         // shutting down.
722         err = cuCtxPushCurrent(ctx);
723         if (err != CUDA_SUCCESS) {
724             err = cuCtxSynchronize();
725         }
726         halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED);
727 
728         // Dump the contents of the free list, ignoring errors.
729         halide_cuda_release_unused_device_allocations(user_context);
730 
731         {
732             ScopedMutexLock spinlock(&filters_list_lock);
733 
734             // Unload the modules attached to this context. Note that the list
735             // nodes themselves are not freed, only the module objects are
736             // released. Subsequent calls to halide_init_kernels might re-create
737             // the program object using the same list node to store the module
738             // object.
739             registered_filters *filters = filters_list;
740             while (filters) {
741                 module_state **prev_ptr = &filters->modules;
742                 module_state *loaded_module = filters->modules;
743                 while (loaded_module != NULL) {
744                     if (loaded_module->context == ctx) {
745                         debug(user_context) << "    cuModuleUnload " << loaded_module->module << "\n";
746                         err = cuModuleUnload(loaded_module->module);
747                         halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED);
748                         *prev_ptr = loaded_module->next;
749                         free(loaded_module);
750                         loaded_module = *prev_ptr;
751                     } else {
752                         loaded_module = loaded_module->next;
753                         prev_ptr = &loaded_module->next;
754                     }
755                 }
756                 filters = filters->next;
757             }
758         }  // spinlock
759 
760         CUcontext old_ctx;
761         cuCtxPopCurrent(&old_ctx);
762 
763         // Only destroy the context if we own it
764 
765         {
766             ScopedMutexLock spinlock(&context_lock);
767 
768             if (ctx == context) {
769                 debug(user_context) << "    cuCtxDestroy " << context << "\n";
770                 err = cuProfilerStop();
771                 err = cuCtxDestroy(context);
772                 halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED);
773                 context = NULL;
774             }
775         }  // spinlock
776     }
777 
778     halide_cuda_release_context(user_context);
779 
780     return 0;
781 }
782 
halide_cuda_device_malloc(void * user_context,halide_buffer_t * buf)783 WEAK int halide_cuda_device_malloc(void *user_context, halide_buffer_t *buf) {
784     debug(user_context)
785         << "CUDA: halide_cuda_device_malloc (user_context: " << user_context
786         << ", buf: " << buf << ")\n";
787 
788     Context ctx(user_context);
789     if (ctx.error != CUDA_SUCCESS) {
790         return ctx.error;
791     }
792 
793     size_t size = buf->size_in_bytes();
794     if (halide_can_reuse_device_allocations(user_context)) {
795         size = quantize_allocation_size(size);
796     }
797     halide_assert(user_context, size != 0);
798     if (buf->device) {
799         // This buffer already has a device allocation
800         halide_assert(user_context, validate_device_pointer(user_context, buf, size));
801         return 0;
802     }
803 
804     // Check all strides positive.
805     for (int i = 0; i < buf->dimensions; i++) {
806         halide_assert(user_context, buf->dim[i].stride >= 0);
807     }
808 
809     debug(user_context) << "    allocating " << *buf << "\n";
810 
811 #ifdef DEBUG_RUNTIME
812     uint64_t t_before = halide_current_time_ns(user_context);
813 #endif
814 
815     CUdeviceptr p = 0;
816     FreeListItem *to_free = NULL;
817     if (halide_can_reuse_device_allocations(user_context)) {
818         CUstream stream = NULL;
819         if (cuStreamSynchronize != NULL) {
820             int result = halide_cuda_get_stream(user_context, ctx.context, &stream);
821             if (result != 0) {
822                 error(user_context) << "CUDA: In halide_cuda_device_malloc, halide_cuda_get_stream returned " << result << "\n";
823             }
824         }
825 
826         ScopedMutexLock lock(&free_list_lock);
827         // Best-fit allocation. There are three tunable constants
828         // here. A bucket is claimed if the size requested is at least
829         // 7/8 of the size of the bucket. We keep at most 32 unused
830         // allocations. We round up each allocation size to its top 4
831         // most significant bits (see quantize_allocation_size).
832         FreeListItem *best = NULL, *item = free_list;
833         FreeListItem **best_prev = NULL, **prev_ptr = &free_list;
834         int depth = 0;
835         while (item) {
836             if ((size <= item->size) &&                           // Fits
837                 (size >= (item->size / 8) * 7) &&                 // Not too much slop
838                 (ctx.context == item->ctx) &&                     // Same cuda context
839                 (stream == item->stream) &&                       // Can only safely re-use on the same stream on which it was freed
840                 ((best == NULL) || (best->size > item->size))) {  // Better than previous best fit
841                 best = item;
842                 best_prev = prev_ptr;
843                 prev_ptr = &item->next;
844                 item = item->next;
845             } else if (depth > 32) {
846                 // Allocations after here have not been used for a
847                 // long time. Just detach the rest of the free list
848                 // and defer the actual cuMemFree calls until after we
849                 // release the free_list_lock.
850                 to_free = item;
851                 *prev_ptr = NULL;
852                 item = NULL;
853                 break;
854             } else {
855                 prev_ptr = &item->next;
856                 item = item->next;
857             }
858             depth++;
859         }
860 
861         if (best) {
862             p = best->ptr;
863             *best_prev = best->next;
864             free(best);
865         }
866     }
867 
868     while (to_free) {
869         FreeListItem *next = to_free->next;
870         cuMemFree(to_free->ptr);
871         free(to_free);
872         to_free = next;
873     }
874 
875     if (!p) {
876         debug(user_context) << "    cuMemAlloc " << (uint64_t)size << " -> ";
877 
878         // Quantize all allocation sizes to the top 4 bits, to make
879         // reuse likelier. Wastes on average 4% memory per allocation.
880 
881         CUresult err = cuMemAlloc(&p, size);
882         if (err == CUDA_ERROR_OUT_OF_MEMORY) {
883             halide_cuda_release_unused_device_allocations(user_context);
884             err = cuMemAlloc(&p, size);
885         }
886         if (err != CUDA_SUCCESS) {
887             debug(user_context) << get_error_name(err) << "\n";
888             error(user_context) << "CUDA: cuMemAlloc failed: "
889                                 << get_error_name(err);
890             return err;
891         } else {
892             debug(user_context) << (void *)p << "\n";
893         }
894     }
895     halide_assert(user_context, p);
896     buf->device = p;
897     buf->device_interface = &cuda_device_interface;
898     buf->device_interface->impl->use_module();
899 
900 #ifdef DEBUG_RUNTIME
901     uint64_t t_after = halide_current_time_ns(user_context);
902     debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
903 #endif
904 
905     return 0;
906 }
907 
908 namespace {
cuda_do_multidimensional_copy(void * user_context,const device_copy & c,uint64_t src,uint64_t dst,int d,bool from_host,bool to_host)909 WEAK int cuda_do_multidimensional_copy(void *user_context, const device_copy &c,
910                                        uint64_t src, uint64_t dst, int d, bool from_host, bool to_host) {
911     if (d > MAX_COPY_DIMS) {
912         error(user_context) << "Buffer has too many dimensions to copy to/from GPU\n";
913         return -1;
914     } else if (d == 0) {
915         CUresult err = CUDA_SUCCESS;
916         const char *copy_name;
917         debug(user_context) << "    from " << (from_host ? "host" : "device")
918                             << " to " << (to_host ? "host" : "device") << ", "
919                             << (void *)src << " -> " << (void *)dst << ", " << c.chunk_size << " bytes\n";
920         if (!from_host && to_host) {
921             debug(user_context) << "cuMemcpyDtoH(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n";
922             err = cuMemcpyDtoH((void *)dst, (CUdeviceptr)src, c.chunk_size);
923         } else if (from_host && !to_host) {
924             debug(user_context) << "cuMemcpyHtoD(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n";
925             err = cuMemcpyHtoD((CUdeviceptr)dst, (void *)src, c.chunk_size);
926         } else if (!from_host && !to_host) {
927             debug(user_context) << "cuMemcpyDtoD(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n";
928             err = cuMemcpyDtoD((CUdeviceptr)dst, (CUdeviceptr)src, c.chunk_size);
929         } else if (dst != src) {
930             debug(user_context) << "memcpy(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n";
931             // Could reach here if a user called directly into the
932             // cuda API for a device->host copy on a source buffer
933             // with device_dirty = false.
934             memcpy((void *)dst, (void *)src, c.chunk_size);
935         }
936         if (err != CUDA_SUCCESS) {
937             error(user_context) << "CUDA: " << copy_name << " failed: " << get_error_name(err);
938             return (int)err;
939         }
940     } else {
941         ssize_t src_off = 0, dst_off = 0;
942         for (int i = 0; i < (int)c.extent[d - 1]; i++) {
943             int err = cuda_do_multidimensional_copy(user_context, c, src + src_off, dst + dst_off, d - 1, from_host, to_host);
944             dst_off += c.dst_stride_bytes[d - 1];
945             src_off += c.src_stride_bytes[d - 1];
946             if (err) {
947                 return err;
948             }
949         }
950     }
951     return 0;
952 }
953 }  // namespace
954 
halide_cuda_buffer_copy(void * user_context,struct halide_buffer_t * src,const struct halide_device_interface_t * dst_device_interface,struct halide_buffer_t * dst)955 WEAK int halide_cuda_buffer_copy(void *user_context, struct halide_buffer_t *src,
956                                  const struct halide_device_interface_t *dst_device_interface,
957                                  struct halide_buffer_t *dst) {
958     // We only handle copies to cuda or to host
959     halide_assert(user_context, dst_device_interface == NULL ||
960                                     dst_device_interface == &cuda_device_interface);
961 
962     if ((src->device_dirty() || src->host == NULL) &&
963         src->device_interface != &cuda_device_interface) {
964         halide_assert(user_context, dst_device_interface == &cuda_device_interface);
965         // This is handled at the higher level.
966         return halide_error_code_incompatible_device_interface;
967     }
968 
969     bool from_host = (src->device_interface != &cuda_device_interface) ||
970                      (src->device == 0) ||
971                      (src->host_dirty() && src->host != NULL);
972     bool to_host = !dst_device_interface;
973 
974     halide_assert(user_context, from_host || src->device);
975     halide_assert(user_context, to_host || dst->device);
976 
977     device_copy c = make_buffer_copy(src, from_host, dst, to_host);
978 
979     int err = 0;
980     {
981         Context ctx(user_context);
982         if (ctx.error != CUDA_SUCCESS) {
983             return ctx.error;
984         }
985 
986         debug(user_context)
987             << "CUDA: halide_cuda_buffer_copy (user_context: " << user_context
988             << ", src: " << src << ", dst: " << dst << ")\n";
989 
990 #ifdef DEBUG_RUNTIME
991         uint64_t t_before = halide_current_time_ns(user_context);
992         if (!from_host) {
993             halide_assert(user_context, validate_device_pointer(user_context, src));
994         }
995         if (!to_host) {
996             halide_assert(user_context, validate_device_pointer(user_context, dst));
997         }
998 #endif
999 
1000         err = cuda_do_multidimensional_copy(user_context, c, c.src + c.src_begin, c.dst, dst->dimensions, from_host, to_host);
1001 
1002 #ifdef DEBUG_RUNTIME
1003         uint64_t t_after = halide_current_time_ns(user_context);
1004         debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
1005 #endif
1006     }
1007 
1008     return err;
1009 }
1010 
1011 namespace {
1012 
cuda_device_crop_from_offset(const struct halide_buffer_t * src,int64_t offset,struct halide_buffer_t * dst)1013 WEAK int cuda_device_crop_from_offset(const struct halide_buffer_t *src,
1014                                       int64_t offset,
1015                                       struct halide_buffer_t *dst) {
1016     dst->device = src->device + offset;
1017     dst->device_interface = src->device_interface;
1018     dst->set_device_dirty(src->device_dirty());
1019     return 0;
1020 }
1021 
1022 }  // namespace
1023 
halide_cuda_device_crop(void * user_context,const struct halide_buffer_t * src,struct halide_buffer_t * dst)1024 WEAK int halide_cuda_device_crop(void *user_context, const struct halide_buffer_t *src,
1025                                  struct halide_buffer_t *dst) {
1026     debug(user_context)
1027         << "CUDA: halide_cuda_device_crop (user_context: " << user_context
1028         << ", src: " << src << ", dst: " << dst << ")\n";
1029 
1030     // Pointer arithmetic works fine.
1031     const int64_t offset = calc_device_crop_byte_offset(src, dst);
1032     return cuda_device_crop_from_offset(src, offset, dst);
1033 }
1034 
halide_cuda_device_slice(void * user_context,const struct halide_buffer_t * src,int slice_dim,int slice_pos,struct halide_buffer_t * dst)1035 WEAK int halide_cuda_device_slice(void *user_context, const struct halide_buffer_t *src,
1036                                   int slice_dim, int slice_pos,
1037                                   struct halide_buffer_t *dst) {
1038     debug(user_context)
1039         << "CUDA: halide_cuda_device_slice (user_context: " << user_context
1040         << ", src: " << src << ", slice_dim " << slice_dim << ", slice_pos "
1041         << slice_pos << ", dst: " << dst << ")\n";
1042 
1043     // Pointer arithmetic works fine.
1044     const int64_t offset = calc_device_slice_byte_offset(src, slice_dim, slice_pos);
1045     return cuda_device_crop_from_offset(src, offset, dst);
1046 }
1047 
halide_cuda_device_release_crop(void * user_context,struct halide_buffer_t * dst)1048 WEAK int halide_cuda_device_release_crop(void *user_context, struct halide_buffer_t *dst) {
1049     debug(user_context)
1050         << "CUDA: halide_cuda_release_crop (user_context: " << user_context
1051         << ", dst: " << dst << ")\n";
1052     return 0;
1053 }
1054 
halide_cuda_copy_to_device(void * user_context,halide_buffer_t * buf)1055 WEAK int halide_cuda_copy_to_device(void *user_context, halide_buffer_t *buf) {
1056     return halide_cuda_buffer_copy(user_context, buf, &cuda_device_interface, buf);
1057 }
1058 
halide_cuda_copy_to_host(void * user_context,halide_buffer_t * buf)1059 WEAK int halide_cuda_copy_to_host(void *user_context, halide_buffer_t *buf) {
1060     return halide_cuda_buffer_copy(user_context, buf, NULL, buf);
1061 }
1062 
1063 // Used to generate correct timings when tracing
halide_cuda_device_sync(void * user_context,struct halide_buffer_t *)1064 WEAK int halide_cuda_device_sync(void *user_context, struct halide_buffer_t *) {
1065     debug(user_context)
1066         << "CUDA: halide_cuda_device_sync (user_context: " << user_context << ")\n";
1067 
1068     Context ctx(user_context);
1069     if (ctx.error != CUDA_SUCCESS) {
1070         return ctx.error;
1071     }
1072 
1073 #ifdef DEBUG_RUNTIME
1074     uint64_t t_before = halide_current_time_ns(user_context);
1075 #endif
1076 
1077     CUresult err;
1078     if (cuStreamSynchronize != NULL) {
1079         CUstream stream;
1080         int result = halide_cuda_get_stream(user_context, ctx.context, &stream);
1081         if (result != 0) {
1082             error(user_context) << "CUDA: In halide_cuda_device_sync, halide_cuda_get_stream returned " << result << "\n";
1083         }
1084         err = cuStreamSynchronize(stream);
1085     } else {
1086         err = cuCtxSynchronize();
1087     }
1088     if (err != CUDA_SUCCESS) {
1089         error(user_context) << "CUDA: cuCtxSynchronize failed: "
1090                             << get_error_name(err);
1091         return err;
1092     }
1093 
1094 #ifdef DEBUG_RUNTIME
1095     uint64_t t_after = halide_current_time_ns(user_context);
1096     debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
1097 #endif
1098 
1099     return 0;
1100 }
1101 
halide_cuda_run(void * user_context,void * state_ptr,const char * entry_name,int blocksX,int blocksY,int blocksZ,int threadsX,int threadsY,int threadsZ,int shared_mem_bytes,size_t arg_sizes[],void * args[],int8_t arg_is_buffer[],int num_attributes,float * vertex_buffer,int num_coords_dim0,int num_coords_dim1)1102 WEAK int halide_cuda_run(void *user_context,
1103                          void *state_ptr,
1104                          const char *entry_name,
1105                          int blocksX, int blocksY, int blocksZ,
1106                          int threadsX, int threadsY, int threadsZ,
1107                          int shared_mem_bytes,
1108                          size_t arg_sizes[],
1109                          void *args[],
1110                          int8_t arg_is_buffer[],
1111                          int num_attributes,
1112                          float *vertex_buffer,
1113                          int num_coords_dim0,
1114                          int num_coords_dim1) {
1115 
1116     debug(user_context) << "CUDA: halide_cuda_run ("
1117                         << "user_context: " << user_context << ", "
1118                         << "entry: " << entry_name << ", "
1119                         << "blocks: " << blocksX << "x" << blocksY << "x" << blocksZ << ", "
1120                         << "threads: " << threadsX << "x" << threadsY << "x" << threadsZ << ", "
1121                         << "shmem: " << shared_mem_bytes << "\n";
1122 
1123     CUresult err;
1124     Context ctx(user_context);
1125     if (ctx.error != CUDA_SUCCESS) {
1126         return ctx.error;
1127     }
1128 
1129     debug(user_context) << "Got context.\n";
1130 
1131 #ifdef DEBUG_RUNTIME
1132     uint64_t t_before = halide_current_time_ns(user_context);
1133 #endif
1134 
1135     halide_assert(user_context, state_ptr);
1136     module_state *loaded_module = find_module_for_context((registered_filters *)state_ptr, ctx.context);
1137     halide_assert(user_context, loaded_module != NULL);
1138     CUmodule mod = loaded_module->module;
1139     debug(user_context) << "Got module " << mod << "\n";
1140     halide_assert(user_context, mod);
1141     CUfunction f;
1142     err = cuModuleGetFunction(&f, mod, entry_name);
1143     debug(user_context) << "Got function " << f << "\n";
1144     if (err != CUDA_SUCCESS) {
1145         error(user_context) << "CUDA: cuModuleGetFunction failed: "
1146                             << get_error_name(err);
1147         return err;
1148     }
1149 
1150     size_t num_args = 0;
1151     while (arg_sizes[num_args] != 0) {
1152         debug(user_context) << "    halide_cuda_run " << (int)num_args
1153                             << " " << (int)arg_sizes[num_args]
1154                             << " [" << (*((void **)args[num_args])) << " ...] "
1155                             << arg_is_buffer[num_args] << "\n";
1156         num_args++;
1157     }
1158 
1159     // We need storage for both the arg and the pointer to it if if
1160     // has to be translated.
1161     void **translated_args = (void **)malloc((num_args + 1) * sizeof(void *));
1162     uint64_t *dev_handles = (uint64_t *)malloc(num_args * sizeof(uint64_t));
1163     for (size_t i = 0; i <= num_args; i++) {  // Get NULL at end.
1164         if (arg_is_buffer[i]) {
1165             halide_assert(user_context, arg_sizes[i] == sizeof(uint64_t));
1166             dev_handles[i] = ((halide_buffer_t *)args[i])->device;
1167             translated_args[i] = &(dev_handles[i]);
1168             debug(user_context) << "    halide_cuda_run translated arg" << (int)i
1169                                 << " [" << (*((void **)translated_args[i])) << " ...]\n";
1170         } else {
1171             translated_args[i] = args[i];
1172         }
1173     }
1174 
1175     CUstream stream = NULL;
1176     // We use whether this routine was defined in the cuda driver library
1177     // as a test for streams support in the cuda implementation.
1178     if (cuStreamSynchronize != NULL) {
1179         int result = halide_cuda_get_stream(user_context, ctx.context, &stream);
1180         if (result != 0) {
1181             error(user_context) << "CUDA: In halide_cuda_run, halide_cuda_get_stream returned " << result << "\n";
1182             free(dev_handles);
1183             free(translated_args);
1184             return result;
1185         }
1186     }
1187 
1188     err = cuLaunchKernel(f,
1189                          blocksX, blocksY, blocksZ,
1190                          threadsX, threadsY, threadsZ,
1191                          shared_mem_bytes,
1192                          stream,
1193                          translated_args,
1194                          NULL);
1195     free(dev_handles);
1196     free(translated_args);
1197     if (err != CUDA_SUCCESS) {
1198         error(user_context) << "CUDA: cuLaunchKernel failed: "
1199                             << get_error_name(err);
1200         return err;
1201     }
1202 
1203 #ifdef DEBUG_RUNTIME
1204     err = cuCtxSynchronize();
1205     if (err != CUDA_SUCCESS) {
1206         error(user_context) << "CUDA: cuCtxSynchronize failed: "
1207                             << get_error_name(err);
1208         return err;
1209     }
1210     uint64_t t_after = halide_current_time_ns(user_context);
1211     debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
1212 #endif
1213     return 0;
1214 }
1215 
halide_cuda_device_and_host_malloc(void * user_context,struct halide_buffer_t * buf)1216 WEAK int halide_cuda_device_and_host_malloc(void *user_context, struct halide_buffer_t *buf) {
1217     return halide_default_device_and_host_malloc(user_context, buf, &cuda_device_interface);
1218 }
1219 
halide_cuda_device_and_host_free(void * user_context,struct halide_buffer_t * buf)1220 WEAK int halide_cuda_device_and_host_free(void *user_context, struct halide_buffer_t *buf) {
1221     return halide_default_device_and_host_free(user_context, buf, &cuda_device_interface);
1222 }
1223 
halide_cuda_wrap_device_ptr(void * user_context,struct halide_buffer_t * buf,uint64_t device_ptr)1224 WEAK int halide_cuda_wrap_device_ptr(void *user_context, struct halide_buffer_t *buf, uint64_t device_ptr) {
1225     halide_assert(user_context, buf->device == 0);
1226     if (buf->device != 0) {
1227         return -2;
1228     }
1229     buf->device = device_ptr;
1230     buf->device_interface = &cuda_device_interface;
1231     buf->device_interface->impl->use_module();
1232 #ifdef DEBUG_RUNTIME
1233     if (!validate_device_pointer(user_context, buf)) {
1234         buf->device_interface->impl->release_module();
1235         buf->device = 0;
1236         buf->device_interface = NULL;
1237         return -3;
1238     }
1239 #endif
1240     return 0;
1241 }
1242 
halide_cuda_detach_device_ptr(void * user_context,struct halide_buffer_t * buf)1243 WEAK int halide_cuda_detach_device_ptr(void *user_context, struct halide_buffer_t *buf) {
1244     if (buf->device == NULL) {
1245         return 0;
1246     }
1247     halide_assert(user_context, buf->device_interface == &cuda_device_interface);
1248     buf->device_interface->impl->release_module();
1249     buf->device = 0;
1250     buf->device_interface = NULL;
1251     return 0;
1252 }
1253 
halide_cuda_get_device_ptr(void * user_context,struct halide_buffer_t * buf)1254 WEAK uintptr_t halide_cuda_get_device_ptr(void *user_context, struct halide_buffer_t *buf) {
1255     if (buf->device == NULL) {
1256         return 0;
1257     }
1258     halide_assert(user_context, buf->device_interface == &cuda_device_interface);
1259     return (uintptr_t)buf->device;
1260 }
1261 
halide_cuda_device_interface()1262 WEAK const halide_device_interface_t *halide_cuda_device_interface() {
1263     return &cuda_device_interface;
1264 }
1265 
halide_cuda_compute_capability(void * user_context,int * major,int * minor)1266 WEAK int halide_cuda_compute_capability(void *user_context, int *major, int *minor) {
1267     if (!lib_cuda) {
1268         // If cuda can't be found, we want to return 0, 0 and it's not
1269         // considered an error. So we should be very careful about
1270         // looking for libcuda without tripping any errors in the rest
1271         // of this runtime.
1272         void *sym = halide_cuda_get_symbol(user_context, "cuInit");
1273         if (!sym) {
1274             *major = *minor = 0;
1275             return 0;
1276         }
1277     }
1278 
1279     {
1280         Context ctx(user_context);
1281         if (ctx.error != 0) {
1282             return ctx.error;
1283         }
1284 
1285         CUresult err;
1286 
1287         CUdevice dev;
1288         err = cuCtxGetDevice(&dev);
1289         if (err != CUDA_SUCCESS) {
1290             error(user_context)
1291                 << "CUDA: cuCtxGetDevice failed ("
1292                 << Halide::Runtime::Internal::Cuda::get_error_name(err)
1293                 << ")";
1294             return err;
1295         }
1296 
1297         err = cuDeviceGetAttribute(major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev);
1298         if (err == CUDA_SUCCESS) {
1299             err = cuDeviceGetAttribute(minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev);
1300         }
1301 
1302         if (err != CUDA_SUCCESS) {
1303             error(user_context)
1304                 << "CUDA: cuDeviceGetAttribute failed ("
1305                 << Halide::Runtime::Internal::Cuda::get_error_name(err)
1306                 << ")";
1307             return err;
1308         }
1309     }
1310 
1311     return 0;
1312 }
1313 
1314 namespace {
halide_cuda_cleanup()1315 WEAK __attribute__((destructor)) void halide_cuda_cleanup() {
1316     halide_cuda_device_release(NULL);
1317 }
1318 }  // namespace
1319 
1320 }  // extern "C" linkage
1321 
1322 namespace Halide {
1323 namespace Runtime {
1324 namespace Internal {
1325 namespace Cuda {
1326 
get_error_name(CUresult err)1327 WEAK const char *get_error_name(CUresult err) {
1328     switch (err) {
1329     case CUDA_SUCCESS:
1330         return "CUDA_SUCCESS";
1331     case CUDA_ERROR_INVALID_VALUE:
1332         return "CUDA_ERROR_INVALID_VALUE";
1333     case CUDA_ERROR_OUT_OF_MEMORY:
1334         return "CUDA_ERROR_OUT_OF_MEMORY";
1335     case CUDA_ERROR_NOT_INITIALIZED:
1336         return "CUDA_ERROR_NOT_INITIALIZED";
1337     case CUDA_ERROR_DEINITIALIZED:
1338         return "CUDA_ERROR_DEINITIALIZED";
1339     case CUDA_ERROR_PROFILER_DISABLED:
1340         return "CUDA_ERROR_PROFILER_DISABLED";
1341     case CUDA_ERROR_PROFILER_NOT_INITIALIZED:
1342         return "CUDA_ERROR_PROFILER_NOT_INITIALIZED";
1343     case CUDA_ERROR_PROFILER_ALREADY_STARTED:
1344         return "CUDA_ERROR_PROFILER_ALREADY_STARTED";
1345     case CUDA_ERROR_PROFILER_ALREADY_STOPPED:
1346         return "CUDA_ERROR_PROFILER_ALREADY_STOPPED";
1347     case CUDA_ERROR_NO_DEVICE:
1348         return "CUDA_ERROR_NO_DEVICE";
1349     case CUDA_ERROR_INVALID_DEVICE:
1350         return "CUDA_ERROR_INVALID_DEVICE";
1351     case CUDA_ERROR_INVALID_IMAGE:
1352         return "CUDA_ERROR_INVALID_IMAGE";
1353     case CUDA_ERROR_INVALID_CONTEXT:
1354         return "CUDA_ERROR_INVALID_CONTEXT";
1355     case CUDA_ERROR_CONTEXT_ALREADY_CURRENT:
1356         return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT";
1357     case CUDA_ERROR_MAP_FAILED:
1358         return "CUDA_ERROR_MAP_FAILED";
1359     case CUDA_ERROR_UNMAP_FAILED:
1360         return "CUDA_ERROR_UNMAP_FAILED";
1361     case CUDA_ERROR_ARRAY_IS_MAPPED:
1362         return "CUDA_ERROR_ARRAY_IS_MAPPED";
1363     case CUDA_ERROR_ALREADY_MAPPED:
1364         return "CUDA_ERROR_ALREADY_MAPPED";
1365     case CUDA_ERROR_NO_BINARY_FOR_GPU:
1366         return "CUDA_ERROR_NO_BINARY_FOR_GPU";
1367     case CUDA_ERROR_ALREADY_ACQUIRED:
1368         return "CUDA_ERROR_ALREADY_ACQUIRED";
1369     case CUDA_ERROR_NOT_MAPPED:
1370         return "CUDA_ERROR_NOT_MAPPED";
1371     case CUDA_ERROR_NOT_MAPPED_AS_ARRAY:
1372         return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY";
1373     case CUDA_ERROR_NOT_MAPPED_AS_POINTER:
1374         return "CUDA_ERROR_NOT_MAPPED_AS_POINTER";
1375     case CUDA_ERROR_ECC_UNCORRECTABLE:
1376         return "CUDA_ERROR_ECC_UNCORRECTABLE";
1377     case CUDA_ERROR_UNSUPPORTED_LIMIT:
1378         return "CUDA_ERROR_UNSUPPORTED_LIMIT";
1379     case CUDA_ERROR_CONTEXT_ALREADY_IN_USE:
1380         return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE";
1381     case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED:
1382         return "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED";
1383     case CUDA_ERROR_INVALID_PTX:
1384         return "CUDA_ERROR_INVALID_PTX";
1385     case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT:
1386         return "CUDA_ERROR_INVALID_GRAPHICS_CONTEXT";
1387     case CUDA_ERROR_NVLINK_UNCORRECTABLE:
1388         return "CUDA_ERROR_NVLINK_UNCORRECTABLE";
1389     case CUDA_ERROR_JIT_COMPILER_NOT_FOUND:
1390         return "CUDA_ERROR_JIT_COMPILER_NOT_FOUND";
1391     case CUDA_ERROR_INVALID_SOURCE:
1392         return "CUDA_ERROR_INVALID_SOURCE";
1393     case CUDA_ERROR_FILE_NOT_FOUND:
1394         return "CUDA_ERROR_FILE_NOT_FOUND";
1395     case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND:
1396         return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND";
1397     case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED:
1398         return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED";
1399     case CUDA_ERROR_OPERATING_SYSTEM:
1400         return "CUDA_ERROR_OPERATING_SYSTEM";
1401     case CUDA_ERROR_INVALID_HANDLE:
1402         return "CUDA_ERROR_INVALID_HANDLE";
1403     case CUDA_ERROR_NOT_FOUND:
1404         return "CUDA_ERROR_NOT_FOUND";
1405     case CUDA_ERROR_NOT_READY:
1406         return "CUDA_ERROR_NOT_READY";
1407     case CUDA_ERROR_ILLEGAL_ADDRESS:
1408         return "CUDA_ERROR_ILLEGAL_ADDRESS";
1409     case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
1410         return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
1411     case CUDA_ERROR_LAUNCH_TIMEOUT:
1412         return "CUDA_ERROR_LAUNCH_TIMEOUT";
1413     case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING:
1414         return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING";
1415     case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED:
1416         return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED";
1417     case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED:
1418         return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED";
1419     case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE:
1420         return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE";
1421     case CUDA_ERROR_CONTEXT_IS_DESTROYED:
1422         return "CUDA_ERROR_CONTEXT_IS_DESTROYED";
1423     // A trap instruction produces the below error, which is how we codegen asserts on GPU
1424     case CUDA_ERROR_ILLEGAL_INSTRUCTION:
1425         return "Illegal instruction or Halide assertion failure inside kernel";
1426     case CUDA_ERROR_MISALIGNED_ADDRESS:
1427         return "CUDA_ERROR_MISALIGNED_ADDRESS";
1428     case CUDA_ERROR_INVALID_ADDRESS_SPACE:
1429         return "CUDA_ERROR_INVALID_ADDRESS_SPACE";
1430     case CUDA_ERROR_INVALID_PC:
1431         return "CUDA_ERROR_INVALID_PC";
1432     case CUDA_ERROR_LAUNCH_FAILED:
1433         return "CUDA_ERROR_LAUNCH_FAILED";
1434     case CUDA_ERROR_NOT_PERMITTED:
1435         return "CUDA_ERROR_NOT_PERMITTED";
1436     case CUDA_ERROR_NOT_SUPPORTED:
1437         return "CUDA_ERROR_NOT_SUPPORTED";
1438     case CUDA_ERROR_UNKNOWN:
1439         return "CUDA_ERROR_UNKNOWN";
1440     default:
1441         // This is unfortunate as usually get_cuda_error is called in the middle of
1442         // an error print, but dropping the number on the floor is worse.
1443         error(NULL) << "Unknown cuda error " << err << "\n";
1444         return "<Unknown error>";
1445     }
1446 }
1447 
1448 WEAK halide_device_interface_impl_t cuda_device_interface_impl = {
1449     halide_use_jit_module,
1450     halide_release_jit_module,
1451     halide_cuda_device_malloc,
1452     halide_cuda_device_free,
1453     halide_cuda_device_sync,
1454     halide_cuda_device_release,
1455     halide_cuda_copy_to_host,
1456     halide_cuda_copy_to_device,
1457     halide_cuda_device_and_host_malloc,
1458     halide_cuda_device_and_host_free,
1459     halide_cuda_buffer_copy,
1460     halide_cuda_device_crop,
1461     halide_cuda_device_slice,
1462     halide_cuda_device_release_crop,
1463     halide_cuda_wrap_device_ptr,
1464     halide_cuda_detach_device_ptr,
1465 };
1466 
1467 WEAK halide_device_interface_t cuda_device_interface = {
1468     halide_device_malloc,
1469     halide_device_free,
1470     halide_device_sync,
1471     halide_device_release,
1472     halide_copy_to_host,
1473     halide_copy_to_device,
1474     halide_device_and_host_malloc,
1475     halide_device_and_host_free,
1476     halide_buffer_copy,
1477     halide_device_crop,
1478     halide_device_slice,
1479     halide_device_release_crop,
1480     halide_device_wrap_native,
1481     halide_device_detach_native,
1482     halide_cuda_compute_capability,
1483     &cuda_device_interface_impl};
1484 
1485 }  // namespace Cuda
1486 }  // namespace Internal
1487 }  // namespace Runtime
1488 }  // namespace Halide
1489