1 #include "HalideRuntimeOpenCL.h"
2 #include "device_buffer_utils.h"
3 #include "device_interface.h"
4 #include "printer.h"
5 #include "scoped_spin_lock.h"
6 
7 #include "mini_cl.h"
8 
9 namespace Halide {
10 namespace Runtime {
11 namespace Internal {
12 namespace OpenCL {
13 
14 // Define the function pointers for the OpenCL API. OpenCL 1.2
15 // currently disabled so we can work on build bots without it.
16 //#define HAVE_OPENCL_12
17 #define CL_FN(ret, fn, args) WEAK ret(CL_API_CALL *fn) args;
18 #include "cl_functions.h"
19 
20 // The default implementation of halide_opencl_get_symbol attempts to load
21 // the OpenCL runtime shared library/DLL, and then get the symbol from it.
22 WEAK void *lib_opencl = NULL;
23 
halide_opencl_get_symbol(void * user_context,const char * name)24 extern "C" WEAK void *halide_opencl_get_symbol(void *user_context, const char *name) {
25     // Only try to load the library if the library isn't already
26     // loaded, or we can't load the symbol from the process already.
27     void *symbol = halide_get_library_symbol(lib_opencl, name);
28     if (symbol) {
29         return symbol;
30     }
31 
32     const char *lib_names[] = {
33 #ifdef WINDOWS
34         "opencl.dll",
35 #else
36         "libOpenCL.so",
37         "/System/Library/Frameworks/OpenCL.framework/OpenCL",
38 #endif
39     };
40     for (size_t i = 0; i < sizeof(lib_names) / sizeof(lib_names[0]); i++) {
41         lib_opencl = halide_load_library(lib_names[i]);
42         if (lib_opencl) {
43             debug(user_context) << "    Loaded OpenCL runtime library: " << lib_names[i] << "\n";
44             break;
45         }
46     }
47 
48     return halide_get_library_symbol(lib_opencl, name);
49 }
50 
51 template<typename T>
get_cl_symbol(void * user_context,const char * name)52 ALWAYS_INLINE T get_cl_symbol(void *user_context, const char *name) {
53     T s = (T)halide_opencl_get_symbol(user_context, name);
54     if (!s) {
55         error(user_context) << "OpenCL API not found: " << name << "\n";
56     }
57     return s;
58 }
59 
60 // Load an OpenCL shared object/dll, and get the function pointers for the OpenCL API from it.
load_libopencl(void * user_context)61 WEAK void load_libopencl(void *user_context) {
62     debug(user_context) << "    load_libopencl (user_context: " << user_context << ")\n";
63     halide_assert(user_context, clCreateContext == NULL);
64 
65 #define CL_FN(ret, fn, args) fn = get_cl_symbol<ret(CL_API_CALL *) args>(user_context, #fn);
66 #include "cl_functions.h"
67 }
68 
69 extern WEAK halide_device_interface_t opencl_device_interface;
70 
71 WEAK const char *get_opencl_error_name(cl_int err);
72 WEAK int create_opencl_context(void *user_context, cl_context *ctx, cl_command_queue *q);
73 
74 // An OpenCL context/queue/synchronization lock defined in
75 // this module with weak linkage
76 cl_context WEAK context = 0;
77 cl_command_queue WEAK command_queue = 0;
78 volatile ScopedSpinLock::AtomicFlag WEAK thread_lock = 0;
79 
80 WEAK char platform_name[256];
81 WEAK ScopedSpinLock::AtomicFlag platform_name_lock = 0;
82 WEAK bool platform_name_initialized = false;
83 
84 WEAK char device_type[256];
85 WEAK ScopedSpinLock::AtomicFlag device_type_lock = 0;
86 WEAK bool device_type_initialized = false;
87 
88 WEAK char build_options[1024];
89 WEAK ScopedSpinLock::AtomicFlag build_options_lock = 0;
90 WEAK bool build_options_initialized = false;
91 
92 }  // namespace OpenCL
93 }  // namespace Internal
94 }  // namespace Runtime
95 }  // namespace Halide
96 
97 using namespace Halide::Runtime::Internal::OpenCL;
98 
99 // Allow OpenCL 1.1 features to be used.
100 #define ENABLE_OPENCL_11
101 
102 namespace {
halide_opencl_set_platform_name_internal(const char * n)103 void halide_opencl_set_platform_name_internal(const char *n) {
104     if (n) {
105         size_t buffer_size = sizeof(platform_name) / sizeof(platform_name[0]);
106         strncpy(platform_name, n, buffer_size);
107         platform_name[buffer_size - 1] = 0;
108     } else {
109         platform_name[0] = 0;
110     }
111     platform_name_initialized = true;
112 }
113 
halide_opencl_get_platform_name_internal(void * user_context)114 const char *halide_opencl_get_platform_name_internal(void *user_context) {
115     if (!platform_name_initialized) {
116         const char *name = getenv("HL_OCL_PLATFORM_NAME");
117         halide_opencl_set_platform_name_internal(name);
118     }
119     return platform_name;
120 }
121 
halide_opencl_set_device_type_internal(const char * n)122 void halide_opencl_set_device_type_internal(const char *n) {
123     if (n) {
124         size_t buffer_size = sizeof(device_type) / sizeof(device_type[0]);
125         strncpy(device_type, n, buffer_size);
126         device_type[buffer_size - 1] = 0;
127     } else {
128         device_type[0] = 0;
129     }
130     device_type_initialized = true;
131 }
132 
halide_opencl_get_device_type_internal(void * user_context)133 const char *halide_opencl_get_device_type_internal(void *user_context) {
134     if (!device_type_initialized) {
135         const char *name = getenv("HL_OCL_DEVICE_TYPE");
136         halide_opencl_set_device_type_internal(name);
137     }
138     return device_type;
139 }
140 
halide_opencl_set_build_options_internal(const char * n)141 void halide_opencl_set_build_options_internal(const char *n) {
142     if (n) {
143         size_t buffer_size = sizeof(build_options) / sizeof(build_options[0]);
144         strncpy(build_options, n, buffer_size);
145         build_options[buffer_size - 1] = 0;
146     } else {
147         build_options[0] = 0;
148     }
149     build_options_initialized = true;
150 }
151 
halide_opencl_get_build_options_internal(void * user_context)152 const char *halide_opencl_get_build_options_internal(void *user_context) {
153     if (!build_options_initialized) {
154         const char *name = getenv("HL_OCL_BUILD_OPTIONS");
155         halide_opencl_set_build_options_internal(name);
156     }
157     return build_options;
158 }
159 }  // namespace
160 
161 extern "C" {
162 
halide_opencl_set_platform_name(const char * n)163 WEAK void halide_opencl_set_platform_name(const char *n) {
164     ScopedSpinLock lock(&platform_name_lock);
165     halide_opencl_set_platform_name_internal(n);
166 }
167 
halide_opencl_get_platform_name(void * user_context)168 WEAK const char *halide_opencl_get_platform_name(void *user_context) {
169     ScopedSpinLock lock(&platform_name_lock);
170     return halide_opencl_get_platform_name_internal(user_context);
171 }
172 
halide_opencl_set_device_type(const char * n)173 WEAK void halide_opencl_set_device_type(const char *n) {
174     ScopedSpinLock lock(&device_type_lock);
175     halide_opencl_set_device_type_internal(n);
176 }
177 
halide_opencl_get_device_type(void * user_context)178 WEAK const char *halide_opencl_get_device_type(void *user_context) {
179     ScopedSpinLock lock(&device_type_lock);
180     return halide_opencl_get_device_type_internal(user_context);
181 }
182 
halide_opencl_set_build_options(const char * n)183 WEAK void halide_opencl_set_build_options(const char *n) {
184     ScopedSpinLock lock(&build_options_lock);
185     halide_opencl_set_build_options_internal(n);
186 }
187 
halide_opencl_get_build_options(void * user_context)188 WEAK const char *halide_opencl_get_build_options(void *user_context) {
189     ScopedSpinLock lock(&build_options_lock);
190     return halide_opencl_get_build_options_internal(user_context);
191 }
192 
193 // The default implementation of halide_acquire_cl_context uses the global
194 // pointers above, and serializes access with a spin lock.
195 // Overriding implementations of acquire/release must implement the following
196 // behavior:
197 // - halide_acquire_cl_context should always store a valid context/command
198 //   queue in ctx/q, or return an error code.
199 // - A call to halide_acquire_cl_context is followed by a matching call to
200 //   halide_release_cl_context. halide_acquire_cl_context should block while a
201 //   previous call (if any) has not yet been released via halide_release_cl_context.
halide_acquire_cl_context(void * user_context,cl_context * ctx,cl_command_queue * q,bool create=true)202 WEAK int halide_acquire_cl_context(void *user_context, cl_context *ctx, cl_command_queue *q, bool create = true) {
203     // TODO: Should we use a more "assertive" assert? These asserts do
204     // not block execution on failure.
205     halide_assert(user_context, ctx != NULL);
206     halide_assert(user_context, q != NULL);
207 
208     halide_assert(user_context, &thread_lock != NULL);
209     while (__atomic_test_and_set(&thread_lock, __ATOMIC_ACQUIRE)) {
210     }
211 
212     // If the context has not been initialized, initialize it now.
213     halide_assert(user_context, &context != NULL);
214     halide_assert(user_context, &command_queue != NULL);
215     if (!context && create) {
216         cl_int error = create_opencl_context(user_context, &context, &command_queue);
217         if (error != CL_SUCCESS) {
218             __atomic_clear(&thread_lock, __ATOMIC_RELEASE);
219             return error;
220         }
221     }
222 
223     *ctx = context;
224     *q = command_queue;
225     return 0;
226 }
227 
halide_release_cl_context(void * user_context)228 WEAK int halide_release_cl_context(void *user_context) {
229     __atomic_clear(&thread_lock, __ATOMIC_RELEASE);
230     return 0;
231 }
232 
233 }  // extern "C"
234 
235 namespace Halide {
236 namespace Runtime {
237 namespace Internal {
238 namespace OpenCL {
239 
240 // Helper object to acquire and release the OpenCL context.
241 class ClContext {
242     void *user_context;
243 
244 public:
245     cl_context context;
246     cl_command_queue cmd_queue;
247     cl_int error_code;
248 
249     // Constructor sets 'error_code' if any occurs.
ClContext(void * user_context)250     ALWAYS_INLINE ClContext(void *user_context)
251         : user_context(user_context),
252           context(NULL),
253           cmd_queue(NULL),
254           error_code(CL_SUCCESS) {
255         if (clCreateContext == NULL) {
256             load_libopencl(user_context);
257         }
258 
259 #ifdef DEBUG_RUNTIME
260         halide_start_clock(user_context);
261 #endif
262 
263         error_code = halide_acquire_cl_context(user_context, &context, &cmd_queue);
264         // don't abort: that would prevent host_supports_device_api() from being able work properly.
265         if (!context || !cmd_queue) {
266             error(user_context) << "OpenCL: null context or cmd_queue";
267             error_code = -1;
268         }
269     }
270 
~ClContext()271     ALWAYS_INLINE ~ClContext() {
272         halide_release_cl_context(user_context);
273     }
274 };
275 
276 // OpenCL doesn't support creating sub-buffers from some-buffers.  In
277 // order to support more generalized (and frankly, minimally useful)
278 // crop behavior, we store a cl_mem and an offset and then create
279 // sub-buffers as needed.
280 struct device_handle {
281     // Important: order these to avoid any padding between fields;
282     // some Win32 compiler optimizer configurations can inconsistently
283     // insert padding otherwise.
284     uint64_t offset;
285     cl_mem mem;
286 };
287 
288 // Structure to hold the state of a module attached to the context.
289 // Also used as a linked-list to keep track of all the different
290 // modules that are attached to a context in order to release them all
291 // when then context is released.
292 struct module_state {
293     cl_program program;
294     module_state *next;
295 };
296 WEAK module_state *state_list = NULL;
297 
validate_device_pointer(void * user_context,halide_buffer_t * buf,size_t size=0)298 WEAK bool validate_device_pointer(void *user_context, halide_buffer_t *buf, size_t size = 0) {
299     if (buf->device == 0) {
300         return true;
301     }
302 
303     // We may call this in situations where we haven't loaded the
304     // OpenCL API yet.
305     if (!clGetMemObjectInfo) {
306         load_libopencl(user_context);
307     }
308 
309     cl_mem dev_ptr = ((device_handle *)buf->device)->mem;
310     uint64_t offset = ((device_handle *)buf->device)->offset;
311 
312     size_t real_size;
313     cl_int result = clGetMemObjectInfo(dev_ptr, CL_MEM_SIZE, sizeof(size_t), &real_size, NULL);
314     if (result != CL_SUCCESS) {
315         error(user_context) << "CL: Bad device pointer " << (void *)dev_ptr
316                             << ": clGetMemObjectInfo returned "
317                             << get_opencl_error_name(result);
318         return false;
319     }
320 
321     debug(user_context) << "CL: validate " << (void *)dev_ptr << " offset: " << offset
322                         << ": asked for " << (uint64_t)size
323                         << ", actual allocated " << (uint64_t)real_size << "\n";
324 
325     if (size) {
326         halide_assert(user_context, real_size >= (size + offset) && "Validating pointer with insufficient size");
327     }
328     return true;
329 }
330 
331 // Initializes the context used by the default implementation
332 // of halide_acquire_context.
create_opencl_context(void * user_context,cl_context * ctx,cl_command_queue * q)333 WEAK int create_opencl_context(void *user_context, cl_context *ctx, cl_command_queue *q) {
334     debug(user_context)
335         << "    create_opencl_context (user_context: " << user_context << ")\n";
336 
337     halide_assert(user_context, ctx != NULL && *ctx == NULL);
338     halide_assert(user_context, q != NULL && *q == NULL);
339 
340     if (clGetPlatformIDs == NULL) {
341         error(user_context) << "CL: clGetPlatformIDs not found\n";
342         return -1;
343     }
344 
345     cl_int err = 0;
346 
347     const cl_uint max_platforms = 4;
348     cl_platform_id platforms[max_platforms];
349     cl_uint platform_count = 0;
350 
351     err = clGetPlatformIDs(max_platforms, platforms, &platform_count);
352     if (err != CL_SUCCESS) {
353         error(user_context) << "CL: clGetPlatformIDs failed: "
354                             << get_opencl_error_name(err) << " " << err;
355         return err;
356     }
357 
358     cl_platform_id platform = NULL;
359 
360     // Find the requested platform, or the first if none specified.
361     const char *name = halide_opencl_get_platform_name(user_context);
362     if (name != NULL) {
363         for (cl_uint i = 0; i < platform_count; ++i) {
364             const cl_uint max_platform_name = 256;
365             char platform_name[max_platform_name];
366             err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, max_platform_name, platform_name, NULL);
367             if (err != CL_SUCCESS) continue;
368             debug(user_context) << "CL: platform " << i << " " << platform_name << "\n";
369 
370             // A platform matches the request if it is a substring of the platform name.
371             if (strstr(platform_name, name)) {
372                 platform = platforms[i];
373                 break;
374             }
375         }
376     } else if (platform_count > 0) {
377         platform = platforms[0];
378     }
379     if (platform == NULL) {
380         error(user_context) << "CL: Failed to find platform\n";
381         return CL_INVALID_PLATFORM;
382     }
383 
384 #ifdef DEBUG_RUNTIME
385     const cl_uint max_platform_name = 256;
386     char platform_name[max_platform_name];
387     err = clGetPlatformInfo(platform, CL_PLATFORM_NAME, max_platform_name, platform_name, NULL);
388     if (err != CL_SUCCESS) {
389         debug(user_context) << "    clGetPlatformInfo(CL_PLATFORM_NAME) failed: "
390                             << get_opencl_error_name(err) << "\n";
391         // This is just debug info, report the error but don't fail context creation due to it.
392         //return err;
393     } else {
394         debug(user_context) << "    Got platform '" << platform_name
395                             << "', about to create context (t="
396                             << halide_current_time_ns(user_context)
397                             << ")\n";
398     }
399 #endif
400 
401     // Get the types of devices requested.
402     cl_device_type device_type = 0;
403     const char *dev_type = halide_opencl_get_device_type(user_context);
404     if (dev_type != NULL && *dev_type != '\0') {
405         if (strstr(dev_type, "cpu")) {
406             device_type |= CL_DEVICE_TYPE_CPU;
407         }
408         if (strstr(dev_type, "gpu")) {
409             device_type |= CL_DEVICE_TYPE_GPU;
410         }
411         if (strstr(dev_type, "acc")) {
412             device_type |= CL_DEVICE_TYPE_ACCELERATOR;
413         }
414     }
415     // If no device types are specified, use all the available
416     // devices.
417     if (device_type == 0) {
418         device_type = CL_DEVICE_TYPE_ALL;
419     }
420 
421     // Get all the devices of the specified type.
422     const cl_uint maxDevices = 128;
423     cl_device_id devices[maxDevices];
424     cl_uint deviceCount = 0;
425     err = clGetDeviceIDs(platform, device_type, maxDevices, devices, &deviceCount);
426     if (err != CL_SUCCESS) {
427         error(user_context) << "CL: clGetDeviceIDs failed: "
428                             << get_opencl_error_name(err);
429         return err;
430     }
431 
432     // If the user indicated a specific device index to use, use
433     // that. Note that this is an index within the set of devices
434     // specified by the device type. -1 means select a device
435     // automatically based on core count.
436     int device = halide_get_gpu_device(user_context);
437     if (device == -1 && deviceCount == 1) {
438         device = 0;
439     } else if (device == -1) {
440         debug(user_context) << "    Multiple CL devices detected. Selecting the one with the most cores.\n";
441         cl_uint best_core_count = 0;
442         for (cl_uint i = 0; i < deviceCount; i++) {
443             cl_device_id dev = devices[i];
444             cl_uint core_count = 0;
445             err = clGetDeviceInfo(dev, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &core_count, NULL);
446             if (err != CL_SUCCESS) {
447                 debug(user_context) << "      Failed to get info on device " << i << "\n";
448                 continue;
449             }
450             debug(user_context) << "      Device " << i << " has " << core_count << " cores\n";
451             if (core_count >= best_core_count) {
452                 device = i;
453                 best_core_count = core_count;
454             }
455         }
456         debug(user_context) << "    Selected device " << device << "\n";
457     }
458 
459     if (device < 0 || device >= (int)deviceCount) {
460         error(user_context) << "CL: Failed to get device: " << device;
461         return CL_DEVICE_NOT_FOUND;
462     }
463 
464     cl_device_id dev = devices[device];
465 
466 #ifdef DEBUG_RUNTIME
467     // Declare variables for other state we want to query.
468     char device_name[256] = "";
469     char device_vendor[256] = "";
470     char device_profile[256] = "";
471     char device_version[256] = "";
472     char driver_version[256] = "";
473     cl_ulong global_mem_size = 0;
474     cl_ulong max_mem_alloc_size = 0;
475     cl_ulong local_mem_size = 0;
476     cl_uint max_compute_units = 0;
477     size_t max_work_group_size = 0;
478     cl_uint max_work_item_dimensions = 0;
479     size_t max_work_item_sizes[4] = {
480         0,
481     };
482 
483     struct {
484         void *dst;
485         size_t sz;
486         cl_device_info param;
487     } infos[] = {
488         {&device_name[0], sizeof(device_name), CL_DEVICE_NAME},
489         {&device_vendor[0], sizeof(device_vendor), CL_DEVICE_VENDOR},
490         {&device_profile[0], sizeof(device_profile), CL_DEVICE_PROFILE},
491         {&device_version[0], sizeof(device_version), CL_DEVICE_VERSION},
492         {&driver_version[0], sizeof(driver_version), CL_DRIVER_VERSION},
493         {&global_mem_size, sizeof(global_mem_size), CL_DEVICE_GLOBAL_MEM_SIZE},
494         {&max_mem_alloc_size, sizeof(max_mem_alloc_size), CL_DEVICE_MAX_MEM_ALLOC_SIZE},
495         {&local_mem_size, sizeof(local_mem_size), CL_DEVICE_LOCAL_MEM_SIZE},
496         {&max_compute_units, sizeof(max_compute_units), CL_DEVICE_MAX_COMPUTE_UNITS},
497         {&max_work_group_size, sizeof(max_work_group_size), CL_DEVICE_MAX_WORK_GROUP_SIZE},
498         {&max_work_item_dimensions, sizeof(max_work_item_dimensions), CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS},
499         {&max_work_item_sizes[0], sizeof(max_work_item_sizes), CL_DEVICE_MAX_WORK_ITEM_SIZES},
500         {NULL}};
501 
502     // Do all the queries.
503     for (int i = 0; infos[i].dst; i++) {
504         err = clGetDeviceInfo(dev, infos[i].param, infos[i].sz, infos[i].dst, NULL);
505         if (err != CL_SUCCESS) {
506             error(user_context) << "CL: clGetDeviceInfo failed: "
507                                 << get_opencl_error_name(err);
508             return err;
509         }
510     }
511 
512     debug(user_context)
513         << "      device name: " << device_name << "\n"
514         << "      device vendor: " << device_vendor << "\n"
515         << "      device profile: " << device_profile << "\n"
516         << "      global mem size: " << global_mem_size / (1024 * 1024) << " MB\n"
517         << "      max mem alloc size: " << max_mem_alloc_size / (1024 * 1024) << " MB\n"
518         << "      local mem size: " << local_mem_size << "\n"
519         << "      max compute units: " << max_compute_units << "\n"
520         << "      max workgroup size: " << (uint64_t)max_work_group_size << "\n"
521         << "      max work item dimensions: " << max_work_item_dimensions << "\n"
522         << "      max work item sizes: " << (uint64_t)max_work_item_sizes[0]
523         << "x" << (uint64_t)max_work_item_sizes[1]
524         << "x" << (uint64_t)max_work_item_sizes[2]
525         << "x" << (uint64_t)max_work_item_sizes[3] << "\n";
526 #endif
527 
528     // Create context and command queue.
529     cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0};
530     debug(user_context) << "    clCreateContext -> ";
531     *ctx = clCreateContext(properties, 1, &dev, NULL, NULL, &err);
532     if (err != CL_SUCCESS) {
533         debug(user_context) << get_opencl_error_name(err);
534         error(user_context) << "CL: clCreateContext failed: "
535                             << get_opencl_error_name(err)
536                             << ":" << (int)err;
537         return err;
538     } else {
539         debug(user_context) << *ctx << "\n";
540     }
541 
542     debug(user_context) << "    clCreateCommandQueue ";
543     *q = clCreateCommandQueue(*ctx, dev, 0, &err);
544     if (err != CL_SUCCESS) {
545         debug(user_context) << get_opencl_error_name(err);
546         error(user_context) << "CL: clCreateCommandQueue failed: "
547                             << get_opencl_error_name(err);
548         return err;
549     } else {
550         debug(user_context) << *q << "\n";
551     }
552 
553     return err;
554 }
555 
556 }  // namespace OpenCL
557 }  // namespace Internal
558 }  // namespace Runtime
559 }  // namespace Halide
560 
561 extern "C" {
562 
halide_opencl_device_free(void * user_context,halide_buffer_t * buf)563 WEAK int halide_opencl_device_free(void *user_context, halide_buffer_t *buf) {
564     // halide_opencl_device_free, at present, can be exposed to clients and they
565     // should be allowed to call halide_opencl_device_free on any halide_buffer_t
566     // including ones that have never been used with a GPU.
567     if (buf->device == 0) {
568         return 0;
569     }
570 
571     cl_mem dev_ptr = ((device_handle *)buf->device)->mem;
572     halide_assert(user_context, (((device_handle *)buf->device)->offset == 0) && "halide_opencl_device_free on buffer obtained from halide_device_crop");
573 
574     debug(user_context)
575         << "CL: halide_opencl_device_free (user_context: " << user_context
576         << ", buf: " << buf << ") cl_mem: " << dev_ptr << "\n";
577 
578     ClContext ctx(user_context);
579     if (ctx.error_code != CL_SUCCESS) {
580         return ctx.error_code;
581     }
582 
583 #ifdef DEBUG_RUNTIME
584     uint64_t t_before = halide_current_time_ns(user_context);
585 #endif
586 
587     halide_assert(user_context, validate_device_pointer(user_context, buf));
588     debug(user_context) << "    clReleaseMemObject " << (void *)dev_ptr << "\n";
589     cl_int result = clReleaseMemObject((cl_mem)dev_ptr);
590     // If clReleaseMemObject fails, it is unlikely to succeed in a later call, so
591     // we just end our reference to it regardless.
592     free((device_handle *)buf->device);
593     buf->device = 0;
594     buf->device_interface->impl->release_module();
595     buf->device_interface = NULL;
596     if (result != CL_SUCCESS) {
597         // We may be called as a destructor, so don't raise an error
598         // here.
599         return result;
600     }
601 
602 #ifdef DEBUG_RUNTIME
603     uint64_t t_after = halide_current_time_ns(user_context);
604     debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
605 #endif
606 
607     return 0;
608 }
609 
halide_opencl_initialize_kernels(void * user_context,void ** state_ptr,const char * src,int size)610 WEAK int halide_opencl_initialize_kernels(void *user_context, void **state_ptr, const char *src, int size) {
611     debug(user_context)
612         << "CL: halide_opencl_init_kernels (user_context: " << user_context
613         << ", state_ptr: " << state_ptr
614         << ", program: " << (void *)src
615         << ", size: " << size << "\n";
616 
617     ClContext ctx(user_context);
618     if (ctx.error_code != CL_SUCCESS) {
619         return ctx.error_code;
620     }
621 
622 #ifdef DEBUG_RUNTIME
623     uint64_t t_before = halide_current_time_ns(user_context);
624 #endif
625 
626     // Create the state object if necessary. This only happens once, regardless
627     // of how many times halide_init_kernels/halide_release is called.
628     // halide_release traverses this list and releases the program objects, but
629     // it does not modify the list nodes created/inserted here.
630     module_state **state = (module_state **)state_ptr;
631     if (!(*state)) {
632         *state = (module_state *)malloc(sizeof(module_state));
633         (*state)->program = NULL;
634         (*state)->next = state_list;
635         state_list = *state;
636     }
637 
638     // Create the program if necessary. TODO: The program object needs to not
639     // only already exist, but be created for the same context/device as the
640     // calling context/device.
641     if (!(*state && (*state)->program) && size > 1) {
642         cl_int err = 0;
643         cl_device_id dev;
644 
645         err = clGetContextInfo(ctx.context, CL_CONTEXT_DEVICES, sizeof(dev), &dev, NULL);
646         if (err != CL_SUCCESS) {
647             error(user_context) << "CL: clGetContextInfo(CL_CONTEXT_DEVICES) failed: "
648                                 << get_opencl_error_name(err);
649             return err;
650         }
651 
652         cl_device_id devices[] = {dev};
653 
654         // Get the max constant buffer size supported by this OpenCL implementation.
655         cl_ulong max_constant_buffer_size = 0;
656         err = clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(max_constant_buffer_size), &max_constant_buffer_size, NULL);
657         if (err != CL_SUCCESS) {
658             error(user_context) << "CL: clGetDeviceInfo (CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) failed: "
659                                 << get_opencl_error_name(err);
660             return err;
661         }
662         // Get the max number of constant arguments supported by this OpenCL implementation.
663         cl_uint max_constant_args = 0;
664         err = clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_ARGS, sizeof(max_constant_args), &max_constant_args, NULL);
665         if (err != CL_SUCCESS) {
666             error(user_context) << "CL: clGetDeviceInfo (CL_DEVICE_MAX_CONSTANT_ARGS) failed: "
667                                 << get_opencl_error_name(err);
668             return err;
669         }
670 
671         // Build the compile argument options.
672         stringstream options(user_context);
673         options << "-D MAX_CONSTANT_BUFFER_SIZE=" << max_constant_buffer_size
674                 << " -D MAX_CONSTANT_ARGS=" << max_constant_args;
675 
676         const char *extra_options = halide_opencl_get_build_options(user_context);
677         options << " " << extra_options;
678 
679         const char *sources[] = {src};
680         debug(user_context) << "    clCreateProgramWithSource -> ";
681         cl_program program = clCreateProgramWithSource(ctx.context, 1, &sources[0], NULL, &err);
682         if (err != CL_SUCCESS) {
683             debug(user_context) << get_opencl_error_name(err) << "\n";
684             error(user_context) << "CL: clCreateProgramWithSource failed: "
685                                 << get_opencl_error_name(err);
686             return err;
687         } else {
688             debug(user_context) << (void *)program << "\n";
689         }
690         (*state)->program = program;
691 
692         debug(user_context) << "    clBuildProgram " << (void *)program
693                             << " " << options.str() << "\n";
694         err = clBuildProgram(program, 1, devices, options.str(), NULL, NULL);
695         if (err != CL_SUCCESS) {
696 
697             {
698                 // Allocate an appropriately sized buffer for the build log.
699                 Printer<ErrorPrinter, 8192> p(user_context);
700 
701                 p << "CL: clBuildProgram failed: "
702                   << get_opencl_error_name(err)
703                   << "\nBuild Log:\n";
704 
705                 // Get build log
706                 if (clGetProgramBuildInfo(program, dev,
707                                           CL_PROGRAM_BUILD_LOG,
708                                           p.capacity() - p.size() - 1, p.dst,
709                                           NULL) != CL_SUCCESS) {
710                     p << "clGetProgramBuildInfo failed";
711                 }
712             }
713 
714             return err;
715         }
716     }
717 
718 #ifdef DEBUG_RUNTIME
719     uint64_t t_after = halide_current_time_ns(user_context);
720     debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
721 #endif
722     return 0;
723 }
724 
725 // Used to generate correct timings when tracing
halide_opencl_device_sync(void * user_context,halide_buffer_t *)726 WEAK int halide_opencl_device_sync(void *user_context, halide_buffer_t *) {
727     debug(user_context) << "CL: halide_opencl_device_sync (user_context: " << user_context << ")\n";
728 
729     ClContext ctx(user_context);
730     if (ctx.error_code != CL_SUCCESS) {
731         return ctx.error_code;
732     }
733 
734 #ifdef DEBUG_RUNTIME
735     uint64_t t_before = halide_current_time_ns(user_context);
736 #endif
737 
738     cl_int err = clFinish(ctx.cmd_queue);
739     if (err != CL_SUCCESS) {
740         error(user_context) << "CL: clFinish failed: "
741                             << get_opencl_error_name(err);
742         return err;
743     }
744 
745 #ifdef DEBUG_RUNTIME
746     uint64_t t_after = halide_current_time_ns(user_context);
747     debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
748 #endif
749 
750     return CL_SUCCESS;
751 }
752 
halide_opencl_device_release(void * user_context)753 WEAK int halide_opencl_device_release(void *user_context) {
754     debug(user_context)
755         << "CL: halide_opencl_device_release (user_context: " << user_context << ")\n";
756 
757     // The ClContext object does not allow the context storage to be modified,
758     // so we use halide_acquire_context directly.
759     int err;
760     cl_context ctx;
761     cl_command_queue q;
762     err = halide_acquire_cl_context(user_context, &ctx, &q, false);
763     if (err != 0) {
764         return err;
765     }
766 
767     if (ctx) {
768         err = clFinish(q);
769         halide_assert(user_context, err == CL_SUCCESS);
770 
771         // Unload the modules attached to this context. Note that the list
772         // nodes themselves are not freed, only the program objects are
773         // released. Subsequent calls to halide_init_kernels might re-create
774         // the program object using the same list node to store the program
775         // object.
776         module_state *state = state_list;
777         while (state) {
778             if (state->program) {
779                 debug(user_context) << "    clReleaseProgram " << state->program << "\n";
780                 err = clReleaseProgram(state->program);
781                 halide_assert(user_context, err == CL_SUCCESS);
782                 state->program = NULL;
783             }
784             state = state->next;
785         }
786 
787         // Release the context itself, if we created it.
788         if (ctx == context) {
789             debug(user_context) << "    clReleaseCommandQueue " << command_queue << "\n";
790             err = clReleaseCommandQueue(command_queue);
791             halide_assert(user_context, err == CL_SUCCESS);
792             command_queue = NULL;
793 
794             debug(user_context) << "    clReleaseContext " << context << "\n";
795             err = clReleaseContext(context);
796             halide_assert(user_context, err == CL_SUCCESS);
797             context = NULL;
798         }
799     }
800 
801     halide_release_cl_context(user_context);
802 
803     return 0;
804 }
805 
halide_opencl_device_malloc(void * user_context,halide_buffer_t * buf)806 WEAK int halide_opencl_device_malloc(void *user_context, halide_buffer_t *buf) {
807     debug(user_context)
808         << "CL: halide_opencl_device_malloc (user_context: " << user_context
809         << ", buf: " << buf << ")\n";
810 
811     ClContext ctx(user_context);
812     if (ctx.error_code != CL_SUCCESS) {
813         return ctx.error_code;
814     }
815 
816     size_t size = buf->size_in_bytes();
817     halide_assert(user_context, size != 0);
818     if (buf->device) {
819         halide_assert(user_context, validate_device_pointer(user_context, buf, size));
820         return 0;
821     }
822 
823     for (int i = 0; i < buf->dimensions; i++) {
824         halide_assert(user_context, buf->dim[i].stride >= 0);
825     }
826 
827     debug(user_context) << "    allocating " << *buf << "\n";
828 
829 #ifdef DEBUG_RUNTIME
830     uint64_t t_before = halide_current_time_ns(user_context);
831 #endif
832 
833     device_handle *dev_handle = (device_handle *)malloc(sizeof(device_handle));
834     if (dev_handle == NULL) {
835         return CL_OUT_OF_HOST_MEMORY;
836     }
837 
838     cl_int err;
839     debug(user_context) << "    clCreateBuffer -> " << (int)size << " ";
840     cl_mem dev_ptr = clCreateBuffer(ctx.context, CL_MEM_READ_WRITE, size, NULL, &err);
841     if (err != CL_SUCCESS || dev_ptr == 0) {
842         debug(user_context) << get_opencl_error_name(err) << "\n";
843         error(user_context) << "CL: clCreateBuffer failed: "
844                             << get_opencl_error_name(err);
845         free(dev_handle);
846         return err;
847     } else {
848         debug(user_context) << (void *)dev_ptr << " device_handle: " << dev_handle << "\n";
849     }
850 
851     dev_handle->mem = dev_ptr;
852     dev_handle->offset = 0;
853     buf->device = (uint64_t)dev_handle;
854     buf->device_interface = &opencl_device_interface;
855     buf->device_interface->impl->use_module();
856 
857     debug(user_context)
858         << "    Allocated device buffer " << (void *)buf->device
859         << " for buffer " << buf << "\n";
860 
861     halide_assert(user_context, validate_device_pointer(user_context, buf, size));
862 
863 #ifdef DEBUG_RUNTIME
864     uint64_t t_after = halide_current_time_ns(user_context);
865     debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
866 #endif
867 
868     return CL_SUCCESS;
869 }
870 
871 namespace {
opencl_do_multidimensional_copy(void * user_context,ClContext & ctx,const device_copy & c,int64_t src_idx,int64_t dst_idx,int d,bool from_host,bool to_host)872 WEAK int opencl_do_multidimensional_copy(void *user_context, ClContext &ctx,
873                                          const device_copy &c,
874                                          int64_t src_idx, int64_t dst_idx,
875                                          int d, bool from_host, bool to_host) {
876     if (d > MAX_COPY_DIMS) {
877         error(user_context) << "Buffer has too many dimensions to copy to/from GPU\n";
878         return -1;
879     } else if (d == 0) {
880         cl_int err = 0;
881 
882         debug(user_context) << "    from " << (from_host ? "host" : "device")
883                             << " to " << (to_host ? "host" : "device") << ", "
884                             << (void *)c.src << " + " << src_idx
885                             << " -> " << (void *)c.dst << " + " << dst_idx
886                             << ", " << c.chunk_size << " bytes\n";
887         if (!from_host && to_host) {
888             err = clEnqueueReadBuffer(ctx.cmd_queue, ((device_handle *)c.src)->mem,
889                                       CL_FALSE, src_idx + ((device_handle *)c.src)->offset, c.chunk_size, (void *)(c.dst + dst_idx),
890                                       0, NULL, NULL);
891         } else if (from_host && !to_host) {
892             err = clEnqueueWriteBuffer(ctx.cmd_queue, ((device_handle *)c.dst)->mem,
893                                        CL_FALSE, dst_idx + ((device_handle *)c.dst)->offset, c.chunk_size, (void *)(c.src + src_idx),
894                                        0, NULL, NULL);
895         } else if (!from_host && !to_host) {
896             err = clEnqueueCopyBuffer(ctx.cmd_queue, ((device_handle *)c.src)->mem, ((device_handle *)c.dst)->mem,
897                                       src_idx + ((device_handle *)c.src)->offset, dst_idx + ((device_handle *)c.dst)->offset,
898                                       c.chunk_size, 0, NULL, NULL);
899         } else if ((c.dst + dst_idx) != (c.src + src_idx)) {
900             // Could reach here if a user called directly into the
901             // opencl API for a device->host copy on a source buffer
902             // with device_dirty = false.
903             memcpy((void *)(c.dst + dst_idx), (void *)(c.src + src_idx), c.chunk_size);
904         }
905 
906         if (err) {
907             error(user_context) << "CL: buffer copy failed: " << get_opencl_error_name(err);
908             return (int)err;
909         }
910     } else {
911         ssize_t src_off = 0, dst_off = 0;
912         for (int i = 0; i < (int)c.extent[d - 1]; i++) {
913             int err = opencl_do_multidimensional_copy(user_context, ctx, c,
914                                                       src_idx + src_off, dst_idx + dst_off,
915                                                       d - 1, from_host, to_host);
916             dst_off += c.dst_stride_bytes[d - 1];
917             src_off += c.src_stride_bytes[d - 1];
918             if (err) {
919                 return err;
920             }
921         }
922     }
923     return 0;
924 }
925 }  // namespace
926 
halide_opencl_buffer_copy(void * user_context,struct halide_buffer_t * src,const struct halide_device_interface_t * dst_device_interface,struct halide_buffer_t * dst)927 WEAK int halide_opencl_buffer_copy(void *user_context, struct halide_buffer_t *src,
928                                    const struct halide_device_interface_t *dst_device_interface,
929                                    struct halide_buffer_t *dst) {
930     // We only handle copies to opencl or to host
931     halide_assert(user_context, dst_device_interface == NULL ||
932                                     dst_device_interface == &opencl_device_interface);
933 
934     if ((src->device_dirty() || src->host == NULL) &&
935         src->device_interface != &opencl_device_interface) {
936         halide_assert(user_context, dst_device_interface == &opencl_device_interface);
937         // This is handled at the higher level.
938         return halide_error_code_incompatible_device_interface;
939     }
940 
941     bool from_host = (src->device_interface != &opencl_device_interface) ||
942                      (src->device == 0) ||
943                      (src->host_dirty() && src->host != NULL);
944     bool to_host = !dst_device_interface;
945 
946     halide_assert(user_context, from_host || src->device);
947     halide_assert(user_context, to_host || dst->device);
948 
949     device_copy c = make_buffer_copy(src, from_host, dst, to_host);
950 
951     int err = 0;
952     {
953         ClContext ctx(user_context);
954         if (ctx.error_code != CL_SUCCESS) {
955             return ctx.error_code;
956         }
957 
958         debug(user_context)
959             << "CL: halide_opencl_buffer_copy (user_context: " << user_context
960             << ", src: " << src << ", dst: " << dst << ")\n";
961 
962 #ifdef DEBUG_RUNTIME
963         uint64_t t_before = halide_current_time_ns(user_context);
964         if (!from_host) {
965             halide_assert(user_context, validate_device_pointer(user_context, src));
966         }
967         if (!to_host) {
968             halide_assert(user_context, validate_device_pointer(user_context, dst));
969         }
970 #endif
971 
972         err = opencl_do_multidimensional_copy(user_context, ctx, c, c.src_begin, 0, dst->dimensions, from_host, to_host);
973 
974         // The reads/writes above are all non-blocking, so empty the command
975         // queue before we proceed so that other host code won't write
976         // to the buffer while the above writes are still running.
977         clFinish(ctx.cmd_queue);
978 
979 #ifdef DEBUG_RUNTIME
980         uint64_t t_after = halide_current_time_ns(user_context);
981         debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
982 #endif
983     }
984 
985     return err;
986 }
987 
halide_opencl_copy_to_device(void * user_context,halide_buffer_t * buf)988 WEAK int halide_opencl_copy_to_device(void *user_context, halide_buffer_t *buf) {
989     return halide_opencl_buffer_copy(user_context, buf, &opencl_device_interface, buf);
990 }
991 
halide_opencl_copy_to_host(void * user_context,halide_buffer_t * buf)992 WEAK int halide_opencl_copy_to_host(void *user_context, halide_buffer_t *buf) {
993     return halide_opencl_buffer_copy(user_context, buf, NULL, buf);
994 }
995 
halide_opencl_run(void * user_context,void * state_ptr,const char * entry_name,int blocksX,int blocksY,int blocksZ,int threadsX,int threadsY,int threadsZ,int shared_mem_bytes,size_t arg_sizes[],void * args[],int8_t arg_is_buffer[],int num_attributes,float * vertex_buffer,int num_coords_dim0,int num_coords_dim1)996 WEAK int halide_opencl_run(void *user_context,
997                            void *state_ptr,
998                            const char *entry_name,
999                            int blocksX, int blocksY, int blocksZ,
1000                            int threadsX, int threadsY, int threadsZ,
1001                            int shared_mem_bytes,
1002                            size_t arg_sizes[],
1003                            void *args[],
1004                            int8_t arg_is_buffer[],
1005                            int num_attributes,
1006                            float *vertex_buffer,
1007                            int num_coords_dim0,
1008                            int num_coords_dim1) {
1009     debug(user_context)
1010         << "CL: halide_opencl_run (user_context: " << user_context << ", "
1011         << "entry: " << entry_name << ", "
1012         << "blocks: " << blocksX << "x" << blocksY << "x" << blocksZ << ", "
1013         << "threads: " << threadsX << "x" << threadsY << "x" << threadsZ << ", "
1014         << "shmem: " << shared_mem_bytes << "\n";
1015 
1016     cl_int err;
1017     ClContext ctx(user_context);
1018     if (ctx.error_code != CL_SUCCESS) {
1019         return ctx.error_code;
1020     }
1021 
1022 #ifdef DEBUG_RUNTIME
1023     uint64_t t_before = halide_current_time_ns(user_context);
1024 #endif
1025 
1026     // Create kernel object for entry_name from the program for this module.
1027     halide_assert(user_context, state_ptr);
1028     cl_program program = ((module_state *)state_ptr)->program;
1029 
1030     halide_assert(user_context, program);
1031     debug(user_context) << "    clCreateKernel " << entry_name << " -> ";
1032     cl_kernel f = clCreateKernel(program, entry_name, &err);
1033     if (err != CL_SUCCESS) {
1034         debug(user_context) << get_opencl_error_name(err) << "\n";
1035         error(user_context) << "CL: clCreateKernel " << entry_name << " failed: "
1036                             << get_opencl_error_name(err) << "\n";
1037         return err;
1038     } else {
1039 #ifdef DEBUG_RUNTIME
1040         uint64_t t_create_kernel = halide_current_time_ns(user_context);
1041         debug(user_context) << "    Time: " << (t_create_kernel - t_before) / 1.0e6 << " ms\n";
1042 #endif
1043     }
1044 
1045     // Pack dims
1046     size_t global_dim[3] = {(size_t)blocksX * threadsX, (size_t)blocksY * threadsY, (size_t)blocksZ * threadsZ};
1047     size_t local_dim[3] = {(size_t)threadsX, (size_t)threadsY, (size_t)threadsZ};
1048 
1049     // Set args
1050     int i = 0;
1051 
1052     // Count sub buffers needed for crops.
1053     int sub_buffers_needed = 0;
1054     while (arg_sizes[i] != 0) {
1055         if (arg_is_buffer[i] &&
1056             ((device_handle *)((halide_buffer_t *)args[i])->device)->offset != 0) {
1057             sub_buffers_needed++;
1058         }
1059         i += 1;
1060     }
1061     cl_mem *sub_buffers = NULL;
1062     int sub_buffers_saved = 0;
1063     if (sub_buffers_needed > 0) {
1064         sub_buffers = (cl_mem *)malloc(sizeof(cl_mem) * sub_buffers_needed);
1065         if (sub_buffers == NULL) {
1066             return halide_error_code_out_of_memory;
1067         }
1068         memset(sub_buffers, 0, sizeof(cl_mem) * sub_buffers_needed);
1069     }
1070 
1071     i = 0;
1072     while (arg_sizes[i] != 0) {
1073         debug(user_context) << "    clSetKernelArg " << i
1074                             << " " << (int)arg_sizes[i]
1075                             << " [" << (*((void **)args[i])) << " ...] "
1076                             << arg_is_buffer[i] << "\n";
1077         void *this_arg = args[i];
1078         cl_int err = CL_SUCCESS;
1079 
1080         if (arg_is_buffer[i]) {
1081             halide_assert(user_context, arg_sizes[i] == sizeof(uint64_t));
1082             cl_mem mem = ((device_handle *)((halide_buffer_t *)this_arg)->device)->mem;
1083             uint64_t offset = ((device_handle *)((halide_buffer_t *)this_arg)->device)->offset;
1084 
1085             if (offset != 0) {
1086                 cl_buffer_region region = {(size_t)offset, ((halide_buffer_t *)this_arg)->size_in_bytes()};
1087                 // The sub-buffer encompasses the linear range of addresses that
1088                 // span the crop.
1089                 mem = clCreateSubBuffer(mem, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
1090                 sub_buffers[sub_buffers_saved++] = mem;
1091             }
1092             if (err == CL_SUCCESS) {
1093                 debug(user_context) << "Mapped dev handle is: " << (void *)mem << "\n";
1094                 err = clSetKernelArg(f, i, sizeof(mem), &mem);
1095             }
1096         } else {
1097             err = clSetKernelArg(f, i, arg_sizes[i], this_arg);
1098         }
1099 
1100         if (err != CL_SUCCESS) {
1101             error(user_context) << "CL: clSetKernelArg failed: "
1102                                 << get_opencl_error_name(err);
1103             for (int sub_buf_index = 0; sub_buf_index < sub_buffers_saved; sub_buf_index++) {
1104                 clReleaseMemObject(sub_buffers[sub_buf_index]);
1105             }
1106             free(sub_buffers);
1107             return err;
1108         }
1109         i++;
1110     }
1111     // Set the shared mem buffer last
1112     // Always set at least 1 byte of shmem, to keep the launch happy
1113     debug(user_context)
1114         << "    clSetKernelArg " << i << " " << shared_mem_bytes << " [NULL]\n";
1115     err = clSetKernelArg(f, i, (shared_mem_bytes > 0) ? shared_mem_bytes : 1, NULL);
1116     if (err != CL_SUCCESS) {
1117         error(user_context) << "CL: clSetKernelArg failed "
1118                             << get_opencl_error_name(err);
1119         return err;
1120     }
1121 
1122     // Launch kernel
1123     debug(user_context)
1124         << "    clEnqueueNDRangeKernel "
1125         << blocksX << "x" << blocksY << "x" << blocksZ << ", "
1126         << threadsX << "x" << threadsY << "x" << threadsZ << " -> ";
1127     err = clEnqueueNDRangeKernel(ctx.cmd_queue, f,
1128                                  // NDRange
1129                                  3, NULL, global_dim, local_dim,
1130                                  // Events
1131                                  0, NULL, NULL);
1132     debug(user_context) << get_opencl_error_name(err) << "\n";
1133 
1134     // Now that the kernel is enqueued, OpenCL is holding its own
1135     // references to sub buffers and the local ones can be released.
1136     for (int sub_buf_index = 0; sub_buf_index < sub_buffers_saved; sub_buf_index++) {
1137         clReleaseMemObject(sub_buffers[sub_buf_index]);
1138     }
1139     free(sub_buffers);
1140 
1141     if (err != CL_SUCCESS) {
1142         error(user_context) << "CL: clEnqueueNDRangeKernel failed: "
1143                             << get_opencl_error_name(err) << "\n";
1144         return err;
1145     }
1146 
1147     debug(user_context) << "    Releasing kernel " << (void *)f << "\n";
1148     clReleaseKernel(f);
1149     debug(user_context) << "    clReleaseKernel finished" << (void *)f << "\n";
1150 
1151 #ifdef DEBUG_RUNTIME
1152     err = clFinish(ctx.cmd_queue);
1153     if (err != CL_SUCCESS) {
1154         error(user_context) << "CL: clFinish failed (" << err << ")\n";
1155         return err;
1156     }
1157     uint64_t t_after = halide_current_time_ns(user_context);
1158     debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
1159 #endif
1160     return 0;
1161 }
1162 
halide_opencl_device_and_host_malloc(void * user_context,struct halide_buffer_t * buf)1163 WEAK int halide_opencl_device_and_host_malloc(void *user_context, struct halide_buffer_t *buf) {
1164     return halide_default_device_and_host_malloc(user_context, buf, &opencl_device_interface);
1165 }
1166 
halide_opencl_device_and_host_free(void * user_context,struct halide_buffer_t * buf)1167 WEAK int halide_opencl_device_and_host_free(void *user_context, struct halide_buffer_t *buf) {
1168     return halide_default_device_and_host_free(user_context, buf, &opencl_device_interface);
1169 }
1170 
halide_opencl_wrap_cl_mem(void * user_context,struct halide_buffer_t * buf,uint64_t mem)1171 WEAK int halide_opencl_wrap_cl_mem(void *user_context, struct halide_buffer_t *buf, uint64_t mem) {
1172     halide_assert(user_context, buf->device == 0);
1173     if (buf->device != 0) {
1174         return -2;
1175     }
1176     device_handle *dev_handle = (device_handle *)malloc(sizeof(device_handle));
1177     if (dev_handle == NULL) {
1178         return halide_error_code_out_of_memory;
1179     }
1180     dev_handle->mem = (cl_mem)mem;
1181     dev_handle->offset = 0;
1182     buf->device = (uint64_t)dev_handle;
1183     buf->device_interface = &opencl_device_interface;
1184     buf->device_interface->impl->use_module();
1185 #ifdef DEBUG_RUNTIME
1186     if (!validate_device_pointer(user_context, buf)) {
1187         free((device_handle *)buf->device);
1188         buf->device = 0;
1189         buf->device_interface->impl->release_module();
1190         buf->device_interface = NULL;
1191         return -3;
1192     }
1193 #endif
1194     return 0;
1195 }
1196 
halide_opencl_detach_cl_mem(void * user_context,halide_buffer_t * buf)1197 WEAK int halide_opencl_detach_cl_mem(void *user_context, halide_buffer_t *buf) {
1198     if (buf->device == NULL) {
1199         return 0;
1200     }
1201     halide_assert(user_context, buf->device_interface == &opencl_device_interface);
1202     free((device_handle *)buf->device);
1203     buf->device = 0;
1204     buf->device_interface->impl->release_module();
1205     buf->device_interface = NULL;
1206     return 0;
1207 }
1208 
halide_opencl_get_cl_mem(void * user_context,halide_buffer_t * buf)1209 WEAK uintptr_t halide_opencl_get_cl_mem(void *user_context, halide_buffer_t *buf) {
1210     if (buf->device == NULL) {
1211         return 0;
1212     }
1213     halide_assert(user_context, buf->device_interface == &opencl_device_interface);
1214     return (uintptr_t)((device_handle *)buf->device)->mem;
1215 }
1216 
halide_opencl_get_crop_offset(void * user_context,halide_buffer_t * buf)1217 WEAK uint64_t halide_opencl_get_crop_offset(void *user_context, halide_buffer_t *buf) {
1218     if (buf->device == NULL) {
1219         return 0;
1220     }
1221     halide_assert(user_context, buf->device_interface == &opencl_device_interface);
1222     return ((device_handle *)buf->device)->offset;
1223 }
1224 
1225 namespace {
1226 
opencl_device_crop_from_offset(void * user_context,const struct halide_buffer_t * src,int64_t offset,struct halide_buffer_t * dst)1227 WEAK int opencl_device_crop_from_offset(void *user_context,
1228                                         const struct halide_buffer_t *src,
1229                                         int64_t offset,
1230                                         struct halide_buffer_t *dst) {
1231     ClContext ctx(user_context);
1232     if (ctx.error_code != CL_SUCCESS) {
1233         return ctx.error_code;
1234     }
1235 
1236     dst->device_interface = src->device_interface;
1237 
1238     device_handle *new_dev_handle = (device_handle *)malloc(sizeof(device_handle));
1239     if (new_dev_handle == NULL) {
1240         error(user_context) << "CL: malloc failed making device handle for crop.\n";
1241         return halide_error_code_out_of_memory;
1242     }
1243 
1244     clRetainMemObject(((device_handle *)src->device)->mem);
1245     new_dev_handle->mem = ((device_handle *)src->device)->mem;
1246     new_dev_handle->offset = ((device_handle *)src->device)->offset + offset;
1247     dst->device = (uint64_t)new_dev_handle;
1248 
1249     return 0;
1250 }
1251 
1252 }  // namespace
1253 
halide_opencl_device_crop(void * user_context,const struct halide_buffer_t * src,struct halide_buffer_t * dst)1254 WEAK int halide_opencl_device_crop(void *user_context,
1255                                    const struct halide_buffer_t *src,
1256                                    struct halide_buffer_t *dst) {
1257     const int64_t offset = calc_device_crop_byte_offset(src, dst);
1258     return opencl_device_crop_from_offset(user_context, src, offset, dst);
1259 }
1260 
halide_opencl_device_slice(void * user_context,const struct halide_buffer_t * src,int slice_dim,int slice_pos,struct halide_buffer_t * dst)1261 WEAK int halide_opencl_device_slice(void *user_context,
1262                                     const struct halide_buffer_t *src,
1263                                     int slice_dim,
1264                                     int slice_pos,
1265                                     struct halide_buffer_t *dst) {
1266     const int64_t offset = calc_device_slice_byte_offset(src, slice_dim, slice_pos);
1267     return opencl_device_crop_from_offset(user_context, src, offset, dst);
1268 }
1269 
halide_opencl_device_release_crop(void * user_context,struct halide_buffer_t * buf)1270 WEAK int halide_opencl_device_release_crop(void *user_context,
1271                                            struct halide_buffer_t *buf) {
1272     // Basically the same code as in halide_opencl_device_free, but with
1273     // enough differences to require separate code.
1274 
1275     cl_mem dev_ptr = ((device_handle *)buf->device)->mem;
1276 
1277     debug(user_context)
1278         << "CL: halide_opencl_device_release_crop(user_context: " << user_context
1279         << ", buf: " << buf << ") cl_mem: " << dev_ptr << " offset: " << ((device_handle *)buf->device)->offset << "\n";
1280 
1281     ClContext ctx(user_context);
1282     if (ctx.error_code != CL_SUCCESS) {
1283         return ctx.error_code;
1284     }
1285 
1286 #ifdef DEBUG_RUNTIME
1287     uint64_t t_before = halide_current_time_ns(user_context);
1288 #endif
1289 
1290     halide_assert(user_context, validate_device_pointer(user_context, buf));
1291     debug(user_context) << "    clReleaseMemObject " << (void *)dev_ptr << "\n";
1292     // Sub-buffers are released with clReleaseMemObject
1293     cl_int result = clReleaseMemObject((cl_mem)dev_ptr);
1294     free((device_handle *)buf->device);
1295     if (result != CL_SUCCESS) {
1296         // We may be called as a destructor, so don't raise an error
1297         // here.
1298         return result;
1299     }
1300 
1301 #ifdef DEBUG_RUNTIME
1302     uint64_t t_after = halide_current_time_ns(user_context);
1303     debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
1304 #endif
1305 
1306     return 0;
1307 }
1308 
halide_opencl_device_interface()1309 WEAK const struct halide_device_interface_t *halide_opencl_device_interface() {
1310     return &opencl_device_interface;
1311 }
1312 
1313 namespace {
halide_opencl_cleanup()1314 WEAK __attribute__((destructor)) void halide_opencl_cleanup() {
1315     halide_opencl_device_release(NULL);
1316 }
1317 }  // namespace
1318 
1319 }  // extern "C" linkage
1320 
1321 namespace Halide {
1322 namespace Runtime {
1323 namespace Internal {
1324 namespace OpenCL {
get_opencl_error_name(cl_int err)1325 WEAK const char *get_opencl_error_name(cl_int err) {
1326     switch (err) {
1327     case CL_SUCCESS:
1328         return "CL_SUCCESS";
1329     case CL_DEVICE_NOT_FOUND:
1330         return "CL_DEVICE_NOT_FOUND";
1331     case CL_DEVICE_NOT_AVAILABLE:
1332         return "CL_DEVICE_NOT_AVAILABLE";
1333     case CL_COMPILER_NOT_AVAILABLE:
1334         return "CL_COMPILER_NOT_AVAILABLE";
1335     case CL_MEM_OBJECT_ALLOCATION_FAILURE:
1336         return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
1337     case CL_OUT_OF_RESOURCES:
1338         return "CL_OUT_OF_RESOURCES";
1339     case CL_OUT_OF_HOST_MEMORY:
1340         return "CL_OUT_OF_HOST_MEMORY";
1341     case CL_PROFILING_INFO_NOT_AVAILABLE:
1342         return "CL_PROFILING_INFO_NOT_AVAILABLE";
1343     case CL_MEM_COPY_OVERLAP:
1344         return "CL_MEM_COPY_OVERLAP";
1345     case CL_IMAGE_FORMAT_MISMATCH:
1346         return "CL_IMAGE_FORMAT_MISMATCH";
1347     case CL_IMAGE_FORMAT_NOT_SUPPORTED:
1348         return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
1349     case CL_BUILD_PROGRAM_FAILURE:
1350         return "CL_BUILD_PROGRAM_FAILURE";
1351     case CL_MAP_FAILURE:
1352         return "CL_MAP_FAILURE";
1353     case CL_MISALIGNED_SUB_BUFFER_OFFSET:
1354         return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
1355     case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
1356         return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
1357     case CL_COMPILE_PROGRAM_FAILURE:
1358         return "CL_COMPILE_PROGRAM_FAILURE";
1359     case CL_LINKER_NOT_AVAILABLE:
1360         return "CL_LINKER_NOT_AVAILABLE";
1361     case CL_LINK_PROGRAM_FAILURE:
1362         return "CL_LINK_PROGRAM_FAILURE";
1363     case CL_DEVICE_PARTITION_FAILED:
1364         return "CL_DEVICE_PARTITION_FAILED";
1365     case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
1366         return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
1367     case CL_INVALID_VALUE:
1368         return "CL_INVALID_VALUE";
1369     case CL_INVALID_DEVICE_TYPE:
1370         return "CL_INVALID_DEVICE_TYPE";
1371     case CL_INVALID_PLATFORM:
1372         return "CL_INVALID_PLATFORM";
1373     case CL_INVALID_DEVICE:
1374         return "CL_INVALID_DEVICE";
1375     case CL_INVALID_CONTEXT:
1376         return "CL_INVALID_CONTEXT";
1377     case CL_INVALID_QUEUE_PROPERTIES:
1378         return "CL_INVALID_QUEUE_PROPERTIES";
1379     case CL_INVALID_COMMAND_QUEUE:
1380         return "CL_INVALID_COMMAND_QUEUE";
1381     case CL_INVALID_HOST_PTR:
1382         return "CL_INVALID_HOST_PTR";
1383     case CL_INVALID_MEM_OBJECT:
1384         return "CL_INVALID_MEM_OBJECT";
1385     case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
1386         return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
1387     case CL_INVALID_IMAGE_SIZE:
1388         return "CL_INVALID_IMAGE_SIZE";
1389     case CL_INVALID_SAMPLER:
1390         return "CL_INVALID_SAMPLER";
1391     case CL_INVALID_BINARY:
1392         return "CL_INVALID_BINARY";
1393     case CL_INVALID_BUILD_OPTIONS:
1394         return "CL_INVALID_BUILD_OPTIONS";
1395     case CL_INVALID_PROGRAM:
1396         return "CL_INVALID_PROGRAM";
1397     case CL_INVALID_PROGRAM_EXECUTABLE:
1398         return "CL_INVALID_PROGRAM_EXECUTABLE";
1399     case CL_INVALID_KERNEL_NAME:
1400         return "CL_INVALID_KERNEL_NAME";
1401     case CL_INVALID_KERNEL_DEFINITION:
1402         return "CL_INVALID_KERNEL_DEFINITION";
1403     case CL_INVALID_KERNEL:
1404         return "CL_INVALID_KERNEL";
1405     case CL_INVALID_ARG_INDEX:
1406         return "CL_INVALID_ARG_INDEX";
1407     case CL_INVALID_ARG_VALUE:
1408         return "CL_INVALID_ARG_VALUE";
1409     case CL_INVALID_ARG_SIZE:
1410         return "CL_INVALID_ARG_SIZE";
1411     case CL_INVALID_KERNEL_ARGS:
1412         return "CL_INVALID_KERNEL_ARGS";
1413     case CL_INVALID_WORK_DIMENSION:
1414         return "CL_INVALID_WORK_DIMENSION";
1415     case CL_INVALID_WORK_GROUP_SIZE:
1416         return "CL_INVALID_WORK_GROUP_SIZE";
1417     case CL_INVALID_WORK_ITEM_SIZE:
1418         return "CL_INVALID_WORK_ITEM_SIZE";
1419     case CL_INVALID_GLOBAL_OFFSET:
1420         return "CL_INVALID_GLOBAL_OFFSET";
1421     case CL_INVALID_EVENT_WAIT_LIST:
1422         return "CL_INVALID_EVENT_WAIT_LIST";
1423     case CL_INVALID_EVENT:
1424         return "CL_INVALID_EVENT";
1425     case CL_INVALID_OPERATION:
1426         return "CL_INVALID_OPERATION";
1427     case CL_INVALID_GL_OBJECT:
1428         return "CL_INVALID_GL_OBJECT";
1429     case CL_INVALID_BUFFER_SIZE:
1430         return "CL_INVALID_BUFFER_SIZE";
1431     case CL_INVALID_MIP_LEVEL:
1432         return "CL_INVALID_MIP_LEVEL";
1433     case CL_INVALID_GLOBAL_WORK_SIZE:
1434         return "CL_INVALID_GLOBAL_WORK_SIZE";
1435     case CL_INVALID_PROPERTY:
1436         return "CL_INVALID_PROPERTY";
1437     case CL_INVALID_IMAGE_DESCRIPTOR:
1438         return "CL_INVALID_IMAGE_DESCRIPTOR";
1439     case CL_INVALID_COMPILER_OPTIONS:
1440         return "CL_INVALID_COMPILER_OPTIONS";
1441     case CL_INVALID_LINKER_OPTIONS:
1442         return "CL_INVALID_LINKER_OPTIONS";
1443     case CL_INVALID_DEVICE_PARTITION_COUNT:
1444         return "CL_INVALID_DEVICE_PARTITION_COUNT";
1445     default:
1446         return "<Unknown error>";
1447     }
1448 }
1449 
1450 WEAK halide_device_interface_impl_t opencl_device_interface_impl = {
1451     halide_use_jit_module,
1452     halide_release_jit_module,
1453     halide_opencl_device_malloc,
1454     halide_opencl_device_free,
1455     halide_opencl_device_sync,
1456     halide_opencl_device_release,
1457     halide_opencl_copy_to_host,
1458     halide_opencl_copy_to_device,
1459     halide_opencl_device_and_host_malloc,
1460     halide_opencl_device_and_host_free,
1461     halide_opencl_buffer_copy,
1462     halide_opencl_device_crop,
1463     halide_opencl_device_slice,
1464     halide_opencl_device_release_crop,
1465     halide_opencl_wrap_cl_mem,
1466     halide_opencl_detach_cl_mem,
1467 };
1468 
1469 WEAK halide_device_interface_t opencl_device_interface = {
1470     halide_device_malloc,
1471     halide_device_free,
1472     halide_device_sync,
1473     halide_device_release,
1474     halide_copy_to_host,
1475     halide_copy_to_device,
1476     halide_device_and_host_malloc,
1477     halide_device_and_host_free,
1478     halide_buffer_copy,
1479     halide_device_crop,
1480     halide_device_slice,
1481     halide_device_release_crop,
1482     halide_device_wrap_native,
1483     halide_device_detach_native,
1484     NULL,
1485     &opencl_device_interface_impl};
1486 
1487 }  // namespace OpenCL
1488 }  // namespace Internal
1489 }  // namespace Runtime
1490 }  // namespace Halide
1491