1 #include "HalideRuntimeOpenCL.h"
2 #include "device_buffer_utils.h"
3 #include "device_interface.h"
4 #include "printer.h"
5 #include "scoped_spin_lock.h"
6
7 #include "mini_cl.h"
8
9 namespace Halide {
10 namespace Runtime {
11 namespace Internal {
12 namespace OpenCL {
13
14 // Define the function pointers for the OpenCL API. OpenCL 1.2
15 // currently disabled so we can work on build bots without it.
16 //#define HAVE_OPENCL_12
17 #define CL_FN(ret, fn, args) WEAK ret(CL_API_CALL *fn) args;
18 #include "cl_functions.h"
19
20 // The default implementation of halide_opencl_get_symbol attempts to load
21 // the OpenCL runtime shared library/DLL, and then get the symbol from it.
22 WEAK void *lib_opencl = NULL;
23
halide_opencl_get_symbol(void * user_context,const char * name)24 extern "C" WEAK void *halide_opencl_get_symbol(void *user_context, const char *name) {
25 // Only try to load the library if the library isn't already
26 // loaded, or we can't load the symbol from the process already.
27 void *symbol = halide_get_library_symbol(lib_opencl, name);
28 if (symbol) {
29 return symbol;
30 }
31
32 const char *lib_names[] = {
33 #ifdef WINDOWS
34 "opencl.dll",
35 #else
36 "libOpenCL.so",
37 "/System/Library/Frameworks/OpenCL.framework/OpenCL",
38 #endif
39 };
40 for (size_t i = 0; i < sizeof(lib_names) / sizeof(lib_names[0]); i++) {
41 lib_opencl = halide_load_library(lib_names[i]);
42 if (lib_opencl) {
43 debug(user_context) << " Loaded OpenCL runtime library: " << lib_names[i] << "\n";
44 break;
45 }
46 }
47
48 return halide_get_library_symbol(lib_opencl, name);
49 }
50
51 template<typename T>
get_cl_symbol(void * user_context,const char * name)52 ALWAYS_INLINE T get_cl_symbol(void *user_context, const char *name) {
53 T s = (T)halide_opencl_get_symbol(user_context, name);
54 if (!s) {
55 error(user_context) << "OpenCL API not found: " << name << "\n";
56 }
57 return s;
58 }
59
60 // Load an OpenCL shared object/dll, and get the function pointers for the OpenCL API from it.
load_libopencl(void * user_context)61 WEAK void load_libopencl(void *user_context) {
62 debug(user_context) << " load_libopencl (user_context: " << user_context << ")\n";
63 halide_assert(user_context, clCreateContext == NULL);
64
65 #define CL_FN(ret, fn, args) fn = get_cl_symbol<ret(CL_API_CALL *) args>(user_context, #fn);
66 #include "cl_functions.h"
67 }
68
69 extern WEAK halide_device_interface_t opencl_device_interface;
70
71 WEAK const char *get_opencl_error_name(cl_int err);
72 WEAK int create_opencl_context(void *user_context, cl_context *ctx, cl_command_queue *q);
73
74 // An OpenCL context/queue/synchronization lock defined in
75 // this module with weak linkage
76 cl_context WEAK context = 0;
77 cl_command_queue WEAK command_queue = 0;
78 volatile ScopedSpinLock::AtomicFlag WEAK thread_lock = 0;
79
80 WEAK char platform_name[256];
81 WEAK ScopedSpinLock::AtomicFlag platform_name_lock = 0;
82 WEAK bool platform_name_initialized = false;
83
84 WEAK char device_type[256];
85 WEAK ScopedSpinLock::AtomicFlag device_type_lock = 0;
86 WEAK bool device_type_initialized = false;
87
88 WEAK char build_options[1024];
89 WEAK ScopedSpinLock::AtomicFlag build_options_lock = 0;
90 WEAK bool build_options_initialized = false;
91
92 } // namespace OpenCL
93 } // namespace Internal
94 } // namespace Runtime
95 } // namespace Halide
96
97 using namespace Halide::Runtime::Internal::OpenCL;
98
99 // Allow OpenCL 1.1 features to be used.
100 #define ENABLE_OPENCL_11
101
102 namespace {
halide_opencl_set_platform_name_internal(const char * n)103 void halide_opencl_set_platform_name_internal(const char *n) {
104 if (n) {
105 size_t buffer_size = sizeof(platform_name) / sizeof(platform_name[0]);
106 strncpy(platform_name, n, buffer_size);
107 platform_name[buffer_size - 1] = 0;
108 } else {
109 platform_name[0] = 0;
110 }
111 platform_name_initialized = true;
112 }
113
halide_opencl_get_platform_name_internal(void * user_context)114 const char *halide_opencl_get_platform_name_internal(void *user_context) {
115 if (!platform_name_initialized) {
116 const char *name = getenv("HL_OCL_PLATFORM_NAME");
117 halide_opencl_set_platform_name_internal(name);
118 }
119 return platform_name;
120 }
121
halide_opencl_set_device_type_internal(const char * n)122 void halide_opencl_set_device_type_internal(const char *n) {
123 if (n) {
124 size_t buffer_size = sizeof(device_type) / sizeof(device_type[0]);
125 strncpy(device_type, n, buffer_size);
126 device_type[buffer_size - 1] = 0;
127 } else {
128 device_type[0] = 0;
129 }
130 device_type_initialized = true;
131 }
132
halide_opencl_get_device_type_internal(void * user_context)133 const char *halide_opencl_get_device_type_internal(void *user_context) {
134 if (!device_type_initialized) {
135 const char *name = getenv("HL_OCL_DEVICE_TYPE");
136 halide_opencl_set_device_type_internal(name);
137 }
138 return device_type;
139 }
140
halide_opencl_set_build_options_internal(const char * n)141 void halide_opencl_set_build_options_internal(const char *n) {
142 if (n) {
143 size_t buffer_size = sizeof(build_options) / sizeof(build_options[0]);
144 strncpy(build_options, n, buffer_size);
145 build_options[buffer_size - 1] = 0;
146 } else {
147 build_options[0] = 0;
148 }
149 build_options_initialized = true;
150 }
151
halide_opencl_get_build_options_internal(void * user_context)152 const char *halide_opencl_get_build_options_internal(void *user_context) {
153 if (!build_options_initialized) {
154 const char *name = getenv("HL_OCL_BUILD_OPTIONS");
155 halide_opencl_set_build_options_internal(name);
156 }
157 return build_options;
158 }
159 } // namespace
160
161 extern "C" {
162
halide_opencl_set_platform_name(const char * n)163 WEAK void halide_opencl_set_platform_name(const char *n) {
164 ScopedSpinLock lock(&platform_name_lock);
165 halide_opencl_set_platform_name_internal(n);
166 }
167
halide_opencl_get_platform_name(void * user_context)168 WEAK const char *halide_opencl_get_platform_name(void *user_context) {
169 ScopedSpinLock lock(&platform_name_lock);
170 return halide_opencl_get_platform_name_internal(user_context);
171 }
172
halide_opencl_set_device_type(const char * n)173 WEAK void halide_opencl_set_device_type(const char *n) {
174 ScopedSpinLock lock(&device_type_lock);
175 halide_opencl_set_device_type_internal(n);
176 }
177
halide_opencl_get_device_type(void * user_context)178 WEAK const char *halide_opencl_get_device_type(void *user_context) {
179 ScopedSpinLock lock(&device_type_lock);
180 return halide_opencl_get_device_type_internal(user_context);
181 }
182
halide_opencl_set_build_options(const char * n)183 WEAK void halide_opencl_set_build_options(const char *n) {
184 ScopedSpinLock lock(&build_options_lock);
185 halide_opencl_set_build_options_internal(n);
186 }
187
halide_opencl_get_build_options(void * user_context)188 WEAK const char *halide_opencl_get_build_options(void *user_context) {
189 ScopedSpinLock lock(&build_options_lock);
190 return halide_opencl_get_build_options_internal(user_context);
191 }
192
193 // The default implementation of halide_acquire_cl_context uses the global
194 // pointers above, and serializes access with a spin lock.
195 // Overriding implementations of acquire/release must implement the following
196 // behavior:
197 // - halide_acquire_cl_context should always store a valid context/command
198 // queue in ctx/q, or return an error code.
199 // - A call to halide_acquire_cl_context is followed by a matching call to
200 // halide_release_cl_context. halide_acquire_cl_context should block while a
201 // previous call (if any) has not yet been released via halide_release_cl_context.
halide_acquire_cl_context(void * user_context,cl_context * ctx,cl_command_queue * q,bool create=true)202 WEAK int halide_acquire_cl_context(void *user_context, cl_context *ctx, cl_command_queue *q, bool create = true) {
203 // TODO: Should we use a more "assertive" assert? These asserts do
204 // not block execution on failure.
205 halide_assert(user_context, ctx != NULL);
206 halide_assert(user_context, q != NULL);
207
208 halide_assert(user_context, &thread_lock != NULL);
209 while (__atomic_test_and_set(&thread_lock, __ATOMIC_ACQUIRE)) {
210 }
211
212 // If the context has not been initialized, initialize it now.
213 halide_assert(user_context, &context != NULL);
214 halide_assert(user_context, &command_queue != NULL);
215 if (!context && create) {
216 cl_int error = create_opencl_context(user_context, &context, &command_queue);
217 if (error != CL_SUCCESS) {
218 __atomic_clear(&thread_lock, __ATOMIC_RELEASE);
219 return error;
220 }
221 }
222
223 *ctx = context;
224 *q = command_queue;
225 return 0;
226 }
227
halide_release_cl_context(void * user_context)228 WEAK int halide_release_cl_context(void *user_context) {
229 __atomic_clear(&thread_lock, __ATOMIC_RELEASE);
230 return 0;
231 }
232
233 } // extern "C"
234
235 namespace Halide {
236 namespace Runtime {
237 namespace Internal {
238 namespace OpenCL {
239
240 // Helper object to acquire and release the OpenCL context.
241 class ClContext {
242 void *user_context;
243
244 public:
245 cl_context context;
246 cl_command_queue cmd_queue;
247 cl_int error_code;
248
249 // Constructor sets 'error_code' if any occurs.
ClContext(void * user_context)250 ALWAYS_INLINE ClContext(void *user_context)
251 : user_context(user_context),
252 context(NULL),
253 cmd_queue(NULL),
254 error_code(CL_SUCCESS) {
255 if (clCreateContext == NULL) {
256 load_libopencl(user_context);
257 }
258
259 #ifdef DEBUG_RUNTIME
260 halide_start_clock(user_context);
261 #endif
262
263 error_code = halide_acquire_cl_context(user_context, &context, &cmd_queue);
264 // don't abort: that would prevent host_supports_device_api() from being able work properly.
265 if (!context || !cmd_queue) {
266 error(user_context) << "OpenCL: null context or cmd_queue";
267 error_code = -1;
268 }
269 }
270
~ClContext()271 ALWAYS_INLINE ~ClContext() {
272 halide_release_cl_context(user_context);
273 }
274 };
275
276 // OpenCL doesn't support creating sub-buffers from some-buffers. In
277 // order to support more generalized (and frankly, minimally useful)
278 // crop behavior, we store a cl_mem and an offset and then create
279 // sub-buffers as needed.
280 struct device_handle {
281 // Important: order these to avoid any padding between fields;
282 // some Win32 compiler optimizer configurations can inconsistently
283 // insert padding otherwise.
284 uint64_t offset;
285 cl_mem mem;
286 };
287
288 // Structure to hold the state of a module attached to the context.
289 // Also used as a linked-list to keep track of all the different
290 // modules that are attached to a context in order to release them all
291 // when then context is released.
292 struct module_state {
293 cl_program program;
294 module_state *next;
295 };
296 WEAK module_state *state_list = NULL;
297
validate_device_pointer(void * user_context,halide_buffer_t * buf,size_t size=0)298 WEAK bool validate_device_pointer(void *user_context, halide_buffer_t *buf, size_t size = 0) {
299 if (buf->device == 0) {
300 return true;
301 }
302
303 // We may call this in situations where we haven't loaded the
304 // OpenCL API yet.
305 if (!clGetMemObjectInfo) {
306 load_libopencl(user_context);
307 }
308
309 cl_mem dev_ptr = ((device_handle *)buf->device)->mem;
310 uint64_t offset = ((device_handle *)buf->device)->offset;
311
312 size_t real_size;
313 cl_int result = clGetMemObjectInfo(dev_ptr, CL_MEM_SIZE, sizeof(size_t), &real_size, NULL);
314 if (result != CL_SUCCESS) {
315 error(user_context) << "CL: Bad device pointer " << (void *)dev_ptr
316 << ": clGetMemObjectInfo returned "
317 << get_opencl_error_name(result);
318 return false;
319 }
320
321 debug(user_context) << "CL: validate " << (void *)dev_ptr << " offset: " << offset
322 << ": asked for " << (uint64_t)size
323 << ", actual allocated " << (uint64_t)real_size << "\n";
324
325 if (size) {
326 halide_assert(user_context, real_size >= (size + offset) && "Validating pointer with insufficient size");
327 }
328 return true;
329 }
330
331 // Initializes the context used by the default implementation
332 // of halide_acquire_context.
create_opencl_context(void * user_context,cl_context * ctx,cl_command_queue * q)333 WEAK int create_opencl_context(void *user_context, cl_context *ctx, cl_command_queue *q) {
334 debug(user_context)
335 << " create_opencl_context (user_context: " << user_context << ")\n";
336
337 halide_assert(user_context, ctx != NULL && *ctx == NULL);
338 halide_assert(user_context, q != NULL && *q == NULL);
339
340 if (clGetPlatformIDs == NULL) {
341 error(user_context) << "CL: clGetPlatformIDs not found\n";
342 return -1;
343 }
344
345 cl_int err = 0;
346
347 const cl_uint max_platforms = 4;
348 cl_platform_id platforms[max_platforms];
349 cl_uint platform_count = 0;
350
351 err = clGetPlatformIDs(max_platforms, platforms, &platform_count);
352 if (err != CL_SUCCESS) {
353 error(user_context) << "CL: clGetPlatformIDs failed: "
354 << get_opencl_error_name(err) << " " << err;
355 return err;
356 }
357
358 cl_platform_id platform = NULL;
359
360 // Find the requested platform, or the first if none specified.
361 const char *name = halide_opencl_get_platform_name(user_context);
362 if (name != NULL) {
363 for (cl_uint i = 0; i < platform_count; ++i) {
364 const cl_uint max_platform_name = 256;
365 char platform_name[max_platform_name];
366 err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, max_platform_name, platform_name, NULL);
367 if (err != CL_SUCCESS) continue;
368 debug(user_context) << "CL: platform " << i << " " << platform_name << "\n";
369
370 // A platform matches the request if it is a substring of the platform name.
371 if (strstr(platform_name, name)) {
372 platform = platforms[i];
373 break;
374 }
375 }
376 } else if (platform_count > 0) {
377 platform = platforms[0];
378 }
379 if (platform == NULL) {
380 error(user_context) << "CL: Failed to find platform\n";
381 return CL_INVALID_PLATFORM;
382 }
383
384 #ifdef DEBUG_RUNTIME
385 const cl_uint max_platform_name = 256;
386 char platform_name[max_platform_name];
387 err = clGetPlatformInfo(platform, CL_PLATFORM_NAME, max_platform_name, platform_name, NULL);
388 if (err != CL_SUCCESS) {
389 debug(user_context) << " clGetPlatformInfo(CL_PLATFORM_NAME) failed: "
390 << get_opencl_error_name(err) << "\n";
391 // This is just debug info, report the error but don't fail context creation due to it.
392 //return err;
393 } else {
394 debug(user_context) << " Got platform '" << platform_name
395 << "', about to create context (t="
396 << halide_current_time_ns(user_context)
397 << ")\n";
398 }
399 #endif
400
401 // Get the types of devices requested.
402 cl_device_type device_type = 0;
403 const char *dev_type = halide_opencl_get_device_type(user_context);
404 if (dev_type != NULL && *dev_type != '\0') {
405 if (strstr(dev_type, "cpu")) {
406 device_type |= CL_DEVICE_TYPE_CPU;
407 }
408 if (strstr(dev_type, "gpu")) {
409 device_type |= CL_DEVICE_TYPE_GPU;
410 }
411 if (strstr(dev_type, "acc")) {
412 device_type |= CL_DEVICE_TYPE_ACCELERATOR;
413 }
414 }
415 // If no device types are specified, use all the available
416 // devices.
417 if (device_type == 0) {
418 device_type = CL_DEVICE_TYPE_ALL;
419 }
420
421 // Get all the devices of the specified type.
422 const cl_uint maxDevices = 128;
423 cl_device_id devices[maxDevices];
424 cl_uint deviceCount = 0;
425 err = clGetDeviceIDs(platform, device_type, maxDevices, devices, &deviceCount);
426 if (err != CL_SUCCESS) {
427 error(user_context) << "CL: clGetDeviceIDs failed: "
428 << get_opencl_error_name(err);
429 return err;
430 }
431
432 // If the user indicated a specific device index to use, use
433 // that. Note that this is an index within the set of devices
434 // specified by the device type. -1 means select a device
435 // automatically based on core count.
436 int device = halide_get_gpu_device(user_context);
437 if (device == -1 && deviceCount == 1) {
438 device = 0;
439 } else if (device == -1) {
440 debug(user_context) << " Multiple CL devices detected. Selecting the one with the most cores.\n";
441 cl_uint best_core_count = 0;
442 for (cl_uint i = 0; i < deviceCount; i++) {
443 cl_device_id dev = devices[i];
444 cl_uint core_count = 0;
445 err = clGetDeviceInfo(dev, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &core_count, NULL);
446 if (err != CL_SUCCESS) {
447 debug(user_context) << " Failed to get info on device " << i << "\n";
448 continue;
449 }
450 debug(user_context) << " Device " << i << " has " << core_count << " cores\n";
451 if (core_count >= best_core_count) {
452 device = i;
453 best_core_count = core_count;
454 }
455 }
456 debug(user_context) << " Selected device " << device << "\n";
457 }
458
459 if (device < 0 || device >= (int)deviceCount) {
460 error(user_context) << "CL: Failed to get device: " << device;
461 return CL_DEVICE_NOT_FOUND;
462 }
463
464 cl_device_id dev = devices[device];
465
466 #ifdef DEBUG_RUNTIME
467 // Declare variables for other state we want to query.
468 char device_name[256] = "";
469 char device_vendor[256] = "";
470 char device_profile[256] = "";
471 char device_version[256] = "";
472 char driver_version[256] = "";
473 cl_ulong global_mem_size = 0;
474 cl_ulong max_mem_alloc_size = 0;
475 cl_ulong local_mem_size = 0;
476 cl_uint max_compute_units = 0;
477 size_t max_work_group_size = 0;
478 cl_uint max_work_item_dimensions = 0;
479 size_t max_work_item_sizes[4] = {
480 0,
481 };
482
483 struct {
484 void *dst;
485 size_t sz;
486 cl_device_info param;
487 } infos[] = {
488 {&device_name[0], sizeof(device_name), CL_DEVICE_NAME},
489 {&device_vendor[0], sizeof(device_vendor), CL_DEVICE_VENDOR},
490 {&device_profile[0], sizeof(device_profile), CL_DEVICE_PROFILE},
491 {&device_version[0], sizeof(device_version), CL_DEVICE_VERSION},
492 {&driver_version[0], sizeof(driver_version), CL_DRIVER_VERSION},
493 {&global_mem_size, sizeof(global_mem_size), CL_DEVICE_GLOBAL_MEM_SIZE},
494 {&max_mem_alloc_size, sizeof(max_mem_alloc_size), CL_DEVICE_MAX_MEM_ALLOC_SIZE},
495 {&local_mem_size, sizeof(local_mem_size), CL_DEVICE_LOCAL_MEM_SIZE},
496 {&max_compute_units, sizeof(max_compute_units), CL_DEVICE_MAX_COMPUTE_UNITS},
497 {&max_work_group_size, sizeof(max_work_group_size), CL_DEVICE_MAX_WORK_GROUP_SIZE},
498 {&max_work_item_dimensions, sizeof(max_work_item_dimensions), CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS},
499 {&max_work_item_sizes[0], sizeof(max_work_item_sizes), CL_DEVICE_MAX_WORK_ITEM_SIZES},
500 {NULL}};
501
502 // Do all the queries.
503 for (int i = 0; infos[i].dst; i++) {
504 err = clGetDeviceInfo(dev, infos[i].param, infos[i].sz, infos[i].dst, NULL);
505 if (err != CL_SUCCESS) {
506 error(user_context) << "CL: clGetDeviceInfo failed: "
507 << get_opencl_error_name(err);
508 return err;
509 }
510 }
511
512 debug(user_context)
513 << " device name: " << device_name << "\n"
514 << " device vendor: " << device_vendor << "\n"
515 << " device profile: " << device_profile << "\n"
516 << " global mem size: " << global_mem_size / (1024 * 1024) << " MB\n"
517 << " max mem alloc size: " << max_mem_alloc_size / (1024 * 1024) << " MB\n"
518 << " local mem size: " << local_mem_size << "\n"
519 << " max compute units: " << max_compute_units << "\n"
520 << " max workgroup size: " << (uint64_t)max_work_group_size << "\n"
521 << " max work item dimensions: " << max_work_item_dimensions << "\n"
522 << " max work item sizes: " << (uint64_t)max_work_item_sizes[0]
523 << "x" << (uint64_t)max_work_item_sizes[1]
524 << "x" << (uint64_t)max_work_item_sizes[2]
525 << "x" << (uint64_t)max_work_item_sizes[3] << "\n";
526 #endif
527
528 // Create context and command queue.
529 cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0};
530 debug(user_context) << " clCreateContext -> ";
531 *ctx = clCreateContext(properties, 1, &dev, NULL, NULL, &err);
532 if (err != CL_SUCCESS) {
533 debug(user_context) << get_opencl_error_name(err);
534 error(user_context) << "CL: clCreateContext failed: "
535 << get_opencl_error_name(err)
536 << ":" << (int)err;
537 return err;
538 } else {
539 debug(user_context) << *ctx << "\n";
540 }
541
542 debug(user_context) << " clCreateCommandQueue ";
543 *q = clCreateCommandQueue(*ctx, dev, 0, &err);
544 if (err != CL_SUCCESS) {
545 debug(user_context) << get_opencl_error_name(err);
546 error(user_context) << "CL: clCreateCommandQueue failed: "
547 << get_opencl_error_name(err);
548 return err;
549 } else {
550 debug(user_context) << *q << "\n";
551 }
552
553 return err;
554 }
555
556 } // namespace OpenCL
557 } // namespace Internal
558 } // namespace Runtime
559 } // namespace Halide
560
561 extern "C" {
562
halide_opencl_device_free(void * user_context,halide_buffer_t * buf)563 WEAK int halide_opencl_device_free(void *user_context, halide_buffer_t *buf) {
564 // halide_opencl_device_free, at present, can be exposed to clients and they
565 // should be allowed to call halide_opencl_device_free on any halide_buffer_t
566 // including ones that have never been used with a GPU.
567 if (buf->device == 0) {
568 return 0;
569 }
570
571 cl_mem dev_ptr = ((device_handle *)buf->device)->mem;
572 halide_assert(user_context, (((device_handle *)buf->device)->offset == 0) && "halide_opencl_device_free on buffer obtained from halide_device_crop");
573
574 debug(user_context)
575 << "CL: halide_opencl_device_free (user_context: " << user_context
576 << ", buf: " << buf << ") cl_mem: " << dev_ptr << "\n";
577
578 ClContext ctx(user_context);
579 if (ctx.error_code != CL_SUCCESS) {
580 return ctx.error_code;
581 }
582
583 #ifdef DEBUG_RUNTIME
584 uint64_t t_before = halide_current_time_ns(user_context);
585 #endif
586
587 halide_assert(user_context, validate_device_pointer(user_context, buf));
588 debug(user_context) << " clReleaseMemObject " << (void *)dev_ptr << "\n";
589 cl_int result = clReleaseMemObject((cl_mem)dev_ptr);
590 // If clReleaseMemObject fails, it is unlikely to succeed in a later call, so
591 // we just end our reference to it regardless.
592 free((device_handle *)buf->device);
593 buf->device = 0;
594 buf->device_interface->impl->release_module();
595 buf->device_interface = NULL;
596 if (result != CL_SUCCESS) {
597 // We may be called as a destructor, so don't raise an error
598 // here.
599 return result;
600 }
601
602 #ifdef DEBUG_RUNTIME
603 uint64_t t_after = halide_current_time_ns(user_context);
604 debug(user_context) << " Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
605 #endif
606
607 return 0;
608 }
609
halide_opencl_initialize_kernels(void * user_context,void ** state_ptr,const char * src,int size)610 WEAK int halide_opencl_initialize_kernels(void *user_context, void **state_ptr, const char *src, int size) {
611 debug(user_context)
612 << "CL: halide_opencl_init_kernels (user_context: " << user_context
613 << ", state_ptr: " << state_ptr
614 << ", program: " << (void *)src
615 << ", size: " << size << "\n";
616
617 ClContext ctx(user_context);
618 if (ctx.error_code != CL_SUCCESS) {
619 return ctx.error_code;
620 }
621
622 #ifdef DEBUG_RUNTIME
623 uint64_t t_before = halide_current_time_ns(user_context);
624 #endif
625
626 // Create the state object if necessary. This only happens once, regardless
627 // of how many times halide_init_kernels/halide_release is called.
628 // halide_release traverses this list and releases the program objects, but
629 // it does not modify the list nodes created/inserted here.
630 module_state **state = (module_state **)state_ptr;
631 if (!(*state)) {
632 *state = (module_state *)malloc(sizeof(module_state));
633 (*state)->program = NULL;
634 (*state)->next = state_list;
635 state_list = *state;
636 }
637
638 // Create the program if necessary. TODO: The program object needs to not
639 // only already exist, but be created for the same context/device as the
640 // calling context/device.
641 if (!(*state && (*state)->program) && size > 1) {
642 cl_int err = 0;
643 cl_device_id dev;
644
645 err = clGetContextInfo(ctx.context, CL_CONTEXT_DEVICES, sizeof(dev), &dev, NULL);
646 if (err != CL_SUCCESS) {
647 error(user_context) << "CL: clGetContextInfo(CL_CONTEXT_DEVICES) failed: "
648 << get_opencl_error_name(err);
649 return err;
650 }
651
652 cl_device_id devices[] = {dev};
653
654 // Get the max constant buffer size supported by this OpenCL implementation.
655 cl_ulong max_constant_buffer_size = 0;
656 err = clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(max_constant_buffer_size), &max_constant_buffer_size, NULL);
657 if (err != CL_SUCCESS) {
658 error(user_context) << "CL: clGetDeviceInfo (CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) failed: "
659 << get_opencl_error_name(err);
660 return err;
661 }
662 // Get the max number of constant arguments supported by this OpenCL implementation.
663 cl_uint max_constant_args = 0;
664 err = clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_ARGS, sizeof(max_constant_args), &max_constant_args, NULL);
665 if (err != CL_SUCCESS) {
666 error(user_context) << "CL: clGetDeviceInfo (CL_DEVICE_MAX_CONSTANT_ARGS) failed: "
667 << get_opencl_error_name(err);
668 return err;
669 }
670
671 // Build the compile argument options.
672 stringstream options(user_context);
673 options << "-D MAX_CONSTANT_BUFFER_SIZE=" << max_constant_buffer_size
674 << " -D MAX_CONSTANT_ARGS=" << max_constant_args;
675
676 const char *extra_options = halide_opencl_get_build_options(user_context);
677 options << " " << extra_options;
678
679 const char *sources[] = {src};
680 debug(user_context) << " clCreateProgramWithSource -> ";
681 cl_program program = clCreateProgramWithSource(ctx.context, 1, &sources[0], NULL, &err);
682 if (err != CL_SUCCESS) {
683 debug(user_context) << get_opencl_error_name(err) << "\n";
684 error(user_context) << "CL: clCreateProgramWithSource failed: "
685 << get_opencl_error_name(err);
686 return err;
687 } else {
688 debug(user_context) << (void *)program << "\n";
689 }
690 (*state)->program = program;
691
692 debug(user_context) << " clBuildProgram " << (void *)program
693 << " " << options.str() << "\n";
694 err = clBuildProgram(program, 1, devices, options.str(), NULL, NULL);
695 if (err != CL_SUCCESS) {
696
697 {
698 // Allocate an appropriately sized buffer for the build log.
699 Printer<ErrorPrinter, 8192> p(user_context);
700
701 p << "CL: clBuildProgram failed: "
702 << get_opencl_error_name(err)
703 << "\nBuild Log:\n";
704
705 // Get build log
706 if (clGetProgramBuildInfo(program, dev,
707 CL_PROGRAM_BUILD_LOG,
708 p.capacity() - p.size() - 1, p.dst,
709 NULL) != CL_SUCCESS) {
710 p << "clGetProgramBuildInfo failed";
711 }
712 }
713
714 return err;
715 }
716 }
717
718 #ifdef DEBUG_RUNTIME
719 uint64_t t_after = halide_current_time_ns(user_context);
720 debug(user_context) << " Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
721 #endif
722 return 0;
723 }
724
725 // Used to generate correct timings when tracing
halide_opencl_device_sync(void * user_context,halide_buffer_t *)726 WEAK int halide_opencl_device_sync(void *user_context, halide_buffer_t *) {
727 debug(user_context) << "CL: halide_opencl_device_sync (user_context: " << user_context << ")\n";
728
729 ClContext ctx(user_context);
730 if (ctx.error_code != CL_SUCCESS) {
731 return ctx.error_code;
732 }
733
734 #ifdef DEBUG_RUNTIME
735 uint64_t t_before = halide_current_time_ns(user_context);
736 #endif
737
738 cl_int err = clFinish(ctx.cmd_queue);
739 if (err != CL_SUCCESS) {
740 error(user_context) << "CL: clFinish failed: "
741 << get_opencl_error_name(err);
742 return err;
743 }
744
745 #ifdef DEBUG_RUNTIME
746 uint64_t t_after = halide_current_time_ns(user_context);
747 debug(user_context) << " Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
748 #endif
749
750 return CL_SUCCESS;
751 }
752
halide_opencl_device_release(void * user_context)753 WEAK int halide_opencl_device_release(void *user_context) {
754 debug(user_context)
755 << "CL: halide_opencl_device_release (user_context: " << user_context << ")\n";
756
757 // The ClContext object does not allow the context storage to be modified,
758 // so we use halide_acquire_context directly.
759 int err;
760 cl_context ctx;
761 cl_command_queue q;
762 err = halide_acquire_cl_context(user_context, &ctx, &q, false);
763 if (err != 0) {
764 return err;
765 }
766
767 if (ctx) {
768 err = clFinish(q);
769 halide_assert(user_context, err == CL_SUCCESS);
770
771 // Unload the modules attached to this context. Note that the list
772 // nodes themselves are not freed, only the program objects are
773 // released. Subsequent calls to halide_init_kernels might re-create
774 // the program object using the same list node to store the program
775 // object.
776 module_state *state = state_list;
777 while (state) {
778 if (state->program) {
779 debug(user_context) << " clReleaseProgram " << state->program << "\n";
780 err = clReleaseProgram(state->program);
781 halide_assert(user_context, err == CL_SUCCESS);
782 state->program = NULL;
783 }
784 state = state->next;
785 }
786
787 // Release the context itself, if we created it.
788 if (ctx == context) {
789 debug(user_context) << " clReleaseCommandQueue " << command_queue << "\n";
790 err = clReleaseCommandQueue(command_queue);
791 halide_assert(user_context, err == CL_SUCCESS);
792 command_queue = NULL;
793
794 debug(user_context) << " clReleaseContext " << context << "\n";
795 err = clReleaseContext(context);
796 halide_assert(user_context, err == CL_SUCCESS);
797 context = NULL;
798 }
799 }
800
801 halide_release_cl_context(user_context);
802
803 return 0;
804 }
805
halide_opencl_device_malloc(void * user_context,halide_buffer_t * buf)806 WEAK int halide_opencl_device_malloc(void *user_context, halide_buffer_t *buf) {
807 debug(user_context)
808 << "CL: halide_opencl_device_malloc (user_context: " << user_context
809 << ", buf: " << buf << ")\n";
810
811 ClContext ctx(user_context);
812 if (ctx.error_code != CL_SUCCESS) {
813 return ctx.error_code;
814 }
815
816 size_t size = buf->size_in_bytes();
817 halide_assert(user_context, size != 0);
818 if (buf->device) {
819 halide_assert(user_context, validate_device_pointer(user_context, buf, size));
820 return 0;
821 }
822
823 for (int i = 0; i < buf->dimensions; i++) {
824 halide_assert(user_context, buf->dim[i].stride >= 0);
825 }
826
827 debug(user_context) << " allocating " << *buf << "\n";
828
829 #ifdef DEBUG_RUNTIME
830 uint64_t t_before = halide_current_time_ns(user_context);
831 #endif
832
833 device_handle *dev_handle = (device_handle *)malloc(sizeof(device_handle));
834 if (dev_handle == NULL) {
835 return CL_OUT_OF_HOST_MEMORY;
836 }
837
838 cl_int err;
839 debug(user_context) << " clCreateBuffer -> " << (int)size << " ";
840 cl_mem dev_ptr = clCreateBuffer(ctx.context, CL_MEM_READ_WRITE, size, NULL, &err);
841 if (err != CL_SUCCESS || dev_ptr == 0) {
842 debug(user_context) << get_opencl_error_name(err) << "\n";
843 error(user_context) << "CL: clCreateBuffer failed: "
844 << get_opencl_error_name(err);
845 free(dev_handle);
846 return err;
847 } else {
848 debug(user_context) << (void *)dev_ptr << " device_handle: " << dev_handle << "\n";
849 }
850
851 dev_handle->mem = dev_ptr;
852 dev_handle->offset = 0;
853 buf->device = (uint64_t)dev_handle;
854 buf->device_interface = &opencl_device_interface;
855 buf->device_interface->impl->use_module();
856
857 debug(user_context)
858 << " Allocated device buffer " << (void *)buf->device
859 << " for buffer " << buf << "\n";
860
861 halide_assert(user_context, validate_device_pointer(user_context, buf, size));
862
863 #ifdef DEBUG_RUNTIME
864 uint64_t t_after = halide_current_time_ns(user_context);
865 debug(user_context) << " Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
866 #endif
867
868 return CL_SUCCESS;
869 }
870
871 namespace {
opencl_do_multidimensional_copy(void * user_context,ClContext & ctx,const device_copy & c,int64_t src_idx,int64_t dst_idx,int d,bool from_host,bool to_host)872 WEAK int opencl_do_multidimensional_copy(void *user_context, ClContext &ctx,
873 const device_copy &c,
874 int64_t src_idx, int64_t dst_idx,
875 int d, bool from_host, bool to_host) {
876 if (d > MAX_COPY_DIMS) {
877 error(user_context) << "Buffer has too many dimensions to copy to/from GPU\n";
878 return -1;
879 } else if (d == 0) {
880 cl_int err = 0;
881
882 debug(user_context) << " from " << (from_host ? "host" : "device")
883 << " to " << (to_host ? "host" : "device") << ", "
884 << (void *)c.src << " + " << src_idx
885 << " -> " << (void *)c.dst << " + " << dst_idx
886 << ", " << c.chunk_size << " bytes\n";
887 if (!from_host && to_host) {
888 err = clEnqueueReadBuffer(ctx.cmd_queue, ((device_handle *)c.src)->mem,
889 CL_FALSE, src_idx + ((device_handle *)c.src)->offset, c.chunk_size, (void *)(c.dst + dst_idx),
890 0, NULL, NULL);
891 } else if (from_host && !to_host) {
892 err = clEnqueueWriteBuffer(ctx.cmd_queue, ((device_handle *)c.dst)->mem,
893 CL_FALSE, dst_idx + ((device_handle *)c.dst)->offset, c.chunk_size, (void *)(c.src + src_idx),
894 0, NULL, NULL);
895 } else if (!from_host && !to_host) {
896 err = clEnqueueCopyBuffer(ctx.cmd_queue, ((device_handle *)c.src)->mem, ((device_handle *)c.dst)->mem,
897 src_idx + ((device_handle *)c.src)->offset, dst_idx + ((device_handle *)c.dst)->offset,
898 c.chunk_size, 0, NULL, NULL);
899 } else if ((c.dst + dst_idx) != (c.src + src_idx)) {
900 // Could reach here if a user called directly into the
901 // opencl API for a device->host copy on a source buffer
902 // with device_dirty = false.
903 memcpy((void *)(c.dst + dst_idx), (void *)(c.src + src_idx), c.chunk_size);
904 }
905
906 if (err) {
907 error(user_context) << "CL: buffer copy failed: " << get_opencl_error_name(err);
908 return (int)err;
909 }
910 } else {
911 ssize_t src_off = 0, dst_off = 0;
912 for (int i = 0; i < (int)c.extent[d - 1]; i++) {
913 int err = opencl_do_multidimensional_copy(user_context, ctx, c,
914 src_idx + src_off, dst_idx + dst_off,
915 d - 1, from_host, to_host);
916 dst_off += c.dst_stride_bytes[d - 1];
917 src_off += c.src_stride_bytes[d - 1];
918 if (err) {
919 return err;
920 }
921 }
922 }
923 return 0;
924 }
925 } // namespace
926
halide_opencl_buffer_copy(void * user_context,struct halide_buffer_t * src,const struct halide_device_interface_t * dst_device_interface,struct halide_buffer_t * dst)927 WEAK int halide_opencl_buffer_copy(void *user_context, struct halide_buffer_t *src,
928 const struct halide_device_interface_t *dst_device_interface,
929 struct halide_buffer_t *dst) {
930 // We only handle copies to opencl or to host
931 halide_assert(user_context, dst_device_interface == NULL ||
932 dst_device_interface == &opencl_device_interface);
933
934 if ((src->device_dirty() || src->host == NULL) &&
935 src->device_interface != &opencl_device_interface) {
936 halide_assert(user_context, dst_device_interface == &opencl_device_interface);
937 // This is handled at the higher level.
938 return halide_error_code_incompatible_device_interface;
939 }
940
941 bool from_host = (src->device_interface != &opencl_device_interface) ||
942 (src->device == 0) ||
943 (src->host_dirty() && src->host != NULL);
944 bool to_host = !dst_device_interface;
945
946 halide_assert(user_context, from_host || src->device);
947 halide_assert(user_context, to_host || dst->device);
948
949 device_copy c = make_buffer_copy(src, from_host, dst, to_host);
950
951 int err = 0;
952 {
953 ClContext ctx(user_context);
954 if (ctx.error_code != CL_SUCCESS) {
955 return ctx.error_code;
956 }
957
958 debug(user_context)
959 << "CL: halide_opencl_buffer_copy (user_context: " << user_context
960 << ", src: " << src << ", dst: " << dst << ")\n";
961
962 #ifdef DEBUG_RUNTIME
963 uint64_t t_before = halide_current_time_ns(user_context);
964 if (!from_host) {
965 halide_assert(user_context, validate_device_pointer(user_context, src));
966 }
967 if (!to_host) {
968 halide_assert(user_context, validate_device_pointer(user_context, dst));
969 }
970 #endif
971
972 err = opencl_do_multidimensional_copy(user_context, ctx, c, c.src_begin, 0, dst->dimensions, from_host, to_host);
973
974 // The reads/writes above are all non-blocking, so empty the command
975 // queue before we proceed so that other host code won't write
976 // to the buffer while the above writes are still running.
977 clFinish(ctx.cmd_queue);
978
979 #ifdef DEBUG_RUNTIME
980 uint64_t t_after = halide_current_time_ns(user_context);
981 debug(user_context) << " Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
982 #endif
983 }
984
985 return err;
986 }
987
halide_opencl_copy_to_device(void * user_context,halide_buffer_t * buf)988 WEAK int halide_opencl_copy_to_device(void *user_context, halide_buffer_t *buf) {
989 return halide_opencl_buffer_copy(user_context, buf, &opencl_device_interface, buf);
990 }
991
halide_opencl_copy_to_host(void * user_context,halide_buffer_t * buf)992 WEAK int halide_opencl_copy_to_host(void *user_context, halide_buffer_t *buf) {
993 return halide_opencl_buffer_copy(user_context, buf, NULL, buf);
994 }
995
halide_opencl_run(void * user_context,void * state_ptr,const char * entry_name,int blocksX,int blocksY,int blocksZ,int threadsX,int threadsY,int threadsZ,int shared_mem_bytes,size_t arg_sizes[],void * args[],int8_t arg_is_buffer[],int num_attributes,float * vertex_buffer,int num_coords_dim0,int num_coords_dim1)996 WEAK int halide_opencl_run(void *user_context,
997 void *state_ptr,
998 const char *entry_name,
999 int blocksX, int blocksY, int blocksZ,
1000 int threadsX, int threadsY, int threadsZ,
1001 int shared_mem_bytes,
1002 size_t arg_sizes[],
1003 void *args[],
1004 int8_t arg_is_buffer[],
1005 int num_attributes,
1006 float *vertex_buffer,
1007 int num_coords_dim0,
1008 int num_coords_dim1) {
1009 debug(user_context)
1010 << "CL: halide_opencl_run (user_context: " << user_context << ", "
1011 << "entry: " << entry_name << ", "
1012 << "blocks: " << blocksX << "x" << blocksY << "x" << blocksZ << ", "
1013 << "threads: " << threadsX << "x" << threadsY << "x" << threadsZ << ", "
1014 << "shmem: " << shared_mem_bytes << "\n";
1015
1016 cl_int err;
1017 ClContext ctx(user_context);
1018 if (ctx.error_code != CL_SUCCESS) {
1019 return ctx.error_code;
1020 }
1021
1022 #ifdef DEBUG_RUNTIME
1023 uint64_t t_before = halide_current_time_ns(user_context);
1024 #endif
1025
1026 // Create kernel object for entry_name from the program for this module.
1027 halide_assert(user_context, state_ptr);
1028 cl_program program = ((module_state *)state_ptr)->program;
1029
1030 halide_assert(user_context, program);
1031 debug(user_context) << " clCreateKernel " << entry_name << " -> ";
1032 cl_kernel f = clCreateKernel(program, entry_name, &err);
1033 if (err != CL_SUCCESS) {
1034 debug(user_context) << get_opencl_error_name(err) << "\n";
1035 error(user_context) << "CL: clCreateKernel " << entry_name << " failed: "
1036 << get_opencl_error_name(err) << "\n";
1037 return err;
1038 } else {
1039 #ifdef DEBUG_RUNTIME
1040 uint64_t t_create_kernel = halide_current_time_ns(user_context);
1041 debug(user_context) << " Time: " << (t_create_kernel - t_before) / 1.0e6 << " ms\n";
1042 #endif
1043 }
1044
1045 // Pack dims
1046 size_t global_dim[3] = {(size_t)blocksX * threadsX, (size_t)blocksY * threadsY, (size_t)blocksZ * threadsZ};
1047 size_t local_dim[3] = {(size_t)threadsX, (size_t)threadsY, (size_t)threadsZ};
1048
1049 // Set args
1050 int i = 0;
1051
1052 // Count sub buffers needed for crops.
1053 int sub_buffers_needed = 0;
1054 while (arg_sizes[i] != 0) {
1055 if (arg_is_buffer[i] &&
1056 ((device_handle *)((halide_buffer_t *)args[i])->device)->offset != 0) {
1057 sub_buffers_needed++;
1058 }
1059 i += 1;
1060 }
1061 cl_mem *sub_buffers = NULL;
1062 int sub_buffers_saved = 0;
1063 if (sub_buffers_needed > 0) {
1064 sub_buffers = (cl_mem *)malloc(sizeof(cl_mem) * sub_buffers_needed);
1065 if (sub_buffers == NULL) {
1066 return halide_error_code_out_of_memory;
1067 }
1068 memset(sub_buffers, 0, sizeof(cl_mem) * sub_buffers_needed);
1069 }
1070
1071 i = 0;
1072 while (arg_sizes[i] != 0) {
1073 debug(user_context) << " clSetKernelArg " << i
1074 << " " << (int)arg_sizes[i]
1075 << " [" << (*((void **)args[i])) << " ...] "
1076 << arg_is_buffer[i] << "\n";
1077 void *this_arg = args[i];
1078 cl_int err = CL_SUCCESS;
1079
1080 if (arg_is_buffer[i]) {
1081 halide_assert(user_context, arg_sizes[i] == sizeof(uint64_t));
1082 cl_mem mem = ((device_handle *)((halide_buffer_t *)this_arg)->device)->mem;
1083 uint64_t offset = ((device_handle *)((halide_buffer_t *)this_arg)->device)->offset;
1084
1085 if (offset != 0) {
1086 cl_buffer_region region = {(size_t)offset, ((halide_buffer_t *)this_arg)->size_in_bytes()};
1087 // The sub-buffer encompasses the linear range of addresses that
1088 // span the crop.
1089 mem = clCreateSubBuffer(mem, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
1090 sub_buffers[sub_buffers_saved++] = mem;
1091 }
1092 if (err == CL_SUCCESS) {
1093 debug(user_context) << "Mapped dev handle is: " << (void *)mem << "\n";
1094 err = clSetKernelArg(f, i, sizeof(mem), &mem);
1095 }
1096 } else {
1097 err = clSetKernelArg(f, i, arg_sizes[i], this_arg);
1098 }
1099
1100 if (err != CL_SUCCESS) {
1101 error(user_context) << "CL: clSetKernelArg failed: "
1102 << get_opencl_error_name(err);
1103 for (int sub_buf_index = 0; sub_buf_index < sub_buffers_saved; sub_buf_index++) {
1104 clReleaseMemObject(sub_buffers[sub_buf_index]);
1105 }
1106 free(sub_buffers);
1107 return err;
1108 }
1109 i++;
1110 }
1111 // Set the shared mem buffer last
1112 // Always set at least 1 byte of shmem, to keep the launch happy
1113 debug(user_context)
1114 << " clSetKernelArg " << i << " " << shared_mem_bytes << " [NULL]\n";
1115 err = clSetKernelArg(f, i, (shared_mem_bytes > 0) ? shared_mem_bytes : 1, NULL);
1116 if (err != CL_SUCCESS) {
1117 error(user_context) << "CL: clSetKernelArg failed "
1118 << get_opencl_error_name(err);
1119 return err;
1120 }
1121
1122 // Launch kernel
1123 debug(user_context)
1124 << " clEnqueueNDRangeKernel "
1125 << blocksX << "x" << blocksY << "x" << blocksZ << ", "
1126 << threadsX << "x" << threadsY << "x" << threadsZ << " -> ";
1127 err = clEnqueueNDRangeKernel(ctx.cmd_queue, f,
1128 // NDRange
1129 3, NULL, global_dim, local_dim,
1130 // Events
1131 0, NULL, NULL);
1132 debug(user_context) << get_opencl_error_name(err) << "\n";
1133
1134 // Now that the kernel is enqueued, OpenCL is holding its own
1135 // references to sub buffers and the local ones can be released.
1136 for (int sub_buf_index = 0; sub_buf_index < sub_buffers_saved; sub_buf_index++) {
1137 clReleaseMemObject(sub_buffers[sub_buf_index]);
1138 }
1139 free(sub_buffers);
1140
1141 if (err != CL_SUCCESS) {
1142 error(user_context) << "CL: clEnqueueNDRangeKernel failed: "
1143 << get_opencl_error_name(err) << "\n";
1144 return err;
1145 }
1146
1147 debug(user_context) << " Releasing kernel " << (void *)f << "\n";
1148 clReleaseKernel(f);
1149 debug(user_context) << " clReleaseKernel finished" << (void *)f << "\n";
1150
1151 #ifdef DEBUG_RUNTIME
1152 err = clFinish(ctx.cmd_queue);
1153 if (err != CL_SUCCESS) {
1154 error(user_context) << "CL: clFinish failed (" << err << ")\n";
1155 return err;
1156 }
1157 uint64_t t_after = halide_current_time_ns(user_context);
1158 debug(user_context) << " Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
1159 #endif
1160 return 0;
1161 }
1162
halide_opencl_device_and_host_malloc(void * user_context,struct halide_buffer_t * buf)1163 WEAK int halide_opencl_device_and_host_malloc(void *user_context, struct halide_buffer_t *buf) {
1164 return halide_default_device_and_host_malloc(user_context, buf, &opencl_device_interface);
1165 }
1166
halide_opencl_device_and_host_free(void * user_context,struct halide_buffer_t * buf)1167 WEAK int halide_opencl_device_and_host_free(void *user_context, struct halide_buffer_t *buf) {
1168 return halide_default_device_and_host_free(user_context, buf, &opencl_device_interface);
1169 }
1170
halide_opencl_wrap_cl_mem(void * user_context,struct halide_buffer_t * buf,uint64_t mem)1171 WEAK int halide_opencl_wrap_cl_mem(void *user_context, struct halide_buffer_t *buf, uint64_t mem) {
1172 halide_assert(user_context, buf->device == 0);
1173 if (buf->device != 0) {
1174 return -2;
1175 }
1176 device_handle *dev_handle = (device_handle *)malloc(sizeof(device_handle));
1177 if (dev_handle == NULL) {
1178 return halide_error_code_out_of_memory;
1179 }
1180 dev_handle->mem = (cl_mem)mem;
1181 dev_handle->offset = 0;
1182 buf->device = (uint64_t)dev_handle;
1183 buf->device_interface = &opencl_device_interface;
1184 buf->device_interface->impl->use_module();
1185 #ifdef DEBUG_RUNTIME
1186 if (!validate_device_pointer(user_context, buf)) {
1187 free((device_handle *)buf->device);
1188 buf->device = 0;
1189 buf->device_interface->impl->release_module();
1190 buf->device_interface = NULL;
1191 return -3;
1192 }
1193 #endif
1194 return 0;
1195 }
1196
halide_opencl_detach_cl_mem(void * user_context,halide_buffer_t * buf)1197 WEAK int halide_opencl_detach_cl_mem(void *user_context, halide_buffer_t *buf) {
1198 if (buf->device == NULL) {
1199 return 0;
1200 }
1201 halide_assert(user_context, buf->device_interface == &opencl_device_interface);
1202 free((device_handle *)buf->device);
1203 buf->device = 0;
1204 buf->device_interface->impl->release_module();
1205 buf->device_interface = NULL;
1206 return 0;
1207 }
1208
halide_opencl_get_cl_mem(void * user_context,halide_buffer_t * buf)1209 WEAK uintptr_t halide_opencl_get_cl_mem(void *user_context, halide_buffer_t *buf) {
1210 if (buf->device == NULL) {
1211 return 0;
1212 }
1213 halide_assert(user_context, buf->device_interface == &opencl_device_interface);
1214 return (uintptr_t)((device_handle *)buf->device)->mem;
1215 }
1216
halide_opencl_get_crop_offset(void * user_context,halide_buffer_t * buf)1217 WEAK uint64_t halide_opencl_get_crop_offset(void *user_context, halide_buffer_t *buf) {
1218 if (buf->device == NULL) {
1219 return 0;
1220 }
1221 halide_assert(user_context, buf->device_interface == &opencl_device_interface);
1222 return ((device_handle *)buf->device)->offset;
1223 }
1224
1225 namespace {
1226
opencl_device_crop_from_offset(void * user_context,const struct halide_buffer_t * src,int64_t offset,struct halide_buffer_t * dst)1227 WEAK int opencl_device_crop_from_offset(void *user_context,
1228 const struct halide_buffer_t *src,
1229 int64_t offset,
1230 struct halide_buffer_t *dst) {
1231 ClContext ctx(user_context);
1232 if (ctx.error_code != CL_SUCCESS) {
1233 return ctx.error_code;
1234 }
1235
1236 dst->device_interface = src->device_interface;
1237
1238 device_handle *new_dev_handle = (device_handle *)malloc(sizeof(device_handle));
1239 if (new_dev_handle == NULL) {
1240 error(user_context) << "CL: malloc failed making device handle for crop.\n";
1241 return halide_error_code_out_of_memory;
1242 }
1243
1244 clRetainMemObject(((device_handle *)src->device)->mem);
1245 new_dev_handle->mem = ((device_handle *)src->device)->mem;
1246 new_dev_handle->offset = ((device_handle *)src->device)->offset + offset;
1247 dst->device = (uint64_t)new_dev_handle;
1248
1249 return 0;
1250 }
1251
1252 } // namespace
1253
halide_opencl_device_crop(void * user_context,const struct halide_buffer_t * src,struct halide_buffer_t * dst)1254 WEAK int halide_opencl_device_crop(void *user_context,
1255 const struct halide_buffer_t *src,
1256 struct halide_buffer_t *dst) {
1257 const int64_t offset = calc_device_crop_byte_offset(src, dst);
1258 return opencl_device_crop_from_offset(user_context, src, offset, dst);
1259 }
1260
halide_opencl_device_slice(void * user_context,const struct halide_buffer_t * src,int slice_dim,int slice_pos,struct halide_buffer_t * dst)1261 WEAK int halide_opencl_device_slice(void *user_context,
1262 const struct halide_buffer_t *src,
1263 int slice_dim,
1264 int slice_pos,
1265 struct halide_buffer_t *dst) {
1266 const int64_t offset = calc_device_slice_byte_offset(src, slice_dim, slice_pos);
1267 return opencl_device_crop_from_offset(user_context, src, offset, dst);
1268 }
1269
halide_opencl_device_release_crop(void * user_context,struct halide_buffer_t * buf)1270 WEAK int halide_opencl_device_release_crop(void *user_context,
1271 struct halide_buffer_t *buf) {
1272 // Basically the same code as in halide_opencl_device_free, but with
1273 // enough differences to require separate code.
1274
1275 cl_mem dev_ptr = ((device_handle *)buf->device)->mem;
1276
1277 debug(user_context)
1278 << "CL: halide_opencl_device_release_crop(user_context: " << user_context
1279 << ", buf: " << buf << ") cl_mem: " << dev_ptr << " offset: " << ((device_handle *)buf->device)->offset << "\n";
1280
1281 ClContext ctx(user_context);
1282 if (ctx.error_code != CL_SUCCESS) {
1283 return ctx.error_code;
1284 }
1285
1286 #ifdef DEBUG_RUNTIME
1287 uint64_t t_before = halide_current_time_ns(user_context);
1288 #endif
1289
1290 halide_assert(user_context, validate_device_pointer(user_context, buf));
1291 debug(user_context) << " clReleaseMemObject " << (void *)dev_ptr << "\n";
1292 // Sub-buffers are released with clReleaseMemObject
1293 cl_int result = clReleaseMemObject((cl_mem)dev_ptr);
1294 free((device_handle *)buf->device);
1295 if (result != CL_SUCCESS) {
1296 // We may be called as a destructor, so don't raise an error
1297 // here.
1298 return result;
1299 }
1300
1301 #ifdef DEBUG_RUNTIME
1302 uint64_t t_after = halide_current_time_ns(user_context);
1303 debug(user_context) << " Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
1304 #endif
1305
1306 return 0;
1307 }
1308
halide_opencl_device_interface()1309 WEAK const struct halide_device_interface_t *halide_opencl_device_interface() {
1310 return &opencl_device_interface;
1311 }
1312
1313 namespace {
halide_opencl_cleanup()1314 WEAK __attribute__((destructor)) void halide_opencl_cleanup() {
1315 halide_opencl_device_release(NULL);
1316 }
1317 } // namespace
1318
1319 } // extern "C" linkage
1320
1321 namespace Halide {
1322 namespace Runtime {
1323 namespace Internal {
1324 namespace OpenCL {
get_opencl_error_name(cl_int err)1325 WEAK const char *get_opencl_error_name(cl_int err) {
1326 switch (err) {
1327 case CL_SUCCESS:
1328 return "CL_SUCCESS";
1329 case CL_DEVICE_NOT_FOUND:
1330 return "CL_DEVICE_NOT_FOUND";
1331 case CL_DEVICE_NOT_AVAILABLE:
1332 return "CL_DEVICE_NOT_AVAILABLE";
1333 case CL_COMPILER_NOT_AVAILABLE:
1334 return "CL_COMPILER_NOT_AVAILABLE";
1335 case CL_MEM_OBJECT_ALLOCATION_FAILURE:
1336 return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
1337 case CL_OUT_OF_RESOURCES:
1338 return "CL_OUT_OF_RESOURCES";
1339 case CL_OUT_OF_HOST_MEMORY:
1340 return "CL_OUT_OF_HOST_MEMORY";
1341 case CL_PROFILING_INFO_NOT_AVAILABLE:
1342 return "CL_PROFILING_INFO_NOT_AVAILABLE";
1343 case CL_MEM_COPY_OVERLAP:
1344 return "CL_MEM_COPY_OVERLAP";
1345 case CL_IMAGE_FORMAT_MISMATCH:
1346 return "CL_IMAGE_FORMAT_MISMATCH";
1347 case CL_IMAGE_FORMAT_NOT_SUPPORTED:
1348 return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
1349 case CL_BUILD_PROGRAM_FAILURE:
1350 return "CL_BUILD_PROGRAM_FAILURE";
1351 case CL_MAP_FAILURE:
1352 return "CL_MAP_FAILURE";
1353 case CL_MISALIGNED_SUB_BUFFER_OFFSET:
1354 return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
1355 case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
1356 return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
1357 case CL_COMPILE_PROGRAM_FAILURE:
1358 return "CL_COMPILE_PROGRAM_FAILURE";
1359 case CL_LINKER_NOT_AVAILABLE:
1360 return "CL_LINKER_NOT_AVAILABLE";
1361 case CL_LINK_PROGRAM_FAILURE:
1362 return "CL_LINK_PROGRAM_FAILURE";
1363 case CL_DEVICE_PARTITION_FAILED:
1364 return "CL_DEVICE_PARTITION_FAILED";
1365 case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
1366 return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
1367 case CL_INVALID_VALUE:
1368 return "CL_INVALID_VALUE";
1369 case CL_INVALID_DEVICE_TYPE:
1370 return "CL_INVALID_DEVICE_TYPE";
1371 case CL_INVALID_PLATFORM:
1372 return "CL_INVALID_PLATFORM";
1373 case CL_INVALID_DEVICE:
1374 return "CL_INVALID_DEVICE";
1375 case CL_INVALID_CONTEXT:
1376 return "CL_INVALID_CONTEXT";
1377 case CL_INVALID_QUEUE_PROPERTIES:
1378 return "CL_INVALID_QUEUE_PROPERTIES";
1379 case CL_INVALID_COMMAND_QUEUE:
1380 return "CL_INVALID_COMMAND_QUEUE";
1381 case CL_INVALID_HOST_PTR:
1382 return "CL_INVALID_HOST_PTR";
1383 case CL_INVALID_MEM_OBJECT:
1384 return "CL_INVALID_MEM_OBJECT";
1385 case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
1386 return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
1387 case CL_INVALID_IMAGE_SIZE:
1388 return "CL_INVALID_IMAGE_SIZE";
1389 case CL_INVALID_SAMPLER:
1390 return "CL_INVALID_SAMPLER";
1391 case CL_INVALID_BINARY:
1392 return "CL_INVALID_BINARY";
1393 case CL_INVALID_BUILD_OPTIONS:
1394 return "CL_INVALID_BUILD_OPTIONS";
1395 case CL_INVALID_PROGRAM:
1396 return "CL_INVALID_PROGRAM";
1397 case CL_INVALID_PROGRAM_EXECUTABLE:
1398 return "CL_INVALID_PROGRAM_EXECUTABLE";
1399 case CL_INVALID_KERNEL_NAME:
1400 return "CL_INVALID_KERNEL_NAME";
1401 case CL_INVALID_KERNEL_DEFINITION:
1402 return "CL_INVALID_KERNEL_DEFINITION";
1403 case CL_INVALID_KERNEL:
1404 return "CL_INVALID_KERNEL";
1405 case CL_INVALID_ARG_INDEX:
1406 return "CL_INVALID_ARG_INDEX";
1407 case CL_INVALID_ARG_VALUE:
1408 return "CL_INVALID_ARG_VALUE";
1409 case CL_INVALID_ARG_SIZE:
1410 return "CL_INVALID_ARG_SIZE";
1411 case CL_INVALID_KERNEL_ARGS:
1412 return "CL_INVALID_KERNEL_ARGS";
1413 case CL_INVALID_WORK_DIMENSION:
1414 return "CL_INVALID_WORK_DIMENSION";
1415 case CL_INVALID_WORK_GROUP_SIZE:
1416 return "CL_INVALID_WORK_GROUP_SIZE";
1417 case CL_INVALID_WORK_ITEM_SIZE:
1418 return "CL_INVALID_WORK_ITEM_SIZE";
1419 case CL_INVALID_GLOBAL_OFFSET:
1420 return "CL_INVALID_GLOBAL_OFFSET";
1421 case CL_INVALID_EVENT_WAIT_LIST:
1422 return "CL_INVALID_EVENT_WAIT_LIST";
1423 case CL_INVALID_EVENT:
1424 return "CL_INVALID_EVENT";
1425 case CL_INVALID_OPERATION:
1426 return "CL_INVALID_OPERATION";
1427 case CL_INVALID_GL_OBJECT:
1428 return "CL_INVALID_GL_OBJECT";
1429 case CL_INVALID_BUFFER_SIZE:
1430 return "CL_INVALID_BUFFER_SIZE";
1431 case CL_INVALID_MIP_LEVEL:
1432 return "CL_INVALID_MIP_LEVEL";
1433 case CL_INVALID_GLOBAL_WORK_SIZE:
1434 return "CL_INVALID_GLOBAL_WORK_SIZE";
1435 case CL_INVALID_PROPERTY:
1436 return "CL_INVALID_PROPERTY";
1437 case CL_INVALID_IMAGE_DESCRIPTOR:
1438 return "CL_INVALID_IMAGE_DESCRIPTOR";
1439 case CL_INVALID_COMPILER_OPTIONS:
1440 return "CL_INVALID_COMPILER_OPTIONS";
1441 case CL_INVALID_LINKER_OPTIONS:
1442 return "CL_INVALID_LINKER_OPTIONS";
1443 case CL_INVALID_DEVICE_PARTITION_COUNT:
1444 return "CL_INVALID_DEVICE_PARTITION_COUNT";
1445 default:
1446 return "<Unknown error>";
1447 }
1448 }
1449
1450 WEAK halide_device_interface_impl_t opencl_device_interface_impl = {
1451 halide_use_jit_module,
1452 halide_release_jit_module,
1453 halide_opencl_device_malloc,
1454 halide_opencl_device_free,
1455 halide_opencl_device_sync,
1456 halide_opencl_device_release,
1457 halide_opencl_copy_to_host,
1458 halide_opencl_copy_to_device,
1459 halide_opencl_device_and_host_malloc,
1460 halide_opencl_device_and_host_free,
1461 halide_opencl_buffer_copy,
1462 halide_opencl_device_crop,
1463 halide_opencl_device_slice,
1464 halide_opencl_device_release_crop,
1465 halide_opencl_wrap_cl_mem,
1466 halide_opencl_detach_cl_mem,
1467 };
1468
1469 WEAK halide_device_interface_t opencl_device_interface = {
1470 halide_device_malloc,
1471 halide_device_free,
1472 halide_device_sync,
1473 halide_device_release,
1474 halide_copy_to_host,
1475 halide_copy_to_device,
1476 halide_device_and_host_malloc,
1477 halide_device_and_host_free,
1478 halide_buffer_copy,
1479 halide_device_crop,
1480 halide_device_slice,
1481 halide_device_release_crop,
1482 halide_device_wrap_native,
1483 halide_device_detach_native,
1484 NULL,
1485 &opencl_device_interface_impl};
1486
1487 } // namespace OpenCL
1488 } // namespace Internal
1489 } // namespace Runtime
1490 } // namespace Halide
1491