1 /* pocl_cl.h - local runtime library declarations. 2 3 Copyright (c) 2011 Universidad Rey Juan Carlos 4 2011-2019 Pekka Jääskeläinen 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy 7 of this software and associated documentation files (the "Software"), to 8 deal in the Software without restriction, including without limitation the 9 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 10 sell copies of the Software, and to permit persons to whom the Software is 11 furnished to do so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in 14 all copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22 IN THE SOFTWARE. 23 */ 24 25 #ifndef POCL_CL_H 26 #define POCL_CL_H 27 28 #include "config.h" 29 30 #include <assert.h> 31 #include <stdio.h> 32 33 #ifdef HAVE_VALGRIND 34 #include <valgrind/helgrind.h> 35 #endif 36 37 #ifdef _WIN32 38 # include "vccompat.hpp" 39 #endif 40 /* To get adaptive mutex type */ 41 #ifndef __USE_GNU 42 #define __USE_GNU 43 #endif 44 45 #include <pthread.h> 46 #ifdef HAVE_CLOCK_GETTIME 47 #include <time.h> 48 #endif 49 50 typedef pthread_mutex_t pocl_lock_t; 51 typedef pthread_cond_t pocl_cond_t; 52 typedef pthread_t pocl_thread_t; 53 #define POCL_LOCK_INITIALIZER PTHREAD_MUTEX_INITIALIZER 54 55 #ifdef BUILD_ICD 56 # include "pocl_icd.h" 57 #endif 58 #include "pocl.h" 59 #include "pocl_tracing.h" 60 #include "pocl_debug.h" 61 #include "pocl_hash.h" 62 #include "pocl_runtime_config.h" 63 #include "common.h" 64 65 #if __STDC_VERSION__ < 199901L 66 # if __GNUC__ >= 2 67 # define __func__ __PRETTY_FUNCTION__ 68 # else 69 # define __func__ UNKNOWN_FUNCTION 70 # endif 71 #endif 72 73 #if defined(__GNUC__) || defined(__clang__) 74 75 /* These return the new value. */ 76 /* See: https://gcc.gnu.org/onlinedocs/gcc-4.1.2/gcc/Atomic-Builtins.html */ 77 #define POCL_ATOMIC_INC(x) __sync_add_and_fetch (&x, 1) 78 #define POCL_ATOMIC_DEC(x) __sync_sub_and_fetch (&x, 1) 79 #define POCL_ATOMIC_CAS(ptr, oldval, newval) \ 80 __sync_val_compare_and_swap (ptr, oldval, newval) 81 82 #elif defined(_WIN32) 83 #define POCL_ATOMIC_INC(x) InterlockedIncrement64 (&x) 84 #define POCL_ATOMIC_DEC(x) InterlockedDecrement64 (&x) 85 #define POCL_ATOMIC_CAS(ptr, oldval, newval) \ 86 InterlockedCompareExchange64 (ptr, newval, oldval) 87 #else 88 #error Need atomic_inc() builtin for this compiler 89 #endif 90 91 #ifdef HAVE_VALGRIND 92 #define VG_REFC_ZERO(var) \ 93 ANNOTATE_HAPPENS_AFTER (&var->pocl_refcount); \ 94 ANNOTATE_HAPPENS_BEFORE_FORGET_ALL (&var->pocl_refcount) 95 #define VG_REFC_NONZERO(var) ANNOTATE_HAPPENS_BEFORE (&var->pocl_refcount) 96 #else 97 #define VG_REFC_ZERO(var) (void)0 98 #define VG_REFC_NONZERO(var) (void)0 99 #endif 100 101 #ifdef __linux__ 102 #define ALIGN_CACHE(x) x __attribute__ ((aligned (HOST_CPU_CACHELINE_SIZE))) 103 #else 104 #define ALIGN_CACHE(x) x 105 #endif 106 107 /* Generic functionality for handling different types of 108 OpenCL (host) objects. */ 109 110 #define POCL_LOCK(__LOCK__) \ 111 do \ 112 { \ 113 int r = pthread_mutex_lock (&(__LOCK__)); \ 114 assert (r == 0); \ 115 } \ 116 while (0) 117 #define POCL_UNLOCK(__LOCK__) \ 118 do \ 119 { \ 120 int r = pthread_mutex_unlock (&(__LOCK__)); \ 121 assert (r == 0); \ 122 } \ 123 while (0) 124 #define POCL_INIT_LOCK(__LOCK__) \ 125 do \ 126 { \ 127 int r = pthread_mutex_init (&(__LOCK__), NULL); \ 128 assert (r == 0); \ 129 } \ 130 while (0) 131 /* We recycle OpenCL objects by not actually freeing them until the 132 very end. Thus, the lock should not be destoryed at the refcount 0. */ 133 #define POCL_DESTROY_LOCK(__LOCK__) \ 134 do \ 135 { \ 136 int r = pthread_mutex_destroy (&(__LOCK__)); \ 137 assert (r == 0); \ 138 } \ 139 while (0) 140 141 142 /* If available, use an Adaptive mutex for locking in the pthread driver, 143 otherwise fallback to simple mutexes */ 144 #define POCL_FAST_LOCK_T pthread_mutex_t 145 #define POCL_FAST_LOCK(l) POCL_LOCK(l) 146 #define POCL_FAST_UNLOCK(l) POCL_UNLOCK(l) 147 148 #ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP 149 #define POCL_FAST_INIT(l) \ 150 do { \ 151 pthread_mutexattr_t attrs; \ 152 pthread_mutexattr_init (&attrs); \ 153 int r = pthread_mutexattr_settype (&attrs, PTHREAD_MUTEX_ADAPTIVE_NP); \ 154 assert (r == 0); \ 155 pthread_mutex_init(&l, &attrs); \ 156 pthread_mutexattr_destroy(&attrs);\ 157 } while (0) 158 #else 159 #define POCL_FAST_INIT(l) POCL_INIT_LOCK (l) 160 #endif 161 162 #define POCL_FAST_DESTROY(l) POCL_DESTROY_LOCK(l) 163 164 #define POCL_INIT_COND(c) pthread_cond_init (&c, NULL) 165 #define POCL_DESTROY_COND(c) pthread_cond_destroy (&c) 166 #define POCL_SIGNAL_COND(c) pthread_cond_signal (&c) 167 #define POCL_BROADCAST_COND(c) pthread_cond_broadcast (&c) 168 #define POCL_WAIT_COND(c, m) pthread_cond_wait (&c, &m) 169 #define POCL_TIMEDWAIT_COND(c, m, t) pthread_cond_timedwait (&c, &m, &t) 170 171 #define POCL_CREATE_THREAD(thr, func, arg) \ 172 pthread_create (&thr, NULL, func, arg) 173 #define POCL_JOIN_THREAD(thr) pthread_join (thr, NULL) 174 #define POCL_JOIN_THREAD2(thr, res_ptr) pthread_join (thr, res_ptr) 175 #define POCL_EXIT_THREAD(res) pthread_exit (res) 176 177 //############################################################################ 178 179 #ifdef ENABLE_EXTRA_VALIDITY_CHECKS 180 #define POCL_MAGIC_1 0xBE8906A1A83D8D23ULL 181 #define POCL_MAGIC_2 0x071AC830215FD807ULL 182 #define IS_CL_OBJECT_VALID(__OBJ__) \ 183 (((__OBJ__) != NULL) && ((__OBJ__)->magic_1 == POCL_MAGIC_1) \ 184 && ((__OBJ__)->magic_2 == POCL_MAGIC_2)) 185 #define CHECK_VALIDITY_MARKERS \ 186 assert ((__OBJ__)->magic_1 == POCL_MAGIC_1); \ 187 assert ((__OBJ__)->magic_2 == POCL_MAGIC_2); 188 #define SET_VALIDITY_MARKERS \ 189 (__OBJ__)->magic_1 = POCL_MAGIC_1; \ 190 (__OBJ__)->magic_2 = POCL_MAGIC_2; 191 #define UNSET_VALIDITY_MARKERS \ 192 (__OBJ__)->magic_1 = 0; \ 193 (__OBJ__)->magic_2 = 0; 194 #else 195 #define IS_CL_OBJECT_VALID(__OBJ__) ((__OBJ__) != NULL) 196 #define CHECK_VALIDITY_MARKERS 197 #define SET_VALIDITY_MARKERS 198 #define UNSET_VALIDITY_MARKERS 199 #endif 200 201 #define POCL_LOCK_OBJ(__OBJ__) \ 202 do \ 203 { \ 204 CHECK_VALIDITY_MARKERS; \ 205 POCL_LOCK ((__OBJ__)->pocl_lock); \ 206 assert ((__OBJ__)->pocl_refcount > 0); \ 207 } \ 208 while (0) 209 #define POCL_UNLOCK_OBJ(__OBJ__) \ 210 do \ 211 { \ 212 CHECK_VALIDITY_MARKERS; \ 213 assert ((__OBJ__)->pocl_refcount >= 0); \ 214 POCL_UNLOCK ((__OBJ__)->pocl_lock); \ 215 } \ 216 while (0) 217 218 #define POCL_RELEASE_OBJECT(__OBJ__, __NEW_REFCOUNT__) \ 219 do \ 220 { \ 221 POCL_LOCK_OBJ (__OBJ__); \ 222 __NEW_REFCOUNT__ = --(__OBJ__)->pocl_refcount; \ 223 POCL_UNLOCK_OBJ (__OBJ__); \ 224 } \ 225 while (0) 226 227 #define POCL_RETAIN_OBJECT_UNLOCKED(__OBJ__) \ 228 ++((__OBJ__)->pocl_refcount) 229 230 #define POCL_RETAIN_OBJECT_REFCOUNT(__OBJ__, R) \ 231 do { \ 232 POCL_LOCK_OBJ (__OBJ__); \ 233 R = POCL_RETAIN_OBJECT_UNLOCKED (__OBJ__); \ 234 POCL_UNLOCK_OBJ (__OBJ__); \ 235 } while (0) 236 237 #define POCL_RETAIN_OBJECT(__OBJ__) \ 238 do { \ 239 POCL_LOCK_OBJ (__OBJ__); \ 240 POCL_RETAIN_OBJECT_UNLOCKED (__OBJ__); \ 241 POCL_UNLOCK_OBJ (__OBJ__); \ 242 } while (0) 243 244 245 extern uint64_t last_object_id; 246 247 /* The reference counter is initialized to 1, 248 when it goes to 0 object can be freed. */ 249 #define POCL_INIT_OBJECT_NO_ICD(__OBJ__) \ 250 do \ 251 { \ 252 SET_VALIDITY_MARKERS; \ 253 (__OBJ__)->pocl_refcount = 1; \ 254 POCL_INIT_LOCK ((__OBJ__)->pocl_lock); \ 255 (__OBJ__)->id = POCL_ATOMIC_INC (last_object_id); \ 256 } \ 257 while (0) 258 259 #define POCL_MEM_FREE(F_PTR) \ 260 do { \ 261 free((F_PTR)); \ 262 (F_PTR) = NULL; \ 263 } while (0) 264 265 #ifdef BUILD_ICD 266 /* Most (all?) object must also initialize the ICD field */ 267 # define POCL_INIT_OBJECT(__OBJ__) \ 268 do { \ 269 POCL_INIT_OBJECT_NO_ICD(__OBJ__); \ 270 POCL_INIT_ICD_OBJECT(__OBJ__); \ 271 } while (0) 272 #else 273 # define POCL_INIT_OBJECT(__OBJ__) \ 274 POCL_INIT_OBJECT_NO_ICD(__OBJ__) 275 #endif 276 277 #define POCL_DESTROY_OBJECT(__OBJ__) \ 278 do \ 279 { \ 280 UNSET_VALIDITY_MARKERS; \ 281 POCL_DESTROY_LOCK ((__OBJ__)->pocl_lock); \ 282 } \ 283 while (0) 284 285 /* Declares the generic pocl object attributes inside a struct. */ 286 #define POCL_OBJECT \ 287 uint64_t magic_1; \ 288 uint64_t id; \ 289 pocl_lock_t pocl_lock; \ 290 uint64_t magic_2; \ 291 int pocl_refcount 292 293 #ifdef __APPLE__ 294 /* Note: OSX doesn't support aliases because it doesn't use ELF */ 295 296 # define POname(name) name 297 # define POdeclsym(name) 298 # define POsym(name) 299 # define POsymAlways(name) 300 301 #elif defined(_WIN32) 302 /* Visual Studio does not support this magic either */ 303 # define POname(name) name 304 # define POdeclsym(name) 305 # define POsym(name) 306 # define POsymAlways(name) 307 # define POdeclsym(name) 308 309 #else 310 /* Symbol aliases are supported */ 311 312 # define POname(name) PO##name 313 314 #define POdeclsym(name) __typeof__ (name) PO##name; 315 # define POCL_ALIAS_OPENCL_SYMBOL(name) \ 316 __typeof__(name) name __attribute__((alias ("PO" #name), visibility("default"))); 317 # define POsymAlways(name) POCL_ALIAS_OPENCL_SYMBOL(name) 318 # if !defined(BUILD_ICD) 319 # define POsym(name) POCL_ALIAS_OPENCL_SYMBOL(name) 320 # else 321 # define POsym(name) 322 # endif 323 324 #endif 325 326 327 /* The ICD compatibility part. This must be first in the objects where 328 * it is used (as the ICD loader assumes that)*/ 329 #ifdef BUILD_ICD 330 # define POCL_ICD_OBJECT struct _cl_icd_dispatch *dispatch; 331 # define POCL_ICD_OBJECT_PLATFORM_ID POCL_ICD_OBJECT 332 # define POsymICD(name) POsym(name) 333 # define POdeclsymICD(name) POdeclsym(name) 334 #else 335 # define POCL_ICD_OBJECT 336 # define POCL_ICD_OBJECT_PLATFORM_ID unsigned long id; 337 # define POsymICD(name) 338 # define POdeclsymICD(name) 339 #endif 340 341 #include "pocl_intfn.h" 342 343 /* fields for cl_kernel -> has_arg_metadata */ 344 #define POCL_HAS_KERNEL_ARG_ADDRESS_QUALIFIER 1 345 #define POCL_HAS_KERNEL_ARG_ACCESS_QUALIFIER 2 346 #define POCL_HAS_KERNEL_ARG_TYPE_NAME 4 347 #define POCL_HAS_KERNEL_ARG_TYPE_QUALIFIER 8 348 #define POCL_HAS_KERNEL_ARG_NAME 16 349 350 /* pocl specific flag, for "hidden" default queues allocated in each context */ 351 #define CL_QUEUE_HIDDEN (1 << 10) 352 353 typedef struct pocl_argument { 354 uint64_t size; 355 /* The "offset" is used to simplify subbuffer handling. 356 * At enqueue time, subbuffers are converted to buffers + offset into them. 357 */ 358 uint64_t offset; 359 void *value; 360 /* 1 if this argument has been set by clSetKernelArg */ 361 char is_set; 362 /* 1 if the argument is read-only according to kernel metadata. So either 363 * a buffer with "const" qualifier, or an image with read_only qualifier */ 364 char is_readonly; 365 /* 1 if the argument pointer is SVM direct pointer, not a cl_mem */ 366 char is_svm; 367 } pocl_argument; 368 369 typedef struct event_node event_node; 370 371 /** 372 * Enumeration for kernel argument types 373 */ 374 typedef enum { 375 POCL_ARG_TYPE_NONE = 0, 376 POCL_ARG_TYPE_POINTER = 1, 377 POCL_ARG_TYPE_IMAGE = 2, 378 POCL_ARG_TYPE_SAMPLER = 3, 379 } pocl_argument_type; 380 381 #define ARG_IS_LOCAL(a) (a.address_qualifier == CL_KERNEL_ARG_ADDRESS_LOCAL) 382 #define ARGP_IS_LOCAL(a) (a->address_qualifier == CL_KERNEL_ARG_ADDRESS_LOCAL) 383 384 typedef struct pocl_argument_info { 385 char* type_name; 386 char* name; 387 cl_kernel_arg_address_qualifier address_qualifier; 388 cl_kernel_arg_access_qualifier access_qualifier; 389 cl_kernel_arg_type_qualifier type_qualifier; 390 pocl_argument_type type; 391 unsigned type_size; 392 } pocl_argument_info; 393 394 /* represents a single buffer to host memory mapping */ 395 396 struct pocl_device_ops { 397 const char *device_name; 398 399 /* New driver api extension for out-of-order execution and 400 asynchronous devices. 401 See this for reference: http://URN.fi/URN:NBN:fi:tty-201412051583 402 See basic and pthread driver for reference. */ 403 404 /* submit gives the command for the device. The command may be left in the cq 405 or stored to the device driver owning the cq. submit is called 406 with node->event locked, and must return with it unlocked. */ 407 void (*submit) (_cl_command_node *node, cl_command_queue cq); 408 409 /* join is called by clFinish and this function blocks until all the enqueued 410 commands are finished. Called by the user thread; see notify_cmdq_finished 411 for the driver thread counterpart. */ 412 void (*join) (cl_device_id device, cl_command_queue cq); 413 414 /* flush is called when clFlush is called. This function ensures that 415 commands will be eventually executed. It is up to the device what happens 416 here, if anything. See basic and pthread for reference.*/ 417 void (*flush) (cl_device_id device, cl_command_queue cq); 418 419 /* notify is used to communicate to a device driver that an event, it has 420 been waiting, has been completed. Upon call, both events are locked, and 421 must be locked also on return.*/ 422 void (*notify) (cl_device_id device, cl_event event, cl_event finished); 423 424 /* broadcast is(has to be) called by the device driver when a command is 425 completed. 426 It is used to broadcast notifications to device drivers waiting 427 this event to complete. 428 There is a default implementation for this. Use it if there is no need 429 to do anything special here. 430 The default implementation calls notify(event, target_event) for the 431 list of events waiting on 'event'. */ 432 void (*broadcast) (cl_event event); 433 434 /* wait_event is called by clWaitForEvents() and blocks the execution until 435 * the waited event is complete or failed. Called by user threads; see the 436 * notify_event_finished() callback for driver thread counterpart. 437 * Called (and must return) with unlocked event. */ 438 void (*wait_event) (cl_device_id device, cl_event event); 439 440 /* update_event is an extra callback called during handling of event status 441 * changes, useful if something device specific needs to be done. May be 442 * NULL; no need to implement if not needed. 443 * 444 * Called via pocl_update_event_* functions in pocl_util.c 445 * All pocl_update_event_* (except COMPLETE) must be called (and return) 446 * with LOCKED event. 447 */ 448 void (*update_event) (cl_device_id device, cl_event event); 449 450 /* free_event_data may be called when event is freed. Event data may only be 451 used by the device driver owning the corresponding command. 452 No need to implement this if the device does not need any event data. */ 453 void (*free_event_data) (cl_event event); 454 455 /* Called from driver threads to notify every user thread waiting on 456 * command queue finish. See join() for user counterpart. 457 * Driver may chose to not implement this, which will result in 458 * undefined behaviour in multi-threaded user programs. */ 459 void (*notify_cmdq_finished) (cl_command_queue cq); 460 461 /* Called from driver threads to notify every user thread waiting on 462 * a specific event. See wait_event() for user counterpart. 463 * Driver may chose to not implement this, which will result in 464 * undefined behaviour in multi-threaded user programs. */ 465 void (*notify_event_finished) (cl_event event); 466 467 /* /New driver api extension */ 468 469 /* Detects & returns the number of available devices the driver finds on the system. */ 470 unsigned int (*probe) (struct pocl_device_ops *ops); 471 /* Device initialization. Parameters: 472 * j : progressive index for the devices of the same type 473 * device : struct to initialize 474 * parameters : optional environment with device-specific parameters 475 */ 476 cl_int (*init) (unsigned j, cl_device_id device, const char *parameters); 477 /* uninitializes the driver for a particular device. May free hardware resources. */ 478 cl_int (*uninit) (unsigned j, cl_device_id device); 479 /* reinitializes the driver for a particular device. Called after uninit; 480 * the first initialization is done by 'init'. May be NULL */ 481 cl_int (*reinit) (unsigned j, cl_device_id device); 482 483 /* allocate a buffer in device memory */ 484 cl_int (*alloc_mem_obj) (cl_device_id device, cl_mem mem_obj, void* host_ptr); 485 /* free a device buffer */ 486 void (*free) (cl_device_id device, cl_mem mem_obj); 487 488 /* return >0 if driver can migrate directly between devices. 489 * Priority between devices signalled by larger numbers. */ 490 int (*can_migrate_d2d) (cl_device_id dest, cl_device_id source); 491 /* migrate buffer content directly between devices */ 492 int (*migrate_d2d) (cl_device_id src_dev, 493 cl_device_id dst_dev, 494 cl_mem mem, 495 pocl_mem_identifier *src_mem_id, 496 pocl_mem_identifier *dst_mem_id); 497 498 /* SVM Ops */ 499 void (*svm_free) (cl_device_id dev, void *svm_ptr); 500 void *(*svm_alloc) (cl_device_id dev, cl_svm_mem_flags flags, size_t size); 501 void (*svm_map) (cl_device_id dev, void *svm_ptr); 502 void (*svm_unmap) (cl_device_id dev, void *svm_ptr); 503 /* these are optional. If the driver needs to do anything to be able 504 * to use host memory, it should do it (and undo it) in these callbacks. 505 * Currently used by HSA. 506 * See pocl_basic_alloc and pocl_basic_free for details. */ 507 void (*svm_register) (cl_device_id dev, void *host_ptr, size_t size); 508 void (*svm_unregister) (cl_device_id dev, void *host_ptr, size_t size); 509 510 /* we can use restrict here, because Spec says overlapping copy should return 511 * with CL_MEM_COPY_OVERLAP error. */ 512 void (*svm_copy) (cl_device_id dev, void *__restrict__ dst, 513 const void *__restrict__ src, size_t size); 514 void (*svm_fill) (cl_device_id dev, void *__restrict__ svm_ptr, size_t size, 515 void *__restrict__ pattern, size_t pattern_size); 516 517 /* the following callbacks only deal with buffers (and IMAGE1D_BUFFER which 518 * is backed by a buffer), not images. */ 519 520 /* clEnqReadBuffer */ 521 void (*read) (void *data, 522 void *__restrict__ dst_host_ptr, 523 pocl_mem_identifier * src_mem_id, 524 cl_mem src_buf, 525 size_t offset, 526 size_t size); 527 /* clEnqReadBufferRect */ 528 void (*read_rect) (void *data, 529 void *__restrict__ dst_host_ptr, 530 pocl_mem_identifier * src_mem_id, 531 cl_mem src_buf, 532 const size_t *buffer_origin, 533 const size_t *host_origin, 534 const size_t *region, 535 size_t buffer_row_pitch, 536 size_t buffer_slice_pitch, 537 size_t host_row_pitch, 538 size_t host_slice_pitch); 539 /* clEnqWriteBuffer */ 540 void (*write) (void *data, 541 const void *__restrict__ src_host_ptr, 542 pocl_mem_identifier * dst_mem_id, 543 cl_mem dst_buf, 544 size_t offset, 545 size_t size); 546 /* clEnqWriteBufferRect */ 547 void (*write_rect) (void *data, 548 const void *__restrict__ src_host_ptr, 549 pocl_mem_identifier * dst_mem_id, 550 cl_mem dst_buf, 551 const size_t *buffer_origin, 552 const size_t *host_origin, 553 const size_t *region, 554 size_t buffer_row_pitch, 555 size_t buffer_slice_pitch, 556 size_t host_row_pitch, 557 size_t host_slice_pitch); 558 /* clEnqCopyBuffer */ 559 void (*copy) (void *data, 560 pocl_mem_identifier * dst_mem_id, 561 cl_mem dst_buf, 562 pocl_mem_identifier * src_mem_id, 563 cl_mem src_buf, 564 size_t dst_offset, 565 size_t src_offset, 566 size_t size); 567 /* clEnqCopyBufferRect */ 568 void (*copy_rect) (void *data, 569 pocl_mem_identifier * dst_mem_id, 570 cl_mem dst_buf, 571 pocl_mem_identifier * src_mem_id, 572 cl_mem src_buf, 573 const size_t *dst_origin, 574 const size_t *src_origin, 575 const size_t *region, 576 size_t dst_row_pitch, 577 size_t dst_slice_pitch, 578 size_t src_row_pitch, 579 size_t src_slice_pitch); 580 581 /* clEnqCopyBuffer with the cl_pocl_content_size extension. This callback is optional */ 582 void (*copy_with_size) (void *data, 583 pocl_mem_identifier *dst_mem_id, 584 cl_mem dst_buf, 585 pocl_mem_identifier *src_mem_id, 586 cl_mem src_buf, 587 pocl_mem_identifier *content_size_buf_mem_id, 588 cl_mem content_size_buf, 589 size_t dst_offset, 590 size_t src_offset, 591 size_t size); 592 593 /* clEnqFillBuffer */ 594 void (*memfill) (void *data, 595 pocl_mem_identifier * dst_mem_id, 596 cl_mem dst_buf, 597 size_t size, 598 size_t offset, 599 const void *__restrict__ pattern, 600 size_t pattern_size); 601 602 /* Maps 'size' bytes of device global memory at + offset to 603 host-accessible memory. This might or might not involve copying 604 the block from the device. */ 605 cl_int (*map_mem) (void *data, 606 pocl_mem_identifier * src_mem_id, 607 cl_mem src_buf, 608 mem_mapping_t *map); 609 cl_int (*unmap_mem) (void *data, 610 pocl_mem_identifier * dst_mem_id, 611 cl_mem dst_buf, 612 mem_mapping_t *map); 613 614 /* these don't actually do the mapping, only return a pointer 615 * where the driver will map in future. Separate API from map/unmap 616 * because 1) unlike other driver ops, this is called from the user thread, 617 * so it can be called in parallel with map/unmap or any command executing 618 * in the driver; 2) most drivers can share the code for these */ 619 cl_int (*get_mapping_ptr) (void *data, pocl_mem_identifier *mem_id, 620 cl_mem mem, mem_mapping_t *map); 621 cl_int (*free_mapping_ptr) (void *data, pocl_mem_identifier *mem_id, 622 cl_mem mem, mem_mapping_t *map); 623 624 /* if the driver needs to do something at kernel create/destroy time */ 625 int (*create_kernel) (cl_device_id device, cl_program program, 626 cl_kernel kernel, unsigned program_device_i); 627 int (*free_kernel) (cl_device_id device, cl_program program, 628 cl_kernel kernel, unsigned program_device_i); 629 630 /* program building callbacks */ 631 int (*build_source) ( 632 cl_program program, cl_uint device_i, 633 634 /* these are filled by clCompileProgram(), otherwise NULLs */ 635 cl_uint num_input_headers, const cl_program *input_headers, 636 const char **header_include_names, 637 638 /* 1 = compile & link, 0 = compile only, linked later via clLinkProgram*/ 639 int link_program); 640 641 int (*build_binary) ( 642 cl_program program, cl_uint device_i, 643 644 /* 1 = compile & link, 0 = compile only, linked later via clLinkProgram*/ 645 int link_program, int spir_build); 646 647 /* build a program with builtin kernels. */ 648 int (*build_builtin) (cl_program program, cl_uint device_i); 649 650 int (*link_program) (cl_program program, cl_uint device_i, 651 652 cl_uint num_input_programs, 653 const cl_program *input_programs, 654 655 /* 1 = create library, 0 = create executable*/ 656 int create_library); 657 658 /* optional. called after build/link and after metadata setup. */ 659 int (*post_build_program) (cl_program program, cl_uint device_i); 660 /* optional. Ensures that everything is built for 661 * returning a poclbinary to the user. E.g. for CPU driver this means 662 * building a dynamic WG sized parallel.bc */ 663 int (*build_poclbinary) (cl_program program, cl_uint device_i); 664 665 /* Optional. If the driver uses the default build_poclbinary implementation 666 * from common_driver.c, that implementation calls this to compile a 667 * "dynamic WG size" kernel. */ 668 void (*compile_kernel) (_cl_command_node *cmd, cl_kernel kernel, 669 cl_device_id device, int specialize); 670 671 /* Optional. driver should free the content of "program->data" here, 672 * if it fills it. */ 673 int (*free_program) (cl_device_id device, cl_program program, 674 unsigned program_device_i); 675 676 /* Driver should setup kernel metadata here, if it can, and return non-zero 677 * on success. This is called after compilation/build/link. E.g. CPU driver 678 * parses the LLVM metadata. */ 679 int (*setup_metadata) (cl_device_id device, cl_program program, 680 unsigned program_device_i); 681 682 /* Driver should examine the binary and return non-zero if it can load it. 683 * Note that it's never called with pocl-binaries; those are automatically 684 * accepted if device-hash in the binary's header matches the device. */ 685 int (*supports_binary) (cl_device_id device, const size_t length, 686 const char *binary); 687 688 /* Optional. if the driver needs to use hardware resources 689 * for command queues, it should use these callbacks */ 690 int (*init_queue) (cl_device_id device, cl_command_queue queue); 691 int (*free_queue) (cl_device_id device, cl_command_queue queue); 692 693 /* clEnqueueNDRangeKernel */ 694 void (*run) (void *data, _cl_command_node *cmd); 695 /* for clEnqueueNativeKernel. may be NULL */ 696 void (*run_native) (void *data, _cl_command_node *cmd); 697 698 /* Perform initialization steps and can return additional 699 build options that are required for the device. The caller 700 owns the returned string. may be NULL */ 701 char* (*init_build) (void *data); 702 703 /* may be NULL */ 704 void (*init_target_machine) (void *data, void *target_machine); 705 706 /* returns a hash string that should identify the device. This string 707 * is used when writing/loading pocl binaries to decide compatibility. */ 708 char* (*build_hash) (cl_device_id device); 709 710 /* the following callbacks deal with images ONLY, with the exception of 711 * IMAGE1D_BUFFER type (which is implemented as a buffer). 712 * If the device does not support images, all of these may be NULL. */ 713 714 /* creates a device-specific hardware resource for sampler. May be NULL */ 715 int (*create_sampler) (cl_device_id device, 716 cl_sampler samp, 717 unsigned context_device_i); 718 int (*free_sampler) (cl_device_id device, 719 cl_sampler samp, 720 unsigned context_device_i); 721 722 /* copies image to image, on the same device (or same global memory). */ 723 cl_int (*copy_image_rect) (void *data, 724 cl_mem src_image, 725 cl_mem dst_image, 726 pocl_mem_identifier *src_mem_id, 727 pocl_mem_identifier *dst_mem_id, 728 const size_t *src_origin, 729 const size_t *dst_origin, 730 const size_t *region); 731 732 /* copies a region from host OR device buffer to device image. 733 * clEnqueueCopyBufferToImage: src_mem_id = buffer, 734 * src_host_ptr = NULL, src_row_pitch = src_slice_pitch = 0 735 * clEnqueueWriteImage: src_mem_id = NULL, 736 * src_host_ptr = host pointer, src_offset = 0 737 */ 738 cl_int (*write_image_rect ) (void *data, 739 cl_mem dst_image, 740 pocl_mem_identifier *dst_mem_id, 741 const void *__restrict__ src_host_ptr, 742 pocl_mem_identifier *src_mem_id, 743 const size_t *origin, 744 const size_t *region, 745 size_t src_row_pitch, 746 size_t src_slice_pitch, 747 size_t src_offset); 748 749 /* copies a region from device image to host or device buffer 750 * clEnqueueCopyImageToBuffer: dst_mem_id = buffer, 751 * dst_host_ptr = NULL, dst_row_pitch = dst_slice_pitch = 0 752 * clEnqueueReadImage: dst_mem_id = NULL, 753 * dst_host_ptr = host pointer, dst_offset = 0 754 */ 755 cl_int (*read_image_rect) (void *data, 756 cl_mem src_image, 757 pocl_mem_identifier *src_mem_id, 758 void *__restrict__ dst_host_ptr, 759 pocl_mem_identifier *dst_mem_id, 760 const size_t *origin, 761 const size_t *region, 762 size_t dst_row_pitch, 763 size_t dst_slice_pitch, 764 size_t dst_offset); 765 766 /* maps the entire image from device to host */ 767 cl_int (*map_image) (void *data, 768 pocl_mem_identifier *mem_id, 769 cl_mem src_image, 770 mem_mapping_t *map); 771 772 /* unmaps the entire image from host to device */ 773 cl_int (*unmap_image) (void *data, 774 pocl_mem_identifier *mem_id, 775 cl_mem dst_image, 776 mem_mapping_t *map); 777 778 /* fill image with pattern */ 779 cl_int (*fill_image) (void *data, cl_mem image, pocl_mem_identifier *mem_id, 780 const size_t *origin, const size_t *region, 781 cl_uint4 orig_pixel, pixel_t fill_pixel, 782 size_t pixel_size); 783 784 /* custom device functionality */ 785 786 /* The device can override this function to perform driver-specific 787 * optimizations to the local size dimensions, whenever the decision 788 * is left to the runtime. */ 789 void (*compute_local_size) (cl_device_id dev, size_t global_x, 790 size_t global_y, size_t global_z, 791 size_t *local_x, size_t *local_y, 792 size_t *local_z); 793 794 cl_int (*get_device_info_ext) (cl_device_id dev, cl_device_info param_name, 795 size_t param_value_size, void * param_value, 796 size_t * param_value_size_ret); 797 }; 798 799 typedef struct pocl_global_mem_t { 800 pocl_lock_t pocl_lock; 801 cl_ulong max_ever_allocated; 802 cl_ulong currently_allocated; 803 cl_ulong total_alloc_limit; 804 } pocl_global_mem_t; 805 806 /** 807 * Enumeration for different modes of converting automatic locals 808 */ 809 typedef enum 810 { 811 POCL_AUTOLOCALS_TO_ARGS_NEVER = 0, 812 POCL_AUTOLOCALS_TO_ARGS_ALWAYS = 1, 813 // convert autolocals to args only if there are dynamic local memory function 814 // arguments in the kernel. 815 POCL_AUTOLOCALS_TO_ARGS_ONLY_IF_DYNAMIC_LOCALS_PRESENT = 2, 816 } pocl_autolocals_to_args_strategy; 817 818 #define NUM_OPENCL_IMAGE_TYPES 6 819 820 struct _cl_device_id { 821 POCL_ICD_OBJECT 822 POCL_OBJECT; 823 /* queries */ 824 cl_device_type type; 825 cl_uint vendor_id; 826 cl_uint max_compute_units; 827 828 /* for subdevice support */ 829 cl_device_id parent_device; 830 unsigned core_start; 831 unsigned core_count; 832 833 cl_uint max_work_item_dimensions; 834 /* when enabled, Workgroup LLVM pass will replace all printf() calls 835 * with calls to __pocl_printf and recursively change functions to 836 * add printf buffer arguments from pocl_context. 837 * Currently the pthread/basic devices require this; other devices 838 * implement printf their own way. */ 839 int device_side_printf; 840 size_t max_work_item_sizes[3]; 841 size_t max_work_group_size; 842 size_t preferred_wg_size_multiple; 843 cl_uint preferred_vector_width_char; 844 cl_uint preferred_vector_width_short; 845 cl_uint preferred_vector_width_int; 846 cl_uint preferred_vector_width_long; 847 cl_uint preferred_vector_width_float; 848 cl_uint preferred_vector_width_double; 849 cl_uint preferred_vector_width_half; 850 cl_uint native_vector_width_char; 851 cl_uint native_vector_width_short; 852 cl_uint native_vector_width_int; 853 cl_uint native_vector_width_long; 854 cl_uint native_vector_width_float; 855 cl_uint native_vector_width_double; 856 cl_uint native_vector_width_half; 857 cl_uint max_clock_frequency; 858 cl_uint address_bits; 859 cl_ulong max_mem_alloc_size; 860 cl_bool image_support; 861 cl_uint max_read_image_args; 862 cl_uint max_write_image_args; 863 cl_uint max_read_write_image_args; 864 size_t image2d_max_width; 865 size_t image2d_max_height; 866 size_t image3d_max_width; 867 size_t image3d_max_height; 868 size_t image3d_max_depth; 869 size_t image_max_buffer_size; 870 size_t image_max_array_size; 871 cl_uint max_samplers; 872 size_t max_parameter_size; 873 cl_uint mem_base_addr_align; 874 cl_uint min_data_type_align_size; 875 cl_device_fp_config half_fp_config; 876 cl_device_fp_config single_fp_config; 877 cl_device_fp_config double_fp_config; 878 cl_device_mem_cache_type global_mem_cache_type; 879 cl_uint global_mem_cacheline_size; 880 cl_ulong global_mem_cache_size; 881 cl_ulong global_mem_size; 882 size_t global_var_pref_size; 883 size_t global_var_max_size; 884 cl_ulong max_constant_buffer_size; 885 cl_uint max_constant_args; 886 cl_device_local_mem_type local_mem_type; 887 cl_ulong local_mem_size; 888 cl_bool error_correction_support; 889 cl_bool host_unified_memory; 890 size_t profiling_timer_resolution; 891 cl_bool endian_little; 892 cl_bool available; 893 cl_bool compiler_available; 894 cl_bool linker_available; 895 /* Is the target a Single Program Multiple Data machine? If not, 896 we need to generate work-item loops to execute all the work-items 897 in the WG. For SPMD machines, the hardware spawns the WIs. */ 898 cl_bool spmd; 899 /* The device uses an HSA-like kernel ABI with a single argument buffer as 900 an input. */ 901 cl_bool arg_buffer_launcher; 902 /* The Workgroup pass creates launcher functions and replaces work-item 903 placeholder global variables (e.g. _local_size_, _global_offset_ etc) with 904 loads from the context struct passed as a kernel argument. This flag 905 enables or disables this pass. */ 906 cl_bool workgroup_pass; 907 cl_device_exec_capabilities execution_capabilities; 908 cl_command_queue_properties queue_properties; 909 cl_platform_id platform; 910 cl_uint max_sub_devices; 911 size_t num_partition_properties; 912 cl_device_partition_property *partition_properties; 913 size_t num_partition_types; 914 cl_device_partition_property *partition_type; 915 size_t printf_buffer_size; 916 char *short_name; 917 char *long_name; 918 919 const char *vendor; 920 const char *driver_version; 921 const char *profile; 922 const char *version; 923 const char *extensions; 924 const char *cl_version_std; // "CL2.0" 925 cl_ulong cl_version_int; // 200 926 927 void *data; 928 const char* llvm_target_triplet; /* the llvm target triplet to use */ 929 const char* llvm_cpu; /* the llvm CPU variant to use */ 930 /* A running number (starting from zero) across all the device instances. 931 Used for indexing arrays in data structures with device specific 932 entries. */ 933 int dev_id; 934 int global_mem_id; /* identifier for device global memory */ 935 /* pointer to an accounting struct for global memory */ 936 pocl_global_mem_t *global_memory; 937 /* Does the device have 64bit longs */ 938 int has_64bit_long; 939 /* Does the device set the event times in update_event() callback ? 940 * if zero, the default event change handlers set the event times based on 941 * the host's system time (pocl_gettimemono_ns). */ 942 int has_own_timer; 943 944 /* If the driver wants SPIR-V input directly, without translation to 945 * LLVM IR with "spir" triple, set this to 1, 946 * and make sure device->ops->supports_binary returns 1 for SPIR-V */ 947 int consumes_il_directly; 948 949 /* Convert automatic local variables to kernel arguments? */ 950 pocl_autolocals_to_args_strategy autolocals_to_args; 951 /* Allocate local buffers device side in the work-group launcher instead of 952 having a disjoint physical local memory per work-group or having the 953 runtime/driver allocate the local space. */ 954 int device_alloca_locals; 955 956 /* If > 0, specialized versions of the work-group functions are generated 957 which assume each grid dimension is of at most the given width. This 958 assumption can be then taken in account in IR optimization and codegen 959 to reduce address computation overheads etc. */ 960 size_t grid_width_specialization_limit; 961 962 /* Device-specific linker flags that should be appended to the clang's 963 argument list for a final linkage call when producing the final binary 964 that can be uploaded to the device using the default LLVM-based 965 codegen. The final entry in the list must be NULL. 966 967 The flags will be added after the following command line: 968 clang -o final.bin input.obj [flags] 969 */ 970 971 const char **final_linkage_flags; 972 973 /* Auxiliary functions required by the device binary which should 974 be retained across the kernel compilation unused code pruning 975 process. */ 976 const char **device_aux_functions; 977 978 /* semicolon separated list of builtin kernels*/ 979 char *builtin_kernel_list; 980 981 /* The target specific IDs for the different OpenCL address spaces. */ 982 unsigned global_as_id; 983 unsigned local_as_id; 984 unsigned constant_as_id; 985 986 /* The address space where the argument data is passed. */ 987 unsigned args_as_id; 988 989 /* The address space where the grid context data is passed. */ 990 unsigned context_as_id; 991 992 /* Set to >0 if the device supports SVM. 993 * When creating context with multiple devices, the device with 994 * largest priority will have the responsibility of allocating 995 * shared buffers residing in Shared Virtual Memory areas. 996 * This allows using both CPU and HSA for SVM allocations, 997 * with HSA having priority in multi-device context */ 998 cl_uint svm_allocation_priority; 999 /* OpenCL 2.0 properties */ 1000 cl_device_svm_capabilities svm_caps; 1001 cl_uint max_events; 1002 cl_uint max_queues; 1003 cl_uint max_pipe_args; 1004 cl_uint max_pipe_active_res; 1005 cl_uint max_pipe_packet_size; 1006 cl_uint dev_queue_pref_size; 1007 cl_uint dev_queue_max_size; 1008 cl_command_queue_properties on_dev_queue_props; 1009 cl_command_queue_properties on_host_queue_props; 1010 /* OpenCL 2.1 */ 1011 char *spirv_version; 1012 1013 /* image formats supported by the device, per image type */ 1014 const cl_image_format *image_formats[NUM_OPENCL_IMAGE_TYPES]; 1015 cl_uint num_image_formats[NUM_OPENCL_IMAGE_TYPES]; 1016 1017 /* Device operations, shared among devices of the same type */ 1018 struct pocl_device_ops *ops; 1019 1020 /* OpenCL 3.0 properties */ 1021 cl_device_atomic_capabilities atomic_memory_capabilities; 1022 cl_device_atomic_capabilities atomic_fence_capabilities; 1023 }; 1024 1025 #define DEVICE_SVM_FINEGR(dev) (dev->svm_caps & (CL_DEVICE_SVM_FINE_GRAIN_BUFFER \ 1026 | CL_DEVICE_SVM_FINE_GRAIN_SYSTEM)) 1027 #define DEVICE_SVM_ATOM(dev) (dev->svm_caps & CL_DEVICE_SVM_ATOMICS) 1028 1029 #define DEVICE_MMAP_IS_NOP(dev) (DEVICE_SVM_FINEGR(dev) && DEVICE_SVM_ATOM(dev)) 1030 1031 #define CHECK_DEVICE_AVAIL_RET(dev) if(!dev->available) { POCL_MSG_ERR("This cl_device is not available.\n"); return CL_INVALID_DEVICE; } 1032 #define CHECK_DEVICE_AVAIL_RETV(dev) if(!dev->available) { POCL_MSG_ERR("This cl_device is not available.\n"); return; } 1033 1034 #define OPENCL_MAX_DIMENSION 3 1035 1036 struct _cl_platform_id { 1037 POCL_ICD_OBJECT_PLATFORM_ID 1038 }; 1039 1040 struct _cl_context { 1041 POCL_ICD_OBJECT 1042 POCL_OBJECT; 1043 /* queries */ 1044 cl_device_id *devices; 1045 cl_context_properties *properties; 1046 /* implementation */ 1047 unsigned num_devices; 1048 unsigned num_properties; 1049 1050 /*********************************************************************/ 1051 /* these values depend on which devices are in context; 1052 * they're calculated by pocl_setup_context() */ 1053 1054 /* The largest of max_mem_alloc_size of all devices in context */ 1055 size_t max_mem_alloc_size; 1056 1057 /* union of image formats supported by all of the devices in context, 1058 * per image-type (there are 6 image types) 1059 TODO the getSupportedImageFormats is supposed to also respect flags, 1060 but for now we ignore that. */ 1061 cl_image_format *image_formats[NUM_OPENCL_IMAGE_TYPES]; 1062 cl_uint num_image_formats[NUM_OPENCL_IMAGE_TYPES]; 1063 1064 /* The device that should allocate SVM (might be == host) 1065 * NULL if none of devices in the context is SVM capable */ 1066 cl_device_id svm_allocdev; 1067 1068 /* for enqueueing migration commands. Two reasons: 1069 * 1) since migration commands can execute in parallel 1070 * to other commands, we can increase paralelism 1071 * 2) in some cases (migration between 2 devices through 1072 * host memory), we need to put two commands in two queues, 1073 * and the clEnqueueX only gives us one (on the destination 1074 * device). */ 1075 cl_command_queue *default_queues; 1076 1077 /* The minimal required buffer alignment for all devices in the context. 1078 * E.g. for clCreateSubBuffer: 1079 * CL_MISALIGNED_SUB_BUFFER_OFFSET is returned in errcode_ret if there are no 1080 * devices in context associated with buffer for which the origin value 1081 * is aligned to the CL_DEVICE_MEM_BASE_ADDR_ALIGN value. 1082 */ 1083 size_t min_buffer_alignment; 1084 1085 #ifdef ENABLE_LLVM 1086 void *llvm_context_data; 1087 #endif 1088 }; 1089 1090 typedef struct _pocl_data_sync_item pocl_data_sync_item; 1091 struct _pocl_data_sync_item { 1092 cl_event event; 1093 pocl_data_sync_item *next; 1094 }; 1095 1096 struct _cl_event; 1097 struct _cl_command_queue { 1098 POCL_ICD_OBJECT 1099 POCL_OBJECT; 1100 /* queries */ 1101 cl_context context; 1102 cl_device_id device; 1103 cl_command_queue_properties properties; 1104 /* implementation */ 1105 cl_event events; /* events of the enqueued commands in enqueue order */ 1106 struct _cl_event *barrier; 1107 unsigned long command_count; /* counter for unfinished command enqueued */ 1108 pocl_data_sync_item last_event; 1109 1110 /* device specific data */ 1111 void *data; 1112 }; 1113 1114 #define POCL_ON_SUB_MISALIGN(mem, que, operation) \ 1115 do \ 1116 { \ 1117 if (mem->parent != NULL) { \ 1118 operation ( \ 1119 (mem->origin % que->device->mem_base_addr_align != 0), \ 1120 CL_MISALIGNED_SUB_BUFFER_OFFSET, \ 1121 "SubBuffer is not " \ 1122 "properly aligned for this device"); \ 1123 } \ 1124 } \ 1125 while (0) 1126 1127 #define POCL_RETURN_ON_SUB_MISALIGN(mem, que) \ 1128 POCL_ON_SUB_MISALIGN(mem, que, POCL_RETURN_ERROR_ON) 1129 1130 #define POCL_GOTO_ON_SUB_MISALIGN(mem, que) \ 1131 POCL_ON_SUB_MISALIGN(mem, que, POCL_GOTO_ERROR_ON) 1132 1133 #define POCL_CONVERT_SUBBUFFER_OFFSET(mem, offset) \ 1134 if (mem->parent != NULL) \ 1135 { \ 1136 offset += mem->origin; \ 1137 mem = mem->parent; \ 1138 } 1139 1140 #define DEVICE_IMAGE_SIZE_SUPPORT 1 1141 #define DEVICE_IMAGE_FORMAT_SUPPORT 2 1142 1143 #define DEVICE_DOESNT_SUPPORT_IMAGE(mem, dev_i) \ 1144 (mem->device_supports_this_image[dev_i] == 0) 1145 1146 #define POCL_ON_UNSUPPORTED_IMAGE(mem, dev, operation) \ 1147 do \ 1148 { \ 1149 unsigned dev_i; \ 1150 for (dev_i = 0; dev_i < mem->context->num_devices; ++dev_i) \ 1151 if (mem->context->devices[dev_i] == dev) \ 1152 break; \ 1153 assert (dev_i < mem->context->num_devices); \ 1154 operation ( \ 1155 (mem->context->devices[dev_i]->image_support == CL_FALSE), \ 1156 CL_INVALID_OPERATION, "Device %s does not support images\n", \ 1157 mem->context->devices[dev_i]->long_name); \ 1158 operation ( \ 1159 ((mem->device_supports_this_image[dev_i] \ 1160 & DEVICE_IMAGE_FORMAT_SUPPORT) \ 1161 == 0), \ 1162 CL_IMAGE_FORMAT_NOT_SUPPORTED, \ 1163 "The image type is not supported by this device\n"); \ 1164 operation ( \ 1165 ((mem->device_supports_this_image[dev_i] \ 1166 & DEVICE_IMAGE_SIZE_SUPPORT) \ 1167 == 0), \ 1168 CL_INVALID_IMAGE_SIZE, \ 1169 "The image size is not supported by this device\n"); \ 1170 } \ 1171 while (0) 1172 1173 1174 #define POCL_RETURN_ON_UNSUPPORTED_IMAGE(mem, dev) \ 1175 POCL_ON_UNSUPPORTED_IMAGE(mem, dev, POCL_RETURN_ERROR_ON) 1176 1177 #define POCL_GOTO_ON_UNSUPPORTED_IMAGE(mem, dev) \ 1178 POCL_ON_UNSUPPORTED_IMAGE(mem, dev, POCL_GOTO_ERROR_ON) 1179 1180 1181 1182 typedef struct _cl_mem cl_mem_t; 1183 struct _cl_mem { 1184 POCL_ICD_OBJECT 1185 POCL_OBJECT; 1186 cl_context context; 1187 cl_mem_object_type type; 1188 cl_mem_flags flags; 1189 1190 size_t size; 1191 size_t origin; /* for sub-buffers */ 1192 1193 /* host backing memory for a buffer. 1194 * 1195 * This is either user provided host-ptr, or driver allocated, 1196 * or temporary allocation by a migration command. Since it 1197 * can have multiple users, it's refcounted. */ 1198 void *mem_host_ptr; 1199 /* version of buffer content in mem_host_ptr */ 1200 uint64_t mem_host_ptr_version; 1201 /* reference count; when it reaches 0, 1202 * the mem_host_ptr is automatically freed */ 1203 uint mem_host_ptr_refcount; 1204 1205 /* array of device-specific memory bookkeeping structs. 1206 The location of some device's struct is determined by 1207 the device's global_mem_id. */ 1208 pocl_mem_identifier *device_ptrs; 1209 1210 /* for content tracking; 1211 * 1212 * this is the valid (highest) version of the buffer's content; 1213 * if any device has lower version in device_ptrs[]->version, 1214 * the buffer content on that device is invalid */ 1215 uint64_t latest_version; 1216 /* the event that last changed (written to) the buffer, this 1217 * is used as a "from "dependency for any migration commands */ 1218 cl_event last_event; 1219 1220 1221 /* A linked list of regions of the buffer mapped to the 1222 host memory */ 1223 mem_mapping_t *mappings; 1224 size_t map_count; 1225 1226 /* in case this is a sub buffer, this points to the parent 1227 buffer */ 1228 cl_mem_t *parent; 1229 /* A linked list of destructor callbacks */ 1230 mem_destructor_callback_t *destructor_callbacks; 1231 1232 /* These two are for cl_pocl_content_size extension. 1233 * They link two buffers together, like this: 1234 * mem->size_buffer->content_buffer = mem 1235 * mem->content_buffer->size_buffer = mem 1236 */ 1237 cl_mem size_buffer; 1238 cl_mem content_buffer; 1239 1240 /* for images, a flag for each device in context, 1241 * whether that device supports this */ 1242 int *device_supports_this_image; 1243 1244 /* if the memory backing mem_host_ptr is "permanent" = 1245 * valid through the entire lifetime of the buffer, 1246 * we can make some assumptions and optimizations */ 1247 cl_bool mem_host_ptr_is_permanent; 1248 1249 /* Image flags */ 1250 cl_bool is_image; 1251 cl_channel_order image_channel_order; 1252 cl_channel_type image_channel_data_type; 1253 size_t image_width; 1254 size_t image_height; 1255 size_t image_depth; 1256 size_t image_array_size; 1257 size_t image_row_pitch; 1258 size_t image_slice_pitch; 1259 size_t image_elem_size; 1260 size_t image_channels; 1261 cl_uint num_mip_levels; 1262 cl_uint num_samples; 1263 cl_mem buffer; 1264 1265 /* pipe flags */ 1266 cl_bool is_pipe; 1267 size_t pipe_packet_size; 1268 size_t pipe_max_packets; 1269 1270 /* list of SVM buffers */ 1271 struct _cl_mem *prev, *next; 1272 }; 1273 1274 typedef uint8_t SHA1_digest_t[SHA1_DIGEST_SIZE * 2 + 1]; 1275 1276 typedef struct pocl_kernel_metadata_s 1277 { 1278 cl_uint num_args; 1279 cl_uint num_locals; 1280 size_t *local_sizes; 1281 char *name; 1282 char *attributes; 1283 struct pocl_argument_info *arg_info; 1284 cl_bitfield has_arg_metadata; 1285 size_t reqd_wg_size[OPENCL_MAX_DIMENSION]; 1286 1287 /* if we know the size of _every_ kernel argument, we store 1288 * the total size here. see struct _cl_kernel on why */ 1289 size_t total_argument_storage_size; 1290 1291 /* array[program->num_devices] */ 1292 pocl_kernel_hash_t *build_hash; 1293 1294 /* If this is a BI kernel descriptor, they are statically defined in 1295 the custom device driver, thus should not be freed. */ 1296 cl_bitfield builtin_kernel; 1297 1298 /* device-specific METAdata, void* array[program->num_devices] */ 1299 void **data; 1300 } pocl_kernel_metadata_t; 1301 1302 struct _cl_program { 1303 POCL_ICD_OBJECT 1304 POCL_OBJECT; 1305 /* queries */ 1306 cl_context context; 1307 /* -cl-denorms-are-zero build option */ 1308 unsigned flush_denorms; 1309 1310 /* list of devices "associated with the program" (quote from Specs) 1311 * ... IOW for which we *can* build the program. 1312 * this is setup once, at clCreateProgramWith{Source,Binaries,...} time */ 1313 cl_device_id *associated_devices; 1314 cl_uint associated_num_devices; 1315 /* list of devices for which we actually did build the program. 1316 * this changes on every rebuild to device arguments given to clBuildProgram 1317 */ 1318 cl_uint num_devices; 1319 cl_device_id *devices; 1320 1321 /* all the program sources appended together, terminated with a zero */ 1322 char *source; 1323 /* The options in the last clBuildProgram call for this Program. */ 1324 char *compiler_options; 1325 1326 /* per-device binaries, in device-specific format */ 1327 size_t *binary_sizes; 1328 unsigned char **binaries; 1329 1330 /* If this is a program with built-in kernels, this is the list of kernel 1331 names it contains. */ 1332 size_t num_builtin_kernels; 1333 char **builtin_kernel_names; 1334 char *concated_builtin_names; 1335 1336 /* Poclcc binary format. */ 1337 /* per-device poclbinary-format binaries. */ 1338 size_t *pocl_binary_sizes; 1339 unsigned char **pocl_binaries; 1340 /* device-specific data, per each device */ 1341 void **data; 1342 1343 /* kernel number and the metadata for each kernel */ 1344 size_t num_kernels; 1345 pocl_kernel_metadata_t *kernel_meta; 1346 1347 /* list of attached cl_kernel instances */ 1348 cl_kernel kernels; 1349 /* Per-device program hash after build */ 1350 SHA1_digest_t* build_hash; 1351 /* Per-device build logs, for the case when we don't yet have the program's cachedir */ 1352 char** build_log; 1353 /* Per-program build log, for the case when we aren't yet building for devices */ 1354 char main_build_log[640]; 1355 /* Use to store build status */ 1356 cl_build_status build_status; 1357 /* Use to store binary type */ 1358 cl_program_binary_type binary_type; 1359 1360 /* Store SPIR-V binary from clCreateProgramWithIL() */ 1361 char *program_il; 1362 size_t program_il_size; 1363 }; 1364 1365 struct _cl_kernel { 1366 POCL_ICD_OBJECT 1367 POCL_OBJECT; 1368 /* -------- */ 1369 cl_context context; 1370 cl_program program; 1371 pocl_kernel_metadata_t *meta; 1372 /* device-specific data, per each device. This is different from meta->data, 1373 * as this is per-instance of cl_kernel, while there is just one meta->data 1374 * for all instances of the kernel of the same name. */ 1375 void **data; 1376 /* just a convenience pointer to meta->name */ 1377 const char *name; 1378 1379 /* The kernel arguments that are set with clSetKernelArg(). 1380 These are copied to the command queue command at enqueue. */ 1381 struct pocl_argument *dyn_arguments; 1382 1383 /* if total_argument_storage_size is known, we preallocate storage for 1384 * actual kernel arguments here, instead of allocating it by one for 1385 * each argument separately. The "offsets" store pointers calculated as 1386 * "dyn_argument_storage + offset-of-argument-N". 1387 * 1388 * The pointer to actual value for argument N, used by drivers, is stored 1389 * in dyn_arguments[N].value; if total_argument_storage_size is not known, 1390 * the .value must be allocated separately for every argument in 1391 * clSetKernelArg; if it is known, clSetKernelArg sets the .value to 1392 * dyn_argument_offsets[N] and copies the value there. 1393 * 1394 * We must keep both ways, because not every driver can know kernel 1395 * argument sizes beforehand. 1396 */ 1397 char *dyn_argument_storage; 1398 void **dyn_argument_offsets; 1399 1400 /* for program's linked list of kernels */ 1401 struct _cl_kernel *next; 1402 }; 1403 1404 typedef struct event_callback_item event_callback_item; 1405 struct event_callback_item 1406 { 1407 void(CL_CALLBACK *callback_function) (cl_event, cl_int, void *); 1408 void *user_data; 1409 cl_int trigger_status; 1410 struct event_callback_item *next; 1411 }; 1412 1413 1414 struct event_node 1415 { 1416 cl_event event; 1417 event_node *next; 1418 }; 1419 1420 /* Optional metadata for events for improved profile data readability etc. */ 1421 typedef struct _pocl_event_md 1422 { 1423 /* The kernel executed by the NDRange command associated with the event, 1424 if any. */ 1425 cl_kernel kernel; 1426 } pocl_event_md; 1427 1428 typedef struct _cl_event _cl_event; 1429 struct _cl_event { 1430 POCL_ICD_OBJECT 1431 POCL_OBJECT; 1432 cl_context context; 1433 cl_command_queue queue; 1434 cl_command_type command_type; 1435 _cl_command_node *command; 1436 1437 /* list of callback functions */ 1438 event_callback_item *callback_list; 1439 1440 /* list of devices needing completion notification for this event */ 1441 event_node *notify_list; 1442 event_node *wait_list; 1443 1444 /* OoO doesn't use sync points -> put used buffers here */ 1445 size_t num_buffers; 1446 cl_mem *mem_objs; 1447 1448 /* Profiling data: time stamps of the different phases of execution. */ 1449 cl_ulong time_queue; /* the enqueue time */ 1450 cl_ulong time_submit; /* the time the command was submitted to the device */ 1451 cl_ulong time_start; /* the time the command actually started executing */ 1452 cl_ulong time_end; /* the finish time of the command */ 1453 1454 /* Device specific data */ 1455 void *data; 1456 1457 /* Additional (optional data) used to make profile data more readable etc. */ 1458 pocl_event_md *meta_data; 1459 1460 /* The execution status of the command this event is monitoring. */ 1461 cl_int status; 1462 /* impicit event = an event for pocl's internal use, not visible to user */ 1463 short implicit_event; 1464 /* if set, at the completion of event, the mem_host_ptr_refcount should be 1465 * lowered and memory freed if it's 0 */ 1466 short release_mem_host_ptr_after; 1467 1468 _cl_event *next; 1469 _cl_event *prev; 1470 }; 1471 1472 typedef struct _pocl_user_event_data 1473 { 1474 pocl_cond_t wakeup_cond; 1475 } pocl_user_event_data; 1476 1477 typedef struct _cl_sampler cl_sampler_t; 1478 struct _cl_sampler { 1479 POCL_ICD_OBJECT 1480 POCL_OBJECT; 1481 cl_context context; 1482 cl_bool normalized_coords; 1483 cl_addressing_mode addressing_mode; 1484 cl_filter_mode filter_mode; 1485 void** device_data; 1486 }; 1487 1488 #define CL_FAILED (-1) 1489 1490 #ifndef __cplusplus 1491 1492 #define min(a,b) (((a) < (b)) ? (a) : (b)) 1493 #define max(a,b) (((a) > (b)) ? (a) : (b)) 1494 1495 #endif 1496 1497 #ifdef __APPLE__ 1498 #include <libkern/OSByteOrder.h> 1499 #define htole16(x) OSSwapHostToLittleInt16(x) 1500 #define le16toh(x) OSSwapLittleToHostInt16(x) 1501 #define htole32(x) OSSwapHostToLittleInt32(x) 1502 #define le32toh(x) OSSwapLittleToHostInt32(x) 1503 #define htole64(x) OSSwapHostToLittleInt64(x) 1504 #define le64toh(x) OSSwapLittleToHostInt64(x) 1505 #elif defined(__FreeBSD__) || defined(__DragonFly__) 1506 #include <sys/endian.h> 1507 #elif defined (_WIN32) 1508 #ifndef htole64 1509 #define htole64(x) (x) 1510 #endif 1511 #ifndef htole32 1512 #define htole32(x) (x) 1513 #endif 1514 #ifndef htole16 1515 #define htole16(x) (x) 1516 #endif 1517 #ifndef le64toh 1518 #define le64toh(x) (x) 1519 #endif 1520 #ifndef le32toh 1521 #define le32toh(x) (x) 1522 #endif 1523 #ifndef le16toh 1524 #define le16toh(x) (x) 1525 #endif 1526 #else 1527 #include <endian.h> 1528 #if defined(__GLIBC__) && __GLIBC__ == 2 && \ 1529 defined(__GLIBC_MINOR__) && __GLIBC_MINOR__ < 9 && \ 1530 defined(__x86_64__) 1531 #ifndef htole64 1532 #define htole64(x) (x) 1533 #endif 1534 #ifndef htole32 1535 #define htole32(x) (x) 1536 #endif 1537 #ifndef htole16 1538 #define htole16(x) (x) 1539 #endif 1540 #ifndef le64toh 1541 #define le64toh(x) (x) 1542 #endif 1543 #ifndef le32toh 1544 #define le32toh(x) (x) 1545 #endif 1546 #ifndef le16toh 1547 #define le16toh(x) (x) 1548 #endif 1549 #endif 1550 #endif 1551 1552 #ifdef HAVE_LTTNG_UST 1553 1554 #include "pocl_lttng.h" 1555 1556 #define TP_CREATE_QUEUE(context_id, queue_id) \ 1557 tracepoint (pocl_trace, create_queue, context_id, queue_id); 1558 #define TP_FREE_QUEUE(context_id, queue_id) \ 1559 tracepoint (pocl_trace, free_queue, context_id, queue_id); 1560 1561 #define TP_CREATE_BUFFER(context_id, buffer_id) \ 1562 tracepoint (pocl_trace, create_buffer, context_id, buffer_id); 1563 #define TP_FREE_BUFFER(context_id, buffer_id) \ 1564 tracepoint (pocl_trace, free_buffer, context_id, buffer_id); 1565 1566 #define TP_CREATE_PROGRAM(context_id, program_id) \ 1567 tracepoint (pocl_trace, create_program, context_id, program_id); 1568 #define TP_BUILD_PROGRAM(context_id, program_id) \ 1569 tracepoint (pocl_trace, build_program, context_id, program_id); 1570 #define TP_FREE_PROGRAM(context_id, program_id) \ 1571 tracepoint (pocl_trace, free_program, context_id, program_id); 1572 1573 #define TP_CREATE_KERNEL(context_id, kernel_id, kernel_name) \ 1574 tracepoint (pocl_trace, create_kernel, context_id, kernel_id, kernel_name); 1575 #define TP_FREE_KERNEL(context_id, kernel_id, kernel_name) \ 1576 tracepoint (pocl_trace, free_kernel, context_id, kernel_id, kernel_name); 1577 1578 #define TP_CREATE_IMAGE(context_id, image_id) \ 1579 tracepoint (pocl_trace, create_image, context_id, image_id); 1580 #define TP_FREE_IMAGE(context_id, image_id) \ 1581 tracepoint (pocl_trace, free_image, context_id, image_id); 1582 1583 #define TP_CREATE_SAMPLER(context_id, sampler_id) \ 1584 tracepoint (pocl_trace, create_sampler, context_id, sampler_id); 1585 #define TP_FREE_SAMPLER(context_id, sampler_id) \ 1586 tracepoint (pocl_trace, free_sampler, context_id, sampler_id); 1587 1588 #else 1589 1590 #define TP_CREATE_QUEUE(context_id, queue_id) 1591 #define TP_FREE_QUEUE(context_id, queue_id) 1592 1593 #define TP_CREATE_BUFFER(context_id, buffer_id) 1594 #define TP_FREE_BUFFER(context_id, buffer_id) 1595 1596 #define TP_CREATE_PROGRAM(context_id, program_id) 1597 #define TP_BUILD_PROGRAM(context_id, program_id) 1598 #define TP_FREE_PROGRAM(context_id, program_id) 1599 1600 #define TP_CREATE_KERNEL(context_id, kernel_id, kernel_name) 1601 #define TP_FREE_KERNEL(context_id, kernel_id, kernel_name) 1602 1603 #define TP_CREATE_IMAGE(context_id, image_id) 1604 #define TP_FREE_IMAGE(context_id, image_id) 1605 1606 #define TP_CREATE_SAMPLER(context_id, sampler_id) 1607 #define TP_FREE_SAMPLER(context_id, sampler_id) 1608 1609 #endif 1610 1611 #endif /* POCL_CL_H */ 1612