1 /* pocl_cl.h - local runtime library declarations.
2 
3    Copyright (c) 2011 Universidad Rey Juan Carlos
4                  2011-2019 Pekka Jääskeläinen
5 
6    Permission is hereby granted, free of charge, to any person obtaining a copy
7    of this software and associated documentation files (the "Software"), to
8    deal in the Software without restriction, including without limitation the
9    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10    sell copies of the Software, and to permit persons to whom the Software is
11    furnished to do so, subject to the following conditions:
12 
13    The above copyright notice and this permission notice shall be included in
14    all copies or substantial portions of the Software.
15 
16    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22    IN THE SOFTWARE.
23 */
24 
25 #ifndef POCL_CL_H
26 #define POCL_CL_H
27 
28 #include "config.h"
29 
30 #include <assert.h>
31 #include <stdio.h>
32 
33 #ifdef HAVE_VALGRIND
34 #include <valgrind/helgrind.h>
35 #endif
36 
37 #ifdef _WIN32
38 #  include "vccompat.hpp"
39 #endif
40 /* To get adaptive mutex type */
41 #ifndef __USE_GNU
42   #define __USE_GNU
43 #endif
44 
45 #include <pthread.h>
46 #ifdef HAVE_CLOCK_GETTIME
47   #include <time.h>
48 #endif
49 
50 typedef pthread_mutex_t pocl_lock_t;
51 typedef pthread_cond_t pocl_cond_t;
52 typedef pthread_t pocl_thread_t;
53 #define POCL_LOCK_INITIALIZER PTHREAD_MUTEX_INITIALIZER
54 
55 #ifdef BUILD_ICD
56 #  include "pocl_icd.h"
57 #endif
58 #include "pocl.h"
59 #include "pocl_tracing.h"
60 #include "pocl_debug.h"
61 #include "pocl_hash.h"
62 #include "pocl_runtime_config.h"
63 #include "common.h"
64 
65 #if __STDC_VERSION__ < 199901L
66 # if __GNUC__ >= 2
67 #  define __func__ __PRETTY_FUNCTION__
68 # else
69 #  define __func__ UNKNOWN_FUNCTION
70 # endif
71 #endif
72 
73 #if defined(__GNUC__) || defined(__clang__)
74 
75 /* These return the new value. */
76 /* See: https://gcc.gnu.org/onlinedocs/gcc-4.1.2/gcc/Atomic-Builtins.html */
77 #define POCL_ATOMIC_INC(x) __sync_add_and_fetch (&x, 1)
78 #define POCL_ATOMIC_DEC(x) __sync_sub_and_fetch (&x, 1)
79 #define POCL_ATOMIC_CAS(ptr, oldval, newval)                                  \
80   __sync_val_compare_and_swap (ptr, oldval, newval)
81 
82 #elif defined(_WIN32)
83 #define POCL_ATOMIC_INC(x) InterlockedIncrement64 (&x)
84 #define POCL_ATOMIC_DEC(x) InterlockedDecrement64 (&x)
85 #define POCL_ATOMIC_CAS(ptr, oldval, newval)                                  \
86   InterlockedCompareExchange64 (ptr, newval, oldval)
87 #else
88 #error Need atomic_inc() builtin for this compiler
89 #endif
90 
91 #ifdef HAVE_VALGRIND
92 #define VG_REFC_ZERO(var)                                                     \
93   ANNOTATE_HAPPENS_AFTER (&var->pocl_refcount);                               \
94   ANNOTATE_HAPPENS_BEFORE_FORGET_ALL (&var->pocl_refcount)
95 #define VG_REFC_NONZERO(var) ANNOTATE_HAPPENS_BEFORE (&var->pocl_refcount)
96 #else
97 #define VG_REFC_ZERO(var) (void)0
98 #define VG_REFC_NONZERO(var) (void)0
99 #endif
100 
101 #ifdef __linux__
102 #define ALIGN_CACHE(x) x __attribute__ ((aligned (HOST_CPU_CACHELINE_SIZE)))
103 #else
104 #define ALIGN_CACHE(x) x
105 #endif
106 
107 /* Generic functionality for handling different types of
108    OpenCL (host) objects. */
109 
110 #define POCL_LOCK(__LOCK__)                                                   \
111   do                                                                          \
112     {                                                                         \
113       int r = pthread_mutex_lock (&(__LOCK__));                               \
114       assert (r == 0);                                                        \
115     }                                                                         \
116   while (0)
117 #define POCL_UNLOCK(__LOCK__)                                                 \
118   do                                                                          \
119     {                                                                         \
120       int r = pthread_mutex_unlock (&(__LOCK__));                             \
121       assert (r == 0);                                                        \
122     }                                                                         \
123   while (0)
124 #define POCL_INIT_LOCK(__LOCK__)                                              \
125   do                                                                          \
126     {                                                                         \
127       int r = pthread_mutex_init (&(__LOCK__), NULL);                         \
128       assert (r == 0);                                                        \
129     }                                                                         \
130   while (0)
131 /* We recycle OpenCL objects by not actually freeing them until the
132    very end. Thus, the lock should not be destoryed at the refcount 0. */
133 #define POCL_DESTROY_LOCK(__LOCK__)                                           \
134   do                                                                          \
135     {                                                                         \
136       int r = pthread_mutex_destroy (&(__LOCK__));                            \
137       assert (r == 0);                                                        \
138     }                                                                         \
139   while (0)
140 
141 
142 /* If available, use an Adaptive mutex for locking in the pthread driver,
143    otherwise fallback to simple mutexes */
144 #define POCL_FAST_LOCK_T pthread_mutex_t
145 #define POCL_FAST_LOCK(l) POCL_LOCK(l)
146 #define POCL_FAST_UNLOCK(l) POCL_UNLOCK(l)
147 
148 #ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
149   #define POCL_FAST_INIT(l) \
150     do { \
151       pthread_mutexattr_t attrs; \
152       pthread_mutexattr_init (&attrs); \
153       int r = pthread_mutexattr_settype (&attrs, PTHREAD_MUTEX_ADAPTIVE_NP); \
154       assert (r == 0); \
155       pthread_mutex_init(&l, &attrs); \
156       pthread_mutexattr_destroy(&attrs);\
157     } while (0)
158 #else
159 #define POCL_FAST_INIT(l) POCL_INIT_LOCK (l)
160 #endif
161 
162 #define POCL_FAST_DESTROY(l) POCL_DESTROY_LOCK(l)
163 
164 #define POCL_INIT_COND(c) pthread_cond_init (&c, NULL)
165 #define POCL_DESTROY_COND(c) pthread_cond_destroy (&c)
166 #define POCL_SIGNAL_COND(c) pthread_cond_signal (&c)
167 #define POCL_BROADCAST_COND(c) pthread_cond_broadcast (&c)
168 #define POCL_WAIT_COND(c, m) pthread_cond_wait (&c, &m)
169 #define POCL_TIMEDWAIT_COND(c, m, t) pthread_cond_timedwait (&c, &m, &t)
170 
171 #define POCL_CREATE_THREAD(thr, func, arg)                                    \
172   pthread_create (&thr, NULL, func, arg)
173 #define POCL_JOIN_THREAD(thr) pthread_join (thr, NULL)
174 #define POCL_JOIN_THREAD2(thr, res_ptr) pthread_join (thr, res_ptr)
175 #define POCL_EXIT_THREAD(res) pthread_exit (res)
176 
177 //############################################################################
178 
179 #ifdef ENABLE_EXTRA_VALIDITY_CHECKS
180 #define POCL_MAGIC_1 0xBE8906A1A83D8D23ULL
181 #define POCL_MAGIC_2 0x071AC830215FD807ULL
182 #define IS_CL_OBJECT_VALID(__OBJ__)                                           \
183   (((__OBJ__) != NULL) && ((__OBJ__)->magic_1 == POCL_MAGIC_1)                \
184    && ((__OBJ__)->magic_2 == POCL_MAGIC_2))
185 #define CHECK_VALIDITY_MARKERS                                                \
186       assert ((__OBJ__)->magic_1 == POCL_MAGIC_1);                            \
187       assert ((__OBJ__)->magic_2 == POCL_MAGIC_2);
188 #define SET_VALIDITY_MARKERS                                                  \
189       (__OBJ__)->magic_1 = POCL_MAGIC_1;                                      \
190       (__OBJ__)->magic_2 = POCL_MAGIC_2;
191 #define UNSET_VALIDITY_MARKERS                                               \
192       (__OBJ__)->magic_1 = 0;                                                 \
193       (__OBJ__)->magic_2 = 0;
194 #else
195 #define IS_CL_OBJECT_VALID(__OBJ__)   ((__OBJ__) != NULL)
196 #define CHECK_VALIDITY_MARKERS
197 #define SET_VALIDITY_MARKERS
198 #define UNSET_VALIDITY_MARKERS
199 #endif
200 
201 #define POCL_LOCK_OBJ(__OBJ__)                                                \
202   do                                                                          \
203     {                                                                         \
204       CHECK_VALIDITY_MARKERS;                                                 \
205       POCL_LOCK ((__OBJ__)->pocl_lock);                                       \
206       assert ((__OBJ__)->pocl_refcount > 0);                                  \
207     }                                                                         \
208   while (0)
209 #define POCL_UNLOCK_OBJ(__OBJ__)                                              \
210   do                                                                          \
211     {                                                                         \
212       CHECK_VALIDITY_MARKERS;                                                 \
213       assert ((__OBJ__)->pocl_refcount >= 0);                                 \
214       POCL_UNLOCK ((__OBJ__)->pocl_lock);                                     \
215     }                                                                         \
216   while (0)
217 
218 #define POCL_RELEASE_OBJECT(__OBJ__, __NEW_REFCOUNT__)                        \
219   do                                                                          \
220     {                                                                         \
221       POCL_LOCK_OBJ (__OBJ__);                                                \
222       __NEW_REFCOUNT__ = --(__OBJ__)->pocl_refcount;                          \
223       POCL_UNLOCK_OBJ (__OBJ__);                                              \
224     }                                                                         \
225   while (0)
226 
227 #define POCL_RETAIN_OBJECT_UNLOCKED(__OBJ__)    \
228     ++((__OBJ__)->pocl_refcount)
229 
230 #define POCL_RETAIN_OBJECT_REFCOUNT(__OBJ__, R) \
231   do {                                          \
232     POCL_LOCK_OBJ (__OBJ__);                    \
233     R = POCL_RETAIN_OBJECT_UNLOCKED (__OBJ__);  \
234     POCL_UNLOCK_OBJ (__OBJ__);                  \
235   } while (0)
236 
237 #define POCL_RETAIN_OBJECT(__OBJ__)             \
238   do {                                          \
239     POCL_LOCK_OBJ (__OBJ__);                    \
240     POCL_RETAIN_OBJECT_UNLOCKED (__OBJ__);      \
241     POCL_UNLOCK_OBJ (__OBJ__);                  \
242   } while (0)
243 
244 
245 extern uint64_t last_object_id;
246 
247 /* The reference counter is initialized to 1,
248    when it goes to 0 object can be freed. */
249 #define POCL_INIT_OBJECT_NO_ICD(__OBJ__)                                      \
250   do                                                                          \
251     {                                                                         \
252       SET_VALIDITY_MARKERS;                                                   \
253       (__OBJ__)->pocl_refcount = 1;                                           \
254       POCL_INIT_LOCK ((__OBJ__)->pocl_lock);                                  \
255       (__OBJ__)->id = POCL_ATOMIC_INC (last_object_id);                       \
256     }                                                                         \
257   while (0)
258 
259 #define POCL_MEM_FREE(F_PTR)                      \
260   do {                                            \
261       free((F_PTR));                              \
262       (F_PTR) = NULL;                             \
263   } while (0)
264 
265 #ifdef BUILD_ICD
266 /* Most (all?) object must also initialize the ICD field */
267 #  define POCL_INIT_OBJECT(__OBJ__)                \
268     do {                                           \
269       POCL_INIT_OBJECT_NO_ICD(__OBJ__);            \
270       POCL_INIT_ICD_OBJECT(__OBJ__);               \
271     } while (0)
272 #else
273 #  define POCL_INIT_OBJECT(__OBJ__)                \
274       POCL_INIT_OBJECT_NO_ICD(__OBJ__)
275 #endif
276 
277 #define POCL_DESTROY_OBJECT(__OBJ__)                                          \
278   do                                                                          \
279     {                                                                         \
280       UNSET_VALIDITY_MARKERS;                                                \
281       POCL_DESTROY_LOCK ((__OBJ__)->pocl_lock);                               \
282     }                                                                         \
283   while (0)
284 
285 /* Declares the generic pocl object attributes inside a struct. */
286 #define POCL_OBJECT                                                           \
287   uint64_t magic_1;                                                           \
288   uint64_t id;                                                                \
289   pocl_lock_t pocl_lock;                                                      \
290   uint64_t magic_2;                                                           \
291   int pocl_refcount
292 
293 #ifdef __APPLE__
294 /* Note: OSX doesn't support aliases because it doesn't use ELF */
295 
296 #  define POname(name) name
297 #  define POdeclsym(name)
298 #  define POsym(name)
299 #  define POsymAlways(name)
300 
301 #elif defined(_WIN32)
302 /* Visual Studio does not support this magic either */
303 #  define POname(name) name
304 #  define POdeclsym(name)
305 #  define POsym(name)
306 #  define POsymAlways(name)
307 #  define POdeclsym(name)
308 
309 #else
310 /* Symbol aliases are supported */
311 
312 #  define POname(name) PO##name
313 
314 #define POdeclsym(name) __typeof__ (name) PO##name;
315 #  define POCL_ALIAS_OPENCL_SYMBOL(name)                                \
316   __typeof__(name) name __attribute__((alias ("PO" #name), visibility("default")));
317 #  define POsymAlways(name) POCL_ALIAS_OPENCL_SYMBOL(name)
318 #  if !defined(BUILD_ICD)
319 #    define POsym(name) POCL_ALIAS_OPENCL_SYMBOL(name)
320 #  else
321 #    define POsym(name)
322 #  endif
323 
324 #endif
325 
326 
327 /* The ICD compatibility part. This must be first in the objects where
328  * it is used (as the ICD loader assumes that)*/
329 #ifdef BUILD_ICD
330 #  define POCL_ICD_OBJECT struct _cl_icd_dispatch *dispatch;
331 #  define POCL_ICD_OBJECT_PLATFORM_ID POCL_ICD_OBJECT
332 #  define POsymICD(name) POsym(name)
333 #  define POdeclsymICD(name) POdeclsym(name)
334 #else
335 #  define POCL_ICD_OBJECT
336 #  define POCL_ICD_OBJECT_PLATFORM_ID unsigned long id;
337 #  define POsymICD(name)
338 #  define POdeclsymICD(name)
339 #endif
340 
341 #include "pocl_intfn.h"
342 
343 /* fields for cl_kernel -> has_arg_metadata */
344 #define POCL_HAS_KERNEL_ARG_ADDRESS_QUALIFIER  1
345 #define POCL_HAS_KERNEL_ARG_ACCESS_QUALIFIER   2
346 #define POCL_HAS_KERNEL_ARG_TYPE_NAME          4
347 #define POCL_HAS_KERNEL_ARG_TYPE_QUALIFIER     8
348 #define POCL_HAS_KERNEL_ARG_NAME               16
349 
350 /* pocl specific flag, for "hidden" default queues allocated in each context */
351 #define CL_QUEUE_HIDDEN (1 << 10)
352 
353 typedef struct pocl_argument {
354   uint64_t size;
355   /* The "offset" is used to simplify subbuffer handling.
356    * At enqueue time, subbuffers are converted to buffers + offset into them.
357    */
358   uint64_t offset;
359   void *value;
360   /* 1 if this argument has been set by clSetKernelArg */
361   char is_set;
362   /* 1 if the argument is read-only according to kernel metadata. So either
363    * a buffer with "const" qualifier, or an image with read_only qualifier  */
364   char is_readonly;
365   /* 1 if the argument pointer is SVM direct pointer, not a cl_mem */
366   char is_svm;
367 } pocl_argument;
368 
369 typedef struct event_node event_node;
370 
371 /**
372  * Enumeration for kernel argument types
373  */
374 typedef enum {
375   POCL_ARG_TYPE_NONE = 0,
376   POCL_ARG_TYPE_POINTER = 1,
377   POCL_ARG_TYPE_IMAGE = 2,
378   POCL_ARG_TYPE_SAMPLER = 3,
379 } pocl_argument_type;
380 
381 #define ARG_IS_LOCAL(a) (a.address_qualifier == CL_KERNEL_ARG_ADDRESS_LOCAL)
382 #define ARGP_IS_LOCAL(a) (a->address_qualifier == CL_KERNEL_ARG_ADDRESS_LOCAL)
383 
384 typedef struct pocl_argument_info {
385   char* type_name;
386   char* name;
387   cl_kernel_arg_address_qualifier address_qualifier;
388   cl_kernel_arg_access_qualifier access_qualifier;
389   cl_kernel_arg_type_qualifier type_qualifier;
390   pocl_argument_type type;
391   unsigned type_size;
392 } pocl_argument_info;
393 
394 /* represents a single buffer to host memory mapping */
395 
396 struct pocl_device_ops {
397   const char *device_name;
398 
399   /* New driver api extension for out-of-order execution and
400      asynchronous devices.
401      See this for reference: http://URN.fi/URN:NBN:fi:tty-201412051583
402      See basic and pthread driver for reference. */
403 
404   /* submit gives the command for the device. The command may be left in the cq
405      or stored to the device driver owning the cq. submit is called
406      with node->event locked, and must return with it unlocked. */
407   void (*submit) (_cl_command_node *node, cl_command_queue cq);
408 
409   /* join is called by clFinish and this function blocks until all the enqueued
410      commands are finished. Called by the user thread; see notify_cmdq_finished
411      for the driver thread counterpart. */
412   void (*join) (cl_device_id device, cl_command_queue cq);
413 
414   /* flush is called when clFlush is called. This function ensures that
415      commands will be eventually executed. It is up to the device what happens
416      here, if anything. See basic and pthread for reference.*/
417   void (*flush) (cl_device_id device, cl_command_queue cq);
418 
419   /* notify is used to communicate to a device driver that an event, it has
420      been waiting, has been completed. Upon call, both events are locked, and
421      must be locked also on return.*/
422   void (*notify) (cl_device_id device, cl_event event, cl_event finished);
423 
424   /* broadcast is(has to be) called by the device driver when a command is
425      completed.
426      It is used to broadcast notifications to device drivers waiting
427      this event to complete.
428      There is a default implementation for this. Use it if there is no need
429      to do anything special here.
430      The default implementation calls notify(event, target_event) for the
431      list of events waiting on 'event'. */
432   void (*broadcast) (cl_event event);
433 
434   /* wait_event is called by clWaitForEvents() and blocks the execution until
435    * the waited event is complete or failed. Called by user threads; see the
436    * notify_event_finished() callback for driver thread counterpart.
437    * Called (and must return) with unlocked event. */
438   void (*wait_event) (cl_device_id device, cl_event event);
439 
440   /* update_event is an extra callback called during handling of event status
441    * changes, useful if something device specific needs to be done. May be
442    * NULL; no need to implement if not needed.
443    *
444    * Called via pocl_update_event_* functions in pocl_util.c
445    * All pocl_update_event_* (except COMPLETE) must be called (and return)
446    * with LOCKED event.
447    */
448   void (*update_event) (cl_device_id device, cl_event event);
449 
450   /* free_event_data may be called when event is freed. Event data may only be
451      used by the device driver owning the corresponding command.
452      No need to implement this if the device does not need any event data. */
453   void (*free_event_data) (cl_event event);
454 
455   /* Called from driver threads to notify every user thread waiting on
456    * command queue finish. See join() for user counterpart.
457    * Driver may chose to not implement this, which will result in
458    * undefined behaviour in multi-threaded user programs. */
459   void (*notify_cmdq_finished) (cl_command_queue cq);
460 
461   /* Called from driver threads to notify every user thread waiting on
462    * a specific event. See wait_event() for user counterpart.
463    * Driver may chose to not implement this, which will result in
464    * undefined behaviour in multi-threaded user programs. */
465   void (*notify_event_finished) (cl_event event);
466 
467   /* /New driver api extension */
468 
469   /* Detects & returns the number of available devices the driver finds on the system. */
470   unsigned int (*probe) (struct pocl_device_ops *ops);
471   /* Device initialization. Parameters:
472    *  j : progressive index for the devices of the same type
473    *  device : struct to initialize
474    *  parameters : optional environment with device-specific parameters
475    */
476   cl_int (*init) (unsigned j, cl_device_id device, const char *parameters);
477   /* uninitializes the driver for a particular device. May free hardware resources. */
478   cl_int (*uninit) (unsigned j, cl_device_id device);
479   /* reinitializes the driver for a particular device. Called after uninit;
480    * the first initialization is done by 'init'. May be NULL */
481   cl_int (*reinit) (unsigned j, cl_device_id device);
482 
483   /* allocate a buffer in device memory */
484   cl_int (*alloc_mem_obj) (cl_device_id device, cl_mem mem_obj, void* host_ptr);
485   /* free a device buffer */
486   void (*free) (cl_device_id device, cl_mem mem_obj);
487 
488   /* return >0 if driver can migrate directly between devices.
489    * Priority between devices signalled by larger numbers. */
490   int (*can_migrate_d2d) (cl_device_id dest, cl_device_id source);
491   /* migrate buffer content directly between devices */
492   int (*migrate_d2d) (cl_device_id src_dev,
493                       cl_device_id dst_dev,
494                       cl_mem mem,
495                       pocl_mem_identifier *src_mem_id,
496                       pocl_mem_identifier *dst_mem_id);
497 
498   /* SVM Ops */
499   void (*svm_free) (cl_device_id dev, void *svm_ptr);
500   void *(*svm_alloc) (cl_device_id dev, cl_svm_mem_flags flags, size_t size);
501   void (*svm_map) (cl_device_id dev, void *svm_ptr);
502   void (*svm_unmap) (cl_device_id dev, void *svm_ptr);
503   /* these are optional. If the driver needs to do anything to be able
504    * to use host memory, it should do it (and undo it) in these callbacks.
505    * Currently used by HSA.
506    * See pocl_basic_alloc and pocl_basic_free for details. */
507   void (*svm_register) (cl_device_id dev, void *host_ptr, size_t size);
508   void (*svm_unregister) (cl_device_id dev, void *host_ptr, size_t size);
509 
510   /* we can use restrict here, because Spec says overlapping copy should return
511    * with CL_MEM_COPY_OVERLAP error. */
512   void (*svm_copy) (cl_device_id dev, void *__restrict__ dst,
513                     const void *__restrict__ src, size_t size);
514   void (*svm_fill) (cl_device_id dev, void *__restrict__ svm_ptr, size_t size,
515                     void *__restrict__ pattern, size_t pattern_size);
516 
517   /* the following callbacks only deal with buffers (and IMAGE1D_BUFFER which
518    * is backed by a buffer), not images.  */
519 
520   /* clEnqReadBuffer */
521   void (*read) (void *data,
522                 void *__restrict__  dst_host_ptr,
523                 pocl_mem_identifier * src_mem_id,
524                 cl_mem src_buf,
525                 size_t offset,
526                 size_t size);
527   /* clEnqReadBufferRect */
528   void (*read_rect) (void *data,
529                      void *__restrict__ dst_host_ptr,
530                      pocl_mem_identifier * src_mem_id,
531                      cl_mem src_buf,
532                      const size_t *buffer_origin,
533                      const size_t *host_origin,
534                      const size_t *region,
535                      size_t buffer_row_pitch,
536                      size_t buffer_slice_pitch,
537                      size_t host_row_pitch,
538                      size_t host_slice_pitch);
539   /* clEnqWriteBuffer */
540   void (*write) (void *data,
541                  const void *__restrict__  src_host_ptr,
542                  pocl_mem_identifier * dst_mem_id,
543                  cl_mem dst_buf,
544                  size_t offset,
545                  size_t size);
546   /* clEnqWriteBufferRect */
547   void (*write_rect) (void *data,
548                       const void *__restrict__ src_host_ptr,
549                       pocl_mem_identifier * dst_mem_id,
550                       cl_mem dst_buf,
551                       const size_t *buffer_origin,
552                       const size_t *host_origin,
553                       const size_t *region,
554                       size_t buffer_row_pitch,
555                       size_t buffer_slice_pitch,
556                       size_t host_row_pitch,
557                       size_t host_slice_pitch);
558   /* clEnqCopyBuffer */
559   void (*copy) (void *data,
560                 pocl_mem_identifier * dst_mem_id,
561                 cl_mem dst_buf,
562                 pocl_mem_identifier * src_mem_id,
563                 cl_mem src_buf,
564                 size_t dst_offset,
565                 size_t src_offset,
566                 size_t size);
567   /* clEnqCopyBufferRect */
568   void (*copy_rect) (void *data,
569                      pocl_mem_identifier * dst_mem_id,
570                      cl_mem dst_buf,
571                      pocl_mem_identifier * src_mem_id,
572                      cl_mem src_buf,
573                      const size_t *dst_origin,
574                      const size_t *src_origin,
575                      const size_t *region,
576                      size_t dst_row_pitch,
577                      size_t dst_slice_pitch,
578                      size_t src_row_pitch,
579                      size_t src_slice_pitch);
580 
581   /* clEnqCopyBuffer with the cl_pocl_content_size extension. This callback is optional */
582   void (*copy_with_size) (void *data,
583                           pocl_mem_identifier *dst_mem_id,
584                           cl_mem dst_buf,
585                           pocl_mem_identifier *src_mem_id,
586                           cl_mem src_buf,
587                           pocl_mem_identifier *content_size_buf_mem_id,
588                           cl_mem content_size_buf,
589                           size_t dst_offset,
590                           size_t src_offset,
591                           size_t size);
592 
593   /* clEnqFillBuffer */
594   void (*memfill) (void *data,
595                    pocl_mem_identifier * dst_mem_id,
596                    cl_mem dst_buf,
597                    size_t size,
598                    size_t offset,
599                    const void *__restrict__  pattern,
600                    size_t pattern_size);
601 
602   /* Maps 'size' bytes of device global memory at  + offset to
603      host-accessible memory. This might or might not involve copying
604      the block from the device. */
605   cl_int (*map_mem) (void *data,
606                      pocl_mem_identifier * src_mem_id,
607                      cl_mem src_buf,
608                      mem_mapping_t *map);
609   cl_int (*unmap_mem) (void *data,
610                        pocl_mem_identifier * dst_mem_id,
611                        cl_mem dst_buf,
612                        mem_mapping_t *map);
613 
614   /* these don't actually do the mapping, only return a pointer
615    * where the driver will map in future. Separate API from map/unmap
616    * because 1) unlike other driver ops, this is called from the user thread,
617    * so it can be called in parallel with map/unmap or any command executing
618    * in the driver; 2) most drivers can share the code for these */
619   cl_int (*get_mapping_ptr) (void *data, pocl_mem_identifier *mem_id,
620                              cl_mem mem, mem_mapping_t *map);
621   cl_int (*free_mapping_ptr) (void *data, pocl_mem_identifier *mem_id,
622                               cl_mem mem, mem_mapping_t *map);
623 
624   /* if the driver needs to do something at kernel create/destroy time */
625   int (*create_kernel) (cl_device_id device, cl_program program,
626                         cl_kernel kernel, unsigned program_device_i);
627   int (*free_kernel) (cl_device_id device, cl_program program,
628                       cl_kernel kernel, unsigned program_device_i);
629 
630   /* program building callbacks */
631   int (*build_source) (
632       cl_program program, cl_uint device_i,
633 
634       /* these are filled by clCompileProgram(), otherwise NULLs */
635       cl_uint num_input_headers, const cl_program *input_headers,
636       const char **header_include_names,
637 
638       /* 1 = compile & link, 0 = compile only, linked later via clLinkProgram*/
639       int link_program);
640 
641   int (*build_binary) (
642       cl_program program, cl_uint device_i,
643 
644       /* 1 = compile & link, 0 = compile only, linked later via clLinkProgram*/
645       int link_program, int spir_build);
646 
647   /* build a program with builtin kernels. */
648   int (*build_builtin) (cl_program program, cl_uint device_i);
649 
650   int (*link_program) (cl_program program, cl_uint device_i,
651 
652                        cl_uint num_input_programs,
653                        const cl_program *input_programs,
654 
655                        /* 1 = create library, 0 = create executable*/
656                        int create_library);
657 
658   /* optional. called after build/link and after metadata setup. */
659   int (*post_build_program) (cl_program program, cl_uint device_i);
660   /* optional. Ensures that everything is built for
661    * returning a poclbinary to the user. E.g. for CPU driver this means
662    * building a dynamic WG sized parallel.bc */
663   int (*build_poclbinary) (cl_program program, cl_uint device_i);
664 
665   /* Optional. If the driver uses the default build_poclbinary implementation
666    * from common_driver.c, that implementation calls this to compile a
667    * "dynamic WG size" kernel. */
668   void (*compile_kernel) (_cl_command_node *cmd, cl_kernel kernel,
669                           cl_device_id device, int specialize);
670 
671   /* Optional. driver should free the content of "program->data" here,
672    * if it fills it. */
673   int (*free_program) (cl_device_id device, cl_program program,
674                        unsigned program_device_i);
675 
676   /* Driver should setup kernel metadata here, if it can, and return non-zero
677    * on success. This is called after compilation/build/link. E.g. CPU driver
678    * parses the LLVM metadata. */
679   int (*setup_metadata) (cl_device_id device, cl_program program,
680                          unsigned program_device_i);
681 
682   /* Driver should examine the binary and return non-zero if it can load it.
683    * Note that it's never called with pocl-binaries; those are automatically
684    * accepted if device-hash in the binary's header matches the device. */
685   int (*supports_binary) (cl_device_id device, const size_t length,
686                           const char *binary);
687 
688   /* Optional. if the driver needs to use hardware resources
689    * for command queues, it should use these callbacks */
690   int (*init_queue) (cl_device_id device, cl_command_queue queue);
691   int (*free_queue) (cl_device_id device, cl_command_queue queue);
692 
693   /* clEnqueueNDRangeKernel */
694   void (*run) (void *data, _cl_command_node *cmd);
695   /* for clEnqueueNativeKernel. may be NULL */
696   void (*run_native) (void *data, _cl_command_node *cmd);
697 
698   /* Perform initialization steps and can return additional
699      build options that are required for the device. The caller
700      owns the returned string. may be NULL */
701   char* (*init_build) (void *data);
702 
703   /* may be NULL */
704   void (*init_target_machine) (void *data, void *target_machine);
705 
706   /* returns a hash string that should identify the device. This string
707    * is used when writing/loading pocl binaries to decide compatibility. */
708   char* (*build_hash) (cl_device_id device);
709 
710   /* the following callbacks deal with images ONLY, with the exception of
711    * IMAGE1D_BUFFER type (which is implemented as a buffer).
712    * If the device does not support images, all of these may be NULL. */
713 
714   /* creates a device-specific hardware resource for sampler. May be NULL */
715   int (*create_sampler) (cl_device_id device,
716                          cl_sampler samp,
717                          unsigned context_device_i);
718   int (*free_sampler) (cl_device_id device,
719                        cl_sampler samp,
720                        unsigned context_device_i);
721 
722   /* copies image to image, on the same device (or same global memory). */
723   cl_int (*copy_image_rect) (void *data,
724                              cl_mem src_image,
725                              cl_mem dst_image,
726                              pocl_mem_identifier *src_mem_id,
727                              pocl_mem_identifier *dst_mem_id,
728                              const size_t *src_origin,
729                              const size_t *dst_origin,
730                              const size_t *region);
731 
732   /* copies a region from host OR device buffer to device image.
733    * clEnqueueCopyBufferToImage: src_mem_id = buffer,
734    *     src_host_ptr = NULL, src_row_pitch = src_slice_pitch = 0
735    * clEnqueueWriteImage: src_mem_id = NULL,
736    *     src_host_ptr = host pointer, src_offset = 0
737    */
738   cl_int (*write_image_rect ) (void *data,
739                                cl_mem dst_image,
740                                pocl_mem_identifier *dst_mem_id,
741                                const void *__restrict__ src_host_ptr,
742                                pocl_mem_identifier *src_mem_id,
743                                const size_t *origin,
744                                const size_t *region,
745                                size_t src_row_pitch,
746                                size_t src_slice_pitch,
747                                size_t src_offset);
748 
749   /* copies a region from device image to host or device buffer
750    * clEnqueueCopyImageToBuffer: dst_mem_id = buffer,
751    *     dst_host_ptr = NULL, dst_row_pitch = dst_slice_pitch = 0
752    * clEnqueueReadImage: dst_mem_id = NULL,
753    *     dst_host_ptr = host pointer, dst_offset = 0
754    */
755   cl_int (*read_image_rect) (void *data,
756                              cl_mem src_image,
757                              pocl_mem_identifier *src_mem_id,
758                              void *__restrict__ dst_host_ptr,
759                              pocl_mem_identifier *dst_mem_id,
760                              const size_t *origin,
761                              const size_t *region,
762                              size_t dst_row_pitch,
763                              size_t dst_slice_pitch,
764                              size_t dst_offset);
765 
766   /* maps the entire image from device to host */
767   cl_int (*map_image) (void *data,
768                        pocl_mem_identifier *mem_id,
769                        cl_mem src_image,
770                        mem_mapping_t *map);
771 
772   /* unmaps the entire image from host to device */
773   cl_int (*unmap_image) (void *data,
774                          pocl_mem_identifier *mem_id,
775                          cl_mem dst_image,
776                          mem_mapping_t *map);
777 
778   /* fill image with pattern */
779   cl_int (*fill_image) (void *data, cl_mem image, pocl_mem_identifier *mem_id,
780                         const size_t *origin, const size_t *region,
781                         cl_uint4 orig_pixel, pixel_t fill_pixel,
782                         size_t pixel_size);
783 
784   /* custom device functionality */
785 
786   /* The device can override this function to perform driver-specific
787    * optimizations to the local size dimensions, whenever the decision
788    * is left to the runtime. */
789   void (*compute_local_size) (cl_device_id dev, size_t global_x,
790                               size_t global_y, size_t global_z,
791                               size_t *local_x, size_t *local_y,
792                               size_t *local_z);
793 
794   cl_int (*get_device_info_ext) (cl_device_id dev, cl_device_info param_name,
795                                  size_t param_value_size, void * param_value,
796                                  size_t * param_value_size_ret);
797 };
798 
799 typedef struct pocl_global_mem_t {
800   pocl_lock_t pocl_lock;
801   cl_ulong max_ever_allocated;
802   cl_ulong currently_allocated;
803   cl_ulong total_alloc_limit;
804 } pocl_global_mem_t;
805 
806 /**
807  * Enumeration for different modes of converting automatic locals
808  */
809 typedef enum
810 {
811   POCL_AUTOLOCALS_TO_ARGS_NEVER = 0,
812   POCL_AUTOLOCALS_TO_ARGS_ALWAYS = 1,
813   // convert autolocals to args only if there are dynamic local memory function
814   // arguments in the kernel.
815   POCL_AUTOLOCALS_TO_ARGS_ONLY_IF_DYNAMIC_LOCALS_PRESENT = 2,
816 } pocl_autolocals_to_args_strategy;
817 
818 #define NUM_OPENCL_IMAGE_TYPES 6
819 
820 struct _cl_device_id {
821   POCL_ICD_OBJECT
822   POCL_OBJECT;
823   /* queries */
824   cl_device_type type;
825   cl_uint vendor_id;
826   cl_uint max_compute_units;
827 
828   /* for subdevice support */
829   cl_device_id parent_device;
830   unsigned core_start;
831   unsigned core_count;
832 
833   cl_uint max_work_item_dimensions;
834   /* when enabled, Workgroup LLVM pass will replace all printf() calls
835    * with calls to __pocl_printf and recursively change functions to
836    * add printf buffer arguments from pocl_context.
837    * Currently the pthread/basic devices require this; other devices
838    * implement printf their own way. */
839   int device_side_printf;
840   size_t max_work_item_sizes[3];
841   size_t max_work_group_size;
842   size_t preferred_wg_size_multiple;
843   cl_uint preferred_vector_width_char;
844   cl_uint preferred_vector_width_short;
845   cl_uint preferred_vector_width_int;
846   cl_uint preferred_vector_width_long;
847   cl_uint preferred_vector_width_float;
848   cl_uint preferred_vector_width_double;
849   cl_uint preferred_vector_width_half;
850   cl_uint native_vector_width_char;
851   cl_uint native_vector_width_short;
852   cl_uint native_vector_width_int;
853   cl_uint native_vector_width_long;
854   cl_uint native_vector_width_float;
855   cl_uint native_vector_width_double;
856   cl_uint native_vector_width_half;
857   cl_uint max_clock_frequency;
858   cl_uint address_bits;
859   cl_ulong max_mem_alloc_size;
860   cl_bool image_support;
861   cl_uint max_read_image_args;
862   cl_uint max_write_image_args;
863   cl_uint max_read_write_image_args;
864   size_t image2d_max_width;
865   size_t image2d_max_height;
866   size_t image3d_max_width;
867   size_t image3d_max_height;
868   size_t image3d_max_depth;
869   size_t image_max_buffer_size;
870   size_t image_max_array_size;
871   cl_uint max_samplers;
872   size_t max_parameter_size;
873   cl_uint mem_base_addr_align;
874   cl_uint min_data_type_align_size;
875   cl_device_fp_config half_fp_config;
876   cl_device_fp_config single_fp_config;
877   cl_device_fp_config double_fp_config;
878   cl_device_mem_cache_type global_mem_cache_type;
879   cl_uint global_mem_cacheline_size;
880   cl_ulong global_mem_cache_size;
881   cl_ulong global_mem_size;
882   size_t global_var_pref_size;
883   size_t global_var_max_size;
884   cl_ulong max_constant_buffer_size;
885   cl_uint max_constant_args;
886   cl_device_local_mem_type local_mem_type;
887   cl_ulong local_mem_size;
888   cl_bool error_correction_support;
889   cl_bool host_unified_memory;
890   size_t profiling_timer_resolution;
891   cl_bool endian_little;
892   cl_bool available;
893   cl_bool compiler_available;
894   cl_bool linker_available;
895   /* Is the target a Single Program Multiple Data machine? If not,
896      we need to generate work-item loops to execute all the work-items
897      in the WG. For SPMD machines, the hardware spawns the WIs. */
898   cl_bool spmd;
899   /* The device uses an HSA-like kernel ABI with a single argument buffer as
900      an input. */
901   cl_bool arg_buffer_launcher;
902   /* The Workgroup pass creates launcher functions and replaces work-item
903      placeholder global variables (e.g. _local_size_, _global_offset_ etc) with
904      loads from the context struct passed as a kernel argument. This flag
905      enables or disables this pass. */
906   cl_bool workgroup_pass;
907   cl_device_exec_capabilities execution_capabilities;
908   cl_command_queue_properties queue_properties;
909   cl_platform_id platform;
910   cl_uint max_sub_devices;
911   size_t num_partition_properties;
912   cl_device_partition_property *partition_properties;
913   size_t num_partition_types;
914   cl_device_partition_property *partition_type;
915   size_t printf_buffer_size;
916   char *short_name;
917   char *long_name;
918 
919   const char *vendor;
920   const char *driver_version;
921   const char *profile;
922   const char *version;
923   const char *extensions;
924   const char *cl_version_std;  // "CL2.0"
925   cl_ulong cl_version_int;     // 200
926 
927   void *data;
928   const char* llvm_target_triplet; /* the llvm target triplet to use */
929   const char* llvm_cpu; /* the llvm CPU variant to use */
930   /* A running number (starting from zero) across all the device instances.
931      Used for indexing arrays in data structures with device specific
932      entries. */
933   int dev_id;
934   int global_mem_id; /* identifier for device global memory */
935   /* pointer to an accounting struct for global memory */
936   pocl_global_mem_t *global_memory;
937   /* Does the device have 64bit longs */
938   int has_64bit_long;
939   /* Does the device set the event times in update_event() callback ?
940    * if zero, the default event change handlers set the event times based on
941    * the host's system time (pocl_gettimemono_ns). */
942   int has_own_timer;
943 
944   /* If the driver wants SPIR-V input directly, without translation to
945    * LLVM IR with "spir" triple, set this to 1,
946    * and make sure device->ops->supports_binary returns 1 for SPIR-V */
947   int consumes_il_directly;
948 
949   /* Convert automatic local variables to kernel arguments? */
950   pocl_autolocals_to_args_strategy autolocals_to_args;
951   /* Allocate local buffers device side in the work-group launcher instead of
952      having a disjoint physical local memory per work-group or having the
953      runtime/driver allocate the local space. */
954   int device_alloca_locals;
955 
956   /* If > 0, specialized versions of the work-group functions are generated
957      which assume each grid dimension is of at most the given width. This
958      assumption can be then taken in account in IR optimization and codegen
959      to reduce address computation overheads etc. */
960   size_t grid_width_specialization_limit;
961 
962   /* Device-specific linker flags that should be appended to the clang's
963      argument list for a final linkage call when producing the final binary
964      that can be uploaded to the device using the default LLVM-based
965      codegen. The final entry in the list must be NULL.
966 
967      The flags will be added after the following command line:
968      clang -o final.bin input.obj [flags]
969   */
970 
971   const char **final_linkage_flags;
972 
973   /* Auxiliary functions required by the device binary which should
974      be retained across the kernel compilation unused code pruning
975      process. */
976   const char **device_aux_functions;
977 
978   /* semicolon separated list of builtin kernels*/
979   char *builtin_kernel_list;
980 
981   /* The target specific IDs for the different OpenCL address spaces. */
982   unsigned global_as_id;
983   unsigned local_as_id;
984   unsigned constant_as_id;
985 
986   /* The address space where the argument data is passed. */
987   unsigned args_as_id;
988 
989   /* The address space where the grid context data is passed. */
990   unsigned context_as_id;
991 
992   /* Set to >0 if the device supports SVM.
993    * When creating context with multiple devices, the device with
994    * largest priority will have the responsibility of allocating
995    * shared buffers residing in Shared Virtual Memory areas.
996    * This allows using both CPU and HSA for SVM allocations,
997    * with HSA having priority in multi-device context */
998   cl_uint svm_allocation_priority;
999   /* OpenCL 2.0 properties */
1000   cl_device_svm_capabilities svm_caps;
1001   cl_uint max_events;
1002   cl_uint max_queues;
1003   cl_uint max_pipe_args;
1004   cl_uint max_pipe_active_res;
1005   cl_uint max_pipe_packet_size;
1006   cl_uint dev_queue_pref_size;
1007   cl_uint dev_queue_max_size;
1008   cl_command_queue_properties on_dev_queue_props;
1009   cl_command_queue_properties on_host_queue_props;
1010   /* OpenCL 2.1 */
1011   char *spirv_version;
1012 
1013   /* image formats supported by the device, per image type */
1014   const cl_image_format *image_formats[NUM_OPENCL_IMAGE_TYPES];
1015   cl_uint num_image_formats[NUM_OPENCL_IMAGE_TYPES];
1016 
1017   /* Device operations, shared among devices of the same type */
1018   struct pocl_device_ops *ops;
1019 
1020   /* OpenCL 3.0 properties */
1021   cl_device_atomic_capabilities atomic_memory_capabilities;
1022   cl_device_atomic_capabilities atomic_fence_capabilities;
1023 };
1024 
1025 #define DEVICE_SVM_FINEGR(dev) (dev->svm_caps & (CL_DEVICE_SVM_FINE_GRAIN_BUFFER \
1026                                               | CL_DEVICE_SVM_FINE_GRAIN_SYSTEM))
1027 #define DEVICE_SVM_ATOM(dev) (dev->svm_caps & CL_DEVICE_SVM_ATOMICS)
1028 
1029 #define DEVICE_MMAP_IS_NOP(dev) (DEVICE_SVM_FINEGR(dev) && DEVICE_SVM_ATOM(dev))
1030 
1031 #define CHECK_DEVICE_AVAIL_RET(dev) if(!dev->available) { POCL_MSG_ERR("This cl_device is not available.\n"); return CL_INVALID_DEVICE; }
1032 #define CHECK_DEVICE_AVAIL_RETV(dev) if(!dev->available) { POCL_MSG_ERR("This cl_device is not available.\n"); return; }
1033 
1034 #define OPENCL_MAX_DIMENSION 3
1035 
1036 struct _cl_platform_id {
1037   POCL_ICD_OBJECT_PLATFORM_ID
1038 };
1039 
1040 struct _cl_context {
1041   POCL_ICD_OBJECT
1042   POCL_OBJECT;
1043   /* queries */
1044   cl_device_id *devices;
1045   cl_context_properties *properties;
1046   /* implementation */
1047   unsigned num_devices;
1048   unsigned num_properties;
1049 
1050   /*********************************************************************/
1051   /* these values depend on which devices are in context;
1052    * they're calculated by pocl_setup_context() */
1053 
1054   /* The largest of max_mem_alloc_size of all devices in context */
1055   size_t max_mem_alloc_size;
1056 
1057   /* union of image formats supported by all of the devices in context,
1058    * per image-type (there are 6 image types)
1059      TODO the getSupportedImageFormats is supposed to also respect flags,
1060      but for now we ignore that. */
1061   cl_image_format *image_formats[NUM_OPENCL_IMAGE_TYPES];
1062   cl_uint num_image_formats[NUM_OPENCL_IMAGE_TYPES];
1063 
1064   /* The device that should allocate SVM (might be == host)
1065    * NULL if none of devices in the context is SVM capable */
1066   cl_device_id svm_allocdev;
1067 
1068   /* for enqueueing migration commands. Two reasons:
1069    * 1) since migration commands can execute in parallel
1070    * to other commands, we can increase paralelism
1071    * 2) in some cases (migration between 2 devices through
1072    * host memory), we need to put two commands in two queues,
1073    * and the clEnqueueX only gives us one (on the destination
1074    * device). */
1075   cl_command_queue *default_queues;
1076 
1077   /* The minimal required buffer alignment for all devices in the context.
1078    * E.g. for clCreateSubBuffer:
1079    * CL_MISALIGNED_SUB_BUFFER_OFFSET is returned in errcode_ret if there are no
1080    * devices in context associated with buffer for which the origin value
1081    * is aligned to the CL_DEVICE_MEM_BASE_ADDR_ALIGN value.
1082    */
1083   size_t min_buffer_alignment;
1084 
1085 #ifdef ENABLE_LLVM
1086   void *llvm_context_data;
1087 #endif
1088 };
1089 
1090 typedef struct _pocl_data_sync_item pocl_data_sync_item;
1091 struct _pocl_data_sync_item {
1092   cl_event event;
1093   pocl_data_sync_item *next;
1094 };
1095 
1096 struct _cl_event;
1097 struct _cl_command_queue {
1098   POCL_ICD_OBJECT
1099   POCL_OBJECT;
1100   /* queries */
1101   cl_context context;
1102   cl_device_id device;
1103   cl_command_queue_properties properties;
1104   /* implementation */
1105   cl_event events; /* events of the enqueued commands in enqueue order */
1106   struct _cl_event *barrier;
1107   unsigned long command_count; /* counter for unfinished command enqueued */
1108   pocl_data_sync_item last_event;
1109 
1110   /* device specific data */
1111   void *data;
1112 };
1113 
1114 #define POCL_ON_SUB_MISALIGN(mem, que, operation)                             \
1115   do                                                                          \
1116     {                                                                         \
1117       if (mem->parent != NULL)  {                                             \
1118         operation (                                                           \
1119             (mem->origin % que->device->mem_base_addr_align != 0),            \
1120             CL_MISALIGNED_SUB_BUFFER_OFFSET,                                  \
1121             "SubBuffer is not "                                               \
1122             "properly aligned for this device");                              \
1123         }                                                                     \
1124     }                                                                         \
1125   while (0)
1126 
1127 #define POCL_RETURN_ON_SUB_MISALIGN(mem, que)                                 \
1128   POCL_ON_SUB_MISALIGN(mem, que, POCL_RETURN_ERROR_ON)
1129 
1130 #define POCL_GOTO_ON_SUB_MISALIGN(mem, que)                                   \
1131   POCL_ON_SUB_MISALIGN(mem, que, POCL_GOTO_ERROR_ON)
1132 
1133 #define POCL_CONVERT_SUBBUFFER_OFFSET(mem, offset)                            \
1134   if (mem->parent != NULL)                                                    \
1135     {                                                                         \
1136       offset += mem->origin;                                                  \
1137       mem = mem->parent;                                                      \
1138     }
1139 
1140 #define DEVICE_IMAGE_SIZE_SUPPORT 1
1141 #define DEVICE_IMAGE_FORMAT_SUPPORT 2
1142 
1143 #define DEVICE_DOESNT_SUPPORT_IMAGE(mem, dev_i)                               \
1144   (mem->device_supports_this_image[dev_i] == 0)
1145 
1146 #define POCL_ON_UNSUPPORTED_IMAGE(mem, dev, operation)                        \
1147   do                                                                          \
1148     {                                                                         \
1149       unsigned dev_i;                                                         \
1150       for (dev_i = 0; dev_i < mem->context->num_devices; ++dev_i)             \
1151         if (mem->context->devices[dev_i] == dev)                              \
1152           break;                                                              \
1153       assert (dev_i < mem->context->num_devices);                             \
1154       operation (                                                  \
1155           (mem->context->devices[dev_i]->image_support == CL_FALSE),          \
1156           CL_INVALID_OPERATION, "Device %s does not support images\n",        \
1157           mem->context->devices[dev_i]->long_name);                           \
1158       operation (                                                  \
1159           ((mem->device_supports_this_image[dev_i]                            \
1160             & DEVICE_IMAGE_FORMAT_SUPPORT)                                    \
1161            == 0),                                                             \
1162           CL_IMAGE_FORMAT_NOT_SUPPORTED,                                      \
1163           "The image type is not supported by this device\n");                \
1164       operation (                                                  \
1165           ((mem->device_supports_this_image[dev_i]                            \
1166             & DEVICE_IMAGE_SIZE_SUPPORT)                                      \
1167            == 0),                                                             \
1168           CL_INVALID_IMAGE_SIZE,                                              \
1169           "The image size is not supported by this device\n");                \
1170     }                                                                         \
1171   while (0)
1172 
1173 
1174 #define POCL_RETURN_ON_UNSUPPORTED_IMAGE(mem, dev)                            \
1175   POCL_ON_UNSUPPORTED_IMAGE(mem, dev, POCL_RETURN_ERROR_ON)
1176 
1177 #define POCL_GOTO_ON_UNSUPPORTED_IMAGE(mem, dev)                              \
1178   POCL_ON_UNSUPPORTED_IMAGE(mem, dev, POCL_GOTO_ERROR_ON)
1179 
1180 
1181 
1182 typedef struct _cl_mem cl_mem_t;
1183 struct _cl_mem {
1184   POCL_ICD_OBJECT
1185   POCL_OBJECT;
1186   cl_context context;
1187   cl_mem_object_type type;
1188   cl_mem_flags flags;
1189 
1190   size_t size;
1191   size_t origin; /* for sub-buffers */
1192 
1193   /* host backing memory for a buffer.
1194    *
1195    * This is either user provided host-ptr, or driver allocated,
1196    * or temporary allocation by a migration command. Since it
1197    * can have multiple users, it's refcounted. */
1198   void *mem_host_ptr;
1199   /* version of buffer content in mem_host_ptr */
1200   uint64_t mem_host_ptr_version;
1201   /* reference count; when it reaches 0,
1202    * the mem_host_ptr is automatically freed */
1203   uint mem_host_ptr_refcount;
1204 
1205   /* array of device-specific memory bookkeeping structs.
1206      The location of some device's struct is determined by
1207      the device's global_mem_id. */
1208   pocl_mem_identifier *device_ptrs;
1209 
1210   /* for content tracking;
1211    *
1212    * this is the valid (highest) version of the buffer's content;
1213    * if any device has lower version in device_ptrs[]->version,
1214    * the buffer content on that device is invalid */
1215   uint64_t latest_version;
1216   /* the event that last changed (written to) the buffer, this
1217    * is used as a "from "dependency for any migration commands */
1218   cl_event last_event;
1219 
1220 
1221   /* A linked list of regions of the buffer mapped to the
1222      host memory */
1223   mem_mapping_t *mappings;
1224   size_t map_count;
1225 
1226   /* in case this is a sub buffer, this points to the parent
1227      buffer */
1228   cl_mem_t *parent;
1229   /* A linked list of destructor callbacks */
1230   mem_destructor_callback_t *destructor_callbacks;
1231 
1232   /* These two are for cl_pocl_content_size extension.
1233    * They link two buffers together, like this:
1234    * mem->size_buffer->content_buffer = mem
1235    * mem->content_buffer->size_buffer = mem
1236    */
1237   cl_mem size_buffer;
1238   cl_mem content_buffer;
1239 
1240   /* for images, a flag for each device in context,
1241    * whether that device supports this */
1242   int *device_supports_this_image;
1243 
1244   /* if the memory backing mem_host_ptr is "permanent" =
1245    * valid through the entire lifetime of the buffer,
1246    * we can make some assumptions and optimizations */
1247   cl_bool mem_host_ptr_is_permanent;
1248 
1249   /* Image flags */
1250   cl_bool                 is_image;
1251   cl_channel_order        image_channel_order;
1252   cl_channel_type         image_channel_data_type;
1253   size_t                  image_width;
1254   size_t                  image_height;
1255   size_t                  image_depth;
1256   size_t                  image_array_size;
1257   size_t                  image_row_pitch;
1258   size_t                  image_slice_pitch;
1259   size_t                  image_elem_size;
1260   size_t                  image_channels;
1261   cl_uint                 num_mip_levels;
1262   cl_uint                 num_samples;
1263   cl_mem                  buffer;
1264 
1265   /* pipe flags */
1266   cl_bool                 is_pipe;
1267   size_t                  pipe_packet_size;
1268   size_t                  pipe_max_packets;
1269 
1270   /* list of SVM buffers */
1271   struct _cl_mem *prev, *next;
1272 };
1273 
1274 typedef uint8_t SHA1_digest_t[SHA1_DIGEST_SIZE * 2 + 1];
1275 
1276 typedef struct pocl_kernel_metadata_s
1277 {
1278   cl_uint num_args;
1279   cl_uint num_locals;
1280   size_t *local_sizes;
1281   char *name;
1282   char *attributes;
1283   struct pocl_argument_info *arg_info;
1284   cl_bitfield has_arg_metadata;
1285   size_t reqd_wg_size[OPENCL_MAX_DIMENSION];
1286 
1287   /* if we know the size of _every_ kernel argument, we store
1288    * the total size here. see struct _cl_kernel on why */
1289   size_t total_argument_storage_size;
1290 
1291   /* array[program->num_devices] */
1292   pocl_kernel_hash_t *build_hash;
1293 
1294   /* If this is a BI kernel descriptor, they are statically defined in
1295      the custom device driver, thus should not be freed. */
1296   cl_bitfield builtin_kernel;
1297 
1298   /* device-specific METAdata, void* array[program->num_devices] */
1299   void **data;
1300 } pocl_kernel_metadata_t;
1301 
1302 struct _cl_program {
1303   POCL_ICD_OBJECT
1304   POCL_OBJECT;
1305   /* queries */
1306   cl_context context;
1307   /* -cl-denorms-are-zero build option */
1308   unsigned flush_denorms;
1309 
1310   /* list of devices "associated with the program" (quote from Specs)
1311    * ... IOW for which we *can* build the program.
1312    * this is setup once, at clCreateProgramWith{Source,Binaries,...} time */
1313   cl_device_id *associated_devices;
1314   cl_uint associated_num_devices;
1315   /* list of devices for which we actually did build the program.
1316    * this changes on every rebuild to device arguments given to clBuildProgram
1317    */
1318   cl_uint num_devices;
1319   cl_device_id *devices;
1320 
1321   /* all the program sources appended together, terminated with a zero */
1322   char *source;
1323   /* The options in the last clBuildProgram call for this Program. */
1324   char *compiler_options;
1325 
1326   /* per-device binaries, in device-specific format */
1327   size_t *binary_sizes;
1328   unsigned char **binaries;
1329 
1330   /* If this is a program with built-in kernels, this is the list of kernel
1331      names it contains. */
1332   size_t num_builtin_kernels;
1333   char **builtin_kernel_names;
1334   char *concated_builtin_names;
1335 
1336   /* Poclcc binary format.  */
1337   /* per-device poclbinary-format binaries.  */
1338   size_t *pocl_binary_sizes;
1339   unsigned char **pocl_binaries;
1340   /* device-specific data, per each device */
1341   void **data;
1342 
1343   /* kernel number and the metadata for each kernel */
1344   size_t num_kernels;
1345   pocl_kernel_metadata_t *kernel_meta;
1346 
1347   /* list of attached cl_kernel instances */
1348   cl_kernel kernels;
1349   /* Per-device program hash after build */
1350   SHA1_digest_t* build_hash;
1351   /* Per-device build logs, for the case when we don't yet have the program's cachedir */
1352   char** build_log;
1353   /* Per-program build log, for the case when we aren't yet building for devices */
1354   char main_build_log[640];
1355   /* Use to store build status */
1356   cl_build_status build_status;
1357   /* Use to store binary type */
1358   cl_program_binary_type binary_type;
1359 
1360   /* Store SPIR-V binary from clCreateProgramWithIL() */
1361   char *program_il;
1362   size_t program_il_size;
1363 };
1364 
1365 struct _cl_kernel {
1366   POCL_ICD_OBJECT
1367   POCL_OBJECT;
1368   /* -------- */
1369   cl_context context;
1370   cl_program program;
1371   pocl_kernel_metadata_t *meta;
1372   /* device-specific data, per each device. This is different from meta->data,
1373    * as this is per-instance of cl_kernel, while there is just one meta->data
1374    * for all instances of the kernel of the same name. */
1375   void **data;
1376   /* just a convenience pointer to meta->name */
1377   const char *name;
1378 
1379   /* The kernel arguments that are set with clSetKernelArg().
1380      These are copied to the command queue command at enqueue. */
1381   struct pocl_argument *dyn_arguments;
1382 
1383   /* if total_argument_storage_size is known, we preallocate storage for
1384    * actual kernel arguments here, instead of allocating it by one for
1385    * each argument separately. The "offsets" store pointers calculated as
1386    * "dyn_argument_storage + offset-of-argument-N".
1387    *
1388    * The pointer to actual value for argument N, used by drivers, is stored
1389    * in dyn_arguments[N].value; if total_argument_storage_size is not known,
1390    * the .value must be allocated separately for every argument in
1391    * clSetKernelArg; if it is known, clSetKernelArg sets the .value to
1392    * dyn_argument_offsets[N] and copies the value there.
1393    *
1394    * We must keep both ways, because not every driver can know kernel
1395    * argument sizes beforehand.
1396    */
1397   char *dyn_argument_storage;
1398   void **dyn_argument_offsets;
1399 
1400   /* for program's linked list of kernels */
1401   struct _cl_kernel *next;
1402 };
1403 
1404 typedef struct event_callback_item event_callback_item;
1405 struct event_callback_item
1406 {
1407   void(CL_CALLBACK *callback_function) (cl_event, cl_int, void *);
1408   void *user_data;
1409   cl_int trigger_status;
1410   struct event_callback_item *next;
1411 };
1412 
1413 
1414 struct event_node
1415 {
1416   cl_event event;
1417   event_node *next;
1418 };
1419 
1420 /* Optional metadata for events for improved profile data readability etc. */
1421 typedef struct _pocl_event_md
1422 {
1423   /* The kernel executed by the NDRange command associated with the event,
1424      if any. */
1425   cl_kernel kernel;
1426 } pocl_event_md;
1427 
1428 typedef struct _cl_event _cl_event;
1429 struct _cl_event {
1430   POCL_ICD_OBJECT
1431   POCL_OBJECT;
1432   cl_context context;
1433   cl_command_queue queue;
1434   cl_command_type command_type;
1435   _cl_command_node *command;
1436 
1437   /* list of callback functions */
1438   event_callback_item *callback_list;
1439 
1440   /* list of devices needing completion notification for this event */
1441   event_node *notify_list;
1442   event_node *wait_list;
1443 
1444   /* OoO doesn't use sync points -> put used buffers here */
1445   size_t num_buffers;
1446   cl_mem *mem_objs;
1447 
1448   /* Profiling data: time stamps of the different phases of execution. */
1449   cl_ulong time_queue;  /* the enqueue time */
1450   cl_ulong time_submit; /* the time the command was submitted to the device */
1451   cl_ulong time_start;  /* the time the command actually started executing */
1452   cl_ulong time_end;    /* the finish time of the command */
1453 
1454   /* Device specific data */
1455   void *data;
1456 
1457   /* Additional (optional data) used to make profile data more readable etc. */
1458   pocl_event_md *meta_data;
1459 
1460   /* The execution status of the command this event is monitoring. */
1461   cl_int status;
1462   /* impicit event = an event for pocl's internal use, not visible to user */
1463   short implicit_event;
1464   /* if set, at the completion of event, the mem_host_ptr_refcount should be
1465    * lowered and memory freed if it's 0 */
1466   short release_mem_host_ptr_after;
1467 
1468   _cl_event *next;
1469   _cl_event *prev;
1470 };
1471 
1472 typedef struct _pocl_user_event_data
1473 {
1474   pocl_cond_t wakeup_cond;
1475 } pocl_user_event_data;
1476 
1477 typedef struct _cl_sampler cl_sampler_t;
1478 struct _cl_sampler {
1479   POCL_ICD_OBJECT
1480   POCL_OBJECT;
1481   cl_context context;
1482   cl_bool             normalized_coords;
1483   cl_addressing_mode  addressing_mode;
1484   cl_filter_mode      filter_mode;
1485   void**              device_data;
1486 };
1487 
1488 #define CL_FAILED (-1)
1489 
1490 #ifndef __cplusplus
1491 
1492 #define min(a,b) (((a) < (b)) ? (a) : (b))
1493 #define max(a,b) (((a) > (b)) ? (a) : (b))
1494 
1495 #endif
1496 
1497 #ifdef __APPLE__
1498   #include <libkern/OSByteOrder.h>
1499   #define htole16(x) OSSwapHostToLittleInt16(x)
1500   #define le16toh(x) OSSwapLittleToHostInt16(x)
1501   #define htole32(x) OSSwapHostToLittleInt32(x)
1502   #define le32toh(x) OSSwapLittleToHostInt32(x)
1503   #define htole64(x) OSSwapHostToLittleInt64(x)
1504   #define le64toh(x) OSSwapLittleToHostInt64(x)
1505 #elif defined(__FreeBSD__) || defined(__DragonFly__)
1506   #include <sys/endian.h>
1507 #elif defined (_WIN32)
1508     #ifndef htole64
1509       #define htole64(x) (x)
1510     #endif
1511     #ifndef htole32
1512       #define htole32(x) (x)
1513     #endif
1514     #ifndef htole16
1515       #define htole16(x) (x)
1516     #endif
1517     #ifndef le64toh
1518       #define le64toh(x) (x)
1519     #endif
1520     #ifndef le32toh
1521       #define le32toh(x) (x)
1522     #endif
1523     #ifndef le16toh
1524       #define le16toh(x) (x)
1525     #endif
1526 #else
1527   #include <endian.h>
1528   #if defined(__GLIBC__) && __GLIBC__ == 2 && \
1529       defined(__GLIBC_MINOR__) && __GLIBC_MINOR__ < 9 && \
1530       defined(__x86_64__)
1531     #ifndef htole64
1532       #define htole64(x) (x)
1533     #endif
1534     #ifndef htole32
1535       #define htole32(x) (x)
1536     #endif
1537     #ifndef htole16
1538       #define htole16(x) (x)
1539     #endif
1540     #ifndef le64toh
1541       #define le64toh(x) (x)
1542     #endif
1543     #ifndef le32toh
1544       #define le32toh(x) (x)
1545     #endif
1546     #ifndef le16toh
1547       #define le16toh(x) (x)
1548     #endif
1549   #endif
1550 #endif
1551 
1552 #ifdef HAVE_LTTNG_UST
1553 
1554 #include "pocl_lttng.h"
1555 
1556 #define TP_CREATE_QUEUE(context_id, queue_id)                                 \
1557   tracepoint (pocl_trace, create_queue, context_id, queue_id);
1558 #define TP_FREE_QUEUE(context_id, queue_id)                                   \
1559   tracepoint (pocl_trace, free_queue, context_id, queue_id);
1560 
1561 #define TP_CREATE_BUFFER(context_id, buffer_id)                               \
1562   tracepoint (pocl_trace, create_buffer, context_id, buffer_id);
1563 #define TP_FREE_BUFFER(context_id, buffer_id)                                 \
1564   tracepoint (pocl_trace, free_buffer, context_id, buffer_id);
1565 
1566 #define TP_CREATE_PROGRAM(context_id, program_id)                             \
1567   tracepoint (pocl_trace, create_program, context_id, program_id);
1568 #define TP_BUILD_PROGRAM(context_id, program_id)                              \
1569   tracepoint (pocl_trace, build_program, context_id, program_id);
1570 #define TP_FREE_PROGRAM(context_id, program_id)                               \
1571   tracepoint (pocl_trace, free_program, context_id, program_id);
1572 
1573 #define TP_CREATE_KERNEL(context_id, kernel_id, kernel_name)                  \
1574   tracepoint (pocl_trace, create_kernel, context_id, kernel_id, kernel_name);
1575 #define TP_FREE_KERNEL(context_id, kernel_id, kernel_name)                    \
1576   tracepoint (pocl_trace, free_kernel, context_id, kernel_id, kernel_name);
1577 
1578 #define TP_CREATE_IMAGE(context_id, image_id)                                 \
1579   tracepoint (pocl_trace, create_image, context_id, image_id);
1580 #define TP_FREE_IMAGE(context_id, image_id)                                   \
1581   tracepoint (pocl_trace, free_image, context_id, image_id);
1582 
1583 #define TP_CREATE_SAMPLER(context_id, sampler_id)                             \
1584   tracepoint (pocl_trace, create_sampler, context_id, sampler_id);
1585 #define TP_FREE_SAMPLER(context_id, sampler_id)                               \
1586   tracepoint (pocl_trace, free_sampler, context_id, sampler_id);
1587 
1588 #else
1589 
1590 #define TP_CREATE_QUEUE(context_id, queue_id)
1591 #define TP_FREE_QUEUE(context_id, queue_id)
1592 
1593 #define TP_CREATE_BUFFER(context_id, buffer_id)
1594 #define TP_FREE_BUFFER(context_id, buffer_id)
1595 
1596 #define TP_CREATE_PROGRAM(context_id, program_id)
1597 #define TP_BUILD_PROGRAM(context_id, program_id)
1598 #define TP_FREE_PROGRAM(context_id, program_id)
1599 
1600 #define TP_CREATE_KERNEL(context_id, kernel_id, kernel_name)
1601 #define TP_FREE_KERNEL(context_id, kernel_id, kernel_name)
1602 
1603 #define TP_CREATE_IMAGE(context_id, image_id)
1604 #define TP_FREE_IMAGE(context_id, image_id)
1605 
1606 #define TP_CREATE_SAMPLER(context_id, sampler_id)
1607 #define TP_FREE_SAMPLER(context_id, sampler_id)
1608 
1609 #endif
1610 
1611 #endif /* POCL_CL_H */
1612