1 /* OpenCL runtime library: pocl_util utility functions
2 
3    Copyright (c) 2012-2019 Pekka Jääskeläinen
4 
5    Permission is hereby granted, free of charge, to any person obtaining a copy
6    of this software and associated documentation files (the "Software"), to deal
7    in the Software without restriction, including without limitation the rights
8    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9    copies of the Software, and to permit persons to whom the Software is
10    furnished to do so, subject to the following conditions:
11 
12    The above copyright notice and this permission notice shall be included in
13    all copies or substantial portions of the Software.
14 
15    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21    THE SOFTWARE.
22 */
23 
24 #include <errno.h>
25 #include <limits.h>
26 #include <stdlib.h>
27 #include <string.h>
28 
29 #include <time.h>
30 
31 #ifndef _WIN32
32 #include <dirent.h>
33 #include <string.h>
34 #include <sys/resource.h>
35 #include <sys/stat.h>
36 #include <sys/time.h>
37 #include <sys/types.h>
38 #include <sys/wait.h>
39 #include <unistd.h>
40 #include <utime.h>
41 #else
42 #  include "vccompat.hpp"
43 #endif
44 
45 #include "pocl_util.h"
46 #include "pocl_timing.h"
47 #include "pocl_llvm.h"
48 #include "utlist.h"
49 #include "common.h"
50 #include "pocl_mem_management.h"
51 #include "devices.h"
52 #include "pocl_runtime_config.h"
53 
54 /* required for setting SSE/AVX flush denorms to zero flag */
55 #if defined(__x86_64__) && defined(__GNUC__)
56 #include <x86intrin.h>
57 #endif
58 
59 struct list_item;
60 
61 typedef struct list_item
62 {
63   void *value;
64   struct list_item *next;
65 } list_item;
66 
67 void
pocl_restore_ftz(unsigned ftz)68 pocl_restore_ftz (unsigned ftz)
69 {
70 #if defined(__x86_64__) && defined(__GNUC__)
71 
72 #ifdef _MM_FLUSH_ZERO_ON
73   if (ftz & _MM_FLUSH_ZERO_ON)
74     _MM_SET_FLUSH_ZERO_MODE (_MM_FLUSH_ZERO_ON);
75   else
76     _MM_SET_FLUSH_ZERO_MODE (_MM_FLUSH_ZERO_OFF);
77 #endif
78 #ifdef _MM_DENORMALS_ZERO_ON
79   if (ftz & _MM_DENORMALS_ZERO_ON)
80     _MM_SET_DENORMALS_ZERO_MODE (_MM_DENORMALS_ZERO_ON);
81   else
82     _MM_SET_DENORMALS_ZERO_MODE (_MM_DENORMALS_ZERO_OFF);
83 #endif
84 
85 #endif
86 }
87 
88 unsigned
pocl_save_ftz()89 pocl_save_ftz ()
90 {
91 #if defined(__x86_64__) && defined(__GNUC__)
92 
93   unsigned s = 0;
94 #ifdef _MM_FLUSH_ZERO_ON
95   if (_MM_GET_FLUSH_ZERO_MODE ())
96     s |= _MM_FLUSH_ZERO_ON;
97   else
98     s &= (~_MM_FLUSH_ZERO_ON);
99 #endif
100 #ifdef _MM_DENORMALS_ZERO_ON
101   if (_MM_GET_DENORMALS_ZERO_MODE ())
102     s |= _MM_DENORMALS_ZERO_ON;
103   else
104     s &= (~_MM_DENORMALS_ZERO_ON);
105 #endif
106   return s;
107 
108 #else
109   return 0;
110 #endif
111 }
112 
113 void
pocl_set_ftz(unsigned ftz)114 pocl_set_ftz (unsigned ftz)
115 {
116 #if defined(__x86_64__) && defined(__GNUC__)
117   if (ftz)
118     {
119 #ifdef _MM_FLUSH_ZERO_ON
120       _MM_SET_FLUSH_ZERO_MODE (_MM_FLUSH_ZERO_ON);
121 #endif
122 
123 #ifdef _MM_DENORMALS_ZERO_ON
124       _MM_SET_DENORMALS_ZERO_MODE (_MM_DENORMALS_ZERO_ON);
125 #endif
126     }
127   else
128     {
129 #ifdef _MM_FLUSH_ZERO_OFF
130       _MM_SET_FLUSH_ZERO_MODE (_MM_FLUSH_ZERO_OFF);
131 #endif
132 
133 #ifdef _MM_DENORMALS_ZERO_OFF
134       _MM_SET_DENORMALS_ZERO_MODE (_MM_DENORMALS_ZERO_OFF);
135 #endif
136     }
137 #endif
138 }
139 
140 
141 void
pocl_set_default_rm()142 pocl_set_default_rm ()
143 {
144 #if defined(__x86_64__) && defined(__GNUC__) && defined(_MM_ROUND_NEAREST)
145   unsigned rm = _MM_GET_ROUNDING_MODE ();
146   if (rm != _MM_ROUND_NEAREST)
147     _MM_SET_ROUNDING_MODE (_MM_ROUND_NEAREST);
148 #endif
149 }
150 
151 unsigned
pocl_save_rm()152 pocl_save_rm ()
153 {
154 #if defined(__x86_64__) && defined(__GNUC__) && defined(_MM_ROUND_NEAREST)
155   return _MM_GET_ROUNDING_MODE ();
156 #else
157   return 0;
158 #endif
159 }
160 
161 void
pocl_restore_rm(unsigned rm)162 pocl_restore_rm (unsigned rm)
163 {
164 #if defined(__x86_64__) && defined(__GNUC__) && defined(_MM_ROUND_NEAREST)
165   _MM_SET_ROUNDING_MODE (rm);
166 #endif
167 }
168 
169 uint32_t
byteswap_uint32_t(uint32_t word,char should_swap)170 byteswap_uint32_t (uint32_t word, char should_swap)
171 {
172     union word_union
173     {
174         uint32_t full_word;
175         unsigned char bytes[4];
176     } old, neww;
177     if (!should_swap) return word;
178 
179     old.full_word = word;
180     neww.bytes[0] = old.bytes[3];
181     neww.bytes[1] = old.bytes[2];
182     neww.bytes[2] = old.bytes[1];
183     neww.bytes[3] = old.bytes[0];
184     return neww.full_word;
185 }
186 
187 float
byteswap_float(float word,char should_swap)188 byteswap_float (float word, char should_swap)
189 {
190     union word_union
191     {
192         float full_word;
193         unsigned char bytes[4];
194     } old, neww;
195     if (!should_swap) return word;
196 
197     old.full_word = word;
198     neww.bytes[0] = old.bytes[3];
199     neww.bytes[1] = old.bytes[2];
200     neww.bytes[2] = old.bytes[1];
201     neww.bytes[3] = old.bytes[0];
202     return neww.full_word;
203 }
204 
205 size_t
pocl_size_ceil2(size_t x)206 pocl_size_ceil2(size_t x) {
207   /* Rounds up to the next highest power of two without branching and
208    * is as fast as a BSR instruction on x86, see:
209    *
210    * https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
211    */
212   --x;
213   x |= x >> 1;
214   x |= x >> 2;
215   x |= x >> 4;
216   x |= x >> 8;
217   x |= x >> 16;
218 #if SIZE_MAX > 0xFFFFFFFF
219   x |= x >> 32;
220 #endif
221   return ++x;
222 }
223 
224 uint64_t
pocl_size_ceil2_64(uint64_t x)225 pocl_size_ceil2_64 (uint64_t x)
226 {
227   /* Rounds up to the next highest power of two without branching and
228    * is as fast as a BSR instruction on x86, see:
229    *
230    * https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
231    */
232   --x;
233   x |= x >> 1;
234   x |= x >> 2;
235   x |= x >> 4;
236   x |= x >> 8;
237   x |= x >> 16;
238   x |= x >> 32;
239   return ++x;
240 }
241 
242 #if defined(_WIN32) || defined(HAVE_POSIX_MEMALIGN) || defined(__ANDROID__)    \
243     || (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L))
244 #define HAVE_ALIGNED_ALLOC
245 #else
246 #error aligned malloc unavailable
247 #endif
248 
249 static void*
pocl_memalign_alloc(size_t align_width,size_t size)250 pocl_memalign_alloc(size_t align_width, size_t size)
251 {
252   void *ptr;
253   int status;
254 
255 #ifdef __ANDROID__
256   ptr = memalign (align_width, size);
257   return ptr;
258 #elif defined(HAVE_POSIX_MEMALIGN)
259   status = posix_memalign (&ptr, align_width, size);
260   return ((status == 0) ? ptr : NULL);
261 #elif defined(_MSC_VER)
262   return _aligned_malloc(size, align_width);
263 #elif defined(__MINGW32__)
264   return __mingw_aligned_malloc(size, align_width);
265 #elif (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L))
266   return aligned_alloc (align_width, size);
267 #else
268 #error Cannot find aligned malloc
269 #endif
270 }
271 
272 void *
pocl_aligned_malloc(size_t alignment,size_t size)273 pocl_aligned_malloc (size_t alignment, size_t size)
274 {
275 #ifdef HAVE_ALIGNED_ALLOC
276   assert (alignment > 0);
277   /* make sure that size is a multiple of alignment, as posix_memalign
278    * does not perform this test, whereas aligned_alloc does */
279   if ((size & (alignment - 1)) != 0)
280     {
281       size = size | (alignment - 1);
282       size += 1;
283     }
284 
285   /* posix_memalign requires alignment to be at least sizeof(void *) */
286   if (alignment < sizeof(void *))
287     alignment = sizeof(void* );
288 
289   void* result;
290 
291   result = pocl_memalign_alloc(alignment, size);
292   if (result == NULL)
293     {
294       errno = -1;
295       return NULL;
296     }
297 
298   return result;
299 
300 #else
301 #error Cannot find aligned malloc
302 #endif
303 
304 #if 0
305   /* this code works in theory, but there many places in pocl
306    * where aligned memory is used in the same pointers
307    * as memory allocated by other means */
308   /* allow zero-sized allocations, force alignment to 1 */
309   if (!size)
310     alignment = 1;
311 
312   /* make sure alignment is a non-zero power of two and that
313    * size is a multiple of alignment */
314   size_t mask = alignment - 1;
315   if (!alignment || ((alignment & mask) != 0) || ((size & mask) != 0))
316     {
317       errno = EINVAL;
318       return NULL;
319     }
320 
321   /* allocate memory plus space for alignment header */
322   uintptr_t address = (uintptr_t)malloc(size + mask + sizeof(void *));
323   if (!address)
324     return NULL;
325 
326   /* align the address, and store original pointer for future use
327    * with free in the preceding bytes */
328   uintptr_t aligned_address = (address + mask + sizeof(void *)) & ~mask;
329   void** address_ptr = (void **)(aligned_address - sizeof(void *));
330   *address_ptr = (void *)address;
331   return (void *)aligned_address;
332 
333 #endif
334 }
335 
336 #if 0
337 void
338 pocl_aligned_free (void *ptr)
339 {
340 #ifdef HAVE_ALIGNED_ALLOC
341   POCL_MEM_FREE (ptr);
342 #else
343 #error Cannot find aligned malloc
344   /* extract pointer from original allocation and free it */
345   if (ptr)
346     free(*(void **)((uintptr_t)ptr - sizeof(void *)));
347 #endif
348 }
349 #endif
350 
351 void
pocl_lock_events_inorder(cl_event ev1,cl_event ev2)352 pocl_lock_events_inorder (cl_event ev1, cl_event ev2)
353 {
354   assert (ev1 != ev2);
355   assert (ev1->id != ev2->id);
356   if (ev1->id < ev2->id)
357     {
358       POCL_LOCK_OBJ (ev1);
359       POCL_LOCK_OBJ (ev2);
360     }
361   else
362     {
363       POCL_LOCK_OBJ (ev2);
364       POCL_LOCK_OBJ (ev1);
365     }
366 }
367 
368 void
pocl_unlock_events_inorder(cl_event ev1,cl_event ev2)369 pocl_unlock_events_inorder (cl_event ev1, cl_event ev2)
370 {
371   assert (ev1 != ev2);
372   assert (ev1->id != ev2->id);
373   if (ev1->id < ev2->id)
374     {
375       POCL_UNLOCK_OBJ (ev1);
376       POCL_UNLOCK_OBJ (ev2);
377     }
378   else
379     {
380       POCL_UNLOCK_OBJ (ev2);
381       POCL_UNLOCK_OBJ (ev1);
382     }
383 }
384 
385 /* This is required because e.g. NDRange commands could have the same buffer
386  * multiple times as argument, or CopyBuffer could have src == dst buffer.
387  *
388  * If the buffer that appears multiple times in the list, is on another device,
389  * we don't want to enqueue >1 migrations for the same buffer.
390  */
391 static void
sort_and_uniq(cl_mem * objs,char * readonly_flags,size_t * num_objs)392 sort_and_uniq (cl_mem *objs, char *readonly_flags, size_t *num_objs)
393 {
394   size_t i;
395   ssize_t j;
396   size_t n = *num_objs;
397   assert (n > 1);
398 
399   /* if the buffer is an image backed by buffer storage,
400    * replace with actual storage */
401   for (i = 0; i < n; ++i)
402     if (objs[i]->buffer)
403       objs[i] = objs[i]->buffer;
404 
405   /* sort by obj id */
406   for (i = 1; i < n; ++i)
407     {
408       cl_mem buf = objs[i];
409       char c = readonly_flags[i];
410       for (j = (i - 1); ((j >= 0) && (objs[j]->id > buf->id)); --j)
411         {
412           objs[j + 1] = objs[j];
413           readonly_flags[j + 1] = readonly_flags[j];
414         }
415       objs[j + 1] = buf;
416       readonly_flags[j + 1] = c;
417     }
418 
419   /* skip the first i objects which are different */
420   for (i = 1; i < n; ++i)
421     if (objs[i - 1] == objs[i])
422       break;
423 
424   /* uniq */
425   size_t k = i;
426   while (i < n)
427     {
428       if (objs[k] != objs[i])
429         {
430           objs[k] = objs[i];
431           readonly_flags[k] = readonly_flags[i];
432           ++k;
433         }
434       else
435         {
436           readonly_flags[k] = readonly_flags[k] & readonly_flags[i];
437         }
438       ++i;
439     }
440 
441   *num_objs = k;
442 }
443 
444 extern unsigned long event_c;
445 extern unsigned long uevent_c;
446 
447 cl_int
pocl_create_event(cl_event * event,cl_command_queue command_queue,cl_command_type command_type,size_t num_buffers,const cl_mem * buffers,cl_context context)448 pocl_create_event (cl_event *event, cl_command_queue command_queue,
449                    cl_command_type command_type, size_t num_buffers,
450                    const cl_mem *buffers, cl_context context)
451 {
452   static uint64_t event_id_counter = 0;
453 
454   if (context == NULL)
455     return CL_INVALID_CONTEXT;
456 
457   assert (event != NULL);
458   *event = pocl_mem_manager_new_event ();
459   if (*event == NULL)
460     return CL_OUT_OF_HOST_MEMORY;
461 
462   (*event)->context = context;
463   (*event)->queue = command_queue;
464 
465   /* user events have a NULL command queue, don't retain it */
466   if (command_queue)
467     POname (clRetainCommandQueue) (command_queue);
468   else
469     POname (clRetainContext) (context);
470 
471   (*event)->command_type = command_type;
472   (*event)->id = POCL_ATOMIC_INC (event_id_counter);
473   (*event)->num_buffers = num_buffers;
474   if (num_buffers > 0)
475     {
476       (*event)->mem_objs = (cl_mem *)malloc (num_buffers * sizeof (cl_mem));
477       memcpy ((*event)->mem_objs, buffers, num_buffers * sizeof (cl_mem));
478     }
479   (*event)->status = CL_QUEUED;
480 
481   if (command_type == CL_COMMAND_USER)
482     POCL_ATOMIC_INC (uevent_c);
483   else
484     POCL_ATOMIC_INC (event_c);
485 
486   POCL_MSG_PRINT_EVENTS ("Created event %p / ID %" PRIu64 " / Command %s\n",
487                          (*event), (*event)->id,
488                          pocl_command_to_str (command_type));
489 
490   return CL_SUCCESS;
491 }
492 
493 static int
pocl_create_event_sync(cl_event waiting_event,cl_event notifier_event)494 pocl_create_event_sync (cl_event waiting_event, cl_event notifier_event)
495 {
496   event_node *notify_target = NULL;
497   event_node *wait_list_item = NULL;
498 
499   if (notifier_event == NULL)
500     return CL_SUCCESS;
501 
502   POCL_MSG_PRINT_EVENTS ("create event sync: waiting %" PRIu64
503                          " , notifier %" PRIu64 "\n",
504                          waiting_event->id, notifier_event->id);
505 
506   pocl_lock_events_inorder (waiting_event, notifier_event);
507 
508   assert (notifier_event->pocl_refcount != 0);
509   assert (waiting_event != notifier_event);
510 
511   LL_FOREACH (waiting_event->wait_list, wait_list_item)
512     {
513       if (wait_list_item->event == notifier_event)
514         {
515           POCL_MSG_PRINT_EVENTS ("Skipping event sync creation \n");
516           goto FINISH;
517         }
518     }
519 
520   if (notifier_event->status == CL_COMPLETE)
521     goto FINISH;
522   notify_target = pocl_mem_manager_new_event_node();
523   wait_list_item = pocl_mem_manager_new_event_node();
524   if (!notify_target || !wait_list_item)
525     return CL_OUT_OF_HOST_MEMORY;
526 
527   notify_target->event = waiting_event;
528   wait_list_item->event = notifier_event;
529   LL_PREPEND (notifier_event->notify_list, notify_target);
530   LL_PREPEND (waiting_event->wait_list, wait_list_item);
531 
532 FINISH:
533   pocl_unlock_events_inorder (waiting_event, notifier_event);
534   return CL_SUCCESS;
535 }
536 
537 /* preallocate the buffers on destination device.
538  * if any allocation fails, we can't run this command. */
539 static int
can_run_command(cl_device_id dev,size_t num_objs,cl_mem * objs)540 can_run_command (cl_device_id dev, size_t num_objs, cl_mem *objs)
541 {
542   size_t i;
543   int errcode;
544 
545   for (i = 0; i < num_objs; ++i)
546     {
547       pocl_mem_identifier *p = &objs[i]->device_ptrs[dev->global_mem_id];
548       // skip already allocated
549       if (p->mem_ptr)
550         continue;
551 
552       assert (dev->ops->alloc_mem_obj);
553       errcode = dev->ops->alloc_mem_obj (dev, objs[i], NULL);
554       if (errcode != CL_SUCCESS)
555         return CL_FALSE;
556     }
557 
558   return CL_TRUE;
559 }
560 
561 static cl_int
pocl_create_command_struct(_cl_command_node ** cmd,cl_command_queue command_queue,cl_command_type command_type,cl_event * event_p,cl_uint num_events,const cl_event * wait_list,size_t num_buffers,const cl_mem * buffers)562 pocl_create_command_struct (_cl_command_node **cmd,
563                             cl_command_queue command_queue,
564                             cl_command_type command_type, cl_event *event_p,
565                             cl_uint num_events, const cl_event *wait_list,
566                             size_t num_buffers, const cl_mem *buffers)
567 {
568   unsigned i;
569   int err;
570   cl_event *event = NULL;
571 
572   *cmd = pocl_mem_manager_new_command ();
573   if (*cmd == NULL)
574     return CL_OUT_OF_HOST_MEMORY;
575 
576   (*cmd)->type = command_type;
577 
578   event = &((*cmd)->event);
579   err = pocl_create_event (event, command_queue, command_type, num_buffers,
580                            buffers, command_queue->context);
581 
582   if (err != CL_SUCCESS)
583     {
584       POCL_MEM_FREE(*cmd);
585       return err;
586     }
587   (*event)->command_type = command_type;
588 
589   /* if host application wants this commands event
590      one reference for the host and one for the runtime/driver */
591   if (event_p)
592     {
593       POCL_MSG_PRINT_EVENTS ("event pointer provided\n");
594       *event_p = *event;
595       (*event)->implicit_event = 0;
596       (*event)->pocl_refcount = 2;
597     }
598   else
599     {
600       (*event)->implicit_event = 1;
601       (*event)->pocl_refcount = 1;
602     }
603 
604   (*cmd)->device = command_queue->device;
605   (*cmd)->event->command = (*cmd);
606 
607   /* Form event synchronizations based on the given wait list */
608   for (i = 0; i < num_events; ++i)
609     {
610       cl_event wle = wait_list[i];
611       pocl_create_event_sync ((*event), wle);
612     }
613   POCL_MSG_PRINT_EVENTS (
614       "Created command struct: CMD %p (event %" PRIu64 " / %p, type: %s)\n", *cmd,
615       (*event)->id, *event, pocl_command_to_str (command_type));
616   return CL_SUCCESS;
617 }
618 
619 static int
pocl_create_migration_commands(cl_device_id dev,cl_event final_event,cl_mem mem,pocl_mem_identifier * p,const char readonly,cl_command_type command_type,cl_mem_migration_flags mig_flags)620 pocl_create_migration_commands (cl_device_id dev, cl_event final_event,
621                                 cl_mem mem, pocl_mem_identifier *p,
622                                 const char readonly,
623                                 cl_command_type command_type,
624                                 cl_mem_migration_flags mig_flags)
625 {
626   int errcode = CL_SUCCESS;
627 
628   cl_event ev_export = NULL, ev_import = NULL, previous_last_event = NULL,
629            last_migration_event = NULL;
630   _cl_command_node *cmd_export = NULL, *cmd_import = NULL;
631   cl_device_id ex_dev = NULL;
632   cl_command_queue ex_cq = NULL, dev_cq = NULL;
633   int can_directly_mig = 0;
634   size_t i;
635 
636   /* "export" means copy buffer content from source device to mem_host_ptr;
637    *
638    * "import" means copy mem_host_ptr content to destination device,
639    * or copy directly between devices
640    *
641    * "need_hostptr" if set, increase the mem_host_ptr_refcount,
642    * to keep the mem_host_ptr backing memory around */
643   int do_import = 0, do_export = 0, do_need_hostptr = 0;
644 
645   /*****************************************************************/
646 
647   /* this part only:
648    *   sets up the buffer content versions according to requested migration type;
649    *   sets the buffer->last_event pointer to the final_event;
650    *   decides what needs to be actually done (import, export) but not do it;
651    *
652    * ... so that any following command sees a correct buffer state.
653    * The actual migration commands are enqueued after. */
654   POCL_LOCK_OBJ (mem);
655 
656   /* Retain the buffer for the duration of the command, except Unmaps,
657    * because corresponding Maps retain twice. */
658   if (command_type != CL_COMMAND_UNMAP_MEM_OBJECT)
659     POCL_RETAIN_OBJECT_UNLOCKED (mem);
660 
661   /* save buffer's current last_event as previous last_event,
662    * then set the last_event pointer to the actual command's event
663    * (final_event).
664    *
665    * We'll need the "previous" event to properly chain events, but
666    * will release it after we've enqueued the required commands. */
667   previous_last_event = mem->last_event;
668   mem->last_event = final_event;
669 
670   /* find device/gmem with latest memory version and fastest migration.
671    * ex_dev = device with latest memory _other than dev_
672    * dev_cq = default command queue for destination dev */
673   int highest_d2d_mig_priority = 0;
674   for (i = 0; i < mem->context->num_devices; ++i)
675     {
676       cl_device_id d = mem->context->devices[i];
677       cl_command_queue cq = mem->context->default_queues[i];
678       if (d == dev)
679         dev_cq = cq;
680       else if (mem->device_ptrs[d->global_mem_id].version == mem->latest_version)
681         {
682           int cur_d2d_mig_priority = 0;
683           if (d->ops->can_migrate_d2d)
684             cur_d2d_mig_priority = d->ops->can_migrate_d2d (dev, d);
685 
686           // if we can directly migrate, and we found a better device, use it
687           if (cur_d2d_mig_priority > highest_d2d_mig_priority)
688             {
689               ex_dev = d;
690               ex_cq = cq;
691               highest_d2d_mig_priority = cur_d2d_mig_priority;
692             }
693 
694           // if we can't migrate D2D, just use plain old through-host migration
695           if (highest_d2d_mig_priority == 0)
696             {
697               ex_dev = d;
698               ex_cq = cq;
699             }
700         }
701     }
702 
703   assert (dev);
704   assert (dev_cq);
705   /* ex_dev can be NULL, or non-NULL != dev */
706   assert (ex_dev != dev);
707 
708   /* if mem_host_ptr_version < latest_version, one of devices must have it;
709    *
710    * could be latest_version == mem_host_ptr_version == some p->version
711    * for some p, and so i < ndev; in that case,
712    * we leave ex_dev set since D2D is preferred migration way;
713    *
714    * otherwise must be
715    * mem_host_ptr_version == latest_version & > all p->version */
716 
717   if ((mem->mem_host_ptr_version < mem->latest_version) && (p->version != mem->latest_version))
718     assert ((ex_dev != NULL) && (mem->device_ptrs[ex_dev->global_mem_id].version == mem->latest_version));
719 
720   /* if ex_dev is NULL, either we have the latest or it's in mem_host_ptr */
721   if (ex_dev == NULL)
722     assert ((p->version == mem->latest_version) ||
723             (mem->mem_host_ptr_version == mem->latest_version));
724 
725   /*****************************************************************/
726 
727   /* buffer must be already allocated on this device's globalmem */
728   assert (p->mem_ptr != NULL);
729 
730   /* we're migrating to host mem only: clEnqueueMigMemObjs() with HOST flag */
731   if (mig_flags & CL_MIGRATE_MEM_OBJECT_HOST)
732     {
733       do_import = 0;
734       do_export = 0;
735       do_need_hostptr = 1;
736       if (mem->mem_host_ptr_version < mem->latest_version)
737         {
738           mem->mem_host_ptr_version = mem->latest_version;
739           /* migrate content only if needed */
740           if ((mig_flags & CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED) == 0)
741             {
742               /* Could be that destination dev has the latest version,
743                * we still need to migrate to host mem */
744               if (ex_dev == NULL)
745                 {
746                   ex_dev = dev; ex_cq = dev_cq;
747                 }
748               do_export = 1;
749               POCL_RETAIN_OBJECT_UNLOCKED (mem);
750             }
751         }
752 
753       goto FINISH_VER_SETUP;
754     }
755 
756   /* otherwise, we're migrating to a device memory. */
757   /* check if we can migrate to the device associated with command_queue
758    * without incurring the overhead of migrating their contents */
759   if (mig_flags & CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED)
760     p->version = mem->latest_version;
761 
762   /* if we don't need to migrate, skip to end */
763   if (p->version >= mem->latest_version)
764     {
765       do_import = 0;
766       do_export = 0;
767       goto FINISH_VER_SETUP;
768     }
769 
770   can_directly_mig = highest_d2d_mig_priority > 0;
771 
772   /* if mem_host_ptr is outdated AND the devices can't migrate
773    * between each other, we need an export command */
774   if ((mem->mem_host_ptr_version != mem->latest_version)
775       && (can_directly_mig == 0))
776     {
777       /* we need two migration commands; one on the "source" device's hidden
778        * queue, and one on the destination device. */
779       do_import = 1;
780       do_export = 1;
781       do_need_hostptr = 1;
782 
783       /* because the two migrate commands will clRelease the buffer */
784       POCL_RETAIN_OBJECT_UNLOCKED (mem);
785       POCL_RETAIN_OBJECT_UNLOCKED (mem);
786       mem->mem_host_ptr_version = mem->latest_version;
787       p->version = mem->latest_version;
788     }
789   /* otherwise either:
790    * 1) mem_host_ptr is latest, and we need to migrate mem-host-ptr to device, or
791    * 2) mem_host_ptr is not latest, but devices can migrate directly between each other,
792    * For both cases we only need one migration command on the destination device. */
793   else
794     {
795       do_import = 1;
796       do_export = 0;
797       do_need_hostptr = 1;
798 
799       /* because the corresponding migrate command will clRelease the buffer */
800       POCL_RETAIN_OBJECT_UNLOCKED (mem);
801       p->version = mem->latest_version;
802     }
803 
804 FINISH_VER_SETUP:
805   /* if the command is a write-use, increase the version. */
806   if (!readonly)
807     {
808       ++p->version;
809       mem->latest_version = p->version;
810     }
811 
812   if (do_need_hostptr)
813     {
814       /* increase refcount the two mig commands */
815       if (do_export)
816         ++mem->mem_host_ptr_refcount;
817       if (do_import)
818         ++mem->mem_host_ptr_refcount;
819 
820       /* allocate mem_host_ptr here if needed... */
821       if (mem->mem_host_ptr == NULL)
822         {
823           size_t align = max (mem->context->min_buffer_alignment, 16);
824           mem->mem_host_ptr = pocl_aligned_malloc (align, mem->size);
825           assert ((mem->mem_host_ptr != NULL)
826                   && "Cannot allocate backing memory for mem_host_ptr!\n");
827         }
828     }
829 
830   POCL_UNLOCK_OBJ (mem);
831 
832   /*****************************************************************/
833 
834   /* enqueue a command for export.
835    * Put the previous last event into its waitlist. */
836   if (do_export)
837     {
838       assert (ex_cq);
839       assert (ex_dev);
840       errcode = pocl_create_command_struct (
841           &cmd_export, ex_cq, CL_COMMAND_MIGRATE_MEM_OBJECTS,
842           &ev_export, // event_p
843           (previous_last_event ? 1 : 0),
844           (previous_last_event ? &previous_last_event : NULL), // waitlist
845           1, &mem                                              // buffer list
846       );
847       assert (errcode == CL_SUCCESS);
848       if (do_need_hostptr)
849         ev_export->release_mem_host_ptr_after = 1;
850 
851       cmd_export->command.migrate.mem_id
852           = &mem->device_ptrs[ex_dev->global_mem_id];
853       cmd_export->command.migrate.type = ENQUEUE_MIGRATE_TYPE_D2H;
854 
855       pocl_command_enqueue (ex_cq, cmd_export);
856 
857       last_migration_event = ev_export;
858     }
859 
860   /* enqueue a command for import.
861    * Put either the previous last event, or export ev, into its waitlist. */
862   if (do_import)
863     {
864       /* the import command must depend on (wait for) either the export
865        * command, or the buffer's previous last event. Can be NULL if there's
866        * no last event or export command */
867       cl_event import_wait_ev = (ev_export ? ev_export : previous_last_event);
868 
869       errcode = pocl_create_command_struct (
870           &cmd_import, dev_cq, CL_COMMAND_MIGRATE_MEM_OBJECTS,
871           &ev_import, // event_p
872           (import_wait_ev ? 1 : 0),
873           (import_wait_ev ? &import_wait_ev : NULL), // waitlist
874           1, &mem                                    // buffer list
875       );
876       assert (errcode == CL_SUCCESS);
877       if (do_need_hostptr)
878         ev_import->release_mem_host_ptr_after = 1;
879 
880       if (can_directly_mig)
881         {
882           cmd_import->command.migrate.type = ENQUEUE_MIGRATE_TYPE_D2D;
883           cmd_import->command.migrate.src_device = ex_dev;
884           cmd_import->command.migrate.src_id
885               = &mem->device_ptrs[ex_dev->global_mem_id];
886           cmd_import->command.migrate.dst_id
887               = &mem->device_ptrs[dev->global_mem_id];
888         }
889       else
890         {
891           cmd_import->command.migrate.type = ENQUEUE_MIGRATE_TYPE_H2D;
892           cmd_import->command.migrate.mem_id
893               = &mem->device_ptrs[dev->global_mem_id];
894         }
895 
896       pocl_command_enqueue (dev_cq, cmd_import);
897 
898       /* because explicit event */
899       if (ev_export)
900         POname (clReleaseEvent) (ev_export);
901 
902       last_migration_event = ev_import;
903     }
904 
905   /* we don't need it anymore. */
906   if (previous_last_event)
907     POname (clReleaseEvent (previous_last_event));
908 
909   /* the final event must depend on the export/import commands */
910   if (last_migration_event)
911     {
912       pocl_create_event_sync (final_event, last_migration_event);
913       /* if the event itself only reads from the buffer,
914        * set the last buffer event to last_mig_event,
915        * instead of the actual command event;
916        * this avoids unnecessary waits e.g on kernels
917        * which only read from buffers */
918       if (readonly)
919         {
920           POCL_LOCK_OBJ (mem);
921           mem->last_event = last_migration_event;
922           POCL_UNLOCK_OBJ (mem);
923           POname (clReleaseEvent) (final_event);
924         }
925       else /* because explicit event */
926         POname (clReleaseEvent) (last_migration_event);
927     }
928 
929   return CL_SUCCESS;
930 }
931 
932 static cl_int
pocl_create_command_full(_cl_command_node ** cmd,cl_command_queue command_queue,cl_command_type command_type,cl_event * event_p,cl_uint num_events,const cl_event * wait_list,size_t num_buffers,cl_mem * buffers,char * readonly_flags,cl_mem_migration_flags mig_flags)933 pocl_create_command_full (_cl_command_node **cmd,
934                           cl_command_queue command_queue,
935                           cl_command_type command_type, cl_event *event_p,
936                           cl_uint num_events, const cl_event *wait_list,
937                           size_t num_buffers, cl_mem *buffers,
938                           char *readonly_flags,
939                           cl_mem_migration_flags mig_flags)
940 {
941   cl_device_id dev = pocl_real_dev (command_queue->device);
942   int err = CL_SUCCESS;
943   size_t i;
944 
945   POCL_RETURN_ERROR_ON ((dev->available == CL_FALSE), CL_INVALID_DEVICE,
946                         "device is not available\n");
947 
948   if (num_buffers >= 1)
949     {
950       assert (buffers);
951       assert (readonly_flags);
952 
953       if (num_buffers > 1)
954         sort_and_uniq (buffers, readonly_flags, &num_buffers);
955 
956       if (can_run_command (dev, num_buffers, buffers) == CL_FALSE)
957         return CL_OUT_OF_RESOURCES;
958     }
959 
960   /* waitlist here only contains the user-provided events.
961    * migration events are added to waitlist later */
962   err = pocl_create_command_struct (cmd, command_queue, command_type, event_p,
963                                     num_events, wait_list, num_buffers,
964                                     buffers);
965   if (err)
966     return err;
967   cl_event final_event = (*cmd)->event;
968 
969   /* retain once for every buffer; this is because we set every buffer's
970    * "last event" to this, and then some next command enqueue
971    * (or clReleaseMemObject) will release it.
972    */
973   POCL_LOCK_OBJ (final_event);
974   final_event->pocl_refcount += num_buffers;
975   POCL_UNLOCK_OBJ (final_event);
976 
977   for (i = 0; i < num_buffers; ++i)
978     {
979       pocl_create_migration_commands (
980           dev, final_event, buffers[i],
981           &buffers[i]->device_ptrs[dev->global_mem_id], readonly_flags[i],
982           command_type, mig_flags);
983     }
984 
985   return err;
986 }
987 
988 cl_int
pocl_create_command_migrate(_cl_command_node ** cmd,cl_command_queue command_queue,cl_mem_migration_flags flags,cl_event * event_p,cl_uint num_events,const cl_event * wait_list,size_t num_buffers,cl_mem * buffers,char * readonly_flags)989 pocl_create_command_migrate (_cl_command_node **cmd,
990                              cl_command_queue command_queue,
991                              cl_mem_migration_flags flags, cl_event *event_p,
992                              cl_uint num_events, const cl_event *wait_list,
993                              size_t num_buffers, cl_mem *buffers,
994                              char *readonly_flags)
995 {
996   return pocl_create_command_full (
997       cmd, command_queue, CL_COMMAND_MIGRATE_MEM_OBJECTS, event_p, num_events,
998       wait_list, num_buffers, buffers, readonly_flags, flags);
999 }
1000 
1001 cl_int
pocl_create_command(_cl_command_node ** cmd,cl_command_queue command_queue,cl_command_type command_type,cl_event * event_p,cl_uint num_events,const cl_event * wait_list,size_t num_buffers,cl_mem * buffers,char * readonly_flags)1002 pocl_create_command (_cl_command_node **cmd, cl_command_queue command_queue,
1003                      cl_command_type command_type, cl_event *event_p,
1004                      cl_uint num_events, const cl_event *wait_list,
1005                      size_t num_buffers, cl_mem *buffers, char *readonly_flags)
1006 {
1007   return pocl_create_command_full (cmd, command_queue, command_type, event_p,
1008                                    num_events, wait_list, num_buffers, buffers,
1009                                    readonly_flags, 0);
1010 }
1011 
1012 /* call with node->event UNLOCKED */
pocl_command_enqueue(cl_command_queue command_queue,_cl_command_node * node)1013 void pocl_command_enqueue (cl_command_queue command_queue,
1014                           _cl_command_node *node)
1015 {
1016   cl_event event;
1017 
1018   POCL_LOCK_OBJ (command_queue);
1019 
1020   /* in case of in-order queue, synchronize to previously enqueued command
1021      if available */
1022   if (!(command_queue->properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE))
1023     {
1024       POCL_MSG_PRINT_EVENTS ("In-order Q; adding event syncs\n");
1025       if (command_queue->last_event.event)
1026         {
1027           pocl_create_event_sync (node->event,
1028                                   command_queue->last_event.event);
1029         }
1030     }
1031 
1032   ++command_queue->command_count;
1033   /* in case of in-order queue, synchronize to previously enqueued command
1034      if available */
1035   if (!(command_queue->properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE))
1036     {
1037       if (command_queue->last_event.event)
1038         {
1039           pocl_create_event_sync (node->event,
1040                                   command_queue->last_event.event);
1041         }
1042     }
1043   /* Command queue is out-of-order queue. If command type is a barrier, then
1044      synchronize to all previously enqueued commands to make sure they are
1045      executed before the barrier. */
1046   else if ((node->type == CL_COMMAND_BARRIER
1047             || node->type == CL_COMMAND_MARKER)
1048            && node->command.barrier.has_wait_list == 0)
1049     {
1050       POCL_MSG_PRINT_EVENTS ("Barrier; adding event syncs\n");
1051       DL_FOREACH (command_queue->events, event)
1052         {
1053           pocl_create_event_sync (node->event, event);
1054         }
1055     }
1056 
1057   if (node->type == CL_COMMAND_BARRIER)
1058     command_queue->barrier = node->event;
1059   else
1060     {
1061       if (command_queue->barrier)
1062         {
1063           pocl_create_event_sync (node->event, command_queue->barrier);
1064         }
1065     }
1066   DL_APPEND (command_queue->events, node->event);
1067 
1068   POCL_MSG_PRINT_EVENTS ("Pushed Event %" PRIu64 " to CQ %" PRIu64 ".\n",
1069                          node->event->id, command_queue->id);
1070   command_queue->last_event.event = node->event;
1071   POCL_UNLOCK_OBJ (command_queue);
1072 
1073   POCL_LOCK_OBJ (node->event);
1074   assert (node->event->status == CL_QUEUED);
1075   assert (command_queue == node->event->queue);
1076   pocl_update_event_queued (node->event);
1077   command_queue->device->ops->submit(node, command_queue);
1078   /* node->event is unlocked by device_ops->submit */
1079 
1080 }
1081 
1082 int
pocl_alloc_or_retain_mem_host_ptr(cl_mem mem)1083 pocl_alloc_or_retain_mem_host_ptr (cl_mem mem)
1084 {
1085   if (mem->mem_host_ptr == NULL)
1086     {
1087       size_t align = max (mem->context->min_buffer_alignment, 16);
1088       mem->mem_host_ptr = pocl_aligned_malloc (align, mem->size);
1089       if (mem->mem_host_ptr == NULL)
1090         return -1;
1091       mem->mem_host_ptr_version = 0;
1092       mem->mem_host_ptr_refcount = 0;
1093     }
1094   ++mem->mem_host_ptr_refcount;
1095   return 0;
1096 }
1097 
1098 int
pocl_release_mem_host_ptr(cl_mem mem)1099 pocl_release_mem_host_ptr (cl_mem mem)
1100 {
1101   assert (mem->mem_host_ptr_refcount > 0);
1102   --mem->mem_host_ptr_refcount;
1103   if (mem->mem_host_ptr_refcount == 0 && mem->mem_host_ptr != NULL)
1104     {
1105       pocl_aligned_free (mem->mem_host_ptr);
1106       mem->mem_host_ptr = NULL;
1107       mem->mem_host_ptr_version = 0;
1108     }
1109   return 0;
1110 }
1111 
1112 /* call (and return) with node->event locked */
1113 void
pocl_command_push(_cl_command_node * node,_cl_command_node ** ready_list,_cl_command_node ** pending_list)1114 pocl_command_push (_cl_command_node *node,
1115                    _cl_command_node **ready_list,
1116                    _cl_command_node **pending_list)
1117 {
1118   assert (node != NULL);
1119 
1120   /* If the last command inserted is a barrier,
1121      command is necessary not ready */
1122 
1123   if ((*ready_list) != NULL && (*ready_list)->prev
1124       && (*ready_list)->prev->type == CL_COMMAND_BARRIER)
1125     {
1126       CDL_PREPEND ((*pending_list), node);
1127       return;
1128     }
1129   if (pocl_command_is_ready(node->event))
1130     {
1131       pocl_update_event_submitted (node->event);
1132       CDL_PREPEND ((*ready_list), node);
1133     }
1134   else
1135     {
1136       CDL_PREPEND ((*pending_list), node);
1137     }
1138 }
1139 
1140 void
pocl_unmap_command_finished(cl_device_id dev,pocl_mem_identifier * mem_id,cl_mem mem,mem_mapping_t * map)1141 pocl_unmap_command_finished (cl_device_id dev, pocl_mem_identifier *mem_id,
1142                              cl_mem mem, mem_mapping_t *map)
1143 {
1144   POCL_LOCK_OBJ (mem);
1145   assert (map->unmap_requested > 0);
1146   dev->ops->free_mapping_ptr (dev->data, mem_id, mem, map);
1147   DL_DELETE (mem->mappings, map);
1148   mem->map_count--;
1149   POCL_MEM_FREE (map);
1150   POCL_UNLOCK_OBJ (mem);
1151 }
1152 
1153 void
pocl_unmap_command_finished2(cl_event event,_cl_command_t * cmd)1154 pocl_unmap_command_finished2 (cl_event event, _cl_command_t *cmd)
1155 {
1156   cl_device_id dev = event->queue->device;
1157   pocl_mem_identifier *mem_id = NULL;
1158   cl_mem mem = NULL;
1159   mem = event->mem_objs[0];
1160   mem_id = &mem->device_ptrs[dev->global_mem_id];
1161   pocl_unmap_command_finished (dev, mem_id, mem, cmd->unmap.mapping);
1162 }
1163 
1164 void
pocl_cl_mem_inherit_flags(cl_mem mem,cl_mem from_buffer,cl_mem_flags flags)1165 pocl_cl_mem_inherit_flags (cl_mem mem, cl_mem from_buffer, cl_mem_flags flags)
1166 {
1167   if ((flags & CL_MEM_READ_WRITE) | (flags & CL_MEM_READ_ONLY)
1168       | (flags & CL_MEM_WRITE_ONLY))
1169     {
1170       mem->flags = (flags & CL_MEM_READ_WRITE) | (flags & CL_MEM_READ_ONLY)
1171                    | (flags & CL_MEM_WRITE_ONLY);
1172     }
1173   else
1174     {
1175       mem->flags = (from_buffer->flags & CL_MEM_READ_WRITE)
1176                    | (from_buffer->flags & CL_MEM_READ_ONLY)
1177                    | (from_buffer->flags & CL_MEM_WRITE_ONLY);
1178     }
1179 
1180   if ((flags & CL_MEM_HOST_NO_ACCESS) | (flags & CL_MEM_HOST_READ_ONLY)
1181       | (flags & CL_MEM_HOST_WRITE_ONLY))
1182     {
1183       mem->flags = mem->flags | ((flags & CL_MEM_HOST_NO_ACCESS)
1184                                  | (flags & CL_MEM_HOST_READ_ONLY)
1185                                  | (flags & CL_MEM_HOST_WRITE_ONLY));
1186     }
1187   else
1188     {
1189       mem->flags
1190           = mem->flags | ((from_buffer->flags & CL_MEM_HOST_NO_ACCESS)
1191                           | (from_buffer->flags & CL_MEM_HOST_READ_ONLY)
1192                           | (from_buffer->flags & CL_MEM_HOST_WRITE_ONLY));
1193     }
1194 
1195   mem->flags = mem->flags | (from_buffer->flags & CL_MEM_USE_HOST_PTR)
1196                | (from_buffer->flags & CL_MEM_ALLOC_HOST_PTR)
1197                | (from_buffer->flags & CL_MEM_COPY_HOST_PTR);
1198 }
1199 
pocl_buffer_boundcheck(cl_mem buffer,size_t offset,size_t size)1200 int pocl_buffer_boundcheck(cl_mem buffer, size_t offset, size_t size) {
1201   POCL_RETURN_ERROR_ON ((offset > buffer->size), CL_INVALID_VALUE,
1202                         "offset(%zu) > buffer->size(%zu)\n", offset,
1203                         buffer->size);
1204   POCL_RETURN_ERROR_ON ((size > buffer->size), CL_INVALID_VALUE,
1205                         "size(%zu) > buffer->size(%zu)\n", size, buffer->size);
1206   POCL_RETURN_ERROR_ON ((offset + size > buffer->size), CL_INVALID_VALUE,
1207                         "offset + size (%zu) > buffer->size(%zu)\n",
1208                         (offset + size), buffer->size);
1209   return CL_SUCCESS;
1210 }
1211 
pocl_buffer_boundcheck_3d(const size_t buffer_size,const size_t * origin,const size_t * region,size_t * row_pitch,size_t * slice_pitch,const char * prefix)1212 int pocl_buffer_boundcheck_3d(const size_t buffer_size,
1213                               const size_t *origin,
1214                               const size_t *region,
1215                               size_t *row_pitch,
1216                               size_t *slice_pitch,
1217                               const char* prefix)
1218 {
1219   size_t rp = *row_pitch;
1220   size_t sp = *slice_pitch;
1221 
1222   /* CL_INVALID_VALUE if row_pitch is not 0 and is less than region[0]. */
1223   POCL_RETURN_ERROR_ON((rp != 0 && rp<region[0]),
1224     CL_INVALID_VALUE, "%srow_pitch is not 0 and is less than region[0]\n", prefix);
1225 
1226   if (rp == 0) rp = region[0];
1227 
1228   /* CL_INVALID_VALUE if slice_pitch is not 0 and is less than region[1] * row_pitch
1229    * or if slice_pitch is not 0 and is not a multiple of row_pitch.
1230    */
1231   POCL_RETURN_ERROR_ON((sp != 0 && sp < (region[1] * rp)),
1232     CL_INVALID_VALUE, "%sslice_pitch is not 0 and is less than "
1233       "region[1] * %srow_pitch\n", prefix, prefix);
1234   POCL_RETURN_ERROR_ON((sp != 0 && (sp % rp != 0)),
1235     CL_INVALID_VALUE, "%sslice_pitch is not 0 and is not a multiple "
1236       "of %srow_pitch\n", prefix, prefix);
1237 
1238   if (sp == 0) sp = region[1] * rp;
1239 
1240   *row_pitch = rp;
1241   *slice_pitch = sp;
1242 
1243   size_t byte_offset_begin = origin[2] * sp +
1244                origin[1] * rp +
1245                origin[0];
1246 
1247   size_t byte_offset_end = origin[0] + region[0]-1 +
1248        rp * (origin[1] + region[1]-1) +
1249        sp * (origin[2] + region[2]-1);
1250 
1251 
1252   POCL_RETURN_ERROR_ON((byte_offset_begin > buffer_size), CL_INVALID_VALUE,
1253             "%sorigin is outside the %sbuffer", prefix, prefix);
1254   POCL_RETURN_ERROR_ON((byte_offset_end >= buffer_size), CL_INVALID_VALUE,
1255             "%sorigin+region is outside the %sbuffer", prefix, prefix);
1256   return CL_SUCCESS;
1257 }
1258 
1259 
1260 
pocl_buffers_boundcheck(cl_mem src_buffer,cl_mem dst_buffer,size_t src_offset,size_t dst_offset,size_t size)1261 int pocl_buffers_boundcheck(cl_mem src_buffer,
1262                             cl_mem dst_buffer,
1263                             size_t src_offset,
1264                             size_t dst_offset,
1265                             size_t size) {
1266   POCL_RETURN_ERROR_ON((src_offset > src_buffer->size), CL_INVALID_VALUE,
1267             "src_offset(%zu) > src_buffer->size(%zu)", src_offset, src_buffer->size);
1268   POCL_RETURN_ERROR_ON((size > src_buffer->size), CL_INVALID_VALUE,
1269             "size(%zu) > src_buffer->size(%zu)", size, src_buffer->size);
1270   POCL_RETURN_ERROR_ON((src_offset + size > src_buffer->size), CL_INVALID_VALUE,
1271             "src_offset + size (%zu) > src_buffer->size(%zu)", (src_offset+size), src_buffer->size);
1272 
1273   POCL_RETURN_ERROR_ON((dst_offset > dst_buffer->size), CL_INVALID_VALUE,
1274             "dst_offset(%zu) > dst_buffer->size(%zu)", dst_offset, dst_buffer->size);
1275   POCL_RETURN_ERROR_ON((size > dst_buffer->size), CL_INVALID_VALUE,
1276             "size(%zu) > dst_buffer->size(%zu)", size, dst_buffer->size);
1277   POCL_RETURN_ERROR_ON((dst_offset + size > dst_buffer->size), CL_INVALID_VALUE,
1278             "dst_offset + size (%zu) > dst_buffer->size(%zu)", (dst_offset+size), dst_buffer->size);
1279   return CL_SUCCESS;
1280 }
1281 
pocl_buffers_overlap(cl_mem src_buffer,cl_mem dst_buffer,size_t src_offset,size_t dst_offset,size_t size)1282 int pocl_buffers_overlap(cl_mem src_buffer,
1283                          cl_mem dst_buffer,
1284                          size_t src_offset,
1285                          size_t dst_offset,
1286                          size_t size) {
1287   /* The regions overlap if src_offset ≤ to dst_offset ≤ to src_offset + size - 1,
1288    * or if dst_offset ≤ to src_offset ≤ to dst_offset + size - 1.
1289    */
1290   if (src_buffer == dst_buffer) {
1291     POCL_RETURN_ERROR_ON(((src_offset <= dst_offset) && (dst_offset <=
1292       (src_offset + size - 1))), CL_MEM_COPY_OVERLAP, "dst_offset lies inside \
1293       the src region and the src_buffer == dst_buffer");
1294     POCL_RETURN_ERROR_ON(((dst_offset <= src_offset) && (src_offset <=
1295       (dst_offset + size - 1))), CL_MEM_COPY_OVERLAP, "src_offset lies inside \
1296       the dst region and the src_buffer == dst_buffer");
1297   }
1298 
1299   /* sub buffers overlap check  */
1300   if (src_buffer->parent && dst_buffer->parent &&
1301         (src_buffer->parent == dst_buffer->parent)) {
1302       src_offset = src_buffer->origin + src_offset;
1303       dst_offset = dst_buffer->origin + dst_offset;
1304 
1305       POCL_RETURN_ERROR_ON (((src_offset <= dst_offset)
1306                              && (dst_offset <= (src_offset + size - 1))),
1307                             CL_MEM_COPY_OVERLAP, "dst_offset lies inside \
1308       the src region and src_buffer + dst_buffer are subbuffers of the same buffer");
1309       POCL_RETURN_ERROR_ON (((dst_offset <= src_offset)
1310                              && (src_offset <= (dst_offset + size - 1))),
1311                             CL_MEM_COPY_OVERLAP, "src_offset lies inside \
1312       the dst region and src_buffer + dst_buffer are subbuffers of the same buffer");
1313 
1314   }
1315 
1316   return CL_SUCCESS;
1317 }
1318 
1319 /*
1320  * Copyright (c) 2011 The Khronos Group Inc.
1321  *
1322  * Permission is hereby granted, free of charge, to any person obtaining a copy of this
1323  * software and /or associated documentation files (the "Materials "), to deal in the Materials
1324  * without restriction, including without limitation the rights to use, copy, modify, merge,
1325  * publish, distribute, sublicense, and/or sell copies of the Materials, and to permit persons to
1326  * whom the Materials are furnished to do so, subject to
1327  * the following conditions:
1328  *
1329  * The above copyright notice and this permission notice shall be included
1330  * in all copies or substantial portions of the Materials.
1331  *
1332  * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1333  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1334  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1335  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1336  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1337  * OUT OF OR IN CONNECTION WITH THE MATERIALS OR THE USE OR OTHER DEALINGS IN
1338  * THE MATERIALS.
1339  */
1340 
1341 int
check_copy_overlap(const size_t src_offset[3],const size_t dst_offset[3],const size_t region[3],const size_t row_pitch,const size_t slice_pitch)1342 check_copy_overlap(const size_t src_offset[3],
1343                    const size_t dst_offset[3],
1344                    const size_t region[3],
1345                    const size_t row_pitch, const size_t slice_pitch)
1346 {
1347   const size_t src_min[] = {src_offset[0], src_offset[1], src_offset[2]};
1348   const size_t src_max[] = {src_offset[0] + region[0],
1349                             src_offset[1] + region[1],
1350                             src_offset[2] + region[2]};
1351   const size_t dst_min[] = {dst_offset[0], dst_offset[1], dst_offset[2]};
1352   const size_t dst_max[] = {dst_offset[0] + region[0],
1353                             dst_offset[1] + region[1],
1354                             dst_offset[2] + region[2]};
1355   int overlap = 1;
1356   unsigned i;
1357   for (i=0; i != 3; ++i)
1358   {
1359     overlap = overlap && (src_min[i] < dst_max[i])
1360                       && (src_max[i] > dst_min[i]);
1361   }
1362 
1363   size_t dst_start =  dst_offset[2] * slice_pitch +
1364                       dst_offset[1] * row_pitch + dst_offset[0];
1365   size_t dst_end = dst_start + (region[2] * slice_pitch +
1366                                 region[1] * row_pitch + region[0]);
1367   size_t src_start =  src_offset[2] * slice_pitch +
1368                       src_offset[1] * row_pitch + src_offset[0];
1369   size_t src_end = src_start + (region[2] * slice_pitch +
1370                                 region[1] * row_pitch + region[0]);
1371 
1372   if (!overlap)
1373   {
1374     size_t delta_src_x = (src_offset[0] + region[0] > row_pitch) ?
1375                           src_offset[0] + region[0] - row_pitch : 0;
1376     size_t delta_dst_x = (dst_offset[0] + region[0] > row_pitch) ?
1377                           dst_offset[0] + region[0] - row_pitch : 0;
1378     if ( (delta_src_x > 0 && delta_src_x > dst_offset[0]) ||
1379           (delta_dst_x > 0 && delta_dst_x > src_offset[0]) )
1380       {
1381         if ( (src_start <= dst_start && dst_start < src_end) ||
1382           (dst_start <= src_start && src_start < dst_end) )
1383           overlap = 1;
1384       }
1385 
1386     if (region[2] > 1)
1387     {
1388       size_t src_height = slice_pitch / row_pitch;
1389       size_t dst_height = slice_pitch / row_pitch;
1390 
1391       size_t delta_src_y = (src_offset[1] + region[1] > src_height) ?
1392                             src_offset[1] + region[1] - src_height : 0;
1393       size_t delta_dst_y = (dst_offset[1] + region[1] > dst_height) ?
1394                             dst_offset[1] + region[1] - dst_height : 0;
1395 
1396       if ( (delta_src_y > 0 && delta_src_y > dst_offset[1]) ||
1397             (delta_dst_y > 0 && delta_dst_y > src_offset[1]) )
1398       {
1399         if ( (src_start <= dst_start && dst_start < src_end) ||
1400               (dst_start <= src_start && src_start < dst_end) )
1401               overlap = 1;
1402       }
1403     }
1404   }
1405 
1406   return overlap;
1407 }
1408 
1409 /* For a subdevice parameter, return the actual device it belongs to. */
1410 cl_device_id
pocl_real_dev(const cl_device_id dev)1411 pocl_real_dev (const cl_device_id dev)
1412 {
1413   cl_device_id ret = dev;
1414   while (ret->parent_device)
1415     ret = ret->parent_device;
1416   return ret;
1417 }
1418 
1419 /* Make a list of unique devices. If any device is a subdevice,
1420  * replace with parent, then remove duplicate parents. */
pocl_unique_device_list(const cl_device_id * in,cl_uint num,cl_uint * real)1421 cl_device_id * pocl_unique_device_list(const cl_device_id * in, cl_uint num, cl_uint *real)
1422 {
1423   cl_uint real_num = num;
1424   cl_device_id *out = (cl_device_id *)calloc (num, sizeof (cl_device_id));
1425   if (!out)
1426     return NULL;
1427 
1428   unsigned i;
1429   for (i=0; i < num; ++i)
1430     out[i] = (in[i] ? pocl_real_dev (in[i]) : NULL);
1431 
1432   i=1;
1433   unsigned device_i=0;
1434   while (i < real_num)
1435     {
1436       device_i=0;
1437       while (device_i < i)
1438         {
1439           if (out[device_i] == out[i])
1440             {
1441               out[device_i] = out[--real_num];
1442               out[real_num] = NULL;
1443             }
1444           else
1445             device_i++;
1446         }
1447       i++;
1448     }
1449 
1450   *real = real_num;
1451   return out;
1452 }
1453 
1454 int
pocl_device_supports_builtin_kernel(cl_device_id dev,const char * kernel_name)1455 pocl_device_supports_builtin_kernel (cl_device_id dev, const char *kernel_name)
1456 {
1457   if (kernel_name == NULL)
1458     return 0;
1459 
1460   if (dev->builtin_kernel_list == NULL)
1461     return 0;
1462 
1463   char *temp = strdup (dev->builtin_kernel_list);
1464   char *token;
1465   char *rest = temp;
1466 
1467   while ((token = strtok_r (rest, ";", &rest)))
1468     {
1469       if (strcmp (token, kernel_name) == 0)
1470         {
1471           free (temp);
1472           return 1;
1473         }
1474     }
1475 
1476   free (temp);
1477   return 0;
1478 }
1479 
1480 static void
image_format_union(const cl_image_format * dev_formats,cl_uint num_dev_formats,cl_image_format ** context_formats,cl_uint * num_context_formats)1481 image_format_union (const cl_image_format *dev_formats,
1482                     cl_uint               num_dev_formats,
1483                     cl_image_format       **context_formats,
1484                     cl_uint               *num_context_formats)
1485 {
1486   if ((dev_formats == NULL) || (num_dev_formats == 0))
1487     return;
1488 
1489   if ((*num_context_formats == 0) || (*context_formats == NULL))
1490     {
1491       // alloc & copy
1492       *context_formats = (cl_image_format *)malloc (sizeof (cl_image_format)
1493                                                     * num_dev_formats);
1494       memcpy (*context_formats, dev_formats,
1495               sizeof (cl_image_format) * num_dev_formats);
1496       *num_context_formats = num_dev_formats;
1497     }
1498   else
1499     {
1500       // realloc & merge
1501       cl_uint i, j;
1502       cl_uint ncf = *num_context_formats;
1503       size_t size = sizeof (cl_image_format) * (num_dev_formats + ncf);
1504       cl_image_format *ctf
1505           = (cl_image_format *)realloc (*context_formats, size);
1506       assert (ctf);
1507       for (i = 0; i < num_dev_formats; ++i)
1508         {
1509           for (j = 0; j < ncf; ++j)
1510             if (memcmp (ctf + j, dev_formats + i, sizeof (cl_image_format))
1511                 == 0)
1512               break;
1513           if (j < ncf)
1514             {
1515               // format already in context, skip
1516               continue;
1517             }
1518           else
1519             {
1520               memcpy (ctf + ncf, dev_formats + i, sizeof (cl_image_format));
1521               ++ncf;
1522             }
1523         }
1524       *context_formats = ctf;
1525       *num_context_formats = ncf;
1526     }
1527 }
1528 
1529 /* Setup certain info about context that comes up later in API calls */
1530 void
pocl_setup_context(cl_context context)1531 pocl_setup_context (cl_context context)
1532 {
1533   unsigned i, j;
1534   int err;
1535   size_t alignment = context->devices[0]->mem_base_addr_align;
1536   context->max_mem_alloc_size = 0;
1537   context->svm_allocdev = NULL;
1538   assert (context->default_queues);
1539 
1540   memset (context->image_formats, 0, sizeof (void *) * NUM_OPENCL_IMAGE_TYPES);
1541   memset (context->num_image_formats, 0,
1542           sizeof (cl_uint) * NUM_OPENCL_IMAGE_TYPES);
1543 
1544   for(i=0; i<context->num_devices; i++)
1545     {
1546       cl_device_id dev = context->devices[i];
1547       if (dev->svm_allocation_priority > 0)
1548         {
1549           if (context->svm_allocdev == NULL
1550               || context->svm_allocdev->svm_allocation_priority
1551                      < dev->svm_allocation_priority)
1552             {
1553               context->svm_allocdev = dev;
1554             }
1555         }
1556 
1557       if (dev->mem_base_addr_align < alignment)
1558         alignment = dev->mem_base_addr_align;
1559 
1560       if (dev->max_mem_alloc_size
1561           > context->max_mem_alloc_size)
1562         context->max_mem_alloc_size =
1563             dev->max_mem_alloc_size;
1564 
1565       if (dev->image_support == CL_TRUE)
1566         {
1567           for (j = 0; j < NUM_OPENCL_IMAGE_TYPES; ++j)
1568             image_format_union (
1569                 dev->image_formats[j],
1570                 dev->num_image_formats[j],
1571                 &context->image_formats[j], &context->num_image_formats[j]);
1572         }
1573 
1574       context->default_queues[i] = POname (clCreateCommandQueue) (
1575           context, dev,
1576           (CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_HIDDEN
1577            | CL_QUEUE_PROFILING_ENABLE),
1578           &err);
1579       assert (err == CL_SUCCESS);
1580       assert (context->default_queues[i]);
1581     }
1582 
1583   assert (alignment > 0);
1584   context->min_buffer_alignment = alignment;
1585 }
1586 
1587 int
pocl_check_event_wait_list(cl_command_queue command_queue,cl_uint num_events_in_wait_list,const cl_event * event_wait_list)1588 pocl_check_event_wait_list (cl_command_queue command_queue,
1589                             cl_uint num_events_in_wait_list,
1590                             const cl_event *event_wait_list)
1591 {
1592   POCL_RETURN_ERROR_COND (
1593       (event_wait_list == NULL && num_events_in_wait_list > 0),
1594       CL_INVALID_EVENT_WAIT_LIST);
1595 
1596   POCL_RETURN_ERROR_COND (
1597       (event_wait_list != NULL && num_events_in_wait_list == 0),
1598       CL_INVALID_EVENT_WAIT_LIST);
1599 
1600   if (event_wait_list)
1601     {
1602       unsigned i;
1603       for (i = 0; i < num_events_in_wait_list; i++)
1604         {
1605           POCL_RETURN_ERROR_COND ((!IS_CL_OBJECT_VALID (event_wait_list[i])),
1606                                   CL_INVALID_EVENT_WAIT_LIST);
1607           POCL_RETURN_ERROR_COND (
1608               (event_wait_list[i]->context != command_queue->context),
1609               CL_INVALID_CONTEXT);
1610         }
1611     }
1612 
1613   return CL_SUCCESS;
1614 }
1615 
1616 const char*
pocl_status_to_str(int status)1617 pocl_status_to_str (int status)
1618 {
1619   static const char *status_to_str[] = {
1620   "complete",
1621   "running",
1622   "submitted",
1623   "queued"};
1624   return status_to_str[status];
1625 }
1626 
1627 void
pocl_abort_on_pthread_error(int status,unsigned line,const char * func)1628 pocl_abort_on_pthread_error (int status, unsigned line, const char *func)
1629 {
1630   if (status != 0)
1631     {
1632       POCL_MSG_PRINT2 (HSA, func, line, "Error from pthread call:\n");
1633       POCL_ABORT ("%s\n", strerror (status));
1634     }
1635 }
1636 
1637 /* Convert a command type to its representation string
1638  */
1639 const char *
pocl_command_to_str(cl_command_type cmd)1640 pocl_command_to_str (cl_command_type cmd)
1641 {
1642   switch (cmd)
1643     {
1644     case CL_COMMAND_NDRANGE_KERNEL:
1645       return "ndrange_kernel";
1646     case CL_COMMAND_TASK:
1647       return "task_kernel";
1648     case CL_COMMAND_NATIVE_KERNEL:
1649       return "native_kernel";
1650     case CL_COMMAND_READ_BUFFER:
1651       return "read_buffer";
1652     case CL_COMMAND_WRITE_BUFFER:
1653       return "write_buffer";
1654     case CL_COMMAND_COPY_BUFFER:
1655       return "copy_buffer";
1656     case CL_COMMAND_READ_IMAGE:
1657       return "read_image";
1658     case CL_COMMAND_WRITE_IMAGE:
1659       return "write_image";
1660     case CL_COMMAND_COPY_IMAGE:
1661       return "copy_image";
1662     case CL_COMMAND_COPY_IMAGE_TO_BUFFER:
1663       return "copy_image_to_buffer";
1664     case CL_COMMAND_COPY_BUFFER_TO_IMAGE:
1665       return "copy_buffer_to_image";
1666     case CL_COMMAND_MAP_BUFFER:
1667       return "map_buffer";
1668     case CL_COMMAND_MAP_IMAGE:
1669       return "map_image";
1670     case CL_COMMAND_UNMAP_MEM_OBJECT:
1671       return "unmap_mem_object";
1672     case CL_COMMAND_MARKER:
1673       return "marker";
1674     case CL_COMMAND_ACQUIRE_GL_OBJECTS:
1675       return "acquire_gl_objects";
1676     case CL_COMMAND_RELEASE_GL_OBJECTS:
1677       return "release_gl_objects";
1678     case CL_COMMAND_READ_BUFFER_RECT:
1679       return "read_buffer_rect";
1680     case CL_COMMAND_WRITE_BUFFER_RECT:
1681       return "write_buffer_rect";
1682     case CL_COMMAND_COPY_BUFFER_RECT:
1683       return "copy_buffer_rect";
1684     case CL_COMMAND_USER:
1685       return "user";
1686     case CL_COMMAND_BARRIER:
1687       return "barrier";
1688     case CL_COMMAND_MIGRATE_MEM_OBJECTS:
1689       return "migrate_mem_objects";
1690     case CL_COMMAND_FILL_BUFFER:
1691       return "fill_buffer";
1692     case CL_COMMAND_FILL_IMAGE:
1693       return "fill_image";
1694     case CL_COMMAND_SVM_FREE:
1695       return "svm_free";
1696     case CL_COMMAND_SVM_MEMCPY:
1697       return "svm_memcpy";
1698     case CL_COMMAND_SVM_MEMFILL:
1699       return "svm_memfill";
1700     case CL_COMMAND_SVM_MAP:
1701       return "svm_map";
1702     case CL_COMMAND_SVM_UNMAP:
1703       return "svm_unmap";
1704     }
1705 
1706   return "unknown";
1707 }
1708 
1709 /*
1710  * This replaces a simple system(), because:
1711  *
1712  * 1) system() was causing issues (gpu lockups) with HSA when
1713  * compiling code (via compile_parallel_bc_to_brig)
1714  * with OpenCL 2.0 atomics (like CalcPie from AMD SDK).
1715  * The reason of lockups is unknown (yet).
1716  *
1717  * 2) system() uses fork() which copies page table maps, and runs
1718  * out of AS when pocl has already allocated huge buffers in memory.
1719  * this happened in llvm_codegen()
1720  *
1721  * vfork() does not copy pagetables.
1722  */
1723 int
pocl_run_command(char * const * args)1724 pocl_run_command (char *const *args)
1725 {
1726   POCL_MSG_PRINT_INFO ("Launching: %s\n", args[0]);
1727 #ifdef HAVE_VFORK
1728   pid_t p = vfork ();
1729 #elif defined(HAVE_FORK)
1730   pid_t p = fork ();
1731 #elif _WIN32
1732   STARTUPINFO si;
1733   ZeroMemory(&si, sizeof(si));
1734   si.cb = sizeof(si);
1735   PROCESS_INFORMATION pi;
1736   ZeroMemory(&pi, sizeof(pi));
1737   DWORD dwProcessFlags = 0;
1738   char * cmd = strdup(args[0]);
1739   int p = CreateProcess(NULL, cmd, NULL, NULL, 1, dwProcessFlags, NULL, NULL, &si, &pi) != 0;
1740   if (!p)
1741     return EXIT_FAILURE;
1742   DWORD waitRc = WaitForSingleObject(pi.hProcess, INFINITE);
1743   if (waitRc == WAIT_FAILED)
1744     return EXIT_FAILURE;
1745   DWORD exit_code = 0;
1746   p = GetExitCodeProcess(pi.hProcess, &exit_code) != 0;
1747   if (!p)
1748     return EXIT_FAILURE;
1749   return exit_code;
1750 #else
1751 #error Must have fork() or vfork() system calls for HSA
1752 #endif
1753   if (p == 0)
1754     {
1755       return execv (args[0], args);
1756     }
1757   else
1758     {
1759       if (p < 0)
1760         return EXIT_FAILURE;
1761       int status;
1762       if (waitpid (p, &status, 0) < 0)
1763         POCL_ABORT ("pocl: waitpid() failed.\n");
1764       if (WIFEXITED (status))
1765         return WEXITSTATUS (status);
1766       else if (WIFSIGNALED (status))
1767         return WTERMSIG (status);
1768       else
1769         return EXIT_FAILURE;
1770     }
1771 }
1772 
1773 // event locked
1774 void
pocl_update_event_queued(cl_event event)1775 pocl_update_event_queued (cl_event event)
1776 {
1777   assert (event != NULL);
1778 
1779   event->status = CL_QUEUED;
1780   cl_command_queue cq = event->queue;
1781   if ((cq->properties & CL_QUEUE_PROFILING_ENABLE)
1782       && (cq->device->has_own_timer == 0))
1783     event->time_queue = pocl_gettimemono_ns ();
1784 
1785   POCL_MSG_PRINT_EVENTS ("Event queued: %" PRIu64 "\n", event->id);
1786 
1787   if (cq->device->ops->update_event)
1788     cq->device->ops->update_event (cq->device, event);
1789   pocl_event_updated (event, CL_QUEUED);
1790 }
1791 
1792 // event locked
1793 void
pocl_update_event_submitted(cl_event event)1794 pocl_update_event_submitted (cl_event event)
1795 {
1796   assert (event != NULL);
1797   assert (event->status == CL_QUEUED);
1798 
1799   cl_command_queue cq = event->queue;
1800   event->status = CL_SUBMITTED;
1801   if ((cq->properties & CL_QUEUE_PROFILING_ENABLE)
1802       && (cq->device->has_own_timer == 0))
1803     event->time_submit = pocl_gettimemono_ns ();
1804 
1805   POCL_MSG_PRINT_EVENTS ("Event submitted: %" PRIu64 "\n", event->id);
1806 
1807   if (cq->device->ops->update_event)
1808     cq->device->ops->update_event (cq->device, event);
1809   pocl_event_updated (event, CL_SUBMITTED);
1810 }
1811 
1812 void
pocl_update_event_running_unlocked(cl_event event)1813 pocl_update_event_running_unlocked (cl_event event)
1814 {
1815   assert (event != NULL);
1816   assert (event->status == CL_SUBMITTED);
1817 
1818   cl_command_queue cq = event->queue;
1819   event->status = CL_RUNNING;
1820   if ((cq->properties & CL_QUEUE_PROFILING_ENABLE)
1821       && (cq->device->has_own_timer == 0))
1822     event->time_start = pocl_gettimemono_ns ();
1823 
1824   POCL_MSG_PRINT_EVENTS ("Event running: %" PRIu64 "\n", event->id);
1825 
1826   if (cq->device->ops->update_event)
1827     cq->device->ops->update_event (cq->device, event);
1828   pocl_event_updated (event, CL_RUNNING);
1829 }
1830 
1831 void
pocl_update_event_running(cl_event event)1832 pocl_update_event_running (cl_event event)
1833 {
1834   POCL_LOCK_OBJ (event);
1835   pocl_update_event_running_unlocked (event);
1836   POCL_UNLOCK_OBJ (event);
1837 }
1838 
1839 // status can be complete or failed (<0)
1840 void
pocl_update_event_finished_msg(cl_int status,const char * func,unsigned line,cl_event event,const char * msg)1841 pocl_update_event_finished_msg (cl_int status, const char *func, unsigned line,
1842                                 cl_event event, const char *msg)
1843 {
1844   assert (event != NULL);
1845   assert (event->queue != NULL);
1846   assert (event->status > CL_COMPLETE);
1847 
1848   cl_command_queue cq = event->queue;
1849   POCL_LOCK_OBJ (cq);
1850   POCL_LOCK_OBJ (event);
1851   if ((cq->properties & CL_QUEUE_PROFILING_ENABLE)
1852       && (cq->device->has_own_timer == 0))
1853     event->time_end = pocl_gettimemono_ns ();
1854 
1855   struct pocl_device_ops *ops = cq->device->ops;
1856   event->status = status;
1857   if (cq->device->ops->update_event)
1858     ops->update_event (cq->device, event);
1859 
1860   if (status == CL_COMPLETE)
1861     POCL_MSG_PRINT_EVENTS ("%s: Command complete, event %" PRIu64 "\n",
1862                            cq->device->short_name, event->id);
1863   else
1864     POCL_MSG_PRINT_EVENTS ("%s: Command FAILED, event %" PRIu64 "\n",
1865                            cq->device->short_name, event->id);
1866 
1867   assert (cq->command_count > 0);
1868   --cq->command_count;
1869   if (cq->barrier == event)
1870     cq->barrier = NULL;
1871   if (cq->last_event.event == event)
1872     cq->last_event.event = NULL;
1873   DL_DELETE (cq->events, event);
1874 
1875   if (ops->notify_cmdq_finished && (cq->command_count == 0))
1876     ops->notify_cmdq_finished (cq);
1877 
1878   if (ops->notify_event_finished)
1879     ops->notify_event_finished (event);
1880 
1881   POCL_UNLOCK_OBJ (cq);
1882   /* note that we must unlock the CmqQ before calling pocl_event_updated,
1883    * because it calls event callbacks, which can have calls to
1884    * clEnqueueSomething() */
1885   pocl_event_updated (event, status);
1886   POCL_UNLOCK_OBJ (event);
1887   ops->broadcast (event);
1888 
1889 #ifdef POCL_DEBUG_MESSAGES
1890   if (msg != NULL)
1891     {
1892       pocl_debug_print_duration (
1893           func, line, msg, (uint64_t) (event->time_end - event->time_start));
1894     }
1895 #endif
1896 
1897   size_t i;
1898   for (i = 0; i < event->num_buffers; ++i)
1899     {
1900       cl_mem mem = event->mem_objs[i];
1901       if (event->release_mem_host_ptr_after)
1902         {
1903           POCL_LOCK_OBJ (mem);
1904           pocl_release_mem_host_ptr (mem);
1905           POCL_UNLOCK_OBJ (mem);
1906         }
1907       POname (clReleaseMemObject) (mem);
1908     }
1909   POCL_MEM_FREE (event->mem_objs);
1910 
1911   POname (clReleaseEvent) (event);
1912 }
1913 
1914 void
pocl_update_event_failed(cl_event event)1915 pocl_update_event_failed (cl_event event)
1916 {
1917   POCL_UNLOCK_OBJ (event);
1918   pocl_update_event_finished_msg (CL_FAILED, NULL, 0, event, NULL);
1919   POCL_LOCK_OBJ (event);
1920 }
1921 
1922 void
pocl_update_event_complete_msg(const char * func,unsigned line,cl_event event,const char * msg)1923 pocl_update_event_complete_msg (const char *func, unsigned line,
1924                                 cl_event event, const char *msg)
1925 {
1926   pocl_update_event_finished_msg (CL_COMPLETE, func, line, event, msg);
1927 }
1928 
1929 /*
1930  * float 2 half / half 2 float
1931  */
1932 
1933 static int const shift = 13;
1934 static int const shiftSign = 16;
1935 
1936 static int32_t const infN = 0x7F800000;  /* flt32 infinity */
1937 static int32_t const maxN = 0x477FE000;  /* max flt16 normal as a flt32 */
1938 static int32_t const minN = 0x38800000;  /* min flt16 normal as a flt32 */
1939 static int32_t const signN = 0x80000000; /* flt32 sign bit */
1940 
1941 /* static int32_t const infC = infN >> shift;
1942  * static int32_t const infC = 0x3FC00;
1943  * static int32_t const nanN = (infC + 1) << shift; // minimum flt16 nan as a flt32
1944  */
1945 static int32_t const nanN = 0x7f802000;
1946 /* static int32_t const maxC = maxN >> shift; */
1947 static int32_t const maxC = 0x23bff;
1948 /* static int32_t const minC = minN >> shift;
1949  * static int32_t const minC = 0x1c400;
1950  * static int32_t const signC = signN >> shiftSign; // flt16 sign bit
1951  */
1952 static int32_t const signC = 0x40000; /* flt16 sign bit */
1953 
1954 static int32_t const mulN = 0x52000000; /* (1 << 23) / minN */
1955 static int32_t const mulC = 0x33800000; /* minN / (1 << (23 - shift)) */
1956 
1957 static int32_t const subC = 0x003FF; /* max flt32 subnormal down shifted */
1958 static int32_t const norC = 0x00400; /* min flt32 normal down shifted */
1959 
1960 /* static int32_t const maxD = infC - maxC - 1; */
1961 static int32_t const maxD = 0x1c000;
1962 /* static int32_t const minD = minC - subC - 1; */
1963 static int32_t const minD = 0x1c000;
1964 
1965 typedef union
1966 {
1967   float f;
1968   int32_t si;
1969   uint32_t ui;
1970 } H2F_Bits;
1971 
1972 float
half_to_float(uint16_t value)1973 half_to_float (uint16_t value)
1974 {
1975   H2F_Bits v;
1976   v.ui = value;
1977   int32_t sign = v.si & signC;
1978   v.si ^= sign;
1979   sign <<= shiftSign;
1980   v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
1981   v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
1982   H2F_Bits s;
1983   s.si = mulC;
1984   s.f *= v.si;
1985   int32_t mask = -(norC > v.si);
1986   v.si <<= shift;
1987   v.si ^= (s.si ^ v.si) & mask;
1988   v.si |= sign;
1989   return v.f;
1990 }
1991 
1992 uint16_t
float_to_half(float value)1993 float_to_half (float value)
1994 {
1995   H2F_Bits v, s;
1996   v.f = value;
1997   uint32_t sign = v.si & signN;
1998   v.si ^= sign;
1999   sign >>= shiftSign;
2000   s.si = mulN;
2001   s.si = s.f * v.f;
2002   v.si ^= (s.si ^ v.si) & -(minN > v.si);
2003   v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
2004   v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
2005   v.ui >>= shift;
2006   v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
2007   v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC);
2008   return v.ui | sign;
2009 }
2010 
2011 /* SPIR-V magic header */
2012 #define SPIRV_MAGIC 0x07230203U
2013 /* Opcode for capability used by module */
2014 #define OpCapab 0x00020011
2015 /* execution model = Kernel is used by OpenCL SPIR-V modules */
2016 #define KernelExecModel 0x6
2017 
2018 int
bitcode_is_spirv_kernel(const char * bitcode,size_t size)2019 bitcode_is_spirv_kernel (const char *bitcode, size_t size)
2020 {
2021   const uint32_t *bc32 = (const uint32_t *)bitcode;
2022   unsigned location = 0;
2023   uint32_t header_magic = htole32 (bc32[location++]);
2024 
2025   if ((size < 20) || (header_magic != SPIRV_MAGIC))
2026     return 0;
2027 
2028   // skip version, generator, bound, schema
2029   location += 4;
2030   int is_opencl = 0;
2031   uint32_t instruction, value;
2032   do
2033     {
2034       instruction = htole32 (bc32[location++]);
2035       value = htole32 (bc32[location++]);
2036       if (value == KernelExecModel)
2037         is_opencl = 1;
2038     }
2039   while (instruction == OpCapab);
2040 
2041   /* SPIR-V but not OpenCL-type. */
2042   if (!is_opencl)
2043     {
2044       POCL_MSG_ERR ("SPIR-V binary provided, but is not using Kernel mode."
2045                     "Pocl can't process this binary.\n");
2046     }
2047 
2048   return is_opencl;
2049 }
2050