1 /* OpenCL runtime library: pocl_util utility functions
2
3 Copyright (c) 2012-2019 Pekka Jääskeläinen
4
5 Permission is hereby granted, free of charge, to any person obtaining a copy
6 of this software and associated documentation files (the "Software"), to deal
7 in the Software without restriction, including without limitation the rights
8 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 copies of the Software, and to permit persons to whom the Software is
10 furnished to do so, subject to the following conditions:
11
12 The above copyright notice and this permission notice shall be included in
13 all copies or substantial portions of the Software.
14
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 THE SOFTWARE.
22 */
23
24 #include <errno.h>
25 #include <limits.h>
26 #include <stdlib.h>
27 #include <string.h>
28
29 #include <time.h>
30
31 #ifndef _WIN32
32 #include <dirent.h>
33 #include <string.h>
34 #include <sys/resource.h>
35 #include <sys/stat.h>
36 #include <sys/time.h>
37 #include <sys/types.h>
38 #include <sys/wait.h>
39 #include <unistd.h>
40 #include <utime.h>
41 #else
42 # include "vccompat.hpp"
43 #endif
44
45 #include "pocl_util.h"
46 #include "pocl_timing.h"
47 #include "pocl_llvm.h"
48 #include "utlist.h"
49 #include "common.h"
50 #include "pocl_mem_management.h"
51 #include "devices.h"
52 #include "pocl_runtime_config.h"
53
54 /* required for setting SSE/AVX flush denorms to zero flag */
55 #if defined(__x86_64__) && defined(__GNUC__)
56 #include <x86intrin.h>
57 #endif
58
59 struct list_item;
60
61 typedef struct list_item
62 {
63 void *value;
64 struct list_item *next;
65 } list_item;
66
67 void
pocl_restore_ftz(unsigned ftz)68 pocl_restore_ftz (unsigned ftz)
69 {
70 #if defined(__x86_64__) && defined(__GNUC__)
71
72 #ifdef _MM_FLUSH_ZERO_ON
73 if (ftz & _MM_FLUSH_ZERO_ON)
74 _MM_SET_FLUSH_ZERO_MODE (_MM_FLUSH_ZERO_ON);
75 else
76 _MM_SET_FLUSH_ZERO_MODE (_MM_FLUSH_ZERO_OFF);
77 #endif
78 #ifdef _MM_DENORMALS_ZERO_ON
79 if (ftz & _MM_DENORMALS_ZERO_ON)
80 _MM_SET_DENORMALS_ZERO_MODE (_MM_DENORMALS_ZERO_ON);
81 else
82 _MM_SET_DENORMALS_ZERO_MODE (_MM_DENORMALS_ZERO_OFF);
83 #endif
84
85 #endif
86 }
87
88 unsigned
pocl_save_ftz()89 pocl_save_ftz ()
90 {
91 #if defined(__x86_64__) && defined(__GNUC__)
92
93 unsigned s = 0;
94 #ifdef _MM_FLUSH_ZERO_ON
95 if (_MM_GET_FLUSH_ZERO_MODE ())
96 s |= _MM_FLUSH_ZERO_ON;
97 else
98 s &= (~_MM_FLUSH_ZERO_ON);
99 #endif
100 #ifdef _MM_DENORMALS_ZERO_ON
101 if (_MM_GET_DENORMALS_ZERO_MODE ())
102 s |= _MM_DENORMALS_ZERO_ON;
103 else
104 s &= (~_MM_DENORMALS_ZERO_ON);
105 #endif
106 return s;
107
108 #else
109 return 0;
110 #endif
111 }
112
113 void
pocl_set_ftz(unsigned ftz)114 pocl_set_ftz (unsigned ftz)
115 {
116 #if defined(__x86_64__) && defined(__GNUC__)
117 if (ftz)
118 {
119 #ifdef _MM_FLUSH_ZERO_ON
120 _MM_SET_FLUSH_ZERO_MODE (_MM_FLUSH_ZERO_ON);
121 #endif
122
123 #ifdef _MM_DENORMALS_ZERO_ON
124 _MM_SET_DENORMALS_ZERO_MODE (_MM_DENORMALS_ZERO_ON);
125 #endif
126 }
127 else
128 {
129 #ifdef _MM_FLUSH_ZERO_OFF
130 _MM_SET_FLUSH_ZERO_MODE (_MM_FLUSH_ZERO_OFF);
131 #endif
132
133 #ifdef _MM_DENORMALS_ZERO_OFF
134 _MM_SET_DENORMALS_ZERO_MODE (_MM_DENORMALS_ZERO_OFF);
135 #endif
136 }
137 #endif
138 }
139
140
141 void
pocl_set_default_rm()142 pocl_set_default_rm ()
143 {
144 #if defined(__x86_64__) && defined(__GNUC__) && defined(_MM_ROUND_NEAREST)
145 unsigned rm = _MM_GET_ROUNDING_MODE ();
146 if (rm != _MM_ROUND_NEAREST)
147 _MM_SET_ROUNDING_MODE (_MM_ROUND_NEAREST);
148 #endif
149 }
150
151 unsigned
pocl_save_rm()152 pocl_save_rm ()
153 {
154 #if defined(__x86_64__) && defined(__GNUC__) && defined(_MM_ROUND_NEAREST)
155 return _MM_GET_ROUNDING_MODE ();
156 #else
157 return 0;
158 #endif
159 }
160
161 void
pocl_restore_rm(unsigned rm)162 pocl_restore_rm (unsigned rm)
163 {
164 #if defined(__x86_64__) && defined(__GNUC__) && defined(_MM_ROUND_NEAREST)
165 _MM_SET_ROUNDING_MODE (rm);
166 #endif
167 }
168
169 uint32_t
byteswap_uint32_t(uint32_t word,char should_swap)170 byteswap_uint32_t (uint32_t word, char should_swap)
171 {
172 union word_union
173 {
174 uint32_t full_word;
175 unsigned char bytes[4];
176 } old, neww;
177 if (!should_swap) return word;
178
179 old.full_word = word;
180 neww.bytes[0] = old.bytes[3];
181 neww.bytes[1] = old.bytes[2];
182 neww.bytes[2] = old.bytes[1];
183 neww.bytes[3] = old.bytes[0];
184 return neww.full_word;
185 }
186
187 float
byteswap_float(float word,char should_swap)188 byteswap_float (float word, char should_swap)
189 {
190 union word_union
191 {
192 float full_word;
193 unsigned char bytes[4];
194 } old, neww;
195 if (!should_swap) return word;
196
197 old.full_word = word;
198 neww.bytes[0] = old.bytes[3];
199 neww.bytes[1] = old.bytes[2];
200 neww.bytes[2] = old.bytes[1];
201 neww.bytes[3] = old.bytes[0];
202 return neww.full_word;
203 }
204
205 size_t
pocl_size_ceil2(size_t x)206 pocl_size_ceil2(size_t x) {
207 /* Rounds up to the next highest power of two without branching and
208 * is as fast as a BSR instruction on x86, see:
209 *
210 * https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
211 */
212 --x;
213 x |= x >> 1;
214 x |= x >> 2;
215 x |= x >> 4;
216 x |= x >> 8;
217 x |= x >> 16;
218 #if SIZE_MAX > 0xFFFFFFFF
219 x |= x >> 32;
220 #endif
221 return ++x;
222 }
223
224 uint64_t
pocl_size_ceil2_64(uint64_t x)225 pocl_size_ceil2_64 (uint64_t x)
226 {
227 /* Rounds up to the next highest power of two without branching and
228 * is as fast as a BSR instruction on x86, see:
229 *
230 * https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
231 */
232 --x;
233 x |= x >> 1;
234 x |= x >> 2;
235 x |= x >> 4;
236 x |= x >> 8;
237 x |= x >> 16;
238 x |= x >> 32;
239 return ++x;
240 }
241
242 #if defined(_WIN32) || defined(HAVE_POSIX_MEMALIGN) || defined(__ANDROID__) \
243 || (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L))
244 #define HAVE_ALIGNED_ALLOC
245 #else
246 #error aligned malloc unavailable
247 #endif
248
249 static void*
pocl_memalign_alloc(size_t align_width,size_t size)250 pocl_memalign_alloc(size_t align_width, size_t size)
251 {
252 void *ptr;
253 int status;
254
255 #ifdef __ANDROID__
256 ptr = memalign (align_width, size);
257 return ptr;
258 #elif defined(HAVE_POSIX_MEMALIGN)
259 status = posix_memalign (&ptr, align_width, size);
260 return ((status == 0) ? ptr : NULL);
261 #elif defined(_MSC_VER)
262 return _aligned_malloc(size, align_width);
263 #elif defined(__MINGW32__)
264 return __mingw_aligned_malloc(size, align_width);
265 #elif (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L))
266 return aligned_alloc (align_width, size);
267 #else
268 #error Cannot find aligned malloc
269 #endif
270 }
271
272 void *
pocl_aligned_malloc(size_t alignment,size_t size)273 pocl_aligned_malloc (size_t alignment, size_t size)
274 {
275 #ifdef HAVE_ALIGNED_ALLOC
276 assert (alignment > 0);
277 /* make sure that size is a multiple of alignment, as posix_memalign
278 * does not perform this test, whereas aligned_alloc does */
279 if ((size & (alignment - 1)) != 0)
280 {
281 size = size | (alignment - 1);
282 size += 1;
283 }
284
285 /* posix_memalign requires alignment to be at least sizeof(void *) */
286 if (alignment < sizeof(void *))
287 alignment = sizeof(void* );
288
289 void* result;
290
291 result = pocl_memalign_alloc(alignment, size);
292 if (result == NULL)
293 {
294 errno = -1;
295 return NULL;
296 }
297
298 return result;
299
300 #else
301 #error Cannot find aligned malloc
302 #endif
303
304 #if 0
305 /* this code works in theory, but there many places in pocl
306 * where aligned memory is used in the same pointers
307 * as memory allocated by other means */
308 /* allow zero-sized allocations, force alignment to 1 */
309 if (!size)
310 alignment = 1;
311
312 /* make sure alignment is a non-zero power of two and that
313 * size is a multiple of alignment */
314 size_t mask = alignment - 1;
315 if (!alignment || ((alignment & mask) != 0) || ((size & mask) != 0))
316 {
317 errno = EINVAL;
318 return NULL;
319 }
320
321 /* allocate memory plus space for alignment header */
322 uintptr_t address = (uintptr_t)malloc(size + mask + sizeof(void *));
323 if (!address)
324 return NULL;
325
326 /* align the address, and store original pointer for future use
327 * with free in the preceding bytes */
328 uintptr_t aligned_address = (address + mask + sizeof(void *)) & ~mask;
329 void** address_ptr = (void **)(aligned_address - sizeof(void *));
330 *address_ptr = (void *)address;
331 return (void *)aligned_address;
332
333 #endif
334 }
335
336 #if 0
337 void
338 pocl_aligned_free (void *ptr)
339 {
340 #ifdef HAVE_ALIGNED_ALLOC
341 POCL_MEM_FREE (ptr);
342 #else
343 #error Cannot find aligned malloc
344 /* extract pointer from original allocation and free it */
345 if (ptr)
346 free(*(void **)((uintptr_t)ptr - sizeof(void *)));
347 #endif
348 }
349 #endif
350
351 void
pocl_lock_events_inorder(cl_event ev1,cl_event ev2)352 pocl_lock_events_inorder (cl_event ev1, cl_event ev2)
353 {
354 assert (ev1 != ev2);
355 assert (ev1->id != ev2->id);
356 if (ev1->id < ev2->id)
357 {
358 POCL_LOCK_OBJ (ev1);
359 POCL_LOCK_OBJ (ev2);
360 }
361 else
362 {
363 POCL_LOCK_OBJ (ev2);
364 POCL_LOCK_OBJ (ev1);
365 }
366 }
367
368 void
pocl_unlock_events_inorder(cl_event ev1,cl_event ev2)369 pocl_unlock_events_inorder (cl_event ev1, cl_event ev2)
370 {
371 assert (ev1 != ev2);
372 assert (ev1->id != ev2->id);
373 if (ev1->id < ev2->id)
374 {
375 POCL_UNLOCK_OBJ (ev1);
376 POCL_UNLOCK_OBJ (ev2);
377 }
378 else
379 {
380 POCL_UNLOCK_OBJ (ev2);
381 POCL_UNLOCK_OBJ (ev1);
382 }
383 }
384
385 /* This is required because e.g. NDRange commands could have the same buffer
386 * multiple times as argument, or CopyBuffer could have src == dst buffer.
387 *
388 * If the buffer that appears multiple times in the list, is on another device,
389 * we don't want to enqueue >1 migrations for the same buffer.
390 */
391 static void
sort_and_uniq(cl_mem * objs,char * readonly_flags,size_t * num_objs)392 sort_and_uniq (cl_mem *objs, char *readonly_flags, size_t *num_objs)
393 {
394 size_t i;
395 ssize_t j;
396 size_t n = *num_objs;
397 assert (n > 1);
398
399 /* if the buffer is an image backed by buffer storage,
400 * replace with actual storage */
401 for (i = 0; i < n; ++i)
402 if (objs[i]->buffer)
403 objs[i] = objs[i]->buffer;
404
405 /* sort by obj id */
406 for (i = 1; i < n; ++i)
407 {
408 cl_mem buf = objs[i];
409 char c = readonly_flags[i];
410 for (j = (i - 1); ((j >= 0) && (objs[j]->id > buf->id)); --j)
411 {
412 objs[j + 1] = objs[j];
413 readonly_flags[j + 1] = readonly_flags[j];
414 }
415 objs[j + 1] = buf;
416 readonly_flags[j + 1] = c;
417 }
418
419 /* skip the first i objects which are different */
420 for (i = 1; i < n; ++i)
421 if (objs[i - 1] == objs[i])
422 break;
423
424 /* uniq */
425 size_t k = i;
426 while (i < n)
427 {
428 if (objs[k] != objs[i])
429 {
430 objs[k] = objs[i];
431 readonly_flags[k] = readonly_flags[i];
432 ++k;
433 }
434 else
435 {
436 readonly_flags[k] = readonly_flags[k] & readonly_flags[i];
437 }
438 ++i;
439 }
440
441 *num_objs = k;
442 }
443
444 extern unsigned long event_c;
445 extern unsigned long uevent_c;
446
447 cl_int
pocl_create_event(cl_event * event,cl_command_queue command_queue,cl_command_type command_type,size_t num_buffers,const cl_mem * buffers,cl_context context)448 pocl_create_event (cl_event *event, cl_command_queue command_queue,
449 cl_command_type command_type, size_t num_buffers,
450 const cl_mem *buffers, cl_context context)
451 {
452 static uint64_t event_id_counter = 0;
453
454 if (context == NULL)
455 return CL_INVALID_CONTEXT;
456
457 assert (event != NULL);
458 *event = pocl_mem_manager_new_event ();
459 if (*event == NULL)
460 return CL_OUT_OF_HOST_MEMORY;
461
462 (*event)->context = context;
463 (*event)->queue = command_queue;
464
465 /* user events have a NULL command queue, don't retain it */
466 if (command_queue)
467 POname (clRetainCommandQueue) (command_queue);
468 else
469 POname (clRetainContext) (context);
470
471 (*event)->command_type = command_type;
472 (*event)->id = POCL_ATOMIC_INC (event_id_counter);
473 (*event)->num_buffers = num_buffers;
474 if (num_buffers > 0)
475 {
476 (*event)->mem_objs = (cl_mem *)malloc (num_buffers * sizeof (cl_mem));
477 memcpy ((*event)->mem_objs, buffers, num_buffers * sizeof (cl_mem));
478 }
479 (*event)->status = CL_QUEUED;
480
481 if (command_type == CL_COMMAND_USER)
482 POCL_ATOMIC_INC (uevent_c);
483 else
484 POCL_ATOMIC_INC (event_c);
485
486 POCL_MSG_PRINT_EVENTS ("Created event %p / ID %" PRIu64 " / Command %s\n",
487 (*event), (*event)->id,
488 pocl_command_to_str (command_type));
489
490 return CL_SUCCESS;
491 }
492
493 static int
pocl_create_event_sync(cl_event waiting_event,cl_event notifier_event)494 pocl_create_event_sync (cl_event waiting_event, cl_event notifier_event)
495 {
496 event_node *notify_target = NULL;
497 event_node *wait_list_item = NULL;
498
499 if (notifier_event == NULL)
500 return CL_SUCCESS;
501
502 POCL_MSG_PRINT_EVENTS ("create event sync: waiting %" PRIu64
503 " , notifier %" PRIu64 "\n",
504 waiting_event->id, notifier_event->id);
505
506 pocl_lock_events_inorder (waiting_event, notifier_event);
507
508 assert (notifier_event->pocl_refcount != 0);
509 assert (waiting_event != notifier_event);
510
511 LL_FOREACH (waiting_event->wait_list, wait_list_item)
512 {
513 if (wait_list_item->event == notifier_event)
514 {
515 POCL_MSG_PRINT_EVENTS ("Skipping event sync creation \n");
516 goto FINISH;
517 }
518 }
519
520 if (notifier_event->status == CL_COMPLETE)
521 goto FINISH;
522 notify_target = pocl_mem_manager_new_event_node();
523 wait_list_item = pocl_mem_manager_new_event_node();
524 if (!notify_target || !wait_list_item)
525 return CL_OUT_OF_HOST_MEMORY;
526
527 notify_target->event = waiting_event;
528 wait_list_item->event = notifier_event;
529 LL_PREPEND (notifier_event->notify_list, notify_target);
530 LL_PREPEND (waiting_event->wait_list, wait_list_item);
531
532 FINISH:
533 pocl_unlock_events_inorder (waiting_event, notifier_event);
534 return CL_SUCCESS;
535 }
536
537 /* preallocate the buffers on destination device.
538 * if any allocation fails, we can't run this command. */
539 static int
can_run_command(cl_device_id dev,size_t num_objs,cl_mem * objs)540 can_run_command (cl_device_id dev, size_t num_objs, cl_mem *objs)
541 {
542 size_t i;
543 int errcode;
544
545 for (i = 0; i < num_objs; ++i)
546 {
547 pocl_mem_identifier *p = &objs[i]->device_ptrs[dev->global_mem_id];
548 // skip already allocated
549 if (p->mem_ptr)
550 continue;
551
552 assert (dev->ops->alloc_mem_obj);
553 errcode = dev->ops->alloc_mem_obj (dev, objs[i], NULL);
554 if (errcode != CL_SUCCESS)
555 return CL_FALSE;
556 }
557
558 return CL_TRUE;
559 }
560
561 static cl_int
pocl_create_command_struct(_cl_command_node ** cmd,cl_command_queue command_queue,cl_command_type command_type,cl_event * event_p,cl_uint num_events,const cl_event * wait_list,size_t num_buffers,const cl_mem * buffers)562 pocl_create_command_struct (_cl_command_node **cmd,
563 cl_command_queue command_queue,
564 cl_command_type command_type, cl_event *event_p,
565 cl_uint num_events, const cl_event *wait_list,
566 size_t num_buffers, const cl_mem *buffers)
567 {
568 unsigned i;
569 int err;
570 cl_event *event = NULL;
571
572 *cmd = pocl_mem_manager_new_command ();
573 if (*cmd == NULL)
574 return CL_OUT_OF_HOST_MEMORY;
575
576 (*cmd)->type = command_type;
577
578 event = &((*cmd)->event);
579 err = pocl_create_event (event, command_queue, command_type, num_buffers,
580 buffers, command_queue->context);
581
582 if (err != CL_SUCCESS)
583 {
584 POCL_MEM_FREE(*cmd);
585 return err;
586 }
587 (*event)->command_type = command_type;
588
589 /* if host application wants this commands event
590 one reference for the host and one for the runtime/driver */
591 if (event_p)
592 {
593 POCL_MSG_PRINT_EVENTS ("event pointer provided\n");
594 *event_p = *event;
595 (*event)->implicit_event = 0;
596 (*event)->pocl_refcount = 2;
597 }
598 else
599 {
600 (*event)->implicit_event = 1;
601 (*event)->pocl_refcount = 1;
602 }
603
604 (*cmd)->device = command_queue->device;
605 (*cmd)->event->command = (*cmd);
606
607 /* Form event synchronizations based on the given wait list */
608 for (i = 0; i < num_events; ++i)
609 {
610 cl_event wle = wait_list[i];
611 pocl_create_event_sync ((*event), wle);
612 }
613 POCL_MSG_PRINT_EVENTS (
614 "Created command struct: CMD %p (event %" PRIu64 " / %p, type: %s)\n", *cmd,
615 (*event)->id, *event, pocl_command_to_str (command_type));
616 return CL_SUCCESS;
617 }
618
619 static int
pocl_create_migration_commands(cl_device_id dev,cl_event final_event,cl_mem mem,pocl_mem_identifier * p,const char readonly,cl_command_type command_type,cl_mem_migration_flags mig_flags)620 pocl_create_migration_commands (cl_device_id dev, cl_event final_event,
621 cl_mem mem, pocl_mem_identifier *p,
622 const char readonly,
623 cl_command_type command_type,
624 cl_mem_migration_flags mig_flags)
625 {
626 int errcode = CL_SUCCESS;
627
628 cl_event ev_export = NULL, ev_import = NULL, previous_last_event = NULL,
629 last_migration_event = NULL;
630 _cl_command_node *cmd_export = NULL, *cmd_import = NULL;
631 cl_device_id ex_dev = NULL;
632 cl_command_queue ex_cq = NULL, dev_cq = NULL;
633 int can_directly_mig = 0;
634 size_t i;
635
636 /* "export" means copy buffer content from source device to mem_host_ptr;
637 *
638 * "import" means copy mem_host_ptr content to destination device,
639 * or copy directly between devices
640 *
641 * "need_hostptr" if set, increase the mem_host_ptr_refcount,
642 * to keep the mem_host_ptr backing memory around */
643 int do_import = 0, do_export = 0, do_need_hostptr = 0;
644
645 /*****************************************************************/
646
647 /* this part only:
648 * sets up the buffer content versions according to requested migration type;
649 * sets the buffer->last_event pointer to the final_event;
650 * decides what needs to be actually done (import, export) but not do it;
651 *
652 * ... so that any following command sees a correct buffer state.
653 * The actual migration commands are enqueued after. */
654 POCL_LOCK_OBJ (mem);
655
656 /* Retain the buffer for the duration of the command, except Unmaps,
657 * because corresponding Maps retain twice. */
658 if (command_type != CL_COMMAND_UNMAP_MEM_OBJECT)
659 POCL_RETAIN_OBJECT_UNLOCKED (mem);
660
661 /* save buffer's current last_event as previous last_event,
662 * then set the last_event pointer to the actual command's event
663 * (final_event).
664 *
665 * We'll need the "previous" event to properly chain events, but
666 * will release it after we've enqueued the required commands. */
667 previous_last_event = mem->last_event;
668 mem->last_event = final_event;
669
670 /* find device/gmem with latest memory version and fastest migration.
671 * ex_dev = device with latest memory _other than dev_
672 * dev_cq = default command queue for destination dev */
673 int highest_d2d_mig_priority = 0;
674 for (i = 0; i < mem->context->num_devices; ++i)
675 {
676 cl_device_id d = mem->context->devices[i];
677 cl_command_queue cq = mem->context->default_queues[i];
678 if (d == dev)
679 dev_cq = cq;
680 else if (mem->device_ptrs[d->global_mem_id].version == mem->latest_version)
681 {
682 int cur_d2d_mig_priority = 0;
683 if (d->ops->can_migrate_d2d)
684 cur_d2d_mig_priority = d->ops->can_migrate_d2d (dev, d);
685
686 // if we can directly migrate, and we found a better device, use it
687 if (cur_d2d_mig_priority > highest_d2d_mig_priority)
688 {
689 ex_dev = d;
690 ex_cq = cq;
691 highest_d2d_mig_priority = cur_d2d_mig_priority;
692 }
693
694 // if we can't migrate D2D, just use plain old through-host migration
695 if (highest_d2d_mig_priority == 0)
696 {
697 ex_dev = d;
698 ex_cq = cq;
699 }
700 }
701 }
702
703 assert (dev);
704 assert (dev_cq);
705 /* ex_dev can be NULL, or non-NULL != dev */
706 assert (ex_dev != dev);
707
708 /* if mem_host_ptr_version < latest_version, one of devices must have it;
709 *
710 * could be latest_version == mem_host_ptr_version == some p->version
711 * for some p, and so i < ndev; in that case,
712 * we leave ex_dev set since D2D is preferred migration way;
713 *
714 * otherwise must be
715 * mem_host_ptr_version == latest_version & > all p->version */
716
717 if ((mem->mem_host_ptr_version < mem->latest_version) && (p->version != mem->latest_version))
718 assert ((ex_dev != NULL) && (mem->device_ptrs[ex_dev->global_mem_id].version == mem->latest_version));
719
720 /* if ex_dev is NULL, either we have the latest or it's in mem_host_ptr */
721 if (ex_dev == NULL)
722 assert ((p->version == mem->latest_version) ||
723 (mem->mem_host_ptr_version == mem->latest_version));
724
725 /*****************************************************************/
726
727 /* buffer must be already allocated on this device's globalmem */
728 assert (p->mem_ptr != NULL);
729
730 /* we're migrating to host mem only: clEnqueueMigMemObjs() with HOST flag */
731 if (mig_flags & CL_MIGRATE_MEM_OBJECT_HOST)
732 {
733 do_import = 0;
734 do_export = 0;
735 do_need_hostptr = 1;
736 if (mem->mem_host_ptr_version < mem->latest_version)
737 {
738 mem->mem_host_ptr_version = mem->latest_version;
739 /* migrate content only if needed */
740 if ((mig_flags & CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED) == 0)
741 {
742 /* Could be that destination dev has the latest version,
743 * we still need to migrate to host mem */
744 if (ex_dev == NULL)
745 {
746 ex_dev = dev; ex_cq = dev_cq;
747 }
748 do_export = 1;
749 POCL_RETAIN_OBJECT_UNLOCKED (mem);
750 }
751 }
752
753 goto FINISH_VER_SETUP;
754 }
755
756 /* otherwise, we're migrating to a device memory. */
757 /* check if we can migrate to the device associated with command_queue
758 * without incurring the overhead of migrating their contents */
759 if (mig_flags & CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED)
760 p->version = mem->latest_version;
761
762 /* if we don't need to migrate, skip to end */
763 if (p->version >= mem->latest_version)
764 {
765 do_import = 0;
766 do_export = 0;
767 goto FINISH_VER_SETUP;
768 }
769
770 can_directly_mig = highest_d2d_mig_priority > 0;
771
772 /* if mem_host_ptr is outdated AND the devices can't migrate
773 * between each other, we need an export command */
774 if ((mem->mem_host_ptr_version != mem->latest_version)
775 && (can_directly_mig == 0))
776 {
777 /* we need two migration commands; one on the "source" device's hidden
778 * queue, and one on the destination device. */
779 do_import = 1;
780 do_export = 1;
781 do_need_hostptr = 1;
782
783 /* because the two migrate commands will clRelease the buffer */
784 POCL_RETAIN_OBJECT_UNLOCKED (mem);
785 POCL_RETAIN_OBJECT_UNLOCKED (mem);
786 mem->mem_host_ptr_version = mem->latest_version;
787 p->version = mem->latest_version;
788 }
789 /* otherwise either:
790 * 1) mem_host_ptr is latest, and we need to migrate mem-host-ptr to device, or
791 * 2) mem_host_ptr is not latest, but devices can migrate directly between each other,
792 * For both cases we only need one migration command on the destination device. */
793 else
794 {
795 do_import = 1;
796 do_export = 0;
797 do_need_hostptr = 1;
798
799 /* because the corresponding migrate command will clRelease the buffer */
800 POCL_RETAIN_OBJECT_UNLOCKED (mem);
801 p->version = mem->latest_version;
802 }
803
804 FINISH_VER_SETUP:
805 /* if the command is a write-use, increase the version. */
806 if (!readonly)
807 {
808 ++p->version;
809 mem->latest_version = p->version;
810 }
811
812 if (do_need_hostptr)
813 {
814 /* increase refcount the two mig commands */
815 if (do_export)
816 ++mem->mem_host_ptr_refcount;
817 if (do_import)
818 ++mem->mem_host_ptr_refcount;
819
820 /* allocate mem_host_ptr here if needed... */
821 if (mem->mem_host_ptr == NULL)
822 {
823 size_t align = max (mem->context->min_buffer_alignment, 16);
824 mem->mem_host_ptr = pocl_aligned_malloc (align, mem->size);
825 assert ((mem->mem_host_ptr != NULL)
826 && "Cannot allocate backing memory for mem_host_ptr!\n");
827 }
828 }
829
830 POCL_UNLOCK_OBJ (mem);
831
832 /*****************************************************************/
833
834 /* enqueue a command for export.
835 * Put the previous last event into its waitlist. */
836 if (do_export)
837 {
838 assert (ex_cq);
839 assert (ex_dev);
840 errcode = pocl_create_command_struct (
841 &cmd_export, ex_cq, CL_COMMAND_MIGRATE_MEM_OBJECTS,
842 &ev_export, // event_p
843 (previous_last_event ? 1 : 0),
844 (previous_last_event ? &previous_last_event : NULL), // waitlist
845 1, &mem // buffer list
846 );
847 assert (errcode == CL_SUCCESS);
848 if (do_need_hostptr)
849 ev_export->release_mem_host_ptr_after = 1;
850
851 cmd_export->command.migrate.mem_id
852 = &mem->device_ptrs[ex_dev->global_mem_id];
853 cmd_export->command.migrate.type = ENQUEUE_MIGRATE_TYPE_D2H;
854
855 pocl_command_enqueue (ex_cq, cmd_export);
856
857 last_migration_event = ev_export;
858 }
859
860 /* enqueue a command for import.
861 * Put either the previous last event, or export ev, into its waitlist. */
862 if (do_import)
863 {
864 /* the import command must depend on (wait for) either the export
865 * command, or the buffer's previous last event. Can be NULL if there's
866 * no last event or export command */
867 cl_event import_wait_ev = (ev_export ? ev_export : previous_last_event);
868
869 errcode = pocl_create_command_struct (
870 &cmd_import, dev_cq, CL_COMMAND_MIGRATE_MEM_OBJECTS,
871 &ev_import, // event_p
872 (import_wait_ev ? 1 : 0),
873 (import_wait_ev ? &import_wait_ev : NULL), // waitlist
874 1, &mem // buffer list
875 );
876 assert (errcode == CL_SUCCESS);
877 if (do_need_hostptr)
878 ev_import->release_mem_host_ptr_after = 1;
879
880 if (can_directly_mig)
881 {
882 cmd_import->command.migrate.type = ENQUEUE_MIGRATE_TYPE_D2D;
883 cmd_import->command.migrate.src_device = ex_dev;
884 cmd_import->command.migrate.src_id
885 = &mem->device_ptrs[ex_dev->global_mem_id];
886 cmd_import->command.migrate.dst_id
887 = &mem->device_ptrs[dev->global_mem_id];
888 }
889 else
890 {
891 cmd_import->command.migrate.type = ENQUEUE_MIGRATE_TYPE_H2D;
892 cmd_import->command.migrate.mem_id
893 = &mem->device_ptrs[dev->global_mem_id];
894 }
895
896 pocl_command_enqueue (dev_cq, cmd_import);
897
898 /* because explicit event */
899 if (ev_export)
900 POname (clReleaseEvent) (ev_export);
901
902 last_migration_event = ev_import;
903 }
904
905 /* we don't need it anymore. */
906 if (previous_last_event)
907 POname (clReleaseEvent (previous_last_event));
908
909 /* the final event must depend on the export/import commands */
910 if (last_migration_event)
911 {
912 pocl_create_event_sync (final_event, last_migration_event);
913 /* if the event itself only reads from the buffer,
914 * set the last buffer event to last_mig_event,
915 * instead of the actual command event;
916 * this avoids unnecessary waits e.g on kernels
917 * which only read from buffers */
918 if (readonly)
919 {
920 POCL_LOCK_OBJ (mem);
921 mem->last_event = last_migration_event;
922 POCL_UNLOCK_OBJ (mem);
923 POname (clReleaseEvent) (final_event);
924 }
925 else /* because explicit event */
926 POname (clReleaseEvent) (last_migration_event);
927 }
928
929 return CL_SUCCESS;
930 }
931
932 static cl_int
pocl_create_command_full(_cl_command_node ** cmd,cl_command_queue command_queue,cl_command_type command_type,cl_event * event_p,cl_uint num_events,const cl_event * wait_list,size_t num_buffers,cl_mem * buffers,char * readonly_flags,cl_mem_migration_flags mig_flags)933 pocl_create_command_full (_cl_command_node **cmd,
934 cl_command_queue command_queue,
935 cl_command_type command_type, cl_event *event_p,
936 cl_uint num_events, const cl_event *wait_list,
937 size_t num_buffers, cl_mem *buffers,
938 char *readonly_flags,
939 cl_mem_migration_flags mig_flags)
940 {
941 cl_device_id dev = pocl_real_dev (command_queue->device);
942 int err = CL_SUCCESS;
943 size_t i;
944
945 POCL_RETURN_ERROR_ON ((dev->available == CL_FALSE), CL_INVALID_DEVICE,
946 "device is not available\n");
947
948 if (num_buffers >= 1)
949 {
950 assert (buffers);
951 assert (readonly_flags);
952
953 if (num_buffers > 1)
954 sort_and_uniq (buffers, readonly_flags, &num_buffers);
955
956 if (can_run_command (dev, num_buffers, buffers) == CL_FALSE)
957 return CL_OUT_OF_RESOURCES;
958 }
959
960 /* waitlist here only contains the user-provided events.
961 * migration events are added to waitlist later */
962 err = pocl_create_command_struct (cmd, command_queue, command_type, event_p,
963 num_events, wait_list, num_buffers,
964 buffers);
965 if (err)
966 return err;
967 cl_event final_event = (*cmd)->event;
968
969 /* retain once for every buffer; this is because we set every buffer's
970 * "last event" to this, and then some next command enqueue
971 * (or clReleaseMemObject) will release it.
972 */
973 POCL_LOCK_OBJ (final_event);
974 final_event->pocl_refcount += num_buffers;
975 POCL_UNLOCK_OBJ (final_event);
976
977 for (i = 0; i < num_buffers; ++i)
978 {
979 pocl_create_migration_commands (
980 dev, final_event, buffers[i],
981 &buffers[i]->device_ptrs[dev->global_mem_id], readonly_flags[i],
982 command_type, mig_flags);
983 }
984
985 return err;
986 }
987
988 cl_int
pocl_create_command_migrate(_cl_command_node ** cmd,cl_command_queue command_queue,cl_mem_migration_flags flags,cl_event * event_p,cl_uint num_events,const cl_event * wait_list,size_t num_buffers,cl_mem * buffers,char * readonly_flags)989 pocl_create_command_migrate (_cl_command_node **cmd,
990 cl_command_queue command_queue,
991 cl_mem_migration_flags flags, cl_event *event_p,
992 cl_uint num_events, const cl_event *wait_list,
993 size_t num_buffers, cl_mem *buffers,
994 char *readonly_flags)
995 {
996 return pocl_create_command_full (
997 cmd, command_queue, CL_COMMAND_MIGRATE_MEM_OBJECTS, event_p, num_events,
998 wait_list, num_buffers, buffers, readonly_flags, flags);
999 }
1000
1001 cl_int
pocl_create_command(_cl_command_node ** cmd,cl_command_queue command_queue,cl_command_type command_type,cl_event * event_p,cl_uint num_events,const cl_event * wait_list,size_t num_buffers,cl_mem * buffers,char * readonly_flags)1002 pocl_create_command (_cl_command_node **cmd, cl_command_queue command_queue,
1003 cl_command_type command_type, cl_event *event_p,
1004 cl_uint num_events, const cl_event *wait_list,
1005 size_t num_buffers, cl_mem *buffers, char *readonly_flags)
1006 {
1007 return pocl_create_command_full (cmd, command_queue, command_type, event_p,
1008 num_events, wait_list, num_buffers, buffers,
1009 readonly_flags, 0);
1010 }
1011
1012 /* call with node->event UNLOCKED */
pocl_command_enqueue(cl_command_queue command_queue,_cl_command_node * node)1013 void pocl_command_enqueue (cl_command_queue command_queue,
1014 _cl_command_node *node)
1015 {
1016 cl_event event;
1017
1018 POCL_LOCK_OBJ (command_queue);
1019
1020 /* in case of in-order queue, synchronize to previously enqueued command
1021 if available */
1022 if (!(command_queue->properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE))
1023 {
1024 POCL_MSG_PRINT_EVENTS ("In-order Q; adding event syncs\n");
1025 if (command_queue->last_event.event)
1026 {
1027 pocl_create_event_sync (node->event,
1028 command_queue->last_event.event);
1029 }
1030 }
1031
1032 ++command_queue->command_count;
1033 /* in case of in-order queue, synchronize to previously enqueued command
1034 if available */
1035 if (!(command_queue->properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE))
1036 {
1037 if (command_queue->last_event.event)
1038 {
1039 pocl_create_event_sync (node->event,
1040 command_queue->last_event.event);
1041 }
1042 }
1043 /* Command queue is out-of-order queue. If command type is a barrier, then
1044 synchronize to all previously enqueued commands to make sure they are
1045 executed before the barrier. */
1046 else if ((node->type == CL_COMMAND_BARRIER
1047 || node->type == CL_COMMAND_MARKER)
1048 && node->command.barrier.has_wait_list == 0)
1049 {
1050 POCL_MSG_PRINT_EVENTS ("Barrier; adding event syncs\n");
1051 DL_FOREACH (command_queue->events, event)
1052 {
1053 pocl_create_event_sync (node->event, event);
1054 }
1055 }
1056
1057 if (node->type == CL_COMMAND_BARRIER)
1058 command_queue->barrier = node->event;
1059 else
1060 {
1061 if (command_queue->barrier)
1062 {
1063 pocl_create_event_sync (node->event, command_queue->barrier);
1064 }
1065 }
1066 DL_APPEND (command_queue->events, node->event);
1067
1068 POCL_MSG_PRINT_EVENTS ("Pushed Event %" PRIu64 " to CQ %" PRIu64 ".\n",
1069 node->event->id, command_queue->id);
1070 command_queue->last_event.event = node->event;
1071 POCL_UNLOCK_OBJ (command_queue);
1072
1073 POCL_LOCK_OBJ (node->event);
1074 assert (node->event->status == CL_QUEUED);
1075 assert (command_queue == node->event->queue);
1076 pocl_update_event_queued (node->event);
1077 command_queue->device->ops->submit(node, command_queue);
1078 /* node->event is unlocked by device_ops->submit */
1079
1080 }
1081
1082 int
pocl_alloc_or_retain_mem_host_ptr(cl_mem mem)1083 pocl_alloc_or_retain_mem_host_ptr (cl_mem mem)
1084 {
1085 if (mem->mem_host_ptr == NULL)
1086 {
1087 size_t align = max (mem->context->min_buffer_alignment, 16);
1088 mem->mem_host_ptr = pocl_aligned_malloc (align, mem->size);
1089 if (mem->mem_host_ptr == NULL)
1090 return -1;
1091 mem->mem_host_ptr_version = 0;
1092 mem->mem_host_ptr_refcount = 0;
1093 }
1094 ++mem->mem_host_ptr_refcount;
1095 return 0;
1096 }
1097
1098 int
pocl_release_mem_host_ptr(cl_mem mem)1099 pocl_release_mem_host_ptr (cl_mem mem)
1100 {
1101 assert (mem->mem_host_ptr_refcount > 0);
1102 --mem->mem_host_ptr_refcount;
1103 if (mem->mem_host_ptr_refcount == 0 && mem->mem_host_ptr != NULL)
1104 {
1105 pocl_aligned_free (mem->mem_host_ptr);
1106 mem->mem_host_ptr = NULL;
1107 mem->mem_host_ptr_version = 0;
1108 }
1109 return 0;
1110 }
1111
1112 /* call (and return) with node->event locked */
1113 void
pocl_command_push(_cl_command_node * node,_cl_command_node ** ready_list,_cl_command_node ** pending_list)1114 pocl_command_push (_cl_command_node *node,
1115 _cl_command_node **ready_list,
1116 _cl_command_node **pending_list)
1117 {
1118 assert (node != NULL);
1119
1120 /* If the last command inserted is a barrier,
1121 command is necessary not ready */
1122
1123 if ((*ready_list) != NULL && (*ready_list)->prev
1124 && (*ready_list)->prev->type == CL_COMMAND_BARRIER)
1125 {
1126 CDL_PREPEND ((*pending_list), node);
1127 return;
1128 }
1129 if (pocl_command_is_ready(node->event))
1130 {
1131 pocl_update_event_submitted (node->event);
1132 CDL_PREPEND ((*ready_list), node);
1133 }
1134 else
1135 {
1136 CDL_PREPEND ((*pending_list), node);
1137 }
1138 }
1139
1140 void
pocl_unmap_command_finished(cl_device_id dev,pocl_mem_identifier * mem_id,cl_mem mem,mem_mapping_t * map)1141 pocl_unmap_command_finished (cl_device_id dev, pocl_mem_identifier *mem_id,
1142 cl_mem mem, mem_mapping_t *map)
1143 {
1144 POCL_LOCK_OBJ (mem);
1145 assert (map->unmap_requested > 0);
1146 dev->ops->free_mapping_ptr (dev->data, mem_id, mem, map);
1147 DL_DELETE (mem->mappings, map);
1148 mem->map_count--;
1149 POCL_MEM_FREE (map);
1150 POCL_UNLOCK_OBJ (mem);
1151 }
1152
1153 void
pocl_unmap_command_finished2(cl_event event,_cl_command_t * cmd)1154 pocl_unmap_command_finished2 (cl_event event, _cl_command_t *cmd)
1155 {
1156 cl_device_id dev = event->queue->device;
1157 pocl_mem_identifier *mem_id = NULL;
1158 cl_mem mem = NULL;
1159 mem = event->mem_objs[0];
1160 mem_id = &mem->device_ptrs[dev->global_mem_id];
1161 pocl_unmap_command_finished (dev, mem_id, mem, cmd->unmap.mapping);
1162 }
1163
1164 void
pocl_cl_mem_inherit_flags(cl_mem mem,cl_mem from_buffer,cl_mem_flags flags)1165 pocl_cl_mem_inherit_flags (cl_mem mem, cl_mem from_buffer, cl_mem_flags flags)
1166 {
1167 if ((flags & CL_MEM_READ_WRITE) | (flags & CL_MEM_READ_ONLY)
1168 | (flags & CL_MEM_WRITE_ONLY))
1169 {
1170 mem->flags = (flags & CL_MEM_READ_WRITE) | (flags & CL_MEM_READ_ONLY)
1171 | (flags & CL_MEM_WRITE_ONLY);
1172 }
1173 else
1174 {
1175 mem->flags = (from_buffer->flags & CL_MEM_READ_WRITE)
1176 | (from_buffer->flags & CL_MEM_READ_ONLY)
1177 | (from_buffer->flags & CL_MEM_WRITE_ONLY);
1178 }
1179
1180 if ((flags & CL_MEM_HOST_NO_ACCESS) | (flags & CL_MEM_HOST_READ_ONLY)
1181 | (flags & CL_MEM_HOST_WRITE_ONLY))
1182 {
1183 mem->flags = mem->flags | ((flags & CL_MEM_HOST_NO_ACCESS)
1184 | (flags & CL_MEM_HOST_READ_ONLY)
1185 | (flags & CL_MEM_HOST_WRITE_ONLY));
1186 }
1187 else
1188 {
1189 mem->flags
1190 = mem->flags | ((from_buffer->flags & CL_MEM_HOST_NO_ACCESS)
1191 | (from_buffer->flags & CL_MEM_HOST_READ_ONLY)
1192 | (from_buffer->flags & CL_MEM_HOST_WRITE_ONLY));
1193 }
1194
1195 mem->flags = mem->flags | (from_buffer->flags & CL_MEM_USE_HOST_PTR)
1196 | (from_buffer->flags & CL_MEM_ALLOC_HOST_PTR)
1197 | (from_buffer->flags & CL_MEM_COPY_HOST_PTR);
1198 }
1199
pocl_buffer_boundcheck(cl_mem buffer,size_t offset,size_t size)1200 int pocl_buffer_boundcheck(cl_mem buffer, size_t offset, size_t size) {
1201 POCL_RETURN_ERROR_ON ((offset > buffer->size), CL_INVALID_VALUE,
1202 "offset(%zu) > buffer->size(%zu)\n", offset,
1203 buffer->size);
1204 POCL_RETURN_ERROR_ON ((size > buffer->size), CL_INVALID_VALUE,
1205 "size(%zu) > buffer->size(%zu)\n", size, buffer->size);
1206 POCL_RETURN_ERROR_ON ((offset + size > buffer->size), CL_INVALID_VALUE,
1207 "offset + size (%zu) > buffer->size(%zu)\n",
1208 (offset + size), buffer->size);
1209 return CL_SUCCESS;
1210 }
1211
pocl_buffer_boundcheck_3d(const size_t buffer_size,const size_t * origin,const size_t * region,size_t * row_pitch,size_t * slice_pitch,const char * prefix)1212 int pocl_buffer_boundcheck_3d(const size_t buffer_size,
1213 const size_t *origin,
1214 const size_t *region,
1215 size_t *row_pitch,
1216 size_t *slice_pitch,
1217 const char* prefix)
1218 {
1219 size_t rp = *row_pitch;
1220 size_t sp = *slice_pitch;
1221
1222 /* CL_INVALID_VALUE if row_pitch is not 0 and is less than region[0]. */
1223 POCL_RETURN_ERROR_ON((rp != 0 && rp<region[0]),
1224 CL_INVALID_VALUE, "%srow_pitch is not 0 and is less than region[0]\n", prefix);
1225
1226 if (rp == 0) rp = region[0];
1227
1228 /* CL_INVALID_VALUE if slice_pitch is not 0 and is less than region[1] * row_pitch
1229 * or if slice_pitch is not 0 and is not a multiple of row_pitch.
1230 */
1231 POCL_RETURN_ERROR_ON((sp != 0 && sp < (region[1] * rp)),
1232 CL_INVALID_VALUE, "%sslice_pitch is not 0 and is less than "
1233 "region[1] * %srow_pitch\n", prefix, prefix);
1234 POCL_RETURN_ERROR_ON((sp != 0 && (sp % rp != 0)),
1235 CL_INVALID_VALUE, "%sslice_pitch is not 0 and is not a multiple "
1236 "of %srow_pitch\n", prefix, prefix);
1237
1238 if (sp == 0) sp = region[1] * rp;
1239
1240 *row_pitch = rp;
1241 *slice_pitch = sp;
1242
1243 size_t byte_offset_begin = origin[2] * sp +
1244 origin[1] * rp +
1245 origin[0];
1246
1247 size_t byte_offset_end = origin[0] + region[0]-1 +
1248 rp * (origin[1] + region[1]-1) +
1249 sp * (origin[2] + region[2]-1);
1250
1251
1252 POCL_RETURN_ERROR_ON((byte_offset_begin > buffer_size), CL_INVALID_VALUE,
1253 "%sorigin is outside the %sbuffer", prefix, prefix);
1254 POCL_RETURN_ERROR_ON((byte_offset_end >= buffer_size), CL_INVALID_VALUE,
1255 "%sorigin+region is outside the %sbuffer", prefix, prefix);
1256 return CL_SUCCESS;
1257 }
1258
1259
1260
pocl_buffers_boundcheck(cl_mem src_buffer,cl_mem dst_buffer,size_t src_offset,size_t dst_offset,size_t size)1261 int pocl_buffers_boundcheck(cl_mem src_buffer,
1262 cl_mem dst_buffer,
1263 size_t src_offset,
1264 size_t dst_offset,
1265 size_t size) {
1266 POCL_RETURN_ERROR_ON((src_offset > src_buffer->size), CL_INVALID_VALUE,
1267 "src_offset(%zu) > src_buffer->size(%zu)", src_offset, src_buffer->size);
1268 POCL_RETURN_ERROR_ON((size > src_buffer->size), CL_INVALID_VALUE,
1269 "size(%zu) > src_buffer->size(%zu)", size, src_buffer->size);
1270 POCL_RETURN_ERROR_ON((src_offset + size > src_buffer->size), CL_INVALID_VALUE,
1271 "src_offset + size (%zu) > src_buffer->size(%zu)", (src_offset+size), src_buffer->size);
1272
1273 POCL_RETURN_ERROR_ON((dst_offset > dst_buffer->size), CL_INVALID_VALUE,
1274 "dst_offset(%zu) > dst_buffer->size(%zu)", dst_offset, dst_buffer->size);
1275 POCL_RETURN_ERROR_ON((size > dst_buffer->size), CL_INVALID_VALUE,
1276 "size(%zu) > dst_buffer->size(%zu)", size, dst_buffer->size);
1277 POCL_RETURN_ERROR_ON((dst_offset + size > dst_buffer->size), CL_INVALID_VALUE,
1278 "dst_offset + size (%zu) > dst_buffer->size(%zu)", (dst_offset+size), dst_buffer->size);
1279 return CL_SUCCESS;
1280 }
1281
pocl_buffers_overlap(cl_mem src_buffer,cl_mem dst_buffer,size_t src_offset,size_t dst_offset,size_t size)1282 int pocl_buffers_overlap(cl_mem src_buffer,
1283 cl_mem dst_buffer,
1284 size_t src_offset,
1285 size_t dst_offset,
1286 size_t size) {
1287 /* The regions overlap if src_offset ≤ to dst_offset ≤ to src_offset + size - 1,
1288 * or if dst_offset ≤ to src_offset ≤ to dst_offset + size - 1.
1289 */
1290 if (src_buffer == dst_buffer) {
1291 POCL_RETURN_ERROR_ON(((src_offset <= dst_offset) && (dst_offset <=
1292 (src_offset + size - 1))), CL_MEM_COPY_OVERLAP, "dst_offset lies inside \
1293 the src region and the src_buffer == dst_buffer");
1294 POCL_RETURN_ERROR_ON(((dst_offset <= src_offset) && (src_offset <=
1295 (dst_offset + size - 1))), CL_MEM_COPY_OVERLAP, "src_offset lies inside \
1296 the dst region and the src_buffer == dst_buffer");
1297 }
1298
1299 /* sub buffers overlap check */
1300 if (src_buffer->parent && dst_buffer->parent &&
1301 (src_buffer->parent == dst_buffer->parent)) {
1302 src_offset = src_buffer->origin + src_offset;
1303 dst_offset = dst_buffer->origin + dst_offset;
1304
1305 POCL_RETURN_ERROR_ON (((src_offset <= dst_offset)
1306 && (dst_offset <= (src_offset + size - 1))),
1307 CL_MEM_COPY_OVERLAP, "dst_offset lies inside \
1308 the src region and src_buffer + dst_buffer are subbuffers of the same buffer");
1309 POCL_RETURN_ERROR_ON (((dst_offset <= src_offset)
1310 && (src_offset <= (dst_offset + size - 1))),
1311 CL_MEM_COPY_OVERLAP, "src_offset lies inside \
1312 the dst region and src_buffer + dst_buffer are subbuffers of the same buffer");
1313
1314 }
1315
1316 return CL_SUCCESS;
1317 }
1318
1319 /*
1320 * Copyright (c) 2011 The Khronos Group Inc.
1321 *
1322 * Permission is hereby granted, free of charge, to any person obtaining a copy of this
1323 * software and /or associated documentation files (the "Materials "), to deal in the Materials
1324 * without restriction, including without limitation the rights to use, copy, modify, merge,
1325 * publish, distribute, sublicense, and/or sell copies of the Materials, and to permit persons to
1326 * whom the Materials are furnished to do so, subject to
1327 * the following conditions:
1328 *
1329 * The above copyright notice and this permission notice shall be included
1330 * in all copies or substantial portions of the Materials.
1331 *
1332 * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1333 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1334 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1335 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1336 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1337 * OUT OF OR IN CONNECTION WITH THE MATERIALS OR THE USE OR OTHER DEALINGS IN
1338 * THE MATERIALS.
1339 */
1340
1341 int
check_copy_overlap(const size_t src_offset[3],const size_t dst_offset[3],const size_t region[3],const size_t row_pitch,const size_t slice_pitch)1342 check_copy_overlap(const size_t src_offset[3],
1343 const size_t dst_offset[3],
1344 const size_t region[3],
1345 const size_t row_pitch, const size_t slice_pitch)
1346 {
1347 const size_t src_min[] = {src_offset[0], src_offset[1], src_offset[2]};
1348 const size_t src_max[] = {src_offset[0] + region[0],
1349 src_offset[1] + region[1],
1350 src_offset[2] + region[2]};
1351 const size_t dst_min[] = {dst_offset[0], dst_offset[1], dst_offset[2]};
1352 const size_t dst_max[] = {dst_offset[0] + region[0],
1353 dst_offset[1] + region[1],
1354 dst_offset[2] + region[2]};
1355 int overlap = 1;
1356 unsigned i;
1357 for (i=0; i != 3; ++i)
1358 {
1359 overlap = overlap && (src_min[i] < dst_max[i])
1360 && (src_max[i] > dst_min[i]);
1361 }
1362
1363 size_t dst_start = dst_offset[2] * slice_pitch +
1364 dst_offset[1] * row_pitch + dst_offset[0];
1365 size_t dst_end = dst_start + (region[2] * slice_pitch +
1366 region[1] * row_pitch + region[0]);
1367 size_t src_start = src_offset[2] * slice_pitch +
1368 src_offset[1] * row_pitch + src_offset[0];
1369 size_t src_end = src_start + (region[2] * slice_pitch +
1370 region[1] * row_pitch + region[0]);
1371
1372 if (!overlap)
1373 {
1374 size_t delta_src_x = (src_offset[0] + region[0] > row_pitch) ?
1375 src_offset[0] + region[0] - row_pitch : 0;
1376 size_t delta_dst_x = (dst_offset[0] + region[0] > row_pitch) ?
1377 dst_offset[0] + region[0] - row_pitch : 0;
1378 if ( (delta_src_x > 0 && delta_src_x > dst_offset[0]) ||
1379 (delta_dst_x > 0 && delta_dst_x > src_offset[0]) )
1380 {
1381 if ( (src_start <= dst_start && dst_start < src_end) ||
1382 (dst_start <= src_start && src_start < dst_end) )
1383 overlap = 1;
1384 }
1385
1386 if (region[2] > 1)
1387 {
1388 size_t src_height = slice_pitch / row_pitch;
1389 size_t dst_height = slice_pitch / row_pitch;
1390
1391 size_t delta_src_y = (src_offset[1] + region[1] > src_height) ?
1392 src_offset[1] + region[1] - src_height : 0;
1393 size_t delta_dst_y = (dst_offset[1] + region[1] > dst_height) ?
1394 dst_offset[1] + region[1] - dst_height : 0;
1395
1396 if ( (delta_src_y > 0 && delta_src_y > dst_offset[1]) ||
1397 (delta_dst_y > 0 && delta_dst_y > src_offset[1]) )
1398 {
1399 if ( (src_start <= dst_start && dst_start < src_end) ||
1400 (dst_start <= src_start && src_start < dst_end) )
1401 overlap = 1;
1402 }
1403 }
1404 }
1405
1406 return overlap;
1407 }
1408
1409 /* For a subdevice parameter, return the actual device it belongs to. */
1410 cl_device_id
pocl_real_dev(const cl_device_id dev)1411 pocl_real_dev (const cl_device_id dev)
1412 {
1413 cl_device_id ret = dev;
1414 while (ret->parent_device)
1415 ret = ret->parent_device;
1416 return ret;
1417 }
1418
1419 /* Make a list of unique devices. If any device is a subdevice,
1420 * replace with parent, then remove duplicate parents. */
pocl_unique_device_list(const cl_device_id * in,cl_uint num,cl_uint * real)1421 cl_device_id * pocl_unique_device_list(const cl_device_id * in, cl_uint num, cl_uint *real)
1422 {
1423 cl_uint real_num = num;
1424 cl_device_id *out = (cl_device_id *)calloc (num, sizeof (cl_device_id));
1425 if (!out)
1426 return NULL;
1427
1428 unsigned i;
1429 for (i=0; i < num; ++i)
1430 out[i] = (in[i] ? pocl_real_dev (in[i]) : NULL);
1431
1432 i=1;
1433 unsigned device_i=0;
1434 while (i < real_num)
1435 {
1436 device_i=0;
1437 while (device_i < i)
1438 {
1439 if (out[device_i] == out[i])
1440 {
1441 out[device_i] = out[--real_num];
1442 out[real_num] = NULL;
1443 }
1444 else
1445 device_i++;
1446 }
1447 i++;
1448 }
1449
1450 *real = real_num;
1451 return out;
1452 }
1453
1454 int
pocl_device_supports_builtin_kernel(cl_device_id dev,const char * kernel_name)1455 pocl_device_supports_builtin_kernel (cl_device_id dev, const char *kernel_name)
1456 {
1457 if (kernel_name == NULL)
1458 return 0;
1459
1460 if (dev->builtin_kernel_list == NULL)
1461 return 0;
1462
1463 char *temp = strdup (dev->builtin_kernel_list);
1464 char *token;
1465 char *rest = temp;
1466
1467 while ((token = strtok_r (rest, ";", &rest)))
1468 {
1469 if (strcmp (token, kernel_name) == 0)
1470 {
1471 free (temp);
1472 return 1;
1473 }
1474 }
1475
1476 free (temp);
1477 return 0;
1478 }
1479
1480 static void
image_format_union(const cl_image_format * dev_formats,cl_uint num_dev_formats,cl_image_format ** context_formats,cl_uint * num_context_formats)1481 image_format_union (const cl_image_format *dev_formats,
1482 cl_uint num_dev_formats,
1483 cl_image_format **context_formats,
1484 cl_uint *num_context_formats)
1485 {
1486 if ((dev_formats == NULL) || (num_dev_formats == 0))
1487 return;
1488
1489 if ((*num_context_formats == 0) || (*context_formats == NULL))
1490 {
1491 // alloc & copy
1492 *context_formats = (cl_image_format *)malloc (sizeof (cl_image_format)
1493 * num_dev_formats);
1494 memcpy (*context_formats, dev_formats,
1495 sizeof (cl_image_format) * num_dev_formats);
1496 *num_context_formats = num_dev_formats;
1497 }
1498 else
1499 {
1500 // realloc & merge
1501 cl_uint i, j;
1502 cl_uint ncf = *num_context_formats;
1503 size_t size = sizeof (cl_image_format) * (num_dev_formats + ncf);
1504 cl_image_format *ctf
1505 = (cl_image_format *)realloc (*context_formats, size);
1506 assert (ctf);
1507 for (i = 0; i < num_dev_formats; ++i)
1508 {
1509 for (j = 0; j < ncf; ++j)
1510 if (memcmp (ctf + j, dev_formats + i, sizeof (cl_image_format))
1511 == 0)
1512 break;
1513 if (j < ncf)
1514 {
1515 // format already in context, skip
1516 continue;
1517 }
1518 else
1519 {
1520 memcpy (ctf + ncf, dev_formats + i, sizeof (cl_image_format));
1521 ++ncf;
1522 }
1523 }
1524 *context_formats = ctf;
1525 *num_context_formats = ncf;
1526 }
1527 }
1528
1529 /* Setup certain info about context that comes up later in API calls */
1530 void
pocl_setup_context(cl_context context)1531 pocl_setup_context (cl_context context)
1532 {
1533 unsigned i, j;
1534 int err;
1535 size_t alignment = context->devices[0]->mem_base_addr_align;
1536 context->max_mem_alloc_size = 0;
1537 context->svm_allocdev = NULL;
1538 assert (context->default_queues);
1539
1540 memset (context->image_formats, 0, sizeof (void *) * NUM_OPENCL_IMAGE_TYPES);
1541 memset (context->num_image_formats, 0,
1542 sizeof (cl_uint) * NUM_OPENCL_IMAGE_TYPES);
1543
1544 for(i=0; i<context->num_devices; i++)
1545 {
1546 cl_device_id dev = context->devices[i];
1547 if (dev->svm_allocation_priority > 0)
1548 {
1549 if (context->svm_allocdev == NULL
1550 || context->svm_allocdev->svm_allocation_priority
1551 < dev->svm_allocation_priority)
1552 {
1553 context->svm_allocdev = dev;
1554 }
1555 }
1556
1557 if (dev->mem_base_addr_align < alignment)
1558 alignment = dev->mem_base_addr_align;
1559
1560 if (dev->max_mem_alloc_size
1561 > context->max_mem_alloc_size)
1562 context->max_mem_alloc_size =
1563 dev->max_mem_alloc_size;
1564
1565 if (dev->image_support == CL_TRUE)
1566 {
1567 for (j = 0; j < NUM_OPENCL_IMAGE_TYPES; ++j)
1568 image_format_union (
1569 dev->image_formats[j],
1570 dev->num_image_formats[j],
1571 &context->image_formats[j], &context->num_image_formats[j]);
1572 }
1573
1574 context->default_queues[i] = POname (clCreateCommandQueue) (
1575 context, dev,
1576 (CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_HIDDEN
1577 | CL_QUEUE_PROFILING_ENABLE),
1578 &err);
1579 assert (err == CL_SUCCESS);
1580 assert (context->default_queues[i]);
1581 }
1582
1583 assert (alignment > 0);
1584 context->min_buffer_alignment = alignment;
1585 }
1586
1587 int
pocl_check_event_wait_list(cl_command_queue command_queue,cl_uint num_events_in_wait_list,const cl_event * event_wait_list)1588 pocl_check_event_wait_list (cl_command_queue command_queue,
1589 cl_uint num_events_in_wait_list,
1590 const cl_event *event_wait_list)
1591 {
1592 POCL_RETURN_ERROR_COND (
1593 (event_wait_list == NULL && num_events_in_wait_list > 0),
1594 CL_INVALID_EVENT_WAIT_LIST);
1595
1596 POCL_RETURN_ERROR_COND (
1597 (event_wait_list != NULL && num_events_in_wait_list == 0),
1598 CL_INVALID_EVENT_WAIT_LIST);
1599
1600 if (event_wait_list)
1601 {
1602 unsigned i;
1603 for (i = 0; i < num_events_in_wait_list; i++)
1604 {
1605 POCL_RETURN_ERROR_COND ((!IS_CL_OBJECT_VALID (event_wait_list[i])),
1606 CL_INVALID_EVENT_WAIT_LIST);
1607 POCL_RETURN_ERROR_COND (
1608 (event_wait_list[i]->context != command_queue->context),
1609 CL_INVALID_CONTEXT);
1610 }
1611 }
1612
1613 return CL_SUCCESS;
1614 }
1615
1616 const char*
pocl_status_to_str(int status)1617 pocl_status_to_str (int status)
1618 {
1619 static const char *status_to_str[] = {
1620 "complete",
1621 "running",
1622 "submitted",
1623 "queued"};
1624 return status_to_str[status];
1625 }
1626
1627 void
pocl_abort_on_pthread_error(int status,unsigned line,const char * func)1628 pocl_abort_on_pthread_error (int status, unsigned line, const char *func)
1629 {
1630 if (status != 0)
1631 {
1632 POCL_MSG_PRINT2 (HSA, func, line, "Error from pthread call:\n");
1633 POCL_ABORT ("%s\n", strerror (status));
1634 }
1635 }
1636
1637 /* Convert a command type to its representation string
1638 */
1639 const char *
pocl_command_to_str(cl_command_type cmd)1640 pocl_command_to_str (cl_command_type cmd)
1641 {
1642 switch (cmd)
1643 {
1644 case CL_COMMAND_NDRANGE_KERNEL:
1645 return "ndrange_kernel";
1646 case CL_COMMAND_TASK:
1647 return "task_kernel";
1648 case CL_COMMAND_NATIVE_KERNEL:
1649 return "native_kernel";
1650 case CL_COMMAND_READ_BUFFER:
1651 return "read_buffer";
1652 case CL_COMMAND_WRITE_BUFFER:
1653 return "write_buffer";
1654 case CL_COMMAND_COPY_BUFFER:
1655 return "copy_buffer";
1656 case CL_COMMAND_READ_IMAGE:
1657 return "read_image";
1658 case CL_COMMAND_WRITE_IMAGE:
1659 return "write_image";
1660 case CL_COMMAND_COPY_IMAGE:
1661 return "copy_image";
1662 case CL_COMMAND_COPY_IMAGE_TO_BUFFER:
1663 return "copy_image_to_buffer";
1664 case CL_COMMAND_COPY_BUFFER_TO_IMAGE:
1665 return "copy_buffer_to_image";
1666 case CL_COMMAND_MAP_BUFFER:
1667 return "map_buffer";
1668 case CL_COMMAND_MAP_IMAGE:
1669 return "map_image";
1670 case CL_COMMAND_UNMAP_MEM_OBJECT:
1671 return "unmap_mem_object";
1672 case CL_COMMAND_MARKER:
1673 return "marker";
1674 case CL_COMMAND_ACQUIRE_GL_OBJECTS:
1675 return "acquire_gl_objects";
1676 case CL_COMMAND_RELEASE_GL_OBJECTS:
1677 return "release_gl_objects";
1678 case CL_COMMAND_READ_BUFFER_RECT:
1679 return "read_buffer_rect";
1680 case CL_COMMAND_WRITE_BUFFER_RECT:
1681 return "write_buffer_rect";
1682 case CL_COMMAND_COPY_BUFFER_RECT:
1683 return "copy_buffer_rect";
1684 case CL_COMMAND_USER:
1685 return "user";
1686 case CL_COMMAND_BARRIER:
1687 return "barrier";
1688 case CL_COMMAND_MIGRATE_MEM_OBJECTS:
1689 return "migrate_mem_objects";
1690 case CL_COMMAND_FILL_BUFFER:
1691 return "fill_buffer";
1692 case CL_COMMAND_FILL_IMAGE:
1693 return "fill_image";
1694 case CL_COMMAND_SVM_FREE:
1695 return "svm_free";
1696 case CL_COMMAND_SVM_MEMCPY:
1697 return "svm_memcpy";
1698 case CL_COMMAND_SVM_MEMFILL:
1699 return "svm_memfill";
1700 case CL_COMMAND_SVM_MAP:
1701 return "svm_map";
1702 case CL_COMMAND_SVM_UNMAP:
1703 return "svm_unmap";
1704 }
1705
1706 return "unknown";
1707 }
1708
1709 /*
1710 * This replaces a simple system(), because:
1711 *
1712 * 1) system() was causing issues (gpu lockups) with HSA when
1713 * compiling code (via compile_parallel_bc_to_brig)
1714 * with OpenCL 2.0 atomics (like CalcPie from AMD SDK).
1715 * The reason of lockups is unknown (yet).
1716 *
1717 * 2) system() uses fork() which copies page table maps, and runs
1718 * out of AS when pocl has already allocated huge buffers in memory.
1719 * this happened in llvm_codegen()
1720 *
1721 * vfork() does not copy pagetables.
1722 */
1723 int
pocl_run_command(char * const * args)1724 pocl_run_command (char *const *args)
1725 {
1726 POCL_MSG_PRINT_INFO ("Launching: %s\n", args[0]);
1727 #ifdef HAVE_VFORK
1728 pid_t p = vfork ();
1729 #elif defined(HAVE_FORK)
1730 pid_t p = fork ();
1731 #elif _WIN32
1732 STARTUPINFO si;
1733 ZeroMemory(&si, sizeof(si));
1734 si.cb = sizeof(si);
1735 PROCESS_INFORMATION pi;
1736 ZeroMemory(&pi, sizeof(pi));
1737 DWORD dwProcessFlags = 0;
1738 char * cmd = strdup(args[0]);
1739 int p = CreateProcess(NULL, cmd, NULL, NULL, 1, dwProcessFlags, NULL, NULL, &si, &pi) != 0;
1740 if (!p)
1741 return EXIT_FAILURE;
1742 DWORD waitRc = WaitForSingleObject(pi.hProcess, INFINITE);
1743 if (waitRc == WAIT_FAILED)
1744 return EXIT_FAILURE;
1745 DWORD exit_code = 0;
1746 p = GetExitCodeProcess(pi.hProcess, &exit_code) != 0;
1747 if (!p)
1748 return EXIT_FAILURE;
1749 return exit_code;
1750 #else
1751 #error Must have fork() or vfork() system calls for HSA
1752 #endif
1753 if (p == 0)
1754 {
1755 return execv (args[0], args);
1756 }
1757 else
1758 {
1759 if (p < 0)
1760 return EXIT_FAILURE;
1761 int status;
1762 if (waitpid (p, &status, 0) < 0)
1763 POCL_ABORT ("pocl: waitpid() failed.\n");
1764 if (WIFEXITED (status))
1765 return WEXITSTATUS (status);
1766 else if (WIFSIGNALED (status))
1767 return WTERMSIG (status);
1768 else
1769 return EXIT_FAILURE;
1770 }
1771 }
1772
1773 // event locked
1774 void
pocl_update_event_queued(cl_event event)1775 pocl_update_event_queued (cl_event event)
1776 {
1777 assert (event != NULL);
1778
1779 event->status = CL_QUEUED;
1780 cl_command_queue cq = event->queue;
1781 if ((cq->properties & CL_QUEUE_PROFILING_ENABLE)
1782 && (cq->device->has_own_timer == 0))
1783 event->time_queue = pocl_gettimemono_ns ();
1784
1785 POCL_MSG_PRINT_EVENTS ("Event queued: %" PRIu64 "\n", event->id);
1786
1787 if (cq->device->ops->update_event)
1788 cq->device->ops->update_event (cq->device, event);
1789 pocl_event_updated (event, CL_QUEUED);
1790 }
1791
1792 // event locked
1793 void
pocl_update_event_submitted(cl_event event)1794 pocl_update_event_submitted (cl_event event)
1795 {
1796 assert (event != NULL);
1797 assert (event->status == CL_QUEUED);
1798
1799 cl_command_queue cq = event->queue;
1800 event->status = CL_SUBMITTED;
1801 if ((cq->properties & CL_QUEUE_PROFILING_ENABLE)
1802 && (cq->device->has_own_timer == 0))
1803 event->time_submit = pocl_gettimemono_ns ();
1804
1805 POCL_MSG_PRINT_EVENTS ("Event submitted: %" PRIu64 "\n", event->id);
1806
1807 if (cq->device->ops->update_event)
1808 cq->device->ops->update_event (cq->device, event);
1809 pocl_event_updated (event, CL_SUBMITTED);
1810 }
1811
1812 void
pocl_update_event_running_unlocked(cl_event event)1813 pocl_update_event_running_unlocked (cl_event event)
1814 {
1815 assert (event != NULL);
1816 assert (event->status == CL_SUBMITTED);
1817
1818 cl_command_queue cq = event->queue;
1819 event->status = CL_RUNNING;
1820 if ((cq->properties & CL_QUEUE_PROFILING_ENABLE)
1821 && (cq->device->has_own_timer == 0))
1822 event->time_start = pocl_gettimemono_ns ();
1823
1824 POCL_MSG_PRINT_EVENTS ("Event running: %" PRIu64 "\n", event->id);
1825
1826 if (cq->device->ops->update_event)
1827 cq->device->ops->update_event (cq->device, event);
1828 pocl_event_updated (event, CL_RUNNING);
1829 }
1830
1831 void
pocl_update_event_running(cl_event event)1832 pocl_update_event_running (cl_event event)
1833 {
1834 POCL_LOCK_OBJ (event);
1835 pocl_update_event_running_unlocked (event);
1836 POCL_UNLOCK_OBJ (event);
1837 }
1838
1839 // status can be complete or failed (<0)
1840 void
pocl_update_event_finished_msg(cl_int status,const char * func,unsigned line,cl_event event,const char * msg)1841 pocl_update_event_finished_msg (cl_int status, const char *func, unsigned line,
1842 cl_event event, const char *msg)
1843 {
1844 assert (event != NULL);
1845 assert (event->queue != NULL);
1846 assert (event->status > CL_COMPLETE);
1847
1848 cl_command_queue cq = event->queue;
1849 POCL_LOCK_OBJ (cq);
1850 POCL_LOCK_OBJ (event);
1851 if ((cq->properties & CL_QUEUE_PROFILING_ENABLE)
1852 && (cq->device->has_own_timer == 0))
1853 event->time_end = pocl_gettimemono_ns ();
1854
1855 struct pocl_device_ops *ops = cq->device->ops;
1856 event->status = status;
1857 if (cq->device->ops->update_event)
1858 ops->update_event (cq->device, event);
1859
1860 if (status == CL_COMPLETE)
1861 POCL_MSG_PRINT_EVENTS ("%s: Command complete, event %" PRIu64 "\n",
1862 cq->device->short_name, event->id);
1863 else
1864 POCL_MSG_PRINT_EVENTS ("%s: Command FAILED, event %" PRIu64 "\n",
1865 cq->device->short_name, event->id);
1866
1867 assert (cq->command_count > 0);
1868 --cq->command_count;
1869 if (cq->barrier == event)
1870 cq->barrier = NULL;
1871 if (cq->last_event.event == event)
1872 cq->last_event.event = NULL;
1873 DL_DELETE (cq->events, event);
1874
1875 if (ops->notify_cmdq_finished && (cq->command_count == 0))
1876 ops->notify_cmdq_finished (cq);
1877
1878 if (ops->notify_event_finished)
1879 ops->notify_event_finished (event);
1880
1881 POCL_UNLOCK_OBJ (cq);
1882 /* note that we must unlock the CmqQ before calling pocl_event_updated,
1883 * because it calls event callbacks, which can have calls to
1884 * clEnqueueSomething() */
1885 pocl_event_updated (event, status);
1886 POCL_UNLOCK_OBJ (event);
1887 ops->broadcast (event);
1888
1889 #ifdef POCL_DEBUG_MESSAGES
1890 if (msg != NULL)
1891 {
1892 pocl_debug_print_duration (
1893 func, line, msg, (uint64_t) (event->time_end - event->time_start));
1894 }
1895 #endif
1896
1897 size_t i;
1898 for (i = 0; i < event->num_buffers; ++i)
1899 {
1900 cl_mem mem = event->mem_objs[i];
1901 if (event->release_mem_host_ptr_after)
1902 {
1903 POCL_LOCK_OBJ (mem);
1904 pocl_release_mem_host_ptr (mem);
1905 POCL_UNLOCK_OBJ (mem);
1906 }
1907 POname (clReleaseMemObject) (mem);
1908 }
1909 POCL_MEM_FREE (event->mem_objs);
1910
1911 POname (clReleaseEvent) (event);
1912 }
1913
1914 void
pocl_update_event_failed(cl_event event)1915 pocl_update_event_failed (cl_event event)
1916 {
1917 POCL_UNLOCK_OBJ (event);
1918 pocl_update_event_finished_msg (CL_FAILED, NULL, 0, event, NULL);
1919 POCL_LOCK_OBJ (event);
1920 }
1921
1922 void
pocl_update_event_complete_msg(const char * func,unsigned line,cl_event event,const char * msg)1923 pocl_update_event_complete_msg (const char *func, unsigned line,
1924 cl_event event, const char *msg)
1925 {
1926 pocl_update_event_finished_msg (CL_COMPLETE, func, line, event, msg);
1927 }
1928
1929 /*
1930 * float 2 half / half 2 float
1931 */
1932
1933 static int const shift = 13;
1934 static int const shiftSign = 16;
1935
1936 static int32_t const infN = 0x7F800000; /* flt32 infinity */
1937 static int32_t const maxN = 0x477FE000; /* max flt16 normal as a flt32 */
1938 static int32_t const minN = 0x38800000; /* min flt16 normal as a flt32 */
1939 static int32_t const signN = 0x80000000; /* flt32 sign bit */
1940
1941 /* static int32_t const infC = infN >> shift;
1942 * static int32_t const infC = 0x3FC00;
1943 * static int32_t const nanN = (infC + 1) << shift; // minimum flt16 nan as a flt32
1944 */
1945 static int32_t const nanN = 0x7f802000;
1946 /* static int32_t const maxC = maxN >> shift; */
1947 static int32_t const maxC = 0x23bff;
1948 /* static int32_t const minC = minN >> shift;
1949 * static int32_t const minC = 0x1c400;
1950 * static int32_t const signC = signN >> shiftSign; // flt16 sign bit
1951 */
1952 static int32_t const signC = 0x40000; /* flt16 sign bit */
1953
1954 static int32_t const mulN = 0x52000000; /* (1 << 23) / minN */
1955 static int32_t const mulC = 0x33800000; /* minN / (1 << (23 - shift)) */
1956
1957 static int32_t const subC = 0x003FF; /* max flt32 subnormal down shifted */
1958 static int32_t const norC = 0x00400; /* min flt32 normal down shifted */
1959
1960 /* static int32_t const maxD = infC - maxC - 1; */
1961 static int32_t const maxD = 0x1c000;
1962 /* static int32_t const minD = minC - subC - 1; */
1963 static int32_t const minD = 0x1c000;
1964
1965 typedef union
1966 {
1967 float f;
1968 int32_t si;
1969 uint32_t ui;
1970 } H2F_Bits;
1971
1972 float
half_to_float(uint16_t value)1973 half_to_float (uint16_t value)
1974 {
1975 H2F_Bits v;
1976 v.ui = value;
1977 int32_t sign = v.si & signC;
1978 v.si ^= sign;
1979 sign <<= shiftSign;
1980 v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
1981 v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
1982 H2F_Bits s;
1983 s.si = mulC;
1984 s.f *= v.si;
1985 int32_t mask = -(norC > v.si);
1986 v.si <<= shift;
1987 v.si ^= (s.si ^ v.si) & mask;
1988 v.si |= sign;
1989 return v.f;
1990 }
1991
1992 uint16_t
float_to_half(float value)1993 float_to_half (float value)
1994 {
1995 H2F_Bits v, s;
1996 v.f = value;
1997 uint32_t sign = v.si & signN;
1998 v.si ^= sign;
1999 sign >>= shiftSign;
2000 s.si = mulN;
2001 s.si = s.f * v.f;
2002 v.si ^= (s.si ^ v.si) & -(minN > v.si);
2003 v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
2004 v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
2005 v.ui >>= shift;
2006 v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
2007 v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC);
2008 return v.ui | sign;
2009 }
2010
2011 /* SPIR-V magic header */
2012 #define SPIRV_MAGIC 0x07230203U
2013 /* Opcode for capability used by module */
2014 #define OpCapab 0x00020011
2015 /* execution model = Kernel is used by OpenCL SPIR-V modules */
2016 #define KernelExecModel 0x6
2017
2018 int
bitcode_is_spirv_kernel(const char * bitcode,size_t size)2019 bitcode_is_spirv_kernel (const char *bitcode, size_t size)
2020 {
2021 const uint32_t *bc32 = (const uint32_t *)bitcode;
2022 unsigned location = 0;
2023 uint32_t header_magic = htole32 (bc32[location++]);
2024
2025 if ((size < 20) || (header_magic != SPIRV_MAGIC))
2026 return 0;
2027
2028 // skip version, generator, bound, schema
2029 location += 4;
2030 int is_opencl = 0;
2031 uint32_t instruction, value;
2032 do
2033 {
2034 instruction = htole32 (bc32[location++]);
2035 value = htole32 (bc32[location++]);
2036 if (value == KernelExecModel)
2037 is_opencl = 1;
2038 }
2039 while (instruction == OpCapab);
2040
2041 /* SPIR-V but not OpenCL-type. */
2042 if (!is_opencl)
2043 {
2044 POCL_MSG_ERR ("SPIR-V binary provided, but is not using Kernel mode."
2045 "Pocl can't process this binary.\n");
2046 }
2047
2048 return is_opencl;
2049 }
2050