1 /* pocl-hsa.c - driver for HSA supported devices.
2 
3    Copyright (c) 2015-2020 Pekka Jääskeläinen
4                  2015 Charles Chen <ccchen@pllab.cs.nthu.edu.tw>
5                       Shao-chung Wang <scwang@pllab.cs.nthu.edu.tw>
6                  2015-2018 Michal Babej <michal.babej@tut.fi>
7 
8    Short snippets borrowed from the MatrixMultiplication example in
9    the HSA runtime library sources (c) 2014 HSA Foundation Inc.
10 
11    Permission is hereby granted, free of charge, to any person obtaining a copy
12    of this software and associated documentation files (the "Software"), to
13    deal in the Software without restriction, including without limitation the
14    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
15    sell copies of the Software, and to permit persons to whom the Software is
16    furnished to do so, subject to the following conditions:
17 
18    The above copyright notice and this permission notice shall be included in
19    all copies or substantial portions of the Software.
20 
21    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
27    IN THE SOFTWARE.
28 */
29 /* Some example code snippets copied verbatim from vector_copy.c of
30  * HSA-Runtime-AMD: */
31 /* Copyright 2014 HSA Foundation Inc.  All Rights Reserved.
32  *
33  * HSAF is granting you permission to use this software and documentation (if
34  * any) (collectively, the "Materials") pursuant to the terms and conditions
35  * of the Software License Agreement included with the Materials.  If you do
36  * not have a copy of the Software License Agreement, contact the  HSA
37  * Foundation for a copy. Redistribution and use in source and binary forms,
38  * with or without modification, are permitted provided that the following
39  * conditions are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution
45  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
46  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
47  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
48  * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
49  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
50  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
51  * WITH THE SOFTWARE.
52  */
53 
54 #ifndef _BSD_SOURCE
55 #define _BSD_SOURCE
56 #endif
57 
58 #ifndef _DEFAULT_SOURCE
59 #define _DEFAULT_SOURCE
60 #endif
61 
62 
63 #include "hsa.h"
64 #include "hsa_ext_finalize.h"
65 #include "hsa_ext_image.h"
66 
67 #include "config.h"
68 #include "config2.h"
69 
70 #if defined(HAVE_HSA_EXT_AMD_H) && AMD_HSA == 1
71 
72 #include "hsa_ext_amd.h"
73 
74 #endif
75 
76 #include "pocl-hsa.h"
77 #include "common.h"
78 #include "common_driver.h"
79 #include "devices.h"
80 #include "pocl-hsa.h"
81 #include "pocl_cache.h"
82 #include "pocl_context.h"
83 #include "pocl_file_util.h"
84 #include "pocl_llvm.h"
85 #include "pocl_mem_management.h"
86 #include "pocl_spir.h"
87 #include "pocl_local_size.h"
88 #include "pocl_util.h"
89 
90 #include <assert.h>
91 #include <limits.h>
92 #include <pthread.h>
93 #include <stdlib.h>
94 #include <string.h>
95 
96 #ifndef _MSC_VER
97 #  include <sys/wait.h>
98 #  include <sys/time.h>
99 #  include <sys/types.h>
100 #  include <unistd.h>
101 #else
102 #  include "vccompat.hpp"
103 #endif
104 
105 #define max(a,b) (((a) > (b)) ? (a) : (b))
106 
107 /* TODO: The kernel cache is never shrunk. We need a hook that is called back
108    when clReleaseKernel is called to get a safe point where to release the
109    kernel entry from the inmemory cache. */
110 #define HSA_KERNEL_CACHE_SIZE 4096
111 #define COMMAND_LIST_SIZE 4096
112 #define EVENT_LIST_SIZE 511
113 
114 typedef struct pocl_hsa_event_data_s {
115   /* Address of the space where this kernel launch's arguments were stored. */
116   void *kernargs;
117   /* The location of the pocl context struct in the Agent's global mem. */
118   void *context;
119   pthread_cond_t event_cond;
120 } pocl_hsa_event_data_t;
121 
122 /* Simple statically-sized kernel data cache */
123 /* for caching kernel dispatch data, binaries etc */
124 typedef struct pocl_hsa_kernel_cache_s {
125   cl_kernel kernel;
126 
127   /* use kernel hash as key */
128   pocl_kernel_hash_t kernel_hash;
129 
130   hsa_executable_t hsa_exe;
131   uint64_t code_handle;
132 
133   uint32_t private_size;
134   uint32_t static_group_size;
135   uint32_t args_segment_size;
136 
137   /* For native non-SPMD targets, we cache work-group functions specialized
138      to specific work-group sizes. */
139   uint64_t local_x;
140   uint64_t local_y;
141   uint64_t local_z;
142 
143   /* If global offset must be zero for this WG function. */
144   int goffs_zero;
145 
146   /* Maximum grid dimension this WG function works with. */
147   size_t max_grid_dim_width;
148 
149 } pocl_hsa_kernel_cache_t;
150 
151 /* data for driver pthread */
152 typedef struct pocl_hsa_device_pthread_data_s {
153   /* list of running commands and their signals*/
154   cl_event running_events[EVENT_LIST_SIZE];
155   hsa_signal_t running_signals[EVENT_LIST_SIZE+1];
156   size_t running_list_size;
157 
158   /* Queue list (for pushing work to the agent);
159    * multiple queues per device */
160   hsa_queue_t **queues;
161   size_t num_queues, last_queue;
162 } pocl_hsa_device_pthread_data_t;
163 
164 typedef struct pocl_hsa_device_data_s {
165   /* The parent device struct. */
166   cl_device_id device;
167   /* The HSA kernel agent controlled by the device driver instance. */
168   hsa_agent_t agent;
169   hsa_profile_t agent_profile;
170 
171   /* mem regions */
172   hsa_region_t global_region, kernarg_region, group_region;
173 
174   /* Per-program data cache to simplify program compiling stage */
175   pocl_hsa_kernel_cache_t kernel_cache[HSA_KERNEL_CACHE_SIZE];
176   unsigned kernel_cache_lastptr;
177 
178   /* kernel signal wait timeout hint, in HSA runtime units */
179   uint64_t timeout;
180   /* length of a timestamp unit expressed in nanoseconds */
181   double timestamp_unit;
182   /* see pocl_hsa_init for details */
183   size_t hw_schedulers;
184 
185   /* list of submitted commands waiting to run later */
186   cl_event wait_list[COMMAND_LIST_SIZE];
187   size_t wait_list_size;
188 
189   /* list of commands ready to run */
190   cl_event ready_list[COMMAND_LIST_SIZE];
191   size_t ready_list_size;
192 
193   /* list manipulation mutex */
194   pthread_mutex_t list_mutex;
195 
196   /* used by host thread to notify driver pthread when events change status */
197   hsa_signal_t nudge_driver_thread;
198 
199   /* device pthread */
200   pthread_t driver_pthread_id;
201 
202   /* device pthread data */
203   pocl_hsa_device_pthread_data_t driver_data;
204 
205   /* exit signal */
206   volatile int exit_driver_thread;
207 
208   /* if agent supports async handlers*/
209   int have_wait_any;
210 
211   /* compilation lock */
212   pocl_lock_t pocl_hsa_compilation_lock;
213 
214   /* printf buffer */
215   void *printf_buffer;
216   uint32_t *printf_write_pos;
217 
218 } pocl_hsa_device_data_t;
219 
220 void pocl_hsa_compile_kernel_hsail (_cl_command_node *cmd, cl_kernel kernel,
221                                     cl_device_id device, int specialize);
222 
223 void pocl_hsa_compile_kernel_native (_cl_command_node *cmd, cl_kernel kernel,
224                                      cl_device_id device, int specialize);
225 
226 static void*
227 pocl_hsa_malloc(pocl_global_mem_t *mem, size_t size, hsa_region_t r);
228 
229 void
pocl_hsa_init_device_ops(struct pocl_device_ops * ops)230 pocl_hsa_init_device_ops(struct pocl_device_ops *ops)
231 {
232   /* TODO: more descriptive name from HSA probing the device */
233   ops->device_name = "hsa";
234   ops->probe = pocl_hsa_probe;
235   ops->uninit = pocl_hsa_uninit;
236   ops->reinit = pocl_hsa_reinit;
237   ops->init = pocl_hsa_init;
238   ops->alloc_mem_obj = pocl_hsa_alloc_mem_obj;
239   ops->free = pocl_hsa_free;
240   ops->run = NULL;
241 
242   ops->read = pocl_driver_read;
243   ops->read_rect = pocl_driver_read_rect;
244   ops->write = pocl_driver_write;
245   ops->write_rect = pocl_driver_write_rect;
246   ops->map_mem = pocl_driver_map_mem;
247   ops->unmap_mem = pocl_driver_unmap_mem;
248   ops->get_mapping_ptr = pocl_driver_get_mapping_ptr;
249   ops->free_mapping_ptr = pocl_driver_free_mapping_ptr;
250   ops->memfill = pocl_driver_memfill;
251   ops->copy = pocl_hsa_copy;
252   ops->copy_rect = pocl_driver_copy_rect;
253   ops->compute_local_size = pocl_default_local_size_optimizer;
254 
255   ops->get_device_info_ext = NULL;
256 
257   ops->svm_free = pocl_hsa_svm_free;
258   ops->svm_alloc = pocl_hsa_svm_alloc;
259   ops->svm_copy = pocl_hsa_svm_copy;
260   ops->svm_fill = pocl_basic_svm_fill;
261   ops->svm_register = pocl_hsa_svm_register;
262   ops->svm_unregister = pocl_hsa_svm_unregister;
263 
264   // new driver api (out-of-order)
265   ops->submit = pocl_hsa_submit;
266   ops->join = pocl_hsa_join;
267   ops->flush = pocl_hsa_flush;
268   ops->notify = pocl_hsa_notify;
269   ops->broadcast = pocl_hsa_broadcast;
270   ops->wait_event = pocl_hsa_wait_event;
271 
272   ops->build_source = pocl_driver_build_source;
273   ops->link_program = pocl_driver_link_program;
274   ops->build_binary = pocl_driver_build_binary;
275   ops->free_program = pocl_driver_free_program;
276   ops->setup_metadata = pocl_driver_setup_metadata;
277   ops->supports_binary = pocl_driver_supports_binary;
278   ops->build_poclbinary = pocl_driver_build_poclbinary;
279 #if HSAIL_ENABLED
280   ops->compile_kernel = pocl_hsa_compile_kernel_hsail;
281 #else
282   ops->compile_kernel = pocl_hsa_compile_kernel_native;
283 #endif
284 
285   ops->update_event = pocl_hsa_update_event;
286   ops->notify_event_finished = pocl_hsa_notify_event_finished;
287   ops->free_event_data = pocl_hsa_free_event_data;
288   ops->init_target_machine = NULL;
289   ops->wait_event = pocl_hsa_wait_event;
290   ops->build_hash = pocl_hsa_build_hash;
291   ops->init_build = pocl_hsa_init_build;
292 }
293 
294 #define MAX_HSA_AGENTS 16
295 
296 static void
pocl_hsa_abort_on_hsa_error(hsa_status_t status,unsigned line,const char * func,const char * code)297 pocl_hsa_abort_on_hsa_error(hsa_status_t status,
298                             unsigned line,
299                             const char* func,
300                             const char* code)
301 {
302   const char* str;
303   if (status != HSA_STATUS_SUCCESS)
304     {
305       hsa_status_string(status, &str);
306       POCL_MSG_PRINT2(HSA, func, line, "Error from HSA Runtime call:\n");
307       POCL_ABORT ("%s\n", str);
308     }
309 }
310 
311 
312 #define HSA_CHECK(code) pocl_hsa_abort_on_hsa_error(code,         \
313                                                     __LINE__,     \
314                                                     __FUNCTION__, \
315                                                     #code);
316 
317 
318 static hsa_agent_t hsa_agents[MAX_HSA_AGENTS];
319 static unsigned found_hsa_agents = 0;
320 
321 static hsa_status_t
pocl_hsa_get_agents_callback(hsa_agent_t agent,void * data)322 pocl_hsa_get_agents_callback(hsa_agent_t agent, void *data)
323 {
324   hsa_device_type_t type;
325   HSA_CHECK(hsa_agent_get_info (agent, HSA_AGENT_INFO_DEVICE, &type));
326 
327   hsa_agent_feature_t features;
328   HSA_CHECK(hsa_agent_get_info(agent, HSA_AGENT_INFO_FEATURE, &features));
329   if (features != HSA_AGENT_FEATURE_KERNEL_DISPATCH)
330     {
331       return HSA_STATUS_SUCCESS;
332     }
333 
334   hsa_agents[found_hsa_agents++] = agent;
335   return HSA_STATUS_SUCCESS;
336 }
337 
338 /*
339  * Sets up the memory regions in pocl_hsa_device_data for a device
340  */
341 static
342 hsa_status_t
setup_agent_memory_regions_callback(hsa_region_t region,void * data)343 setup_agent_memory_regions_callback(hsa_region_t region, void* data)
344 {
345   pocl_hsa_device_data_t* d = (pocl_hsa_device_data_t*)data;
346 
347   hsa_region_segment_t segment;
348   hsa_region_global_flag_t flags;
349   HSA_CHECK(hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment));
350 
351   if (segment == HSA_REGION_SEGMENT_GLOBAL)
352     {
353       d->global_region = region;
354       HSA_CHECK(hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS,
355                                     &flags));
356       if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
357         d->kernarg_region = region;
358     }
359 
360   if (segment == HSA_REGION_SEGMENT_GROUP)
361     d->group_region = region;
362 
363   return HSA_STATUS_SUCCESS;
364 }
365 
366 /* HSA unsupported device features are hard coded in a known Agent
367    list and detected by the advertised agent name string. */
368 #define HSA_NUM_KNOWN_HSA_AGENTS 2
369 
370 static const char *default_native_final_linkage_flags[] =
371   {"-nostartfiles", HOST_LD_FLAGS_ARRAY, NULL};
372 
373 static const char *phsa_native_device_aux_funcs[] =
374   {"_pocl_run_all_wgs", "_pocl_finish_all_wgs", "_pocl_spawn_wg", NULL};
375 
376 #define AMD_VENDOR_ID 0x1002
377 
378 static struct _cl_device_id supported_hsa_devices[HSA_NUM_KNOWN_HSA_AGENTS]
379     = { [0] = { .long_name = "Spectre",
380                 .llvm_cpu = (HSAIL_ENABLED ? NULL : "kaveri"),
381                 .llvm_target_triplet
382                 = (HSAIL_ENABLED ? "hsail64" : "amdgcn--amdhsa"),
383                 .spmd = CL_TRUE,
384                 .autolocals_to_args = POCL_AUTOLOCALS_TO_ARGS_NEVER,
385                 .device_alloca_locals = CL_FALSE,
386                 .context_as_id = SPIR_ADDRESS_SPACE_GLOBAL,
387                 .args_as_id = SPIR_ADDRESS_SPACE_GLOBAL,
388                 .has_64bit_long = 1,
389                 .vendor_id = AMD_VENDOR_ID,
390                 .global_mem_cache_type = CL_READ_WRITE_CACHE,
391                 .max_constant_buffer_size = 65536,
392                 .local_mem_type = CL_LOCAL,
393                 .endian_little = CL_TRUE,
394                 .extensions = HSA_DEVICE_EXTENSIONS,
395                 .device_side_printf = !HSAIL_ENABLED,
396                 .printf_buffer_size = PRINTF_BUFFER_SIZE * 1024,
397                 .preferred_wg_size_multiple = 64, // wavefront size on Kaveri
398                 .preferred_vector_width_char = 4,
399                 .preferred_vector_width_short = 2,
400                 .preferred_vector_width_int = 1,
401                 .preferred_vector_width_long = 1,
402                 .preferred_vector_width_float = 1,
403                 .preferred_vector_width_double = 1,
404                 .native_vector_width_char = 4,
405                 .native_vector_width_short = 2,
406                 .native_vector_width_int = 1,
407                 .native_vector_width_long = 1,
408                 .native_vector_width_float = 1,
409                 .native_vector_width_double = 1 },
410         [1] = { .long_name = "phsa generic CPU agent",
411                 .llvm_cpu = NULL,
412                 .llvm_target_triplet = (HSAIL_ENABLED ? "hsail64" : NULL),
413                 .spmd = CL_FALSE,
414                 .autolocals_to_args
415                 = (HSAIL_ENABLED ? POCL_AUTOLOCALS_TO_ARGS_NEVER
416                                  : POCL_AUTOLOCALS_TO_ARGS_ALWAYS),
417                 .device_alloca_locals = CL_TRUE,
418                 .context_as_id = SPIR_ADDRESS_SPACE_GLOBAL,
419                 .args_as_id = SPIR_ADDRESS_SPACE_GLOBAL,
420                 .has_64bit_long = 1,
421                 .vendor_id = 0xffff,
422                 .global_mem_cache_type = CL_READ_WRITE_CACHE,
423                 .max_constant_buffer_size = 65536,
424                 .local_mem_type = CL_LOCAL,
425                 .endian_little = !(WORDS_BIGENDIAN),
426                 .extensions = HSA_DEVICE_EXTENSIONS,
427                 .device_side_printf = !HSAIL_ENABLED,
428                 .printf_buffer_size = PRINTF_BUFFER_SIZE * 1024,
429                 .preferred_wg_size_multiple = 1,
430                 /* We want to exploit the widest vector types in HSAIL
431                    for the CPUs assuming they have some sort of SIMD ISE
432                    which the finalizer than can more readily utilize.  */
433                 .preferred_vector_width_char = 16,
434                 .preferred_vector_width_short = 16,
435                 .preferred_vector_width_int = 16,
436                 .preferred_vector_width_long = 16,
437                 .preferred_vector_width_float = 16,
438                 .preferred_vector_width_double = 16,
439                 .native_vector_width_char = 16,
440                 .native_vector_width_short = 16,
441                 .native_vector_width_int = 16,
442                 .native_vector_width_long = 16,
443                 .native_vector_width_float = 16,
444                 .native_vector_width_double = 16,
445                 .final_linkage_flags = default_native_final_linkage_flags,
446                 .device_aux_functions
447                 = (HSAIL_ENABLED ? NULL : phsa_native_device_aux_funcs) } };
448 
449 char *
pocl_hsa_build_hash(cl_device_id device)450 pocl_hsa_build_hash (cl_device_id device)
451 {
452   char* res = calloc(1000, sizeof(char));
453   snprintf(res, 1000, "HSA-%s-%s", device->llvm_target_triplet, device->long_name);
454   return res;
455 }
456 
457 // Detect the HSA device and populate its properties to the device
458 // struct.
459 static void
get_hsa_device_features(char * dev_name,struct _cl_device_id * dev)460 get_hsa_device_features(char* dev_name, struct _cl_device_id* dev)
461 {
462 
463 #define COPY_ATTR(ATTR) dev->ATTR = supported_hsa_devices[i].ATTR
464 #define COPY_VECWIDTH(ATTR) \
465      dev->preferred_vector_width_ ## ATTR = \
466          supported_hsa_devices[i].preferred_vector_width_ ## ATTR; \
467      dev->native_vector_width_ ## ATTR = \
468          supported_hsa_devices[i].native_vector_width_ ## ATTR;
469 
470   int found = 0;
471   unsigned i;
472   for(i = 0; i < HSA_NUM_KNOWN_HSA_AGENTS; i++)
473     {
474       if (strcmp(dev_name, supported_hsa_devices[i].long_name) == 0)
475         {
476 	  COPY_ATTR (llvm_cpu);
477 	  COPY_ATTR (llvm_target_triplet);
478 	  COPY_ATTR (spmd);
479 	  COPY_ATTR (autolocals_to_args);
480           COPY_ATTR (device_alloca_locals);
481           COPY_ATTR (context_as_id);
482           COPY_ATTR (args_as_id);
483           if (!HSAIL_ENABLED)
484             {
485               /* TODO: Add a CMake variable or HSA description string
486                  autodetection to control these. */
487               if (dev->llvm_cpu == NULL)
488                 dev->llvm_cpu = get_llvm_cpu_name ();
489               if (dev->llvm_target_triplet == NULL)
490                 dev->llvm_target_triplet = OCL_KERNEL_TARGET;
491               dev->arg_buffer_launcher = CL_TRUE;
492             }
493           COPY_ATTR (has_64bit_long);
494           COPY_ATTR (vendor_id);
495           COPY_ATTR (global_mem_cache_type);
496           COPY_ATTR (max_constant_buffer_size);
497           COPY_ATTR (local_mem_type);
498           COPY_ATTR (endian_little);
499           COPY_ATTR (preferred_wg_size_multiple);
500           COPY_ATTR (extensions);
501 	  COPY_ATTR (final_linkage_flags);
502 	  COPY_ATTR (device_aux_functions);
503 	  COPY_ATTR (device_side_printf);
504 	  COPY_ATTR (printf_buffer_size);
505           COPY_VECWIDTH (char);
506           COPY_VECWIDTH (short);
507           COPY_VECWIDTH (int);
508           COPY_VECWIDTH (long);
509           COPY_VECWIDTH (float);
510           COPY_VECWIDTH (double);
511           found = 1;
512           break;
513         }
514     }
515   if (!found)
516     {
517       POCL_MSG_PRINT_INFO("pocl-hsa: found unknown HSA devices '%s'.\n",
518 			  dev_name);
519       POCL_ABORT ("We found a device for which we don't have device "
520                   "OpenCL attribute information (compute unit count, "
521                   "constant buffer size etc), and there's no way to get all "
522                   "the required info from HSA API. Please create a "
523                   "new entry with the information in supported_hsa_devices, "
524                   "and send a note/patch to pocl developers. Thanks!\n");
525     }
526 }
527 
528 unsigned int
pocl_hsa_probe(struct pocl_device_ops * ops)529 pocl_hsa_probe (struct pocl_device_ops *ops)
530 {
531   int env_count = pocl_device_get_env_count (ops->device_name);
532 
533   POCL_MSG_PRINT_INFO ("pocl-hsa: found %d env devices with %s.\n", env_count,
534                        ops->device_name);
535 
536   /* No hsa env specified, the user did not request for HSA agents. */
537   if (env_count <= 0)
538     return 0;
539 
540   HSA_CHECK (hsa_init ());
541 
542   HSA_CHECK (hsa_iterate_agents (pocl_hsa_get_agents_callback, NULL));
543 
544   POCL_MSG_PRINT_INFO ("pocl-hsa: found %d agents.\n", found_hsa_agents);
545 
546   return (int)found_hsa_agents;
547 }
548 
549 static void
hsa_queue_callback(hsa_status_t status,hsa_queue_t * q,void * data)550 hsa_queue_callback (hsa_status_t status, hsa_queue_t *q, void *data)
551 {
552   HSA_CHECK (status);
553 }
554 
555 /* driver pthread prototype */
556 void *pocl_hsa_driver_pthread (void *cldev);
557 
558 /* (Re)initialize the device data. dev is the device driver part, count is
559    the number of devices of this type initialized so far. */
560 static pocl_hsa_device_data_t *
init_dev_data(cl_device_id dev,int count)561 init_dev_data (cl_device_id dev, int count)
562 {
563   pocl_hsa_device_data_t *d
564       = (pocl_hsa_device_data_t *)calloc (1, sizeof (pocl_hsa_device_data_t));
565 
566   dev->data = d;
567   d->device = dev;
568   d->hw_schedulers = 3;
569   POCL_INIT_LOCK (d->pocl_hsa_compilation_lock);
570 
571   /* Before the first HSA device, re-init the runtime. */
572   if (count == 0 && found_hsa_agents == 0)
573     {
574       HSA_CHECK (hsa_init ());
575       HSA_CHECK (hsa_iterate_agents (pocl_hsa_get_agents_callback, NULL));
576     }
577   d->agent.handle = hsa_agents[count].handle;
578 
579   HSA_CHECK (hsa_agent_iterate_regions (
580       d->agent, setup_agent_memory_regions_callback, d));
581   bool boolarg = 0;
582   HSA_CHECK (hsa_region_get_info (
583       d->global_region, HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED, &boolarg));
584   assert (boolarg != 0);
585 
586   pocl_reinit_system_memory ();
587 
588   HSA_CHECK (hsa_signal_create (1, 1, &d->agent, &d->nudge_driver_thread));
589   d->exit_driver_thread = 0;
590   pthread_mutexattr_t mattr;
591   PTHREAD_CHECK (pthread_mutexattr_init (&mattr));
592   PTHREAD_CHECK (pthread_mutexattr_settype (&mattr, PTHREAD_MUTEX_ERRORCHECK));
593   PTHREAD_CHECK (pthread_mutex_init (&d->list_mutex, &mattr));
594 
595   uint64_t hsa_freq;
596   HSA_CHECK (
597       hsa_system_get_info (HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &hsa_freq));
598   d->timeout = hsa_freq; // 1 second in hsa units
599   d->timestamp_unit = (1000000000.0 / (double)hsa_freq);
600   POCL_MSG_PRINT_INFO ("HSA timestamp frequency: %" PRIu64 "\n", hsa_freq);
601   POCL_MSG_PRINT_INFO ("HSA timeout: %" PRIu64 "\n", d->timeout);
602   POCL_MSG_PRINT_INFO ("HSA timestamp unit: %g\n", d->timestamp_unit);
603 
604 #if AMD_HSA == 1
605   /* TODO check at runtime */
606   d->have_wait_any = 1;
607 #endif
608 
609 #if AMD_HSA == 1
610   if (dev->vendor_id == AMD_VENDOR_ID)
611     {
612       char booltest = 0;
613       HSA_CHECK (hsa_region_get_info (
614           d->global_region, HSA_AMD_REGION_INFO_HOST_ACCESSIBLE, &booltest));
615       assert (booltest != 0);
616     }
617 #endif
618 
619   size_t sizearg;
620   HSA_CHECK (hsa_region_get_info (d->global_region,
621                                   HSA_REGION_INFO_ALLOC_MAX_SIZE, &sizearg));
622   dev->max_mem_alloc_size = sizearg;
623 
624   /* For some reason, the global region size returned is 128 Terabytes...
625    * for now, use the max alloc size, it seems to be a much more reasonable
626    * value.
627    * HSA_CHECK(hsa_region_get_info(d->global_region, HSA_REGION_INFO_SIZE,
628    *                               &sizearg));
629    */
630   HSA_CHECK (
631       hsa_region_get_info (d->global_region, HSA_REGION_INFO_SIZE, &sizearg));
632   dev->global_mem_size = sizearg;
633   if (dev->global_mem_size > 16 * 1024 * 1024 * (uint64_t)1024)
634     dev->global_mem_size = dev->max_mem_alloc_size;
635 
636   HSA_CHECK (
637       hsa_region_get_info (d->group_region, HSA_REGION_INFO_SIZE, &sizearg));
638   dev->local_mem_size = sizearg;
639 
640   HSA_CHECK (hsa_region_get_info (
641       d->global_region, HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT, &sizearg));
642   dev->mem_base_addr_align = max (sizearg, MAX_EXTENDED_ALIGNMENT);
643 
644   HSA_CHECK (hsa_agent_get_info (d->agent, HSA_AGENT_INFO_PROFILE,
645                                  &d->agent_profile));
646 
647   if (d->agent_profile == HSA_PROFILE_FULL)
648     pocl_setup_device_for_system_memory (dev);
649 
650   dev->profile = "FULL_PROFILE";
651   dev->has_own_timer = CL_TRUE;
652 
653   dev->compiler_available = CL_TRUE;
654   dev->linker_available = CL_TRUE;
655 
656   dev->profiling_timer_resolution = (size_t) (d->timestamp_unit) || 1;
657 
658   if (dev->device_side_printf)
659     {
660       d->printf_buffer = pocl_hsa_malloc
661         (dev->global_memory, dev->printf_buffer_size, d->global_region);
662 
663       d->printf_write_pos = pocl_hsa_malloc
664         (dev->global_memory, sizeof (size_t), d->global_region);
665     }
666 
667   d->exit_driver_thread = 0;
668   PTHREAD_CHECK (pthread_create (&d->driver_pthread_id, NULL,
669                                  &pocl_hsa_driver_pthread, dev));
670 
671   return d;
672 }
673 
674 cl_int
pocl_hsa_init(unsigned j,cl_device_id dev,const char * parameters)675 pocl_hsa_init (unsigned j, cl_device_id dev, const char *parameters)
676 {
677   pocl_init_default_device_infos (dev);
678 
679   SETUP_DEVICE_CL_VERSION(HSA_DEVICE_CL_VERSION_MAJOR,
680                           HSA_DEVICE_CL_VERSION_MINOR)
681 
682   dev->spmd = CL_TRUE;
683   dev->arg_buffer_launcher = CL_FALSE;
684   dev->autolocals_to_args = POCL_AUTOLOCALS_TO_ARGS_NEVER;
685   dev->device_alloca_locals = CL_FALSE;
686 
687   dev->local_as_id = SPIR_ADDRESS_SPACE_LOCAL;
688   dev->constant_as_id = SPIR_ADDRESS_SPACE_CONSTANT;
689 
690   assert (found_hsa_agents > 0);
691   assert (j < found_hsa_agents);
692   dev->data = (void*)(uintptr_t)j;
693   hsa_agent_t agent = hsa_agents[j];
694 
695   uint32_t cache_sizes[4];
696   HSA_CHECK(hsa_agent_get_info (agent, HSA_AGENT_INFO_CACHE_SIZE,
697                         &cache_sizes));
698   // The only nonzero value on Kaveri is the first (L1)
699   dev->global_mem_cache_size = cache_sizes[0];
700 
701   dev->short_name = dev->long_name = (char*)malloc (64*sizeof(char));
702   HSA_CHECK(hsa_agent_get_info (agent, HSA_AGENT_INFO_NAME, dev->long_name));
703   get_hsa_device_features (dev->long_name, dev);
704 
705   dev->type = CL_DEVICE_TYPE_GPU;
706 
707   // Enable when it's actually implemented AND if supported by
708   // the target agent (check with hsa_agent_extension_supported).
709   dev->image_support = CL_FALSE;
710 
711   dev->single_fp_config = CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO
712       | CL_FP_ROUND_TO_INF | CL_FP_FMA | CL_FP_INF_NAN;
713   dev->double_fp_config = CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO
714                           | CL_FP_ROUND_TO_INF | CL_FP_FMA | CL_FP_INF_NAN;
715 
716   hsa_machine_model_t model;
717   HSA_CHECK (hsa_agent_get_info (agent, HSA_AGENT_INFO_MACHINE_MODEL, &model));
718   dev->address_bits = (model == HSA_MACHINE_MODEL_LARGE) ? 64 : 32;
719 
720   uint16_t wg_sizes[3];
721   HSA_CHECK (
722       hsa_agent_get_info (agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM, &wg_sizes));
723 
724   int max_wg = pocl_get_int_option ("POCL_MAX_WORK_GROUP_SIZE", 0);
725   if (max_wg > 0)
726     {
727       wg_sizes[0] = min (wg_sizes[0], max_wg);
728       wg_sizes[1] = min (wg_sizes[1], max_wg);
729       wg_sizes[2] = min (wg_sizes[2], max_wg);
730     }
731 
732   dev->max_work_item_sizes[0] = wg_sizes[0];
733   dev->max_work_item_sizes[1] = wg_sizes[1];
734   dev->max_work_item_sizes[2] = wg_sizes[2];
735 
736   /* Specialize WG functions for grid dimensions of width <= USHRT_MAX. */
737   dev->grid_width_specialization_limit = USHRT_MAX;
738 
739   HSA_CHECK(hsa_agent_get_info
740     (agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &dev->max_work_group_size));
741 
742   if (max_wg > 0)
743     dev->max_work_group_size = max_wg;
744 
745   /* Assume a small maximum work-group size indicates also the desire to
746      maximally utilize it. */
747   if (max_wg <= 256)
748     dev->ops->compute_local_size = pocl_wg_utilization_maximizer;
749 
750   if (AMD_HSA && dev->vendor_id == AMD_VENDOR_ID)
751     {
752 #if AMD_HSA == 1
753       uint32_t temp;
754       HSA_CHECK (hsa_agent_get_info (agent, HSA_AMD_AGENT_INFO_CACHELINE_SIZE,
755                                      &temp));
756       dev->global_mem_cacheline_size = temp;
757 
758       HSA_CHECK (hsa_agent_get_info (
759           agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &temp));
760       dev->max_compute_units = temp;
761 
762       HSA_CHECK (hsa_agent_get_info (
763           agent, HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, &temp));
764       dev->max_clock_frequency = temp;
765 #endif
766     }
767   else
768     {
769       /* Could not use AMD extensions to find out CU/frequency of the device.
770 	 Using dummy values. */
771       dev->global_mem_cacheline_size = 64;
772       dev->max_compute_units = 4;
773       dev->max_clock_frequency = 700;
774     }
775 
776   HSA_CHECK(hsa_agent_get_info
777     (agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &dev->max_work_group_size));
778 
779   /* Image features. */
780   if (dev->image_support == CL_TRUE)
781     {
782       hsa_dim3_t image_size;
783       HSA_CHECK (hsa_agent_get_info (
784           agent, HSA_EXT_AGENT_INFO_IMAGE_1D_MAX_ELEMENTS, &image_size));
785       dev->image_max_buffer_size = image_size.x;
786       HSA_CHECK (hsa_agent_get_info (
787           agent, HSA_EXT_AGENT_INFO_IMAGE_2D_MAX_ELEMENTS, &image_size));
788       dev->image2d_max_height = image_size.x;
789       dev->image2d_max_width = image_size.y;
790       HSA_CHECK (hsa_agent_get_info (
791           agent, HSA_EXT_AGENT_INFO_IMAGE_3D_MAX_ELEMENTS, &image_size));
792       dev->image3d_max_height = image_size.x;
793       dev->image3d_max_width = image_size.y;
794       dev->image3d_max_depth = image_size.z;
795       // is this directly the product of the dimensions?
796       //stat = hsa_agent_get_info(agent, ??, &dev->image_max_array_size);
797       HSA_CHECK (hsa_agent_get_info (agent,
798                                      HSA_EXT_AGENT_INFO_MAX_IMAGE_RD_HANDLES,
799                                      &dev->max_read_image_args));
800       HSA_CHECK (hsa_agent_get_info (agent,
801                                      HSA_EXT_AGENT_INFO_MAX_IMAGE_RORW_HANDLES,
802                                      &dev->max_read_write_image_args));
803       dev->max_write_image_args = dev->max_read_write_image_args;
804       HSA_CHECK (hsa_agent_get_info (
805           agent, HSA_EXT_AGENT_INFO_MAX_SAMPLER_HANDLERS, &dev->max_samplers));
806     }
807 
808   dev->svm_allocation_priority = 2;
809   /* OpenCL 2.0 properties */
810   dev->svm_caps = CL_DEVICE_SVM_COARSE_GRAIN_BUFFER
811                   | CL_DEVICE_SVM_FINE_GRAIN_BUFFER
812                   | CL_DEVICE_SVM_ATOMICS;
813   /* This is from clinfo output ran on AMD Catalyst drivers */
814   dev->max_events = 1024;
815   dev->max_queues = 1;
816   dev->max_pipe_args = 16;
817   dev->max_pipe_active_res = 16;
818   dev->max_pipe_packet_size = 1024 * 1024;
819   dev->dev_queue_pref_size = 256 * 1024;
820   dev->dev_queue_max_size = 512 * 1024;
821   dev->on_dev_queue_props
822       = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE;
823   dev->on_host_queue_props = CL_QUEUE_PROFILING_ENABLE;
824 
825   pocl_hsa_device_data_t *d = init_dev_data (dev, j);
826 
827   /* 0 is the host memory shared with all drivers that use it */
828   if (d->agent_profile == HSA_PROFILE_FULL)
829     dev->global_mem_id = 0;
830 
831   return CL_SUCCESS;
832 }
833 
834 
835 static void*
pocl_hsa_malloc(pocl_global_mem_t * mem,size_t size,hsa_region_t r)836 pocl_hsa_malloc (pocl_global_mem_t *mem, size_t size, hsa_region_t r)
837 {
838   void *b = NULL;
839 
840   if (hsa_memory_allocate(r, size, &b) != HSA_STATUS_SUCCESS)
841     {
842       POCL_MSG_PRINT_INFO ("hsa_memory_allocate failed");
843       return NULL;
844     }
845 
846   if (b)
847     POCL_MSG_PRINT_INFO("HSA malloc'ed : size %" PRIuS " @ %p\n", size, b);
848 
849   /* TODO: Due to lack of align parameter to the HSA allocation function, we
850      should align the buffer here ourselves.  For now, let's just hope that
851      the called HSA implementation wide aligns (currently to 128).  */
852   if ((uint64_t)b % MAX_EXTENDED_ALIGNMENT > 0)
853     POCL_MSG_WARN("HSA runtime returned a buffer with smaller alignment "
854 		  "than %d", MAX_EXTENDED_ALIGNMENT);
855 
856   return b;
857 }
858 
859 void
pocl_hsa_copy(void * data,pocl_mem_identifier * dst_mem_id,cl_mem dst_buf,pocl_mem_identifier * src_mem_id,cl_mem src_buf,size_t dst_offset,size_t src_offset,size_t size)860 pocl_hsa_copy (void *data,
861                pocl_mem_identifier * dst_mem_id,
862                cl_mem dst_buf,
863                pocl_mem_identifier * src_mem_id,
864                cl_mem src_buf,
865                size_t dst_offset,
866                size_t src_offset,
867                size_t size)
868 {
869   void *__restrict__ dst_ptr = dst_mem_id->mem_ptr;
870   void *__restrict__ src_ptr = src_mem_id->mem_ptr;
871   if ((src_ptr + src_offset) == (dst_ptr + dst_offset))
872     return;
873   HSA_CHECK (
874       hsa_memory_copy (dst_ptr + dst_offset, src_ptr + src_offset, size));
875 }
876 
877 cl_int
pocl_hsa_alloc_mem_obj(cl_device_id device,cl_mem mem_obj,void * host_ptr)878 pocl_hsa_alloc_mem_obj (cl_device_id device, cl_mem mem_obj, void *host_ptr)
879 {
880   /* if we share global memory with CPU, let the CPU driver allocate it */
881   if (device->global_mem_id == 0)
882     return pocl_basic_alloc_mem_obj (device, mem_obj, host_ptr);
883 
884   /* ... otherwise allocate it via HSA. */
885   pocl_mem_identifier *p = &mem_obj->device_ptrs[device->global_mem_id];
886   pocl_global_mem_t *gmem = device->global_memory;
887   pocl_hsa_device_data_t* d = device->data;
888   void *b = pocl_hsa_malloc (gmem, mem_obj->size, d->global_region);
889   p->mem_ptr = b;
890   p->version = 0;
891 
892   if (b == NULL)
893     return CL_MEM_OBJECT_ALLOCATION_FAILURE;
894   else
895     return CL_SUCCESS;
896 }
897 
898 void
pocl_hsa_free(cl_device_id device,cl_mem memobj)899 pocl_hsa_free (cl_device_id device, cl_mem memobj)
900 {
901   /* if we share global memory with CPU, let the CPU driver free it */
902   if (device->global_mem_id == 0)
903     return pocl_basic_free (device, memobj);
904 
905   /* ... otherwise free it via HSA. */
906   cl_mem_flags flags = memobj->flags;
907   pocl_mem_identifier *p = &memobj->device_ptrs[device->global_mem_id];
908   hsa_memory_free(p->mem_ptr);
909   p->mem_ptr = NULL;
910   p->version = 0;
911 }
912 
pocl_hsa_svm_register(cl_device_id dev,void * host_ptr,size_t size)913 void pocl_hsa_svm_register (cl_device_id dev, void *host_ptr, size_t size)
914 {
915   POCL_MSG_PRINT_HSA ("hsa_memory_register()\n");
916   hsa_memory_register(host_ptr, size);
917 }
918 
pocl_hsa_svm_unregister(cl_device_id dev,void * host_ptr,size_t size)919 void pocl_hsa_svm_unregister (cl_device_id dev, void *host_ptr, size_t size)
920 {
921   POCL_MSG_PRINT_HSA ("hsa_memory_deregister()\n");
922   hsa_memory_deregister(host_ptr, size);
923 }
924 
925 
926 static void
setup_kernel_args(pocl_hsa_device_data_t * d,_cl_command_node * cmd,pocl_hsa_event_data_t * event_data,size_t max_args_size,uint32_t * total_group_size)927 setup_kernel_args (pocl_hsa_device_data_t *d,
928                    _cl_command_node *cmd,
929                    pocl_hsa_event_data_t *event_data,
930                    size_t max_args_size,
931                    uint32_t *total_group_size)
932 {
933   char *write_pos = event_data->kernargs;
934   const char *last_pos = event_data->kernargs + max_args_size;
935   cl_kernel kernel = cmd->command.run.kernel;
936   pocl_kernel_metadata_t *meta = kernel->meta;
937 
938   POCL_MSG_PRINT_INFO ("setup_kernel_args for %s\n",
939 		       cmd->command.run.kernel->name);
940 #define CHECK_AND_ALIGN_SPACE(DSIZE)                         \
941   do {                                                       \
942     if (write_pos + (DSIZE) > last_pos)                      \
943       POCL_ABORT("pocl-hsa: too many kernel arguments!\n");  \
944     unsigned unaligned = (intptr_t)write_pos % DSIZE;        \
945     if (unaligned > 0) write_pos += (DSIZE - unaligned);     \
946   } while (0)
947 
948   size_t i;
949   for (i = 0; i < meta->num_args + meta->num_locals; ++i)
950     {
951       struct pocl_argument *al = &(cmd->command.run.arguments[i]);
952 
953       if (i >= meta->num_args || ARG_IS_LOCAL (meta->arg_info[i]))
954         {
955 	  size_t buf_size = ARG_IS_LOCAL (meta->arg_info[i]) ?
956 	    al->size : meta->local_sizes[i - meta->num_args];
957           if (d->device->device_alloca_locals)
958             {
959               /* Local buffers are allocated in the device side work-group
960                  launcher. Let's pass only the sizes of the local args in
961                  the arg buffer. */
962               assert (sizeof (size_t) == 8);
963               CHECK_AND_ALIGN_SPACE (sizeof (size_t));
964               memcpy (write_pos, &buf_size, sizeof (size_t));
965               write_pos += sizeof (size_t);
966             }
967           else if (HSAIL_ENABLED)
968             {
969               CHECK_AND_ALIGN_SPACE (sizeof (uint32_t));
970               memcpy (write_pos, total_group_size, sizeof (uint32_t));
971               *total_group_size += (uint32_t)buf_size;
972               write_pos += sizeof (uint32_t);
973             }
974           else
975             assert (0 && "Unsupported local mem allocation scheme.");
976         }
977       else if (meta->arg_info[i].type == POCL_ARG_TYPE_POINTER)
978         {
979           CHECK_AND_ALIGN_SPACE(sizeof (uint64_t));
980           /* Assuming the pointers are 64b (or actually the same as in
981              host) due to HSA. TODO: the 32b profile. */
982 
983           if (al->value == NULL)
984             {
985               uint64_t temp = 0;
986               memcpy (write_pos, &temp, sizeof (uint64_t));
987             }
988           else
989             {
990               uint64_t dev_ptr = 0;
991               if (al->is_svm)
992                 dev_ptr = (uint64_t) (*(void **)al->value);
993               else
994                 {
995                   cl_mem m = *(cl_mem *)al->value;
996                   dev_ptr
997                       = (uint64_t)m->device_ptrs[cmd->device->dev_id].mem_ptr;
998                   if (m->flags & CL_MEM_USE_HOST_PTR
999                       && d->agent_profile == HSA_PROFILE_BASE)
1000                     {
1001                       POCL_MSG_PRINT_INFO (
1002                           "HSA: Copy HOST_PTR allocated %lu byte buffer "
1003                           "from %p to %lx due to having a BASE profile "
1004                           "agent.\n",
1005                           m->size, m->mem_host_ptr, dev_ptr);
1006                       hsa_memory_copy ((void *)dev_ptr, m->mem_host_ptr,
1007                                        m->size);
1008                     }
1009                 }
1010 
1011               dev_ptr += al->offset;
1012               memcpy (write_pos, &dev_ptr, sizeof (uint64_t));
1013             }
1014           POCL_MSG_PRINT_INFO (
1015               "arg %lu (global ptr) written to %lx val %lx arg offs %d\n", i,
1016               (uint64_t)write_pos, *(uint64_t *)write_pos,
1017               (int)(write_pos - (char*)event_data->kernargs));
1018           write_pos += sizeof (uint64_t);
1019         }
1020       else if (meta->arg_info[i].type == POCL_ARG_TYPE_IMAGE)
1021         {
1022           POCL_ABORT_UNIMPLEMENTED("pocl-hsa: image arguments"
1023                                    " not implemented.\n");
1024         }
1025       else if (meta->arg_info[i].type == POCL_ARG_TYPE_SAMPLER)
1026         {
1027           POCL_ABORT_UNIMPLEMENTED("pocl-hsa: sampler arguments"
1028                                    " not implemented.\n");
1029         }
1030       else
1031         {
1032           // Scalars.
1033           CHECK_AND_ALIGN_SPACE (al->size);
1034           memcpy (write_pos, al->value, al->size);
1035           POCL_MSG_PRINT_INFO (
1036               "arg %lu (scalar) written to %lx val %x offs %d\n", i,
1037               (uint64_t)write_pos, *(uint32_t *)al->value,
1038               (int)(write_pos - (char*)event_data->kernargs));
1039           write_pos += al->size;
1040         }
1041     }
1042 
1043   CHECK_AND_ALIGN_SPACE(sizeof (uint64_t));
1044 
1045   /* Copy the context object to HSA allocated global memory to ensure Base
1046      profile agents can access it. */
1047 
1048   event_data->context = pocl_hsa_malloc
1049     (d->device->global_memory, POCL_CONTEXT_SIZE (d->device->address_bits),
1050      d->global_region);
1051 
1052   if (d->device->address_bits == 64)
1053     memcpy (event_data->context, &cmd->command.run.pc, sizeof (struct pocl_context));
1054   else
1055     POCL_CONTEXT_COPY64TO32 (event_data->context, &cmd->command.run.pc);
1056 
1057   memcpy (write_pos, &event_data->context, sizeof (event_data->context));
1058   POCL_MSG_PRINT_INFO ("A %d-bit context object was written at %p offs %d\n",
1059                        d->device->address_bits, event_data->context,
1060                        (int)(write_pos - (char*)event_data->kernargs));
1061   write_pos += sizeof (uint64_t);
1062 
1063   /* MUST TODO: free the local buffers after finishing the kernel in case of
1064      host side allocation.
1065    */
1066 }
1067 
1068 static int
compile_parallel_bc_to_brig(char * brigfile,_cl_command_node * cmd,int specialize)1069 compile_parallel_bc_to_brig (char *brigfile, _cl_command_node *cmd,
1070                              int specialize)
1071 {
1072   int error;
1073   char hsailfile[POCL_FILENAME_LENGTH];
1074   char parallel_bc_path[POCL_FILENAME_LENGTH];
1075   _cl_command_run *run_cmd = &cmd->command.run;
1076 
1077   pocl_cache_work_group_function_path (parallel_bc_path,
1078                                        run_cmd->kernel->program, cmd->device_i,
1079                                        run_cmd->kernel, cmd, specialize);
1080 
1081   strcpy (brigfile, parallel_bc_path);
1082   strncat (brigfile, ".brig", POCL_FILENAME_LENGTH-1);
1083   strcpy (hsailfile, parallel_bc_path);
1084   strncat (hsailfile, ".hsail", POCL_FILENAME_LENGTH-1);
1085 
1086   if (pocl_exists (brigfile))
1087     POCL_MSG_PRINT_INFO("pocl-hsa: using existing BRIG file: \n%s\n",
1088                         brigfile);
1089   else
1090     {
1091       // TODO call llvm via c++ interface like pocl_llvm_codegen()
1092       POCL_MSG_PRINT_HSA ("BRIG file not found,"
1093                           " compiling parallel.bc to brig file: \n%s\n",
1094                           parallel_bc_path);
1095 
1096 
1097       char* args1[] = { LLVM_LLC, "-O2", "-march=hsail64", "-filetype=asm",
1098                         "-o", hsailfile, parallel_bc_path, NULL };
1099       if ((error = pocl_run_command (args1)))
1100         {
1101           POCL_MSG_PRINT_HSA ("llc exit status %i\n", error);
1102           return error;
1103         }
1104 
1105       char* args2[] = { HSAIL_ASM, "-o", brigfile, hsailfile, NULL };
1106       if ((error = pocl_run_command (args2)))
1107         {
1108           POCL_MSG_PRINT_HSA ("HSAILasm exit status %i\n", error);
1109           return error;
1110         }
1111     }
1112 
1113   return 0;
1114 }
1115 
1116 static pocl_hsa_kernel_cache_t *
pocl_hsa_find_mem_cached_kernel(pocl_hsa_device_data_t * d,_cl_command_run * cmd)1117 pocl_hsa_find_mem_cached_kernel (pocl_hsa_device_data_t *d,
1118                                  _cl_command_run *cmd)
1119 {
1120   size_t i;
1121   for (i = 0; i < HSA_KERNEL_CACHE_SIZE; i++)
1122     {
1123       if (((d->kernel_cache[i].kernel == NULL)
1124            || (memcmp (d->kernel_cache[i].kernel_hash, cmd->hash,
1125                        sizeof (pocl_kernel_hash_t))
1126                != 0)))
1127         continue;
1128 
1129       if (d->kernel_cache[i].local_x == cmd->pc.local_size[0]
1130           && d->kernel_cache[i].local_y == cmd->pc.local_size[1]
1131           && d->kernel_cache[i].local_z == cmd->pc.local_size[2]
1132           && (!d->kernel_cache[i].goffs_zero
1133               || (cmd->pc.global_offset[0] == 0
1134                   && cmd->pc.global_offset[1] == 0
1135                   && cmd->pc.global_offset[2] == 0))
1136           && pocl_cmd_max_grid_dim_width (cmd)
1137                  <= d->kernel_cache[i].max_grid_dim_width)
1138         return &d->kernel_cache[i];
1139     }
1140   return NULL;
1141 }
1142 
1143 void
pocl_hsa_compile_kernel_native(_cl_command_node * cmd,cl_kernel kernel,cl_device_id device,int specialize)1144 pocl_hsa_compile_kernel_native (_cl_command_node *cmd, cl_kernel kernel,
1145                                 cl_device_id device, int specialize)
1146 {
1147   pocl_hsa_device_data_t *d = (pocl_hsa_device_data_t*)device->data;
1148 
1149   _cl_command_run *run_cmd = &cmd->command.run;
1150 
1151   POCL_LOCK (d->pocl_hsa_compilation_lock);
1152   assert (cmd->command.run.kernel == kernel);
1153   char *binary_fn = pocl_check_kernel_disk_cache (cmd, specialize);
1154   if (pocl_hsa_find_mem_cached_kernel (d, &cmd->command.run) != NULL)
1155     {
1156         POCL_MSG_PRINT_INFO("built kernel found in mem cache\n");
1157         POCL_UNLOCK (d->pocl_hsa_compilation_lock);
1158         return;
1159     }
1160 
1161   POCL_MSG_PRINT_INFO ("pocl-hsa: loading native binary from file %s.\n",
1162                        binary_fn);
1163 
1164   uint64_t elf_size;
1165   FILE *elf_file;
1166   elf_file = fopen(binary_fn, "rb");
1167   if (elf_file == NULL)
1168     POCL_ABORT ("pocl-hsa: could not get the file size of the native "
1169                 "binary\n");
1170 
1171   /* This assumes phsa-runtime's deserialization input format
1172      which stores the following data: */
1173   uint32_t metadata_size =
1174     sizeof (uint64_t) /* The ELF bin size. ELF bin follows. */ +
1175     sizeof (hsa_isa_t) +
1176     sizeof (hsa_default_float_rounding_mode_t) + sizeof (hsa_profile_t) +
1177     sizeof (hsa_machine_model_t);
1178 
1179   /* TODO: Use HSA's deserialization interface to store the final binary
1180      to disk so we don't need to wrap it here and fix to phsa's format.  */
1181   fseek (elf_file, 0, SEEK_END);
1182   elf_size = ftell (elf_file);
1183   fseek (elf_file, 0, SEEK_SET);
1184 
1185   uint64_t blob_size = metadata_size + elf_size;
1186 
1187   uint8_t *blob = malloc (blob_size);
1188   uint8_t *wpos = blob;
1189 
1190   memcpy (wpos, &elf_size, sizeof (elf_size));
1191   wpos += sizeof (elf_size);
1192 
1193   uint64_t read_size;
1194   if (fread (wpos, 1, elf_size, elf_file) != elf_size)
1195     POCL_ABORT("pocl-hsa: could not read the native ELF binary.\n");
1196   fclose (elf_file);
1197 
1198   POCL_MSG_PRINT_INFO("pocl-hsa: native binary size: %lu.\n", elf_size);
1199 
1200   wpos += elf_size;
1201 
1202   /* Assume the rest of the HSA properties are OK as zero. */
1203   memset (wpos, 0, metadata_size - sizeof (uint64_t));
1204 
1205   hsa_executable_t exe;
1206   hsa_code_object_t obj;
1207 
1208   HSA_CHECK (hsa_executable_create (d->agent_profile,
1209                                     HSA_EXECUTABLE_STATE_UNFROZEN, "", &exe));
1210 
1211   HSA_CHECK(hsa_code_object_deserialize (blob, blob_size, "", &obj));
1212 
1213   HSA_CHECK(hsa_executable_load_code_object (exe, d->agent, obj, ""));
1214 
1215   HSA_CHECK(hsa_executable_freeze (exe, NULL));
1216 
1217   free (blob);
1218 
1219   int i = d->kernel_cache_lastptr;
1220   if (i < HSA_KERNEL_CACHE_SIZE)
1221     {
1222       d->kernel_cache[i].kernel = kernel;
1223       memcpy (d->kernel_cache[i].kernel_hash, cmd->command.run.hash,
1224               sizeof (pocl_kernel_hash_t));
1225       d->kernel_cache[i].local_x = run_cmd->pc.local_size[0];
1226       d->kernel_cache[i].local_y = run_cmd->pc.local_size[1];
1227       d->kernel_cache[i].local_z = run_cmd->pc.local_size[2];
1228       d->kernel_cache[i].goffs_zero = run_cmd->pc.global_offset[0] == 0
1229                                       && run_cmd->pc.global_offset[1] == 0
1230                                       && run_cmd->pc.global_offset[2] == 0;
1231 
1232       size_t max_grid_width = pocl_cmd_max_grid_dim_width (&cmd->command.run);
1233       d->kernel_cache[i].max_grid_dim_width
1234           = max_grid_width > device->grid_width_specialization_limit
1235                 ? SIZE_MAX
1236                 : device->grid_width_specialization_limit;
1237       d->kernel_cache[i].hsa_exe.handle = exe.handle;
1238       d->kernel_cache_lastptr++;
1239     }
1240   else
1241     POCL_ABORT ("kernel cache full\n");
1242 
1243   hsa_executable_symbol_t kernel_symbol;
1244 
1245   const char *launcher_name_tmpl = "phsa_kernel.%s_grid_launcher";
1246   size_t launcher_name_length =
1247     strlen (kernel->name) + strlen (launcher_name_tmpl) + 1;
1248   char *symbol_name = malloc (launcher_name_length);
1249 
1250   snprintf (symbol_name, launcher_name_length, launcher_name_tmpl,
1251             kernel->name);
1252 
1253   POCL_MSG_PRINT_INFO ("pocl-hsa: getting kernel symbol %s.\n", symbol_name);
1254 
1255   HSA_CHECK (hsa_executable_get_symbol (exe, NULL, symbol_name, d->agent, 0,
1256                                         &kernel_symbol));
1257   free (symbol_name);
1258 
1259   hsa_symbol_kind_t symtype;
1260   HSA_CHECK (hsa_executable_symbol_get_info (
1261       kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &symtype));
1262   if (symtype != HSA_SYMBOL_KIND_KERNEL)
1263     POCL_ABORT ("pocl-hsa: the kernel function symbol resolves "
1264                 "to something else than a function\n");
1265 
1266   uint64_t code_handle;
1267   HSA_CHECK(hsa_executable_symbol_get_info
1268     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &code_handle));
1269 
1270   d->kernel_cache[i].code_handle = code_handle;
1271 
1272   /* Group and private memory allocation is done via pocl, HSA runtime
1273      should not mind these.  */
1274   d->kernel_cache[i].static_group_size = 0;
1275   d->kernel_cache[i].private_size = 0;
1276   d->kernel_cache[i].args_segment_size = 2048;
1277 
1278   POCL_UNLOCK (d->pocl_hsa_compilation_lock);
1279   POCL_MSG_PRINT_INFO("pocl-hsa: native kernel compilation for phsa "
1280 		      "finished\n");
1281 }
1282 
1283 void
pocl_hsa_compile_kernel_hsail(_cl_command_node * cmd,cl_kernel kernel,cl_device_id device,int specialize)1284 pocl_hsa_compile_kernel_hsail (_cl_command_node *cmd, cl_kernel kernel,
1285                                cl_device_id device, int specialize)
1286 {
1287   char brigfile[POCL_FILENAME_LENGTH];
1288   char *brig_blob;
1289 
1290   pocl_hsa_device_data_t *d = (pocl_hsa_device_data_t*)device->data;
1291 
1292   hsa_executable_t final_obj;
1293 
1294   _cl_command_run *run_cmd = &cmd->command.run;
1295 
1296   POCL_LOCK (d->pocl_hsa_compilation_lock);
1297 
1298   int error = pocl_llvm_generate_workgroup_function (cmd->device_i, device,
1299                                                      kernel, cmd, specialize);
1300   if (error)
1301     {
1302       POCL_MSG_PRINT_GENERAL ("HSA: pocl_llvm_generate_workgroup_function()"
1303                               " failed for kernel %s\n", kernel->name);
1304       assert (error == 0);
1305     }
1306 
1307   unsigned i;
1308   if (pocl_hsa_find_mem_cached_kernel (d, run_cmd) != NULL)
1309     {
1310         POCL_MSG_PRINT_INFO("built kernel found in mem cache\n");
1311         POCL_UNLOCK (d->pocl_hsa_compilation_lock);
1312         return;
1313     }
1314 
1315   if (compile_parallel_bc_to_brig (brigfile, cmd, specialize))
1316     POCL_ABORT("Compiling LLVM IR -> HSAIL -> BRIG failed.\n");
1317 
1318   POCL_MSG_PRINT_HSA ("loading binary from file %s.\n", brigfile);
1319   uint64_t filesize = 0;
1320   int read = pocl_read_file(brigfile, &brig_blob, &filesize);
1321 
1322   if (read != 0)
1323     POCL_ABORT("pocl-hsa: could not read the binary.\n");
1324 
1325   POCL_MSG_PRINT_HSA ("BRIG binary size: %lu.\n", filesize);
1326 
1327   hsa_ext_module_t hsa_module = (hsa_ext_module_t)brig_blob;
1328 
1329   hsa_ext_program_t hsa_program;
1330   memset (&hsa_program, 0, sizeof (hsa_ext_program_t));
1331 
1332   HSA_CHECK(hsa_ext_program_create
1333     (HSA_MACHINE_MODEL_LARGE, HSA_PROFILE_FULL,
1334      HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL,
1335      &hsa_program));
1336 
1337   HSA_CHECK(hsa_ext_program_add_module (hsa_program, hsa_module));
1338 
1339   hsa_isa_t isa;
1340   HSA_CHECK(hsa_agent_get_info (d->agent, HSA_AGENT_INFO_ISA, &isa));
1341 
1342   hsa_ext_control_directives_t control_directives;
1343   memset (&control_directives, 0, sizeof (hsa_ext_control_directives_t));
1344 
1345   hsa_code_object_t code_object;
1346   HSA_CHECK(hsa_ext_program_finalize
1347     (hsa_program, isa, 0, control_directives, "",
1348      HSA_CODE_OBJECT_TYPE_PROGRAM, &code_object));
1349 
1350   HSA_CHECK(hsa_executable_create (d->agent_profile,
1351                                   HSA_EXECUTABLE_STATE_UNFROZEN,
1352                                   "", &final_obj));
1353 
1354   HSA_CHECK(hsa_executable_load_code_object (final_obj, d->agent,
1355                                             code_object, ""));
1356 
1357   HSA_CHECK(hsa_executable_freeze (final_obj, NULL));
1358 
1359   HSA_CHECK(hsa_ext_program_destroy(hsa_program));
1360 
1361   free (brig_blob);
1362 
1363   i = d->kernel_cache_lastptr;
1364   if (i < HSA_KERNEL_CACHE_SIZE)
1365     {
1366       d->kernel_cache[i].kernel = kernel;
1367       memcpy (d->kernel_cache[i].kernel_hash, cmd->command.run.hash,
1368               sizeof (pocl_kernel_hash_t));
1369       d->kernel_cache[i].hsa_exe.handle = final_obj.handle;
1370       d->kernel_cache_lastptr++;
1371     }
1372   else
1373     POCL_ABORT ("kernel cache full\n");
1374 
1375   hsa_executable_symbol_t kernel_symbol;
1376 
1377   size_t kernel_name_length = strlen (kernel->name);
1378   char *symbol = malloc (kernel_name_length + 2);
1379   symbol[0] = '&';
1380   symbol[1] = '\0';
1381 
1382   strncat (symbol, kernel->name, kernel_name_length);
1383 
1384   POCL_MSG_PRINT_HSA ("getting kernel symbol %s.\n", symbol);
1385 
1386   HSA_CHECK(hsa_executable_get_symbol
1387     (final_obj, NULL, symbol, d->agent, 0, &kernel_symbol));
1388 
1389   free(symbol);
1390 
1391   hsa_symbol_kind_t symtype;
1392   HSA_CHECK(hsa_executable_symbol_get_info
1393     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &symtype));
1394   if(symtype != HSA_SYMBOL_KIND_KERNEL)
1395     POCL_ABORT ("pocl-hsa: the kernel function symbol resolves "
1396                 "to something else than a function\n");
1397 
1398   uint64_t code_handle;
1399   HSA_CHECK(hsa_executable_symbol_get_info
1400     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &code_handle));
1401 
1402   d->kernel_cache[i].code_handle = code_handle;
1403 
1404   HSA_CHECK(hsa_executable_symbol_get_info (
1405        kernel_symbol,
1406        HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
1407        &d->kernel_cache[i].static_group_size));
1408 
1409   HSA_CHECK(hsa_executable_symbol_get_info (
1410        kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
1411        &d->kernel_cache[i].private_size));
1412 
1413   HSA_CHECK(hsa_executable_symbol_get_info (
1414        kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
1415        &d->kernel_cache[i].args_segment_size));
1416 
1417   POCL_UNLOCK (d->pocl_hsa_compilation_lock);
1418 }
1419 
1420 cl_int
pocl_hsa_uninit(unsigned j,cl_device_id device)1421 pocl_hsa_uninit (unsigned j, cl_device_id device)
1422 {
1423   assert (found_hsa_agents > 0);
1424   pocl_hsa_device_data_t *d = (pocl_hsa_device_data_t *)device->data;
1425 
1426   if (d->driver_pthread_id)
1427     {
1428       POCL_MSG_PRINT_INFO ("waiting for HSA device pthread"
1429                            " to finish its work...\n");
1430       d->exit_driver_thread = 1;
1431       void* ptr;
1432       PTHREAD_CHECK(pthread_join(d->driver_pthread_id, &ptr));
1433       POCL_MSG_PRINT_INFO ("....done.\n");
1434     }
1435 
1436   if (device->device_side_printf)
1437     {
1438       hsa_memory_free (d->printf_buffer);
1439       hsa_memory_free (d->printf_write_pos);
1440     }
1441 
1442   unsigned i;
1443   for (i = 0; i < HSA_KERNEL_CACHE_SIZE; i++)
1444     if (d->kernel_cache[i].kernel)
1445       {
1446         HSA_CHECK (hsa_executable_destroy (d->kernel_cache[i].hsa_exe));
1447       }
1448 
1449   // TODO: destroy the executables that didn't fit to the kernel
1450   // cache. Also code objects are not destroyed at the moment.
1451   hsa_signal_destroy(d->nudge_driver_thread);
1452 
1453   PTHREAD_CHECK(pthread_mutex_destroy(&d->list_mutex));
1454 
1455   POCL_DESTROY_LOCK (d->pocl_hsa_compilation_lock);
1456 
1457   POCL_MEM_FREE(d);
1458   device->data = NULL;
1459 
1460   // after last device, call HSA runtime shutdown
1461   if (j == (found_hsa_agents - 1))
1462     {
1463       HSA_CHECK (hsa_shut_down ());
1464       found_hsa_agents = 0;
1465     }
1466 
1467   return CL_SUCCESS;
1468 }
1469 
1470 cl_int
pocl_hsa_reinit(unsigned j,cl_device_id device)1471 pocl_hsa_reinit (unsigned j, cl_device_id device)
1472 {
1473   assert (device->data == NULL);
1474   device->data = init_dev_data (device, j);
1475   return CL_SUCCESS;
1476 }
1477 
1478 cl_ulong
pocl_hsa_get_timer_value(void * data)1479 pocl_hsa_get_timer_value(void *data)
1480 {
1481   uint64_t hsa_ts;
1482   HSA_CHECK(hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &hsa_ts));
1483   cl_ulong res = (cl_ulong)(hsa_ts *
1484                             ((pocl_hsa_device_data_t*)data)->timestamp_unit);
1485   return res;
1486 }
1487 
1488 #define PN_ADD(array, p) \
1489   do { \
1490     if (array##_size > COMMAND_LIST_SIZE) \
1491       POCL_ABORT("array overload\n"); \
1492     array[array##_size++] = p; \
1493   } \
1494   while (0)
1495 
1496 #define PN_REMOVE(array, index) \
1497   do { \
1498     assert(array##_size > 0); \
1499     array[index] = array[--array##_size]; \
1500     array[array##_size] = NULL; \
1501   } \
1502   while (0)
1503 
1504 void
pocl_hsa_submit(_cl_command_node * node,cl_command_queue cq)1505 pocl_hsa_submit (_cl_command_node *node, cl_command_queue cq)
1506 {
1507   cl_device_id device = node->device;
1508   pocl_hsa_device_data_t *d = device->data;
1509   unsigned added_to_readylist = 0;
1510 
1511   PTHREAD_CHECK (pthread_mutex_lock (&d->list_mutex));
1512 
1513   node->ready = 1;
1514   if (pocl_command_is_ready (node->event))
1515     {
1516       pocl_update_event_submitted (node->event);
1517       PN_ADD(d->ready_list, node->event);
1518       added_to_readylist = 1;
1519     }
1520   else
1521     PN_ADD(d->wait_list, node->event);
1522 
1523   POCL_MSG_PRINT_INFO("After Event %" PRIu64 " submit: WL : %li, RL: %li\n",
1524                       node->event->id, d->wait_list_size, d->ready_list_size);
1525 
1526   POCL_UNLOCK_OBJ (node->event);
1527 
1528   PTHREAD_CHECK(pthread_mutex_unlock(&d->list_mutex));
1529 
1530   if (added_to_readylist)
1531     hsa_signal_subtract_relaxed(d->nudge_driver_thread, 1);
1532 
1533 }
1534 
1535 void
pocl_hsa_join(cl_device_id device,cl_command_queue cq)1536 pocl_hsa_join (cl_device_id device, cl_command_queue cq)
1537 {
1538   POCL_LOCK_OBJ (cq);
1539   if (cq->command_count == 0)
1540     {
1541       POCL_UNLOCK_OBJ (cq);
1542       POCL_MSG_PRINT_HSA ("device->join: empty queue\n");
1543       return;
1544     }
1545   cl_event event = cq->last_event.event;
1546   assert(event);
1547   POCL_LOCK_OBJ (event);
1548   POCL_RETAIN_OBJECT_UNLOCKED (event);
1549   POCL_UNLOCK_OBJ (cq);
1550 
1551   POCL_MSG_PRINT_HSA ("device->join on event %" PRIu64 "\n", event->id);
1552 
1553   if (event->status <= CL_COMPLETE)
1554     {
1555       POCL_MSG_PRINT_HSA ("device->join: last event (%" PRIu64 ") in queue"
1556                           " exists, but is complete\n", event->id);
1557       goto RETURN;
1558     }
1559 
1560   while (event->status > CL_COMPLETE)
1561     {
1562       pocl_hsa_event_data_t *e_d = (pocl_hsa_event_data_t *)event->data;
1563       PTHREAD_CHECK (pthread_cond_wait (&e_d->event_cond, &event->pocl_lock));
1564     }
1565   POCL_MSG_PRINT_HSA ("device->join on event %" PRIu64 " finished"
1566                       " with status: %i\n", event->id, event->status);
1567 
1568 RETURN:
1569   assert (event->status <= CL_COMPLETE);
1570   POCL_UNLOCK_OBJ (event);
1571 
1572   POname (clReleaseEvent) (event);
1573 }
1574 
1575 void
pocl_hsa_flush(cl_device_id device,cl_command_queue cq)1576 pocl_hsa_flush (cl_device_id device, cl_command_queue cq)
1577 {
1578   pocl_hsa_device_data_t *d = (pocl_hsa_device_data_t *)device->data;
1579   hsa_signal_subtract_relaxed(d->nudge_driver_thread, 1);
1580 }
1581 
1582 void
pocl_hsa_notify(cl_device_id device,cl_event event,cl_event finished)1583 pocl_hsa_notify (cl_device_id device, cl_event event, cl_event finished)
1584 {
1585   pocl_hsa_device_data_t *d = device->data;
1586   _cl_command_node *node = event->command;
1587   int added_to_readylist = 0;
1588   POCL_MSG_PRINT_HSA ("notify on event %" PRIu64 " \n", event->id);
1589 
1590   if (finished->status < CL_COMPLETE)
1591     {
1592       pocl_update_event_failed (event);
1593       return;
1594     }
1595 
1596   if (!node->ready)
1597     return;
1598 
1599   if (pocl_command_is_ready (event))
1600     {
1601       if (event->status == CL_QUEUED)
1602         {
1603           pocl_update_event_submitted (event);
1604           PTHREAD_CHECK(pthread_mutex_lock(&d->list_mutex));
1605 
1606           size_t i = 0;
1607           for(i = 0; i < d->wait_list_size; i++)
1608             if (d->wait_list[i] == event)
1609               break;
1610           if (i < d->wait_list_size)
1611             {
1612               POCL_MSG_PRINT_INFO("event %" PRIu64 " wait_list -> ready_list\n",
1613                                   event->id);
1614               PN_ADD(d->ready_list, event);
1615               PN_REMOVE(d->wait_list, i);
1616             }
1617           else
1618             POCL_ABORT("cant move event %" PRIu64 " from waitlist to"
1619                        " readylist - not found in waitlist\n", event->id);
1620           added_to_readylist = 1;
1621           PTHREAD_CHECK(pthread_mutex_unlock(&d->list_mutex));
1622         }
1623       else
1624         POCL_MSG_WARN ("node->ready was 1 but event %" PRIu64 " is"
1625                        " not queued: status %i!\n",
1626                        event->id, event->status);
1627     }
1628 
1629   if (added_to_readylist)
1630     hsa_signal_subtract_relaxed(d->nudge_driver_thread, 1);
1631 }
1632 
1633 void
pocl_hsa_broadcast(cl_event event)1634 pocl_hsa_broadcast (cl_event event)
1635 {
1636   POCL_MSG_PRINT_HSA ("broadcasting\n");
1637   pocl_broadcast(event);
1638 }
1639 
1640 void
pocl_hsa_wait_event(cl_device_id device,cl_event event)1641 pocl_hsa_wait_event(cl_device_id device, cl_event event)
1642 {
1643   POCL_MSG_PRINT_HSA ("device->wait_event on event %" PRIu64 "\n", event->id);
1644   POCL_LOCK_OBJ (event);
1645   if (event->status <= CL_COMPLETE)
1646     {
1647       POCL_MSG_PRINT_HSA ("device->wain_event: last event"
1648                           " (%" PRIu64 ") in queue exists, but is complete\n",
1649                           event->id);
1650       POCL_UNLOCK_OBJ(event);
1651       return;
1652     }
1653   while (event->status > CL_COMPLETE)
1654     {
1655       pocl_hsa_event_data_t *e_d = (pocl_hsa_event_data_t *)event->data;
1656       PTHREAD_CHECK (
1657           pthread_cond_wait (&(e_d->event_cond), &event->pocl_lock));
1658     }
1659   POCL_UNLOCK_OBJ (event);
1660 
1661   POCL_MSG_PRINT_INFO("event wait finished with status: %i\n", event->status);
1662   assert (event->status <= CL_COMPLETE);
1663 }
1664 
1665 /* DRIVER PTHREAD part */
1666 
1667 #if AMD_HSA == 1
1668 /* this is array of "less than 1" conditions for signals,
1669  * passed to hsa_amd_signal_wait_any() as a readonly argument */
1670 static hsa_signal_value_t signal_ones_array[EVENT_LIST_SIZE + 1];
1671 static hsa_signal_condition_t less_than_sigcond_array[EVENT_LIST_SIZE + 1];
1672 static int signal_array_initialized = 0;
1673 #endif
1674 
1675 static void
pocl_hsa_launch(pocl_hsa_device_data_t * d,cl_event event)1676 pocl_hsa_launch (pocl_hsa_device_data_t *d, cl_event event)
1677 {
1678   POCL_LOCK_OBJ (event);
1679   _cl_command_node *cmd = event->command;
1680   _cl_command_run *run_cmd = &cmd->command.run;
1681   cl_kernel kernel = cmd->command.run.kernel;
1682   struct pocl_context *pc = &cmd->command.run.pc;
1683   hsa_kernel_dispatch_packet_t *kernel_packet;
1684   pocl_hsa_device_pthread_data_t* dd = &d->driver_data;
1685   pocl_hsa_event_data_t *event_data = (pocl_hsa_event_data_t *)event->data;
1686 
1687   unsigned i;
1688   pocl_hsa_kernel_cache_t *cached_data
1689       = pocl_hsa_find_mem_cached_kernel (d, run_cmd);
1690   assert (cached_data);
1691 
1692   HSA_CHECK(hsa_memory_allocate (d->kernarg_region,
1693 				 cached_data->args_segment_size,
1694 				 &event_data->kernargs));
1695 
1696   dd->last_queue = (dd->last_queue + 1) % dd->num_queues;
1697   hsa_queue_t* last_queue = dd->queues[dd->last_queue];
1698   const uint64_t queue_mask = last_queue->size - 1;
1699 
1700   uint64_t packet_id = hsa_queue_add_write_index_relaxed (last_queue, 1);
1701   while ((packet_id - hsa_queue_load_read_index_acquire (last_queue))
1702          >= last_queue->size)
1703     {
1704       /* device queue is full. TODO this isn't the optimal solution */
1705       POCL_MSG_WARN("pocl-hsa: queue %" PRIuS " overloaded\n", dd->last_queue);
1706       usleep(2000);
1707     }
1708 
1709   kernel_packet =
1710       &(((hsa_kernel_dispatch_packet_t*)(last_queue->base_address))
1711         [packet_id & queue_mask]);
1712 
1713   if (!HSAIL_ENABLED && !d->device->spmd)
1714     {
1715       /* For non-SPMD machines with native compilation, we produce a multi-WI
1716 	 WG function with pocl and launch it via the HSA runtime like it was
1717 	 a single-WI WG. */
1718       kernel_packet->workgroup_size_x = kernel_packet->workgroup_size_y =
1719 	kernel_packet->workgroup_size_z = 1;
1720     }
1721   else
1722     {
1723       /* Otherwise let the target processor take care of the SPMD grid
1724          execution. */
1725       kernel_packet->workgroup_size_x = run_cmd->pc.local_size[0];
1726       kernel_packet->workgroup_size_y = run_cmd->pc.local_size[1];
1727       kernel_packet->workgroup_size_z = run_cmd->pc.local_size[2];
1728     }
1729 
1730   if (d->device->device_side_printf)
1731     {
1732       pc->printf_buffer = d->printf_buffer;
1733       pc->printf_buffer_capacity = d->device->printf_buffer_size;
1734       bzero (d->printf_write_pos, sizeof (size_t));
1735       pc->printf_buffer_position = d->printf_write_pos;
1736     }
1737 
1738   /* TODO: Dynamic WG sizes. */
1739 
1740   /* For SPMD devices we let the processor (HSA runtime) control the
1741      grid execution unless we are using our own WG launcher that
1742      uses the context struct. */
1743   if (!d->device->spmd || d->device->arg_buffer_launcher)
1744     {
1745       pc->local_size[0] = run_cmd->pc.local_size[0];
1746       pc->local_size[1] = run_cmd->pc.local_size[1];
1747       pc->local_size[2] = run_cmd->pc.local_size[2];
1748     }
1749 
1750   kernel_packet->grid_size_x = kernel_packet->grid_size_y
1751     = kernel_packet->grid_size_z = 1;
1752   kernel_packet->grid_size_x =
1753     pc->num_groups[0] * kernel_packet->workgroup_size_x;
1754   kernel_packet->grid_size_y =
1755     pc->num_groups[1] * kernel_packet->workgroup_size_y;
1756   kernel_packet->grid_size_z =
1757     pc->num_groups[2] * kernel_packet->workgroup_size_z;
1758 
1759   kernel_packet->kernel_object = cached_data->code_handle;
1760   kernel_packet->private_segment_size = cached_data->private_size;
1761   uint32_t total_group_size = cached_data->static_group_size;
1762 
1763   HSA_CHECK (
1764       hsa_signal_create (1, 1, &d->agent, &kernel_packet->completion_signal));
1765 
1766   setup_kernel_args (d, cmd, event_data,
1767                      cached_data->args_segment_size, &total_group_size);
1768 
1769   kernel_packet->group_segment_size = total_group_size;
1770 
1771   POCL_MSG_PRINT_HSA ("kernel's total group size: %u\n",
1772                       total_group_size);
1773   POCL_MSG_PRINT_INFO ("kernel command grid size %u x %u x %u\n",
1774                        kernel_packet->grid_size_x, kernel_packet->grid_size_y,
1775                        kernel_packet->grid_size_z);
1776 
1777 
1778   if (total_group_size > cmd->device->local_mem_size)
1779     POCL_ABORT ("pocl-hsa: required local memory > device local memory!\n");
1780 
1781   kernel_packet->kernarg_address = event_data->kernargs;
1782 
1783   typedef union {
1784     uint32_t header_setup;
1785     struct {
1786       uint16_t header;
1787       uint16_t setup;
1788     } a;
1789   } hsa_header_union_t;
1790 
1791   hsa_header_union_t h;
1792   h.a.header = (uint16_t)HSA_FENCE_SCOPE_SYSTEM
1793     << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
1794   h.a.header |= (uint16_t)HSA_FENCE_SCOPE_SYSTEM
1795     << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
1796   h.a.header |= (uint16_t)HSA_PACKET_TYPE_KERNEL_DISPATCH
1797                 << HSA_PACKET_HEADER_TYPE;
1798   h.a.setup = (uint16_t)cmd->command.run.pc.work_dim
1799               << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
1800   __atomic_store_n ((uint32_t *)(&kernel_packet->header), h.header_setup,
1801                     __ATOMIC_RELEASE);
1802 
1803   /* ring the doorbell to start execution */
1804   hsa_signal_store_relaxed (last_queue->doorbell_signal, packet_id);
1805 
1806   if (dd->running_list_size > EVENT_LIST_SIZE)
1807     POCL_ABORT("running events list too big\n");
1808   else
1809     {
1810       dd->running_events[dd->running_list_size] = event;
1811       dd->running_signals[dd->running_list_size++].handle
1812         = kernel_packet->completion_signal.handle;
1813     }
1814 
1815   pocl_update_event_running_unlocked (event);
1816   POCL_UNLOCK_OBJ (event);
1817 }
1818 
1819 static void
pocl_hsa_ndrange_event_finished(pocl_hsa_device_data_t * d,size_t i)1820 pocl_hsa_ndrange_event_finished (pocl_hsa_device_data_t *d, size_t i)
1821 {
1822   pocl_hsa_device_pthread_data_t* dd = &d->driver_data;
1823 
1824   assert(i < dd->running_list_size);
1825   cl_event event = dd->running_events[i];
1826   _cl_command_node *node = event->command;
1827 
1828   POCL_LOCK_OBJ (event);
1829   pocl_hsa_event_data_t *event_data = (pocl_hsa_event_data_t *)event->data;
1830 
1831   POCL_MSG_PRINT_INFO("event %" PRIu64 " finished, removing from running_list\n",
1832                       event->id);
1833   dd->running_events[i] = dd->running_events[--dd->running_list_size];
1834 
1835 #if AMD_HSA == 1
1836   /* TODO Times are reported as ticks in the domain of the HSA system clock. */
1837   hsa_amd_profiling_dispatch_time_t t;
1838   HSA_CHECK(hsa_amd_profiling_get_dispatch_time(d->agent,
1839                                                 dd->running_signals[i], &t));
1840   uint64_t j = t.end - t.start;
1841   pocl_debug_print_duration (__func__, __LINE__,
1842                              "HSA NDrange Kernel (HSA clock)", j);
1843 #endif
1844 
1845   hsa_signal_destroy (dd->running_signals[i]);
1846   dd->running_signals[i] = dd->running_signals[dd->running_list_size];
1847 
1848   hsa_memory_free (event_data->kernargs);
1849   hsa_memory_free (event_data->context);
1850 
1851   POCL_UNLOCK_OBJ (event);
1852 
1853   if (d->device->device_side_printf && *d->printf_write_pos > 0)
1854     {
1855       write (STDOUT_FILENO, d->printf_buffer, *d->printf_write_pos);
1856       bzero (d->printf_write_pos, sizeof (size_t));
1857     }
1858 
1859   POCL_UPDATE_EVENT_COMPLETE (event);
1860 
1861   pocl_ndrange_node_cleanup (node);
1862   pocl_mem_manager_free_command (node);
1863 }
1864 
1865 static void
check_running_signals(pocl_hsa_device_data_t * d)1866 check_running_signals (pocl_hsa_device_data_t *d)
1867 {
1868   unsigned i;
1869   pocl_hsa_device_pthread_data_t *dd = &d->driver_data;
1870   for (i = 0; i < dd->running_list_size; i++)
1871     {
1872       if (hsa_signal_load_acquire (dd->running_signals[i]) < 1)
1873         {
1874           pocl_hsa_ndrange_event_finished (d, i);
1875         }
1876     }
1877 }
1878 
1879 static int
pocl_hsa_run_ready_commands(pocl_hsa_device_data_t * d)1880 pocl_hsa_run_ready_commands (pocl_hsa_device_data_t *d)
1881 {
1882   check_running_signals (d);
1883   int enqueued_ndrange = 0;
1884 
1885   PTHREAD_CHECK (pthread_mutex_lock (&d->list_mutex));
1886   while (d->ready_list_size)
1887     {
1888       cl_event e = d->ready_list[0];
1889       PN_REMOVE (d->ready_list, 0);
1890       PTHREAD_CHECK (pthread_mutex_unlock (&d->list_mutex));
1891       if (e->command->type == CL_COMMAND_NDRANGE_KERNEL)
1892         {
1893           d->device->ops->compile_kernel (
1894               e->command, e->command->command.run.kernel, e->queue->device, 1);
1895           pocl_hsa_launch (d, e);
1896           enqueued_ndrange = 1;
1897           POCL_MSG_PRINT_INFO ("NDrange event %" PRIu64 " launched, remove"
1898                                " from readylist\n", e->id);
1899         }
1900       else
1901         {
1902           POCL_MSG_PRINT_INFO ("running non-NDrange event %" PRIu64 ","
1903                                " remove from readylist\n", e->id);
1904           pocl_exec_command (e->command);
1905         }
1906       check_running_signals (d);
1907       PTHREAD_CHECK (pthread_mutex_lock (&d->list_mutex));
1908     }
1909   PTHREAD_CHECK (pthread_mutex_unlock (&d->list_mutex));
1910   return enqueued_ndrange;
1911 }
1912 
1913 void *
pocl_hsa_driver_pthread(void * cldev)1914 pocl_hsa_driver_pthread (void *cldev)
1915 {
1916   size_t i;
1917 #if AMD_HSA == 1
1918   if (!signal_array_initialized)
1919     {
1920       signal_array_initialized = 1;
1921       for (i = 0; i < (EVENT_LIST_SIZE+1); i++)
1922         {
1923           signal_ones_array[i] = 1;
1924           less_than_sigcond_array[i] = HSA_SIGNAL_CONDITION_LT;
1925         }
1926     }
1927 #endif
1928 
1929   cl_device_id device = (cl_device_id)cldev;
1930   POCL_RETAIN_OBJECT_UNLOCKED (device);
1931   pocl_hsa_device_data_t *d = device->data;
1932   pocl_hsa_device_pthread_data_t *dd = &d->driver_data;
1933 
1934   /* timeout counter, resets with each new queued kernel to 1/8, then
1935    * exponentially increases by 40% up to about 3/4 of d->timeout.
1936    * disabled for now */
1937 #if 0
1938   uint64_t kernel_timeout_ns = d->timeout >> 3;
1939 #endif
1940 
1941   dd->running_list_size = 0;
1942   dd->last_queue = 0;
1943   dd->num_queues = d->hw_schedulers;  // TODO this is somewhat arbitrary.
1944   POCL_MSG_PRINT_HSA ("Queues: %" PRIuS "\n", dd->num_queues);
1945 
1946   dd->queues = (hsa_queue_t **) calloc (dd->num_queues, sizeof(hsa_queue_t*));
1947 
1948   uint32_t queue_min_size, queue_max_size;
1949   HSA_CHECK(hsa_agent_get_info(d->agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE,
1950                                &queue_min_size));
1951   HSA_CHECK(hsa_agent_get_info(d->agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE,
1952                                &queue_max_size));
1953 
1954   uint32_t queue_size = 1 << ((__builtin_ctz(queue_min_size)
1955                                + __builtin_ctz(queue_max_size)) / 2);
1956   POCL_MSG_PRINT_HSA ("queue size: %" PRIu32 "\n", queue_size);
1957 
1958   for (i = 0; i < dd->num_queues; i++)
1959     {
1960       HSA_CHECK (hsa_queue_create (d->agent, queue_size, HSA_QUEUE_TYPE_SINGLE,
1961                                    hsa_queue_callback, device, -1, -1,
1962                                    &dd->queues[i]));
1963 #if AMD_HSA == 1
1964       HSA_CHECK (hsa_amd_profiling_set_profiler_enabled (dd->queues[i], 1));
1965 #endif
1966     }
1967 
1968   while (1)
1969     {
1970       /* reset the signal. Disabled for now; see below */
1971 #if 0
1972       if (pocl_hsa_run_ready_commands (d))
1973         kernel_timeout_ns = d->timeout >> 3;
1974 #else
1975       pocl_hsa_run_ready_commands (d);
1976 #endif
1977       if (d->exit_driver_thread)
1978         goto EXIT_PTHREAD;
1979 
1980         // wait for anything to happen or timeout
1981 #if AMD_HSA == 1
1982       // FIXME: An ABA race condition here. If there was (another) submit after
1983       // the previous wait returned, but before this reset, we miss the
1984       // notification decrement and get stuck if there are no further submits
1985       // to decrement the 1.
1986       hsa_signal_store_release (d->nudge_driver_thread, 1);
1987 
1988       if (d->have_wait_any)
1989         {
1990           dd->running_signals[dd->running_list_size].handle =
1991               d->nudge_driver_thread.handle;
1992           hsa_amd_signal_wait_any(dd->running_list_size+1,
1993                                   dd->running_signals,
1994                                   less_than_sigcond_array, signal_ones_array,
1995                                   d->timeout, HSA_WAIT_STATE_BLOCKED, NULL);
1996           dd->running_signals[dd->running_list_size].handle = 0;
1997         }
1998       else
1999         {
2000 #endif
2001 #if 0
2002           if (kernel_timeout_ns < (d->timeout >> 1))
2003             kernel_timeout_ns = (kernel_timeout_ns * 22937UL) >> 14;
2004 	  // See the above comment. Busy wait for now until a proper
2005 	  // synchronization fix is in place.
2006           hsa_signal_wait_acquire (d->nudge_driver_thread,
2007 				   HSA_SIGNAL_CONDITION_LT, 1,
2008 				   kernel_timeout_ns, HSA_WAIT_STATE_BLOCKED);
2009 #endif
2010 
2011 #if AMD_HSA == 1
2012         }
2013 #endif
2014 
2015       if (d->exit_driver_thread)
2016         goto EXIT_PTHREAD;
2017     }
2018 
2019 EXIT_PTHREAD:
2020   /* TODO wait for commands to finish... */
2021   POCL_MSG_PRINT_HSA ("driver pthread exiting, still "
2022                       "running evts: %" PRIuS "\n",
2023                       dd->running_list_size);
2024   assert(dd->running_list_size == 0);
2025 
2026   for (i = 0; i < dd->num_queues; i++)
2027     HSA_CHECK (hsa_queue_destroy (dd->queues[i]));
2028   POCL_MEM_FREE (dd->queues);
2029 
2030   POname (clReleaseDevice) (device);
2031 
2032   pthread_exit (NULL);
2033 }
2034 
2035 void
pocl_hsa_notify_event_finished(cl_event event)2036 pocl_hsa_notify_event_finished (cl_event event)
2037 {
2038   pocl_hsa_event_data_t *e_d = (pocl_hsa_event_data_t *)event->data;
2039   pthread_cond_broadcast (&e_d->event_cond);
2040 }
2041 
2042 void
pocl_hsa_update_event(cl_device_id device,cl_event event)2043 pocl_hsa_update_event (cl_device_id device, cl_event event)
2044 {
2045   pocl_hsa_event_data_t *e_d = NULL;
2046 
2047   if (event->data == NULL && event->status == CL_QUEUED)
2048     {
2049       pocl_hsa_event_data_t *e_d
2050           = (pocl_hsa_event_data_t *)malloc (sizeof (pocl_hsa_event_data_t));
2051       assert (e_d);
2052       pthread_cond_init (&e_d->event_cond, NULL);
2053       event->data = (void *)e_d;
2054     }
2055   else
2056     {
2057       e_d = event->data;
2058     }
2059 
2060   switch (event->status)
2061     {
2062     case CL_QUEUED:
2063       if (event->queue->properties & CL_QUEUE_PROFILING_ENABLE)
2064         event->time_queue = pocl_hsa_get_timer_value (device->data);
2065       break;
2066     case CL_SUBMITTED:
2067       if (event->queue->properties & CL_QUEUE_PROFILING_ENABLE)
2068         event->time_submit = pocl_hsa_get_timer_value (device->data);
2069       break;
2070     case CL_RUNNING:
2071       if (event->queue->properties & CL_QUEUE_PROFILING_ENABLE)
2072         event->time_start = pocl_hsa_get_timer_value (device->data);
2073       break;
2074     case CL_FAILED:
2075     case CL_COMPLETE:
2076       if (event->queue->properties & CL_QUEUE_PROFILING_ENABLE)
2077         event->time_end = pocl_hsa_get_timer_value (device->data);
2078       break;
2079     }
2080 }
2081 
pocl_hsa_free_event_data(cl_event event)2082 void pocl_hsa_free_event_data (cl_event event)
2083 {
2084   assert(event->data != NULL);
2085   free(event->data);
2086   event->data = NULL;
2087 }
2088 
2089 /****** SVM callbacks *****/
2090 
2091 void
pocl_hsa_svm_free(cl_device_id dev,void * svm_ptr)2092 pocl_hsa_svm_free (cl_device_id dev, void *svm_ptr)
2093 {
2094   /* TODO we should somehow figure out the size argument
2095    * and call pocl_free_global_mem */
2096   HSA_CHECK (hsa_memory_free (svm_ptr));
2097 }
2098 
2099 void *
pocl_hsa_svm_alloc(cl_device_id dev,cl_svm_mem_flags flags,size_t size)2100 pocl_hsa_svm_alloc (cl_device_id dev, cl_svm_mem_flags flags, size_t size)
2101 {
2102   if ((flags & CL_MEM_SVM_ATOMICS)
2103       && ((dev->svm_caps & CL_DEVICE_SVM_ATOMICS) == 0))
2104     {
2105       POCL_MSG_ERR ("This device doesn't support SVM Atomics\n");
2106       return NULL;
2107     }
2108 
2109   if ((flags & CL_MEM_SVM_FINE_GRAIN_BUFFER)
2110        && ((dev->svm_caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) == 0))
2111     {
2112       POCL_MSG_ERR ("This device doesn't support SVM Fine grained Buffer\n");
2113       return NULL;
2114     }
2115 
2116   pocl_hsa_device_data_t *d = (pocl_hsa_device_data_t *)dev->data;
2117   void *b = NULL;
2118   HSA_CHECK (hsa_memory_allocate (d->global_region, size, &b));
2119   return b;
2120 }
2121 
2122 void
pocl_hsa_svm_copy(cl_device_id dev,void * __restrict__ dst,const void * __restrict__ src,size_t size)2123 pocl_hsa_svm_copy (cl_device_id dev, void *__restrict__ dst,
2124                    const void *__restrict__ src, size_t size)
2125 {
2126   HSA_CHECK (hsa_memory_copy (dst, src, size));
2127 }
2128 
2129 char*
pocl_hsa_init_build(void * data)2130 pocl_hsa_init_build (void *data)
2131 {
2132   if (!((pocl_hsa_device_data_t*)data)->device->device_side_printf)
2133     return strdup ("-DC99_PRINTF");
2134   else
2135     return NULL;
2136 }
2137