1 /* pocl-hsa.c - driver for HSA supported devices.
2
3 Copyright (c) 2015-2020 Pekka Jääskeläinen
4 2015 Charles Chen <ccchen@pllab.cs.nthu.edu.tw>
5 Shao-chung Wang <scwang@pllab.cs.nthu.edu.tw>
6 2015-2018 Michal Babej <michal.babej@tut.fi>
7
8 Short snippets borrowed from the MatrixMultiplication example in
9 the HSA runtime library sources (c) 2014 HSA Foundation Inc.
10
11 Permission is hereby granted, free of charge, to any person obtaining a copy
12 of this software and associated documentation files (the "Software"), to
13 deal in the Software without restriction, including without limitation the
14 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
15 sell copies of the Software, and to permit persons to whom the Software is
16 furnished to do so, subject to the following conditions:
17
18 The above copyright notice and this permission notice shall be included in
19 all copies or substantial portions of the Software.
20
21 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
27 IN THE SOFTWARE.
28 */
29 /* Some example code snippets copied verbatim from vector_copy.c of
30 * HSA-Runtime-AMD: */
31 /* Copyright 2014 HSA Foundation Inc. All Rights Reserved.
32 *
33 * HSAF is granting you permission to use this software and documentation (if
34 * any) (collectively, the "Materials") pursuant to the terms and conditions
35 * of the Software License Agreement included with the Materials. If you do
36 * not have a copy of the Software License Agreement, contact the HSA
37 * Foundation for a copy. Redistribution and use in source and binary forms,
38 * with or without modification, are permitted provided that the following
39 * conditions are met:
40 * 1. Redistributions of source code must retain the above copyright
41 * notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution
45 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
46 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
47 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
48 * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
49 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
50 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
51 * WITH THE SOFTWARE.
52 */
53
54 #ifndef _BSD_SOURCE
55 #define _BSD_SOURCE
56 #endif
57
58 #ifndef _DEFAULT_SOURCE
59 #define _DEFAULT_SOURCE
60 #endif
61
62
63 #include "hsa.h"
64 #include "hsa_ext_finalize.h"
65 #include "hsa_ext_image.h"
66
67 #include "config.h"
68 #include "config2.h"
69
70 #if defined(HAVE_HSA_EXT_AMD_H) && AMD_HSA == 1
71
72 #include "hsa_ext_amd.h"
73
74 #endif
75
76 #include "pocl-hsa.h"
77 #include "common.h"
78 #include "common_driver.h"
79 #include "devices.h"
80 #include "pocl-hsa.h"
81 #include "pocl_cache.h"
82 #include "pocl_context.h"
83 #include "pocl_file_util.h"
84 #include "pocl_llvm.h"
85 #include "pocl_mem_management.h"
86 #include "pocl_spir.h"
87 #include "pocl_local_size.h"
88 #include "pocl_util.h"
89
90 #include <assert.h>
91 #include <limits.h>
92 #include <pthread.h>
93 #include <stdlib.h>
94 #include <string.h>
95
96 #ifndef _MSC_VER
97 # include <sys/wait.h>
98 # include <sys/time.h>
99 # include <sys/types.h>
100 # include <unistd.h>
101 #else
102 # include "vccompat.hpp"
103 #endif
104
105 #define max(a,b) (((a) > (b)) ? (a) : (b))
106
107 /* TODO: The kernel cache is never shrunk. We need a hook that is called back
108 when clReleaseKernel is called to get a safe point where to release the
109 kernel entry from the inmemory cache. */
110 #define HSA_KERNEL_CACHE_SIZE 4096
111 #define COMMAND_LIST_SIZE 4096
112 #define EVENT_LIST_SIZE 511
113
114 typedef struct pocl_hsa_event_data_s {
115 /* Address of the space where this kernel launch's arguments were stored. */
116 void *kernargs;
117 /* The location of the pocl context struct in the Agent's global mem. */
118 void *context;
119 pthread_cond_t event_cond;
120 } pocl_hsa_event_data_t;
121
122 /* Simple statically-sized kernel data cache */
123 /* for caching kernel dispatch data, binaries etc */
124 typedef struct pocl_hsa_kernel_cache_s {
125 cl_kernel kernel;
126
127 /* use kernel hash as key */
128 pocl_kernel_hash_t kernel_hash;
129
130 hsa_executable_t hsa_exe;
131 uint64_t code_handle;
132
133 uint32_t private_size;
134 uint32_t static_group_size;
135 uint32_t args_segment_size;
136
137 /* For native non-SPMD targets, we cache work-group functions specialized
138 to specific work-group sizes. */
139 uint64_t local_x;
140 uint64_t local_y;
141 uint64_t local_z;
142
143 /* If global offset must be zero for this WG function. */
144 int goffs_zero;
145
146 /* Maximum grid dimension this WG function works with. */
147 size_t max_grid_dim_width;
148
149 } pocl_hsa_kernel_cache_t;
150
151 /* data for driver pthread */
152 typedef struct pocl_hsa_device_pthread_data_s {
153 /* list of running commands and their signals*/
154 cl_event running_events[EVENT_LIST_SIZE];
155 hsa_signal_t running_signals[EVENT_LIST_SIZE+1];
156 size_t running_list_size;
157
158 /* Queue list (for pushing work to the agent);
159 * multiple queues per device */
160 hsa_queue_t **queues;
161 size_t num_queues, last_queue;
162 } pocl_hsa_device_pthread_data_t;
163
164 typedef struct pocl_hsa_device_data_s {
165 /* The parent device struct. */
166 cl_device_id device;
167 /* The HSA kernel agent controlled by the device driver instance. */
168 hsa_agent_t agent;
169 hsa_profile_t agent_profile;
170
171 /* mem regions */
172 hsa_region_t global_region, kernarg_region, group_region;
173
174 /* Per-program data cache to simplify program compiling stage */
175 pocl_hsa_kernel_cache_t kernel_cache[HSA_KERNEL_CACHE_SIZE];
176 unsigned kernel_cache_lastptr;
177
178 /* kernel signal wait timeout hint, in HSA runtime units */
179 uint64_t timeout;
180 /* length of a timestamp unit expressed in nanoseconds */
181 double timestamp_unit;
182 /* see pocl_hsa_init for details */
183 size_t hw_schedulers;
184
185 /* list of submitted commands waiting to run later */
186 cl_event wait_list[COMMAND_LIST_SIZE];
187 size_t wait_list_size;
188
189 /* list of commands ready to run */
190 cl_event ready_list[COMMAND_LIST_SIZE];
191 size_t ready_list_size;
192
193 /* list manipulation mutex */
194 pthread_mutex_t list_mutex;
195
196 /* used by host thread to notify driver pthread when events change status */
197 hsa_signal_t nudge_driver_thread;
198
199 /* device pthread */
200 pthread_t driver_pthread_id;
201
202 /* device pthread data */
203 pocl_hsa_device_pthread_data_t driver_data;
204
205 /* exit signal */
206 volatile int exit_driver_thread;
207
208 /* if agent supports async handlers*/
209 int have_wait_any;
210
211 /* compilation lock */
212 pocl_lock_t pocl_hsa_compilation_lock;
213
214 /* printf buffer */
215 void *printf_buffer;
216 uint32_t *printf_write_pos;
217
218 } pocl_hsa_device_data_t;
219
220 void pocl_hsa_compile_kernel_hsail (_cl_command_node *cmd, cl_kernel kernel,
221 cl_device_id device, int specialize);
222
223 void pocl_hsa_compile_kernel_native (_cl_command_node *cmd, cl_kernel kernel,
224 cl_device_id device, int specialize);
225
226 static void*
227 pocl_hsa_malloc(pocl_global_mem_t *mem, size_t size, hsa_region_t r);
228
229 void
pocl_hsa_init_device_ops(struct pocl_device_ops * ops)230 pocl_hsa_init_device_ops(struct pocl_device_ops *ops)
231 {
232 /* TODO: more descriptive name from HSA probing the device */
233 ops->device_name = "hsa";
234 ops->probe = pocl_hsa_probe;
235 ops->uninit = pocl_hsa_uninit;
236 ops->reinit = pocl_hsa_reinit;
237 ops->init = pocl_hsa_init;
238 ops->alloc_mem_obj = pocl_hsa_alloc_mem_obj;
239 ops->free = pocl_hsa_free;
240 ops->run = NULL;
241
242 ops->read = pocl_driver_read;
243 ops->read_rect = pocl_driver_read_rect;
244 ops->write = pocl_driver_write;
245 ops->write_rect = pocl_driver_write_rect;
246 ops->map_mem = pocl_driver_map_mem;
247 ops->unmap_mem = pocl_driver_unmap_mem;
248 ops->get_mapping_ptr = pocl_driver_get_mapping_ptr;
249 ops->free_mapping_ptr = pocl_driver_free_mapping_ptr;
250 ops->memfill = pocl_driver_memfill;
251 ops->copy = pocl_hsa_copy;
252 ops->copy_rect = pocl_driver_copy_rect;
253 ops->compute_local_size = pocl_default_local_size_optimizer;
254
255 ops->get_device_info_ext = NULL;
256
257 ops->svm_free = pocl_hsa_svm_free;
258 ops->svm_alloc = pocl_hsa_svm_alloc;
259 ops->svm_copy = pocl_hsa_svm_copy;
260 ops->svm_fill = pocl_basic_svm_fill;
261 ops->svm_register = pocl_hsa_svm_register;
262 ops->svm_unregister = pocl_hsa_svm_unregister;
263
264 // new driver api (out-of-order)
265 ops->submit = pocl_hsa_submit;
266 ops->join = pocl_hsa_join;
267 ops->flush = pocl_hsa_flush;
268 ops->notify = pocl_hsa_notify;
269 ops->broadcast = pocl_hsa_broadcast;
270 ops->wait_event = pocl_hsa_wait_event;
271
272 ops->build_source = pocl_driver_build_source;
273 ops->link_program = pocl_driver_link_program;
274 ops->build_binary = pocl_driver_build_binary;
275 ops->free_program = pocl_driver_free_program;
276 ops->setup_metadata = pocl_driver_setup_metadata;
277 ops->supports_binary = pocl_driver_supports_binary;
278 ops->build_poclbinary = pocl_driver_build_poclbinary;
279 #if HSAIL_ENABLED
280 ops->compile_kernel = pocl_hsa_compile_kernel_hsail;
281 #else
282 ops->compile_kernel = pocl_hsa_compile_kernel_native;
283 #endif
284
285 ops->update_event = pocl_hsa_update_event;
286 ops->notify_event_finished = pocl_hsa_notify_event_finished;
287 ops->free_event_data = pocl_hsa_free_event_data;
288 ops->init_target_machine = NULL;
289 ops->wait_event = pocl_hsa_wait_event;
290 ops->build_hash = pocl_hsa_build_hash;
291 ops->init_build = pocl_hsa_init_build;
292 }
293
294 #define MAX_HSA_AGENTS 16
295
296 static void
pocl_hsa_abort_on_hsa_error(hsa_status_t status,unsigned line,const char * func,const char * code)297 pocl_hsa_abort_on_hsa_error(hsa_status_t status,
298 unsigned line,
299 const char* func,
300 const char* code)
301 {
302 const char* str;
303 if (status != HSA_STATUS_SUCCESS)
304 {
305 hsa_status_string(status, &str);
306 POCL_MSG_PRINT2(HSA, func, line, "Error from HSA Runtime call:\n");
307 POCL_ABORT ("%s\n", str);
308 }
309 }
310
311
312 #define HSA_CHECK(code) pocl_hsa_abort_on_hsa_error(code, \
313 __LINE__, \
314 __FUNCTION__, \
315 #code);
316
317
318 static hsa_agent_t hsa_agents[MAX_HSA_AGENTS];
319 static unsigned found_hsa_agents = 0;
320
321 static hsa_status_t
pocl_hsa_get_agents_callback(hsa_agent_t agent,void * data)322 pocl_hsa_get_agents_callback(hsa_agent_t agent, void *data)
323 {
324 hsa_device_type_t type;
325 HSA_CHECK(hsa_agent_get_info (agent, HSA_AGENT_INFO_DEVICE, &type));
326
327 hsa_agent_feature_t features;
328 HSA_CHECK(hsa_agent_get_info(agent, HSA_AGENT_INFO_FEATURE, &features));
329 if (features != HSA_AGENT_FEATURE_KERNEL_DISPATCH)
330 {
331 return HSA_STATUS_SUCCESS;
332 }
333
334 hsa_agents[found_hsa_agents++] = agent;
335 return HSA_STATUS_SUCCESS;
336 }
337
338 /*
339 * Sets up the memory regions in pocl_hsa_device_data for a device
340 */
341 static
342 hsa_status_t
setup_agent_memory_regions_callback(hsa_region_t region,void * data)343 setup_agent_memory_regions_callback(hsa_region_t region, void* data)
344 {
345 pocl_hsa_device_data_t* d = (pocl_hsa_device_data_t*)data;
346
347 hsa_region_segment_t segment;
348 hsa_region_global_flag_t flags;
349 HSA_CHECK(hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment));
350
351 if (segment == HSA_REGION_SEGMENT_GLOBAL)
352 {
353 d->global_region = region;
354 HSA_CHECK(hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS,
355 &flags));
356 if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
357 d->kernarg_region = region;
358 }
359
360 if (segment == HSA_REGION_SEGMENT_GROUP)
361 d->group_region = region;
362
363 return HSA_STATUS_SUCCESS;
364 }
365
366 /* HSA unsupported device features are hard coded in a known Agent
367 list and detected by the advertised agent name string. */
368 #define HSA_NUM_KNOWN_HSA_AGENTS 2
369
370 static const char *default_native_final_linkage_flags[] =
371 {"-nostartfiles", HOST_LD_FLAGS_ARRAY, NULL};
372
373 static const char *phsa_native_device_aux_funcs[] =
374 {"_pocl_run_all_wgs", "_pocl_finish_all_wgs", "_pocl_spawn_wg", NULL};
375
376 #define AMD_VENDOR_ID 0x1002
377
378 static struct _cl_device_id supported_hsa_devices[HSA_NUM_KNOWN_HSA_AGENTS]
379 = { [0] = { .long_name = "Spectre",
380 .llvm_cpu = (HSAIL_ENABLED ? NULL : "kaveri"),
381 .llvm_target_triplet
382 = (HSAIL_ENABLED ? "hsail64" : "amdgcn--amdhsa"),
383 .spmd = CL_TRUE,
384 .autolocals_to_args = POCL_AUTOLOCALS_TO_ARGS_NEVER,
385 .device_alloca_locals = CL_FALSE,
386 .context_as_id = SPIR_ADDRESS_SPACE_GLOBAL,
387 .args_as_id = SPIR_ADDRESS_SPACE_GLOBAL,
388 .has_64bit_long = 1,
389 .vendor_id = AMD_VENDOR_ID,
390 .global_mem_cache_type = CL_READ_WRITE_CACHE,
391 .max_constant_buffer_size = 65536,
392 .local_mem_type = CL_LOCAL,
393 .endian_little = CL_TRUE,
394 .extensions = HSA_DEVICE_EXTENSIONS,
395 .device_side_printf = !HSAIL_ENABLED,
396 .printf_buffer_size = PRINTF_BUFFER_SIZE * 1024,
397 .preferred_wg_size_multiple = 64, // wavefront size on Kaveri
398 .preferred_vector_width_char = 4,
399 .preferred_vector_width_short = 2,
400 .preferred_vector_width_int = 1,
401 .preferred_vector_width_long = 1,
402 .preferred_vector_width_float = 1,
403 .preferred_vector_width_double = 1,
404 .native_vector_width_char = 4,
405 .native_vector_width_short = 2,
406 .native_vector_width_int = 1,
407 .native_vector_width_long = 1,
408 .native_vector_width_float = 1,
409 .native_vector_width_double = 1 },
410 [1] = { .long_name = "phsa generic CPU agent",
411 .llvm_cpu = NULL,
412 .llvm_target_triplet = (HSAIL_ENABLED ? "hsail64" : NULL),
413 .spmd = CL_FALSE,
414 .autolocals_to_args
415 = (HSAIL_ENABLED ? POCL_AUTOLOCALS_TO_ARGS_NEVER
416 : POCL_AUTOLOCALS_TO_ARGS_ALWAYS),
417 .device_alloca_locals = CL_TRUE,
418 .context_as_id = SPIR_ADDRESS_SPACE_GLOBAL,
419 .args_as_id = SPIR_ADDRESS_SPACE_GLOBAL,
420 .has_64bit_long = 1,
421 .vendor_id = 0xffff,
422 .global_mem_cache_type = CL_READ_WRITE_CACHE,
423 .max_constant_buffer_size = 65536,
424 .local_mem_type = CL_LOCAL,
425 .endian_little = !(WORDS_BIGENDIAN),
426 .extensions = HSA_DEVICE_EXTENSIONS,
427 .device_side_printf = !HSAIL_ENABLED,
428 .printf_buffer_size = PRINTF_BUFFER_SIZE * 1024,
429 .preferred_wg_size_multiple = 1,
430 /* We want to exploit the widest vector types in HSAIL
431 for the CPUs assuming they have some sort of SIMD ISE
432 which the finalizer than can more readily utilize. */
433 .preferred_vector_width_char = 16,
434 .preferred_vector_width_short = 16,
435 .preferred_vector_width_int = 16,
436 .preferred_vector_width_long = 16,
437 .preferred_vector_width_float = 16,
438 .preferred_vector_width_double = 16,
439 .native_vector_width_char = 16,
440 .native_vector_width_short = 16,
441 .native_vector_width_int = 16,
442 .native_vector_width_long = 16,
443 .native_vector_width_float = 16,
444 .native_vector_width_double = 16,
445 .final_linkage_flags = default_native_final_linkage_flags,
446 .device_aux_functions
447 = (HSAIL_ENABLED ? NULL : phsa_native_device_aux_funcs) } };
448
449 char *
pocl_hsa_build_hash(cl_device_id device)450 pocl_hsa_build_hash (cl_device_id device)
451 {
452 char* res = calloc(1000, sizeof(char));
453 snprintf(res, 1000, "HSA-%s-%s", device->llvm_target_triplet, device->long_name);
454 return res;
455 }
456
457 // Detect the HSA device and populate its properties to the device
458 // struct.
459 static void
get_hsa_device_features(char * dev_name,struct _cl_device_id * dev)460 get_hsa_device_features(char* dev_name, struct _cl_device_id* dev)
461 {
462
463 #define COPY_ATTR(ATTR) dev->ATTR = supported_hsa_devices[i].ATTR
464 #define COPY_VECWIDTH(ATTR) \
465 dev->preferred_vector_width_ ## ATTR = \
466 supported_hsa_devices[i].preferred_vector_width_ ## ATTR; \
467 dev->native_vector_width_ ## ATTR = \
468 supported_hsa_devices[i].native_vector_width_ ## ATTR;
469
470 int found = 0;
471 unsigned i;
472 for(i = 0; i < HSA_NUM_KNOWN_HSA_AGENTS; i++)
473 {
474 if (strcmp(dev_name, supported_hsa_devices[i].long_name) == 0)
475 {
476 COPY_ATTR (llvm_cpu);
477 COPY_ATTR (llvm_target_triplet);
478 COPY_ATTR (spmd);
479 COPY_ATTR (autolocals_to_args);
480 COPY_ATTR (device_alloca_locals);
481 COPY_ATTR (context_as_id);
482 COPY_ATTR (args_as_id);
483 if (!HSAIL_ENABLED)
484 {
485 /* TODO: Add a CMake variable or HSA description string
486 autodetection to control these. */
487 if (dev->llvm_cpu == NULL)
488 dev->llvm_cpu = get_llvm_cpu_name ();
489 if (dev->llvm_target_triplet == NULL)
490 dev->llvm_target_triplet = OCL_KERNEL_TARGET;
491 dev->arg_buffer_launcher = CL_TRUE;
492 }
493 COPY_ATTR (has_64bit_long);
494 COPY_ATTR (vendor_id);
495 COPY_ATTR (global_mem_cache_type);
496 COPY_ATTR (max_constant_buffer_size);
497 COPY_ATTR (local_mem_type);
498 COPY_ATTR (endian_little);
499 COPY_ATTR (preferred_wg_size_multiple);
500 COPY_ATTR (extensions);
501 COPY_ATTR (final_linkage_flags);
502 COPY_ATTR (device_aux_functions);
503 COPY_ATTR (device_side_printf);
504 COPY_ATTR (printf_buffer_size);
505 COPY_VECWIDTH (char);
506 COPY_VECWIDTH (short);
507 COPY_VECWIDTH (int);
508 COPY_VECWIDTH (long);
509 COPY_VECWIDTH (float);
510 COPY_VECWIDTH (double);
511 found = 1;
512 break;
513 }
514 }
515 if (!found)
516 {
517 POCL_MSG_PRINT_INFO("pocl-hsa: found unknown HSA devices '%s'.\n",
518 dev_name);
519 POCL_ABORT ("We found a device for which we don't have device "
520 "OpenCL attribute information (compute unit count, "
521 "constant buffer size etc), and there's no way to get all "
522 "the required info from HSA API. Please create a "
523 "new entry with the information in supported_hsa_devices, "
524 "and send a note/patch to pocl developers. Thanks!\n");
525 }
526 }
527
528 unsigned int
pocl_hsa_probe(struct pocl_device_ops * ops)529 pocl_hsa_probe (struct pocl_device_ops *ops)
530 {
531 int env_count = pocl_device_get_env_count (ops->device_name);
532
533 POCL_MSG_PRINT_INFO ("pocl-hsa: found %d env devices with %s.\n", env_count,
534 ops->device_name);
535
536 /* No hsa env specified, the user did not request for HSA agents. */
537 if (env_count <= 0)
538 return 0;
539
540 HSA_CHECK (hsa_init ());
541
542 HSA_CHECK (hsa_iterate_agents (pocl_hsa_get_agents_callback, NULL));
543
544 POCL_MSG_PRINT_INFO ("pocl-hsa: found %d agents.\n", found_hsa_agents);
545
546 return (int)found_hsa_agents;
547 }
548
549 static void
hsa_queue_callback(hsa_status_t status,hsa_queue_t * q,void * data)550 hsa_queue_callback (hsa_status_t status, hsa_queue_t *q, void *data)
551 {
552 HSA_CHECK (status);
553 }
554
555 /* driver pthread prototype */
556 void *pocl_hsa_driver_pthread (void *cldev);
557
558 /* (Re)initialize the device data. dev is the device driver part, count is
559 the number of devices of this type initialized so far. */
560 static pocl_hsa_device_data_t *
init_dev_data(cl_device_id dev,int count)561 init_dev_data (cl_device_id dev, int count)
562 {
563 pocl_hsa_device_data_t *d
564 = (pocl_hsa_device_data_t *)calloc (1, sizeof (pocl_hsa_device_data_t));
565
566 dev->data = d;
567 d->device = dev;
568 d->hw_schedulers = 3;
569 POCL_INIT_LOCK (d->pocl_hsa_compilation_lock);
570
571 /* Before the first HSA device, re-init the runtime. */
572 if (count == 0 && found_hsa_agents == 0)
573 {
574 HSA_CHECK (hsa_init ());
575 HSA_CHECK (hsa_iterate_agents (pocl_hsa_get_agents_callback, NULL));
576 }
577 d->agent.handle = hsa_agents[count].handle;
578
579 HSA_CHECK (hsa_agent_iterate_regions (
580 d->agent, setup_agent_memory_regions_callback, d));
581 bool boolarg = 0;
582 HSA_CHECK (hsa_region_get_info (
583 d->global_region, HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED, &boolarg));
584 assert (boolarg != 0);
585
586 pocl_reinit_system_memory ();
587
588 HSA_CHECK (hsa_signal_create (1, 1, &d->agent, &d->nudge_driver_thread));
589 d->exit_driver_thread = 0;
590 pthread_mutexattr_t mattr;
591 PTHREAD_CHECK (pthread_mutexattr_init (&mattr));
592 PTHREAD_CHECK (pthread_mutexattr_settype (&mattr, PTHREAD_MUTEX_ERRORCHECK));
593 PTHREAD_CHECK (pthread_mutex_init (&d->list_mutex, &mattr));
594
595 uint64_t hsa_freq;
596 HSA_CHECK (
597 hsa_system_get_info (HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &hsa_freq));
598 d->timeout = hsa_freq; // 1 second in hsa units
599 d->timestamp_unit = (1000000000.0 / (double)hsa_freq);
600 POCL_MSG_PRINT_INFO ("HSA timestamp frequency: %" PRIu64 "\n", hsa_freq);
601 POCL_MSG_PRINT_INFO ("HSA timeout: %" PRIu64 "\n", d->timeout);
602 POCL_MSG_PRINT_INFO ("HSA timestamp unit: %g\n", d->timestamp_unit);
603
604 #if AMD_HSA == 1
605 /* TODO check at runtime */
606 d->have_wait_any = 1;
607 #endif
608
609 #if AMD_HSA == 1
610 if (dev->vendor_id == AMD_VENDOR_ID)
611 {
612 char booltest = 0;
613 HSA_CHECK (hsa_region_get_info (
614 d->global_region, HSA_AMD_REGION_INFO_HOST_ACCESSIBLE, &booltest));
615 assert (booltest != 0);
616 }
617 #endif
618
619 size_t sizearg;
620 HSA_CHECK (hsa_region_get_info (d->global_region,
621 HSA_REGION_INFO_ALLOC_MAX_SIZE, &sizearg));
622 dev->max_mem_alloc_size = sizearg;
623
624 /* For some reason, the global region size returned is 128 Terabytes...
625 * for now, use the max alloc size, it seems to be a much more reasonable
626 * value.
627 * HSA_CHECK(hsa_region_get_info(d->global_region, HSA_REGION_INFO_SIZE,
628 * &sizearg));
629 */
630 HSA_CHECK (
631 hsa_region_get_info (d->global_region, HSA_REGION_INFO_SIZE, &sizearg));
632 dev->global_mem_size = sizearg;
633 if (dev->global_mem_size > 16 * 1024 * 1024 * (uint64_t)1024)
634 dev->global_mem_size = dev->max_mem_alloc_size;
635
636 HSA_CHECK (
637 hsa_region_get_info (d->group_region, HSA_REGION_INFO_SIZE, &sizearg));
638 dev->local_mem_size = sizearg;
639
640 HSA_CHECK (hsa_region_get_info (
641 d->global_region, HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT, &sizearg));
642 dev->mem_base_addr_align = max (sizearg, MAX_EXTENDED_ALIGNMENT);
643
644 HSA_CHECK (hsa_agent_get_info (d->agent, HSA_AGENT_INFO_PROFILE,
645 &d->agent_profile));
646
647 if (d->agent_profile == HSA_PROFILE_FULL)
648 pocl_setup_device_for_system_memory (dev);
649
650 dev->profile = "FULL_PROFILE";
651 dev->has_own_timer = CL_TRUE;
652
653 dev->compiler_available = CL_TRUE;
654 dev->linker_available = CL_TRUE;
655
656 dev->profiling_timer_resolution = (size_t) (d->timestamp_unit) || 1;
657
658 if (dev->device_side_printf)
659 {
660 d->printf_buffer = pocl_hsa_malloc
661 (dev->global_memory, dev->printf_buffer_size, d->global_region);
662
663 d->printf_write_pos = pocl_hsa_malloc
664 (dev->global_memory, sizeof (size_t), d->global_region);
665 }
666
667 d->exit_driver_thread = 0;
668 PTHREAD_CHECK (pthread_create (&d->driver_pthread_id, NULL,
669 &pocl_hsa_driver_pthread, dev));
670
671 return d;
672 }
673
674 cl_int
pocl_hsa_init(unsigned j,cl_device_id dev,const char * parameters)675 pocl_hsa_init (unsigned j, cl_device_id dev, const char *parameters)
676 {
677 pocl_init_default_device_infos (dev);
678
679 SETUP_DEVICE_CL_VERSION(HSA_DEVICE_CL_VERSION_MAJOR,
680 HSA_DEVICE_CL_VERSION_MINOR)
681
682 dev->spmd = CL_TRUE;
683 dev->arg_buffer_launcher = CL_FALSE;
684 dev->autolocals_to_args = POCL_AUTOLOCALS_TO_ARGS_NEVER;
685 dev->device_alloca_locals = CL_FALSE;
686
687 dev->local_as_id = SPIR_ADDRESS_SPACE_LOCAL;
688 dev->constant_as_id = SPIR_ADDRESS_SPACE_CONSTANT;
689
690 assert (found_hsa_agents > 0);
691 assert (j < found_hsa_agents);
692 dev->data = (void*)(uintptr_t)j;
693 hsa_agent_t agent = hsa_agents[j];
694
695 uint32_t cache_sizes[4];
696 HSA_CHECK(hsa_agent_get_info (agent, HSA_AGENT_INFO_CACHE_SIZE,
697 &cache_sizes));
698 // The only nonzero value on Kaveri is the first (L1)
699 dev->global_mem_cache_size = cache_sizes[0];
700
701 dev->short_name = dev->long_name = (char*)malloc (64*sizeof(char));
702 HSA_CHECK(hsa_agent_get_info (agent, HSA_AGENT_INFO_NAME, dev->long_name));
703 get_hsa_device_features (dev->long_name, dev);
704
705 dev->type = CL_DEVICE_TYPE_GPU;
706
707 // Enable when it's actually implemented AND if supported by
708 // the target agent (check with hsa_agent_extension_supported).
709 dev->image_support = CL_FALSE;
710
711 dev->single_fp_config = CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO
712 | CL_FP_ROUND_TO_INF | CL_FP_FMA | CL_FP_INF_NAN;
713 dev->double_fp_config = CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO
714 | CL_FP_ROUND_TO_INF | CL_FP_FMA | CL_FP_INF_NAN;
715
716 hsa_machine_model_t model;
717 HSA_CHECK (hsa_agent_get_info (agent, HSA_AGENT_INFO_MACHINE_MODEL, &model));
718 dev->address_bits = (model == HSA_MACHINE_MODEL_LARGE) ? 64 : 32;
719
720 uint16_t wg_sizes[3];
721 HSA_CHECK (
722 hsa_agent_get_info (agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM, &wg_sizes));
723
724 int max_wg = pocl_get_int_option ("POCL_MAX_WORK_GROUP_SIZE", 0);
725 if (max_wg > 0)
726 {
727 wg_sizes[0] = min (wg_sizes[0], max_wg);
728 wg_sizes[1] = min (wg_sizes[1], max_wg);
729 wg_sizes[2] = min (wg_sizes[2], max_wg);
730 }
731
732 dev->max_work_item_sizes[0] = wg_sizes[0];
733 dev->max_work_item_sizes[1] = wg_sizes[1];
734 dev->max_work_item_sizes[2] = wg_sizes[2];
735
736 /* Specialize WG functions for grid dimensions of width <= USHRT_MAX. */
737 dev->grid_width_specialization_limit = USHRT_MAX;
738
739 HSA_CHECK(hsa_agent_get_info
740 (agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &dev->max_work_group_size));
741
742 if (max_wg > 0)
743 dev->max_work_group_size = max_wg;
744
745 /* Assume a small maximum work-group size indicates also the desire to
746 maximally utilize it. */
747 if (max_wg <= 256)
748 dev->ops->compute_local_size = pocl_wg_utilization_maximizer;
749
750 if (AMD_HSA && dev->vendor_id == AMD_VENDOR_ID)
751 {
752 #if AMD_HSA == 1
753 uint32_t temp;
754 HSA_CHECK (hsa_agent_get_info (agent, HSA_AMD_AGENT_INFO_CACHELINE_SIZE,
755 &temp));
756 dev->global_mem_cacheline_size = temp;
757
758 HSA_CHECK (hsa_agent_get_info (
759 agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &temp));
760 dev->max_compute_units = temp;
761
762 HSA_CHECK (hsa_agent_get_info (
763 agent, HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, &temp));
764 dev->max_clock_frequency = temp;
765 #endif
766 }
767 else
768 {
769 /* Could not use AMD extensions to find out CU/frequency of the device.
770 Using dummy values. */
771 dev->global_mem_cacheline_size = 64;
772 dev->max_compute_units = 4;
773 dev->max_clock_frequency = 700;
774 }
775
776 HSA_CHECK(hsa_agent_get_info
777 (agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &dev->max_work_group_size));
778
779 /* Image features. */
780 if (dev->image_support == CL_TRUE)
781 {
782 hsa_dim3_t image_size;
783 HSA_CHECK (hsa_agent_get_info (
784 agent, HSA_EXT_AGENT_INFO_IMAGE_1D_MAX_ELEMENTS, &image_size));
785 dev->image_max_buffer_size = image_size.x;
786 HSA_CHECK (hsa_agent_get_info (
787 agent, HSA_EXT_AGENT_INFO_IMAGE_2D_MAX_ELEMENTS, &image_size));
788 dev->image2d_max_height = image_size.x;
789 dev->image2d_max_width = image_size.y;
790 HSA_CHECK (hsa_agent_get_info (
791 agent, HSA_EXT_AGENT_INFO_IMAGE_3D_MAX_ELEMENTS, &image_size));
792 dev->image3d_max_height = image_size.x;
793 dev->image3d_max_width = image_size.y;
794 dev->image3d_max_depth = image_size.z;
795 // is this directly the product of the dimensions?
796 //stat = hsa_agent_get_info(agent, ??, &dev->image_max_array_size);
797 HSA_CHECK (hsa_agent_get_info (agent,
798 HSA_EXT_AGENT_INFO_MAX_IMAGE_RD_HANDLES,
799 &dev->max_read_image_args));
800 HSA_CHECK (hsa_agent_get_info (agent,
801 HSA_EXT_AGENT_INFO_MAX_IMAGE_RORW_HANDLES,
802 &dev->max_read_write_image_args));
803 dev->max_write_image_args = dev->max_read_write_image_args;
804 HSA_CHECK (hsa_agent_get_info (
805 agent, HSA_EXT_AGENT_INFO_MAX_SAMPLER_HANDLERS, &dev->max_samplers));
806 }
807
808 dev->svm_allocation_priority = 2;
809 /* OpenCL 2.0 properties */
810 dev->svm_caps = CL_DEVICE_SVM_COARSE_GRAIN_BUFFER
811 | CL_DEVICE_SVM_FINE_GRAIN_BUFFER
812 | CL_DEVICE_SVM_ATOMICS;
813 /* This is from clinfo output ran on AMD Catalyst drivers */
814 dev->max_events = 1024;
815 dev->max_queues = 1;
816 dev->max_pipe_args = 16;
817 dev->max_pipe_active_res = 16;
818 dev->max_pipe_packet_size = 1024 * 1024;
819 dev->dev_queue_pref_size = 256 * 1024;
820 dev->dev_queue_max_size = 512 * 1024;
821 dev->on_dev_queue_props
822 = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE;
823 dev->on_host_queue_props = CL_QUEUE_PROFILING_ENABLE;
824
825 pocl_hsa_device_data_t *d = init_dev_data (dev, j);
826
827 /* 0 is the host memory shared with all drivers that use it */
828 if (d->agent_profile == HSA_PROFILE_FULL)
829 dev->global_mem_id = 0;
830
831 return CL_SUCCESS;
832 }
833
834
835 static void*
pocl_hsa_malloc(pocl_global_mem_t * mem,size_t size,hsa_region_t r)836 pocl_hsa_malloc (pocl_global_mem_t *mem, size_t size, hsa_region_t r)
837 {
838 void *b = NULL;
839
840 if (hsa_memory_allocate(r, size, &b) != HSA_STATUS_SUCCESS)
841 {
842 POCL_MSG_PRINT_INFO ("hsa_memory_allocate failed");
843 return NULL;
844 }
845
846 if (b)
847 POCL_MSG_PRINT_INFO("HSA malloc'ed : size %" PRIuS " @ %p\n", size, b);
848
849 /* TODO: Due to lack of align parameter to the HSA allocation function, we
850 should align the buffer here ourselves. For now, let's just hope that
851 the called HSA implementation wide aligns (currently to 128). */
852 if ((uint64_t)b % MAX_EXTENDED_ALIGNMENT > 0)
853 POCL_MSG_WARN("HSA runtime returned a buffer with smaller alignment "
854 "than %d", MAX_EXTENDED_ALIGNMENT);
855
856 return b;
857 }
858
859 void
pocl_hsa_copy(void * data,pocl_mem_identifier * dst_mem_id,cl_mem dst_buf,pocl_mem_identifier * src_mem_id,cl_mem src_buf,size_t dst_offset,size_t src_offset,size_t size)860 pocl_hsa_copy (void *data,
861 pocl_mem_identifier * dst_mem_id,
862 cl_mem dst_buf,
863 pocl_mem_identifier * src_mem_id,
864 cl_mem src_buf,
865 size_t dst_offset,
866 size_t src_offset,
867 size_t size)
868 {
869 void *__restrict__ dst_ptr = dst_mem_id->mem_ptr;
870 void *__restrict__ src_ptr = src_mem_id->mem_ptr;
871 if ((src_ptr + src_offset) == (dst_ptr + dst_offset))
872 return;
873 HSA_CHECK (
874 hsa_memory_copy (dst_ptr + dst_offset, src_ptr + src_offset, size));
875 }
876
877 cl_int
pocl_hsa_alloc_mem_obj(cl_device_id device,cl_mem mem_obj,void * host_ptr)878 pocl_hsa_alloc_mem_obj (cl_device_id device, cl_mem mem_obj, void *host_ptr)
879 {
880 /* if we share global memory with CPU, let the CPU driver allocate it */
881 if (device->global_mem_id == 0)
882 return pocl_basic_alloc_mem_obj (device, mem_obj, host_ptr);
883
884 /* ... otherwise allocate it via HSA. */
885 pocl_mem_identifier *p = &mem_obj->device_ptrs[device->global_mem_id];
886 pocl_global_mem_t *gmem = device->global_memory;
887 pocl_hsa_device_data_t* d = device->data;
888 void *b = pocl_hsa_malloc (gmem, mem_obj->size, d->global_region);
889 p->mem_ptr = b;
890 p->version = 0;
891
892 if (b == NULL)
893 return CL_MEM_OBJECT_ALLOCATION_FAILURE;
894 else
895 return CL_SUCCESS;
896 }
897
898 void
pocl_hsa_free(cl_device_id device,cl_mem memobj)899 pocl_hsa_free (cl_device_id device, cl_mem memobj)
900 {
901 /* if we share global memory with CPU, let the CPU driver free it */
902 if (device->global_mem_id == 0)
903 return pocl_basic_free (device, memobj);
904
905 /* ... otherwise free it via HSA. */
906 cl_mem_flags flags = memobj->flags;
907 pocl_mem_identifier *p = &memobj->device_ptrs[device->global_mem_id];
908 hsa_memory_free(p->mem_ptr);
909 p->mem_ptr = NULL;
910 p->version = 0;
911 }
912
pocl_hsa_svm_register(cl_device_id dev,void * host_ptr,size_t size)913 void pocl_hsa_svm_register (cl_device_id dev, void *host_ptr, size_t size)
914 {
915 POCL_MSG_PRINT_HSA ("hsa_memory_register()\n");
916 hsa_memory_register(host_ptr, size);
917 }
918
pocl_hsa_svm_unregister(cl_device_id dev,void * host_ptr,size_t size)919 void pocl_hsa_svm_unregister (cl_device_id dev, void *host_ptr, size_t size)
920 {
921 POCL_MSG_PRINT_HSA ("hsa_memory_deregister()\n");
922 hsa_memory_deregister(host_ptr, size);
923 }
924
925
926 static void
setup_kernel_args(pocl_hsa_device_data_t * d,_cl_command_node * cmd,pocl_hsa_event_data_t * event_data,size_t max_args_size,uint32_t * total_group_size)927 setup_kernel_args (pocl_hsa_device_data_t *d,
928 _cl_command_node *cmd,
929 pocl_hsa_event_data_t *event_data,
930 size_t max_args_size,
931 uint32_t *total_group_size)
932 {
933 char *write_pos = event_data->kernargs;
934 const char *last_pos = event_data->kernargs + max_args_size;
935 cl_kernel kernel = cmd->command.run.kernel;
936 pocl_kernel_metadata_t *meta = kernel->meta;
937
938 POCL_MSG_PRINT_INFO ("setup_kernel_args for %s\n",
939 cmd->command.run.kernel->name);
940 #define CHECK_AND_ALIGN_SPACE(DSIZE) \
941 do { \
942 if (write_pos + (DSIZE) > last_pos) \
943 POCL_ABORT("pocl-hsa: too many kernel arguments!\n"); \
944 unsigned unaligned = (intptr_t)write_pos % DSIZE; \
945 if (unaligned > 0) write_pos += (DSIZE - unaligned); \
946 } while (0)
947
948 size_t i;
949 for (i = 0; i < meta->num_args + meta->num_locals; ++i)
950 {
951 struct pocl_argument *al = &(cmd->command.run.arguments[i]);
952
953 if (i >= meta->num_args || ARG_IS_LOCAL (meta->arg_info[i]))
954 {
955 size_t buf_size = ARG_IS_LOCAL (meta->arg_info[i]) ?
956 al->size : meta->local_sizes[i - meta->num_args];
957 if (d->device->device_alloca_locals)
958 {
959 /* Local buffers are allocated in the device side work-group
960 launcher. Let's pass only the sizes of the local args in
961 the arg buffer. */
962 assert (sizeof (size_t) == 8);
963 CHECK_AND_ALIGN_SPACE (sizeof (size_t));
964 memcpy (write_pos, &buf_size, sizeof (size_t));
965 write_pos += sizeof (size_t);
966 }
967 else if (HSAIL_ENABLED)
968 {
969 CHECK_AND_ALIGN_SPACE (sizeof (uint32_t));
970 memcpy (write_pos, total_group_size, sizeof (uint32_t));
971 *total_group_size += (uint32_t)buf_size;
972 write_pos += sizeof (uint32_t);
973 }
974 else
975 assert (0 && "Unsupported local mem allocation scheme.");
976 }
977 else if (meta->arg_info[i].type == POCL_ARG_TYPE_POINTER)
978 {
979 CHECK_AND_ALIGN_SPACE(sizeof (uint64_t));
980 /* Assuming the pointers are 64b (or actually the same as in
981 host) due to HSA. TODO: the 32b profile. */
982
983 if (al->value == NULL)
984 {
985 uint64_t temp = 0;
986 memcpy (write_pos, &temp, sizeof (uint64_t));
987 }
988 else
989 {
990 uint64_t dev_ptr = 0;
991 if (al->is_svm)
992 dev_ptr = (uint64_t) (*(void **)al->value);
993 else
994 {
995 cl_mem m = *(cl_mem *)al->value;
996 dev_ptr
997 = (uint64_t)m->device_ptrs[cmd->device->dev_id].mem_ptr;
998 if (m->flags & CL_MEM_USE_HOST_PTR
999 && d->agent_profile == HSA_PROFILE_BASE)
1000 {
1001 POCL_MSG_PRINT_INFO (
1002 "HSA: Copy HOST_PTR allocated %lu byte buffer "
1003 "from %p to %lx due to having a BASE profile "
1004 "agent.\n",
1005 m->size, m->mem_host_ptr, dev_ptr);
1006 hsa_memory_copy ((void *)dev_ptr, m->mem_host_ptr,
1007 m->size);
1008 }
1009 }
1010
1011 dev_ptr += al->offset;
1012 memcpy (write_pos, &dev_ptr, sizeof (uint64_t));
1013 }
1014 POCL_MSG_PRINT_INFO (
1015 "arg %lu (global ptr) written to %lx val %lx arg offs %d\n", i,
1016 (uint64_t)write_pos, *(uint64_t *)write_pos,
1017 (int)(write_pos - (char*)event_data->kernargs));
1018 write_pos += sizeof (uint64_t);
1019 }
1020 else if (meta->arg_info[i].type == POCL_ARG_TYPE_IMAGE)
1021 {
1022 POCL_ABORT_UNIMPLEMENTED("pocl-hsa: image arguments"
1023 " not implemented.\n");
1024 }
1025 else if (meta->arg_info[i].type == POCL_ARG_TYPE_SAMPLER)
1026 {
1027 POCL_ABORT_UNIMPLEMENTED("pocl-hsa: sampler arguments"
1028 " not implemented.\n");
1029 }
1030 else
1031 {
1032 // Scalars.
1033 CHECK_AND_ALIGN_SPACE (al->size);
1034 memcpy (write_pos, al->value, al->size);
1035 POCL_MSG_PRINT_INFO (
1036 "arg %lu (scalar) written to %lx val %x offs %d\n", i,
1037 (uint64_t)write_pos, *(uint32_t *)al->value,
1038 (int)(write_pos - (char*)event_data->kernargs));
1039 write_pos += al->size;
1040 }
1041 }
1042
1043 CHECK_AND_ALIGN_SPACE(sizeof (uint64_t));
1044
1045 /* Copy the context object to HSA allocated global memory to ensure Base
1046 profile agents can access it. */
1047
1048 event_data->context = pocl_hsa_malloc
1049 (d->device->global_memory, POCL_CONTEXT_SIZE (d->device->address_bits),
1050 d->global_region);
1051
1052 if (d->device->address_bits == 64)
1053 memcpy (event_data->context, &cmd->command.run.pc, sizeof (struct pocl_context));
1054 else
1055 POCL_CONTEXT_COPY64TO32 (event_data->context, &cmd->command.run.pc);
1056
1057 memcpy (write_pos, &event_data->context, sizeof (event_data->context));
1058 POCL_MSG_PRINT_INFO ("A %d-bit context object was written at %p offs %d\n",
1059 d->device->address_bits, event_data->context,
1060 (int)(write_pos - (char*)event_data->kernargs));
1061 write_pos += sizeof (uint64_t);
1062
1063 /* MUST TODO: free the local buffers after finishing the kernel in case of
1064 host side allocation.
1065 */
1066 }
1067
1068 static int
compile_parallel_bc_to_brig(char * brigfile,_cl_command_node * cmd,int specialize)1069 compile_parallel_bc_to_brig (char *brigfile, _cl_command_node *cmd,
1070 int specialize)
1071 {
1072 int error;
1073 char hsailfile[POCL_FILENAME_LENGTH];
1074 char parallel_bc_path[POCL_FILENAME_LENGTH];
1075 _cl_command_run *run_cmd = &cmd->command.run;
1076
1077 pocl_cache_work_group_function_path (parallel_bc_path,
1078 run_cmd->kernel->program, cmd->device_i,
1079 run_cmd->kernel, cmd, specialize);
1080
1081 strcpy (brigfile, parallel_bc_path);
1082 strncat (brigfile, ".brig", POCL_FILENAME_LENGTH-1);
1083 strcpy (hsailfile, parallel_bc_path);
1084 strncat (hsailfile, ".hsail", POCL_FILENAME_LENGTH-1);
1085
1086 if (pocl_exists (brigfile))
1087 POCL_MSG_PRINT_INFO("pocl-hsa: using existing BRIG file: \n%s\n",
1088 brigfile);
1089 else
1090 {
1091 // TODO call llvm via c++ interface like pocl_llvm_codegen()
1092 POCL_MSG_PRINT_HSA ("BRIG file not found,"
1093 " compiling parallel.bc to brig file: \n%s\n",
1094 parallel_bc_path);
1095
1096
1097 char* args1[] = { LLVM_LLC, "-O2", "-march=hsail64", "-filetype=asm",
1098 "-o", hsailfile, parallel_bc_path, NULL };
1099 if ((error = pocl_run_command (args1)))
1100 {
1101 POCL_MSG_PRINT_HSA ("llc exit status %i\n", error);
1102 return error;
1103 }
1104
1105 char* args2[] = { HSAIL_ASM, "-o", brigfile, hsailfile, NULL };
1106 if ((error = pocl_run_command (args2)))
1107 {
1108 POCL_MSG_PRINT_HSA ("HSAILasm exit status %i\n", error);
1109 return error;
1110 }
1111 }
1112
1113 return 0;
1114 }
1115
1116 static pocl_hsa_kernel_cache_t *
pocl_hsa_find_mem_cached_kernel(pocl_hsa_device_data_t * d,_cl_command_run * cmd)1117 pocl_hsa_find_mem_cached_kernel (pocl_hsa_device_data_t *d,
1118 _cl_command_run *cmd)
1119 {
1120 size_t i;
1121 for (i = 0; i < HSA_KERNEL_CACHE_SIZE; i++)
1122 {
1123 if (((d->kernel_cache[i].kernel == NULL)
1124 || (memcmp (d->kernel_cache[i].kernel_hash, cmd->hash,
1125 sizeof (pocl_kernel_hash_t))
1126 != 0)))
1127 continue;
1128
1129 if (d->kernel_cache[i].local_x == cmd->pc.local_size[0]
1130 && d->kernel_cache[i].local_y == cmd->pc.local_size[1]
1131 && d->kernel_cache[i].local_z == cmd->pc.local_size[2]
1132 && (!d->kernel_cache[i].goffs_zero
1133 || (cmd->pc.global_offset[0] == 0
1134 && cmd->pc.global_offset[1] == 0
1135 && cmd->pc.global_offset[2] == 0))
1136 && pocl_cmd_max_grid_dim_width (cmd)
1137 <= d->kernel_cache[i].max_grid_dim_width)
1138 return &d->kernel_cache[i];
1139 }
1140 return NULL;
1141 }
1142
1143 void
pocl_hsa_compile_kernel_native(_cl_command_node * cmd,cl_kernel kernel,cl_device_id device,int specialize)1144 pocl_hsa_compile_kernel_native (_cl_command_node *cmd, cl_kernel kernel,
1145 cl_device_id device, int specialize)
1146 {
1147 pocl_hsa_device_data_t *d = (pocl_hsa_device_data_t*)device->data;
1148
1149 _cl_command_run *run_cmd = &cmd->command.run;
1150
1151 POCL_LOCK (d->pocl_hsa_compilation_lock);
1152 assert (cmd->command.run.kernel == kernel);
1153 char *binary_fn = pocl_check_kernel_disk_cache (cmd, specialize);
1154 if (pocl_hsa_find_mem_cached_kernel (d, &cmd->command.run) != NULL)
1155 {
1156 POCL_MSG_PRINT_INFO("built kernel found in mem cache\n");
1157 POCL_UNLOCK (d->pocl_hsa_compilation_lock);
1158 return;
1159 }
1160
1161 POCL_MSG_PRINT_INFO ("pocl-hsa: loading native binary from file %s.\n",
1162 binary_fn);
1163
1164 uint64_t elf_size;
1165 FILE *elf_file;
1166 elf_file = fopen(binary_fn, "rb");
1167 if (elf_file == NULL)
1168 POCL_ABORT ("pocl-hsa: could not get the file size of the native "
1169 "binary\n");
1170
1171 /* This assumes phsa-runtime's deserialization input format
1172 which stores the following data: */
1173 uint32_t metadata_size =
1174 sizeof (uint64_t) /* The ELF bin size. ELF bin follows. */ +
1175 sizeof (hsa_isa_t) +
1176 sizeof (hsa_default_float_rounding_mode_t) + sizeof (hsa_profile_t) +
1177 sizeof (hsa_machine_model_t);
1178
1179 /* TODO: Use HSA's deserialization interface to store the final binary
1180 to disk so we don't need to wrap it here and fix to phsa's format. */
1181 fseek (elf_file, 0, SEEK_END);
1182 elf_size = ftell (elf_file);
1183 fseek (elf_file, 0, SEEK_SET);
1184
1185 uint64_t blob_size = metadata_size + elf_size;
1186
1187 uint8_t *blob = malloc (blob_size);
1188 uint8_t *wpos = blob;
1189
1190 memcpy (wpos, &elf_size, sizeof (elf_size));
1191 wpos += sizeof (elf_size);
1192
1193 uint64_t read_size;
1194 if (fread (wpos, 1, elf_size, elf_file) != elf_size)
1195 POCL_ABORT("pocl-hsa: could not read the native ELF binary.\n");
1196 fclose (elf_file);
1197
1198 POCL_MSG_PRINT_INFO("pocl-hsa: native binary size: %lu.\n", elf_size);
1199
1200 wpos += elf_size;
1201
1202 /* Assume the rest of the HSA properties are OK as zero. */
1203 memset (wpos, 0, metadata_size - sizeof (uint64_t));
1204
1205 hsa_executable_t exe;
1206 hsa_code_object_t obj;
1207
1208 HSA_CHECK (hsa_executable_create (d->agent_profile,
1209 HSA_EXECUTABLE_STATE_UNFROZEN, "", &exe));
1210
1211 HSA_CHECK(hsa_code_object_deserialize (blob, blob_size, "", &obj));
1212
1213 HSA_CHECK(hsa_executable_load_code_object (exe, d->agent, obj, ""));
1214
1215 HSA_CHECK(hsa_executable_freeze (exe, NULL));
1216
1217 free (blob);
1218
1219 int i = d->kernel_cache_lastptr;
1220 if (i < HSA_KERNEL_CACHE_SIZE)
1221 {
1222 d->kernel_cache[i].kernel = kernel;
1223 memcpy (d->kernel_cache[i].kernel_hash, cmd->command.run.hash,
1224 sizeof (pocl_kernel_hash_t));
1225 d->kernel_cache[i].local_x = run_cmd->pc.local_size[0];
1226 d->kernel_cache[i].local_y = run_cmd->pc.local_size[1];
1227 d->kernel_cache[i].local_z = run_cmd->pc.local_size[2];
1228 d->kernel_cache[i].goffs_zero = run_cmd->pc.global_offset[0] == 0
1229 && run_cmd->pc.global_offset[1] == 0
1230 && run_cmd->pc.global_offset[2] == 0;
1231
1232 size_t max_grid_width = pocl_cmd_max_grid_dim_width (&cmd->command.run);
1233 d->kernel_cache[i].max_grid_dim_width
1234 = max_grid_width > device->grid_width_specialization_limit
1235 ? SIZE_MAX
1236 : device->grid_width_specialization_limit;
1237 d->kernel_cache[i].hsa_exe.handle = exe.handle;
1238 d->kernel_cache_lastptr++;
1239 }
1240 else
1241 POCL_ABORT ("kernel cache full\n");
1242
1243 hsa_executable_symbol_t kernel_symbol;
1244
1245 const char *launcher_name_tmpl = "phsa_kernel.%s_grid_launcher";
1246 size_t launcher_name_length =
1247 strlen (kernel->name) + strlen (launcher_name_tmpl) + 1;
1248 char *symbol_name = malloc (launcher_name_length);
1249
1250 snprintf (symbol_name, launcher_name_length, launcher_name_tmpl,
1251 kernel->name);
1252
1253 POCL_MSG_PRINT_INFO ("pocl-hsa: getting kernel symbol %s.\n", symbol_name);
1254
1255 HSA_CHECK (hsa_executable_get_symbol (exe, NULL, symbol_name, d->agent, 0,
1256 &kernel_symbol));
1257 free (symbol_name);
1258
1259 hsa_symbol_kind_t symtype;
1260 HSA_CHECK (hsa_executable_symbol_get_info (
1261 kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &symtype));
1262 if (symtype != HSA_SYMBOL_KIND_KERNEL)
1263 POCL_ABORT ("pocl-hsa: the kernel function symbol resolves "
1264 "to something else than a function\n");
1265
1266 uint64_t code_handle;
1267 HSA_CHECK(hsa_executable_symbol_get_info
1268 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &code_handle));
1269
1270 d->kernel_cache[i].code_handle = code_handle;
1271
1272 /* Group and private memory allocation is done via pocl, HSA runtime
1273 should not mind these. */
1274 d->kernel_cache[i].static_group_size = 0;
1275 d->kernel_cache[i].private_size = 0;
1276 d->kernel_cache[i].args_segment_size = 2048;
1277
1278 POCL_UNLOCK (d->pocl_hsa_compilation_lock);
1279 POCL_MSG_PRINT_INFO("pocl-hsa: native kernel compilation for phsa "
1280 "finished\n");
1281 }
1282
1283 void
pocl_hsa_compile_kernel_hsail(_cl_command_node * cmd,cl_kernel kernel,cl_device_id device,int specialize)1284 pocl_hsa_compile_kernel_hsail (_cl_command_node *cmd, cl_kernel kernel,
1285 cl_device_id device, int specialize)
1286 {
1287 char brigfile[POCL_FILENAME_LENGTH];
1288 char *brig_blob;
1289
1290 pocl_hsa_device_data_t *d = (pocl_hsa_device_data_t*)device->data;
1291
1292 hsa_executable_t final_obj;
1293
1294 _cl_command_run *run_cmd = &cmd->command.run;
1295
1296 POCL_LOCK (d->pocl_hsa_compilation_lock);
1297
1298 int error = pocl_llvm_generate_workgroup_function (cmd->device_i, device,
1299 kernel, cmd, specialize);
1300 if (error)
1301 {
1302 POCL_MSG_PRINT_GENERAL ("HSA: pocl_llvm_generate_workgroup_function()"
1303 " failed for kernel %s\n", kernel->name);
1304 assert (error == 0);
1305 }
1306
1307 unsigned i;
1308 if (pocl_hsa_find_mem_cached_kernel (d, run_cmd) != NULL)
1309 {
1310 POCL_MSG_PRINT_INFO("built kernel found in mem cache\n");
1311 POCL_UNLOCK (d->pocl_hsa_compilation_lock);
1312 return;
1313 }
1314
1315 if (compile_parallel_bc_to_brig (brigfile, cmd, specialize))
1316 POCL_ABORT("Compiling LLVM IR -> HSAIL -> BRIG failed.\n");
1317
1318 POCL_MSG_PRINT_HSA ("loading binary from file %s.\n", brigfile);
1319 uint64_t filesize = 0;
1320 int read = pocl_read_file(brigfile, &brig_blob, &filesize);
1321
1322 if (read != 0)
1323 POCL_ABORT("pocl-hsa: could not read the binary.\n");
1324
1325 POCL_MSG_PRINT_HSA ("BRIG binary size: %lu.\n", filesize);
1326
1327 hsa_ext_module_t hsa_module = (hsa_ext_module_t)brig_blob;
1328
1329 hsa_ext_program_t hsa_program;
1330 memset (&hsa_program, 0, sizeof (hsa_ext_program_t));
1331
1332 HSA_CHECK(hsa_ext_program_create
1333 (HSA_MACHINE_MODEL_LARGE, HSA_PROFILE_FULL,
1334 HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL,
1335 &hsa_program));
1336
1337 HSA_CHECK(hsa_ext_program_add_module (hsa_program, hsa_module));
1338
1339 hsa_isa_t isa;
1340 HSA_CHECK(hsa_agent_get_info (d->agent, HSA_AGENT_INFO_ISA, &isa));
1341
1342 hsa_ext_control_directives_t control_directives;
1343 memset (&control_directives, 0, sizeof (hsa_ext_control_directives_t));
1344
1345 hsa_code_object_t code_object;
1346 HSA_CHECK(hsa_ext_program_finalize
1347 (hsa_program, isa, 0, control_directives, "",
1348 HSA_CODE_OBJECT_TYPE_PROGRAM, &code_object));
1349
1350 HSA_CHECK(hsa_executable_create (d->agent_profile,
1351 HSA_EXECUTABLE_STATE_UNFROZEN,
1352 "", &final_obj));
1353
1354 HSA_CHECK(hsa_executable_load_code_object (final_obj, d->agent,
1355 code_object, ""));
1356
1357 HSA_CHECK(hsa_executable_freeze (final_obj, NULL));
1358
1359 HSA_CHECK(hsa_ext_program_destroy(hsa_program));
1360
1361 free (brig_blob);
1362
1363 i = d->kernel_cache_lastptr;
1364 if (i < HSA_KERNEL_CACHE_SIZE)
1365 {
1366 d->kernel_cache[i].kernel = kernel;
1367 memcpy (d->kernel_cache[i].kernel_hash, cmd->command.run.hash,
1368 sizeof (pocl_kernel_hash_t));
1369 d->kernel_cache[i].hsa_exe.handle = final_obj.handle;
1370 d->kernel_cache_lastptr++;
1371 }
1372 else
1373 POCL_ABORT ("kernel cache full\n");
1374
1375 hsa_executable_symbol_t kernel_symbol;
1376
1377 size_t kernel_name_length = strlen (kernel->name);
1378 char *symbol = malloc (kernel_name_length + 2);
1379 symbol[0] = '&';
1380 symbol[1] = '\0';
1381
1382 strncat (symbol, kernel->name, kernel_name_length);
1383
1384 POCL_MSG_PRINT_HSA ("getting kernel symbol %s.\n", symbol);
1385
1386 HSA_CHECK(hsa_executable_get_symbol
1387 (final_obj, NULL, symbol, d->agent, 0, &kernel_symbol));
1388
1389 free(symbol);
1390
1391 hsa_symbol_kind_t symtype;
1392 HSA_CHECK(hsa_executable_symbol_get_info
1393 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &symtype));
1394 if(symtype != HSA_SYMBOL_KIND_KERNEL)
1395 POCL_ABORT ("pocl-hsa: the kernel function symbol resolves "
1396 "to something else than a function\n");
1397
1398 uint64_t code_handle;
1399 HSA_CHECK(hsa_executable_symbol_get_info
1400 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &code_handle));
1401
1402 d->kernel_cache[i].code_handle = code_handle;
1403
1404 HSA_CHECK(hsa_executable_symbol_get_info (
1405 kernel_symbol,
1406 HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
1407 &d->kernel_cache[i].static_group_size));
1408
1409 HSA_CHECK(hsa_executable_symbol_get_info (
1410 kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
1411 &d->kernel_cache[i].private_size));
1412
1413 HSA_CHECK(hsa_executable_symbol_get_info (
1414 kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
1415 &d->kernel_cache[i].args_segment_size));
1416
1417 POCL_UNLOCK (d->pocl_hsa_compilation_lock);
1418 }
1419
1420 cl_int
pocl_hsa_uninit(unsigned j,cl_device_id device)1421 pocl_hsa_uninit (unsigned j, cl_device_id device)
1422 {
1423 assert (found_hsa_agents > 0);
1424 pocl_hsa_device_data_t *d = (pocl_hsa_device_data_t *)device->data;
1425
1426 if (d->driver_pthread_id)
1427 {
1428 POCL_MSG_PRINT_INFO ("waiting for HSA device pthread"
1429 " to finish its work...\n");
1430 d->exit_driver_thread = 1;
1431 void* ptr;
1432 PTHREAD_CHECK(pthread_join(d->driver_pthread_id, &ptr));
1433 POCL_MSG_PRINT_INFO ("....done.\n");
1434 }
1435
1436 if (device->device_side_printf)
1437 {
1438 hsa_memory_free (d->printf_buffer);
1439 hsa_memory_free (d->printf_write_pos);
1440 }
1441
1442 unsigned i;
1443 for (i = 0; i < HSA_KERNEL_CACHE_SIZE; i++)
1444 if (d->kernel_cache[i].kernel)
1445 {
1446 HSA_CHECK (hsa_executable_destroy (d->kernel_cache[i].hsa_exe));
1447 }
1448
1449 // TODO: destroy the executables that didn't fit to the kernel
1450 // cache. Also code objects are not destroyed at the moment.
1451 hsa_signal_destroy(d->nudge_driver_thread);
1452
1453 PTHREAD_CHECK(pthread_mutex_destroy(&d->list_mutex));
1454
1455 POCL_DESTROY_LOCK (d->pocl_hsa_compilation_lock);
1456
1457 POCL_MEM_FREE(d);
1458 device->data = NULL;
1459
1460 // after last device, call HSA runtime shutdown
1461 if (j == (found_hsa_agents - 1))
1462 {
1463 HSA_CHECK (hsa_shut_down ());
1464 found_hsa_agents = 0;
1465 }
1466
1467 return CL_SUCCESS;
1468 }
1469
1470 cl_int
pocl_hsa_reinit(unsigned j,cl_device_id device)1471 pocl_hsa_reinit (unsigned j, cl_device_id device)
1472 {
1473 assert (device->data == NULL);
1474 device->data = init_dev_data (device, j);
1475 return CL_SUCCESS;
1476 }
1477
1478 cl_ulong
pocl_hsa_get_timer_value(void * data)1479 pocl_hsa_get_timer_value(void *data)
1480 {
1481 uint64_t hsa_ts;
1482 HSA_CHECK(hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &hsa_ts));
1483 cl_ulong res = (cl_ulong)(hsa_ts *
1484 ((pocl_hsa_device_data_t*)data)->timestamp_unit);
1485 return res;
1486 }
1487
1488 #define PN_ADD(array, p) \
1489 do { \
1490 if (array##_size > COMMAND_LIST_SIZE) \
1491 POCL_ABORT("array overload\n"); \
1492 array[array##_size++] = p; \
1493 } \
1494 while (0)
1495
1496 #define PN_REMOVE(array, index) \
1497 do { \
1498 assert(array##_size > 0); \
1499 array[index] = array[--array##_size]; \
1500 array[array##_size] = NULL; \
1501 } \
1502 while (0)
1503
1504 void
pocl_hsa_submit(_cl_command_node * node,cl_command_queue cq)1505 pocl_hsa_submit (_cl_command_node *node, cl_command_queue cq)
1506 {
1507 cl_device_id device = node->device;
1508 pocl_hsa_device_data_t *d = device->data;
1509 unsigned added_to_readylist = 0;
1510
1511 PTHREAD_CHECK (pthread_mutex_lock (&d->list_mutex));
1512
1513 node->ready = 1;
1514 if (pocl_command_is_ready (node->event))
1515 {
1516 pocl_update_event_submitted (node->event);
1517 PN_ADD(d->ready_list, node->event);
1518 added_to_readylist = 1;
1519 }
1520 else
1521 PN_ADD(d->wait_list, node->event);
1522
1523 POCL_MSG_PRINT_INFO("After Event %" PRIu64 " submit: WL : %li, RL: %li\n",
1524 node->event->id, d->wait_list_size, d->ready_list_size);
1525
1526 POCL_UNLOCK_OBJ (node->event);
1527
1528 PTHREAD_CHECK(pthread_mutex_unlock(&d->list_mutex));
1529
1530 if (added_to_readylist)
1531 hsa_signal_subtract_relaxed(d->nudge_driver_thread, 1);
1532
1533 }
1534
1535 void
pocl_hsa_join(cl_device_id device,cl_command_queue cq)1536 pocl_hsa_join (cl_device_id device, cl_command_queue cq)
1537 {
1538 POCL_LOCK_OBJ (cq);
1539 if (cq->command_count == 0)
1540 {
1541 POCL_UNLOCK_OBJ (cq);
1542 POCL_MSG_PRINT_HSA ("device->join: empty queue\n");
1543 return;
1544 }
1545 cl_event event = cq->last_event.event;
1546 assert(event);
1547 POCL_LOCK_OBJ (event);
1548 POCL_RETAIN_OBJECT_UNLOCKED (event);
1549 POCL_UNLOCK_OBJ (cq);
1550
1551 POCL_MSG_PRINT_HSA ("device->join on event %" PRIu64 "\n", event->id);
1552
1553 if (event->status <= CL_COMPLETE)
1554 {
1555 POCL_MSG_PRINT_HSA ("device->join: last event (%" PRIu64 ") in queue"
1556 " exists, but is complete\n", event->id);
1557 goto RETURN;
1558 }
1559
1560 while (event->status > CL_COMPLETE)
1561 {
1562 pocl_hsa_event_data_t *e_d = (pocl_hsa_event_data_t *)event->data;
1563 PTHREAD_CHECK (pthread_cond_wait (&e_d->event_cond, &event->pocl_lock));
1564 }
1565 POCL_MSG_PRINT_HSA ("device->join on event %" PRIu64 " finished"
1566 " with status: %i\n", event->id, event->status);
1567
1568 RETURN:
1569 assert (event->status <= CL_COMPLETE);
1570 POCL_UNLOCK_OBJ (event);
1571
1572 POname (clReleaseEvent) (event);
1573 }
1574
1575 void
pocl_hsa_flush(cl_device_id device,cl_command_queue cq)1576 pocl_hsa_flush (cl_device_id device, cl_command_queue cq)
1577 {
1578 pocl_hsa_device_data_t *d = (pocl_hsa_device_data_t *)device->data;
1579 hsa_signal_subtract_relaxed(d->nudge_driver_thread, 1);
1580 }
1581
1582 void
pocl_hsa_notify(cl_device_id device,cl_event event,cl_event finished)1583 pocl_hsa_notify (cl_device_id device, cl_event event, cl_event finished)
1584 {
1585 pocl_hsa_device_data_t *d = device->data;
1586 _cl_command_node *node = event->command;
1587 int added_to_readylist = 0;
1588 POCL_MSG_PRINT_HSA ("notify on event %" PRIu64 " \n", event->id);
1589
1590 if (finished->status < CL_COMPLETE)
1591 {
1592 pocl_update_event_failed (event);
1593 return;
1594 }
1595
1596 if (!node->ready)
1597 return;
1598
1599 if (pocl_command_is_ready (event))
1600 {
1601 if (event->status == CL_QUEUED)
1602 {
1603 pocl_update_event_submitted (event);
1604 PTHREAD_CHECK(pthread_mutex_lock(&d->list_mutex));
1605
1606 size_t i = 0;
1607 for(i = 0; i < d->wait_list_size; i++)
1608 if (d->wait_list[i] == event)
1609 break;
1610 if (i < d->wait_list_size)
1611 {
1612 POCL_MSG_PRINT_INFO("event %" PRIu64 " wait_list -> ready_list\n",
1613 event->id);
1614 PN_ADD(d->ready_list, event);
1615 PN_REMOVE(d->wait_list, i);
1616 }
1617 else
1618 POCL_ABORT("cant move event %" PRIu64 " from waitlist to"
1619 " readylist - not found in waitlist\n", event->id);
1620 added_to_readylist = 1;
1621 PTHREAD_CHECK(pthread_mutex_unlock(&d->list_mutex));
1622 }
1623 else
1624 POCL_MSG_WARN ("node->ready was 1 but event %" PRIu64 " is"
1625 " not queued: status %i!\n",
1626 event->id, event->status);
1627 }
1628
1629 if (added_to_readylist)
1630 hsa_signal_subtract_relaxed(d->nudge_driver_thread, 1);
1631 }
1632
1633 void
pocl_hsa_broadcast(cl_event event)1634 pocl_hsa_broadcast (cl_event event)
1635 {
1636 POCL_MSG_PRINT_HSA ("broadcasting\n");
1637 pocl_broadcast(event);
1638 }
1639
1640 void
pocl_hsa_wait_event(cl_device_id device,cl_event event)1641 pocl_hsa_wait_event(cl_device_id device, cl_event event)
1642 {
1643 POCL_MSG_PRINT_HSA ("device->wait_event on event %" PRIu64 "\n", event->id);
1644 POCL_LOCK_OBJ (event);
1645 if (event->status <= CL_COMPLETE)
1646 {
1647 POCL_MSG_PRINT_HSA ("device->wain_event: last event"
1648 " (%" PRIu64 ") in queue exists, but is complete\n",
1649 event->id);
1650 POCL_UNLOCK_OBJ(event);
1651 return;
1652 }
1653 while (event->status > CL_COMPLETE)
1654 {
1655 pocl_hsa_event_data_t *e_d = (pocl_hsa_event_data_t *)event->data;
1656 PTHREAD_CHECK (
1657 pthread_cond_wait (&(e_d->event_cond), &event->pocl_lock));
1658 }
1659 POCL_UNLOCK_OBJ (event);
1660
1661 POCL_MSG_PRINT_INFO("event wait finished with status: %i\n", event->status);
1662 assert (event->status <= CL_COMPLETE);
1663 }
1664
1665 /* DRIVER PTHREAD part */
1666
1667 #if AMD_HSA == 1
1668 /* this is array of "less than 1" conditions for signals,
1669 * passed to hsa_amd_signal_wait_any() as a readonly argument */
1670 static hsa_signal_value_t signal_ones_array[EVENT_LIST_SIZE + 1];
1671 static hsa_signal_condition_t less_than_sigcond_array[EVENT_LIST_SIZE + 1];
1672 static int signal_array_initialized = 0;
1673 #endif
1674
1675 static void
pocl_hsa_launch(pocl_hsa_device_data_t * d,cl_event event)1676 pocl_hsa_launch (pocl_hsa_device_data_t *d, cl_event event)
1677 {
1678 POCL_LOCK_OBJ (event);
1679 _cl_command_node *cmd = event->command;
1680 _cl_command_run *run_cmd = &cmd->command.run;
1681 cl_kernel kernel = cmd->command.run.kernel;
1682 struct pocl_context *pc = &cmd->command.run.pc;
1683 hsa_kernel_dispatch_packet_t *kernel_packet;
1684 pocl_hsa_device_pthread_data_t* dd = &d->driver_data;
1685 pocl_hsa_event_data_t *event_data = (pocl_hsa_event_data_t *)event->data;
1686
1687 unsigned i;
1688 pocl_hsa_kernel_cache_t *cached_data
1689 = pocl_hsa_find_mem_cached_kernel (d, run_cmd);
1690 assert (cached_data);
1691
1692 HSA_CHECK(hsa_memory_allocate (d->kernarg_region,
1693 cached_data->args_segment_size,
1694 &event_data->kernargs));
1695
1696 dd->last_queue = (dd->last_queue + 1) % dd->num_queues;
1697 hsa_queue_t* last_queue = dd->queues[dd->last_queue];
1698 const uint64_t queue_mask = last_queue->size - 1;
1699
1700 uint64_t packet_id = hsa_queue_add_write_index_relaxed (last_queue, 1);
1701 while ((packet_id - hsa_queue_load_read_index_acquire (last_queue))
1702 >= last_queue->size)
1703 {
1704 /* device queue is full. TODO this isn't the optimal solution */
1705 POCL_MSG_WARN("pocl-hsa: queue %" PRIuS " overloaded\n", dd->last_queue);
1706 usleep(2000);
1707 }
1708
1709 kernel_packet =
1710 &(((hsa_kernel_dispatch_packet_t*)(last_queue->base_address))
1711 [packet_id & queue_mask]);
1712
1713 if (!HSAIL_ENABLED && !d->device->spmd)
1714 {
1715 /* For non-SPMD machines with native compilation, we produce a multi-WI
1716 WG function with pocl and launch it via the HSA runtime like it was
1717 a single-WI WG. */
1718 kernel_packet->workgroup_size_x = kernel_packet->workgroup_size_y =
1719 kernel_packet->workgroup_size_z = 1;
1720 }
1721 else
1722 {
1723 /* Otherwise let the target processor take care of the SPMD grid
1724 execution. */
1725 kernel_packet->workgroup_size_x = run_cmd->pc.local_size[0];
1726 kernel_packet->workgroup_size_y = run_cmd->pc.local_size[1];
1727 kernel_packet->workgroup_size_z = run_cmd->pc.local_size[2];
1728 }
1729
1730 if (d->device->device_side_printf)
1731 {
1732 pc->printf_buffer = d->printf_buffer;
1733 pc->printf_buffer_capacity = d->device->printf_buffer_size;
1734 bzero (d->printf_write_pos, sizeof (size_t));
1735 pc->printf_buffer_position = d->printf_write_pos;
1736 }
1737
1738 /* TODO: Dynamic WG sizes. */
1739
1740 /* For SPMD devices we let the processor (HSA runtime) control the
1741 grid execution unless we are using our own WG launcher that
1742 uses the context struct. */
1743 if (!d->device->spmd || d->device->arg_buffer_launcher)
1744 {
1745 pc->local_size[0] = run_cmd->pc.local_size[0];
1746 pc->local_size[1] = run_cmd->pc.local_size[1];
1747 pc->local_size[2] = run_cmd->pc.local_size[2];
1748 }
1749
1750 kernel_packet->grid_size_x = kernel_packet->grid_size_y
1751 = kernel_packet->grid_size_z = 1;
1752 kernel_packet->grid_size_x =
1753 pc->num_groups[0] * kernel_packet->workgroup_size_x;
1754 kernel_packet->grid_size_y =
1755 pc->num_groups[1] * kernel_packet->workgroup_size_y;
1756 kernel_packet->grid_size_z =
1757 pc->num_groups[2] * kernel_packet->workgroup_size_z;
1758
1759 kernel_packet->kernel_object = cached_data->code_handle;
1760 kernel_packet->private_segment_size = cached_data->private_size;
1761 uint32_t total_group_size = cached_data->static_group_size;
1762
1763 HSA_CHECK (
1764 hsa_signal_create (1, 1, &d->agent, &kernel_packet->completion_signal));
1765
1766 setup_kernel_args (d, cmd, event_data,
1767 cached_data->args_segment_size, &total_group_size);
1768
1769 kernel_packet->group_segment_size = total_group_size;
1770
1771 POCL_MSG_PRINT_HSA ("kernel's total group size: %u\n",
1772 total_group_size);
1773 POCL_MSG_PRINT_INFO ("kernel command grid size %u x %u x %u\n",
1774 kernel_packet->grid_size_x, kernel_packet->grid_size_y,
1775 kernel_packet->grid_size_z);
1776
1777
1778 if (total_group_size > cmd->device->local_mem_size)
1779 POCL_ABORT ("pocl-hsa: required local memory > device local memory!\n");
1780
1781 kernel_packet->kernarg_address = event_data->kernargs;
1782
1783 typedef union {
1784 uint32_t header_setup;
1785 struct {
1786 uint16_t header;
1787 uint16_t setup;
1788 } a;
1789 } hsa_header_union_t;
1790
1791 hsa_header_union_t h;
1792 h.a.header = (uint16_t)HSA_FENCE_SCOPE_SYSTEM
1793 << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
1794 h.a.header |= (uint16_t)HSA_FENCE_SCOPE_SYSTEM
1795 << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
1796 h.a.header |= (uint16_t)HSA_PACKET_TYPE_KERNEL_DISPATCH
1797 << HSA_PACKET_HEADER_TYPE;
1798 h.a.setup = (uint16_t)cmd->command.run.pc.work_dim
1799 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
1800 __atomic_store_n ((uint32_t *)(&kernel_packet->header), h.header_setup,
1801 __ATOMIC_RELEASE);
1802
1803 /* ring the doorbell to start execution */
1804 hsa_signal_store_relaxed (last_queue->doorbell_signal, packet_id);
1805
1806 if (dd->running_list_size > EVENT_LIST_SIZE)
1807 POCL_ABORT("running events list too big\n");
1808 else
1809 {
1810 dd->running_events[dd->running_list_size] = event;
1811 dd->running_signals[dd->running_list_size++].handle
1812 = kernel_packet->completion_signal.handle;
1813 }
1814
1815 pocl_update_event_running_unlocked (event);
1816 POCL_UNLOCK_OBJ (event);
1817 }
1818
1819 static void
pocl_hsa_ndrange_event_finished(pocl_hsa_device_data_t * d,size_t i)1820 pocl_hsa_ndrange_event_finished (pocl_hsa_device_data_t *d, size_t i)
1821 {
1822 pocl_hsa_device_pthread_data_t* dd = &d->driver_data;
1823
1824 assert(i < dd->running_list_size);
1825 cl_event event = dd->running_events[i];
1826 _cl_command_node *node = event->command;
1827
1828 POCL_LOCK_OBJ (event);
1829 pocl_hsa_event_data_t *event_data = (pocl_hsa_event_data_t *)event->data;
1830
1831 POCL_MSG_PRINT_INFO("event %" PRIu64 " finished, removing from running_list\n",
1832 event->id);
1833 dd->running_events[i] = dd->running_events[--dd->running_list_size];
1834
1835 #if AMD_HSA == 1
1836 /* TODO Times are reported as ticks in the domain of the HSA system clock. */
1837 hsa_amd_profiling_dispatch_time_t t;
1838 HSA_CHECK(hsa_amd_profiling_get_dispatch_time(d->agent,
1839 dd->running_signals[i], &t));
1840 uint64_t j = t.end - t.start;
1841 pocl_debug_print_duration (__func__, __LINE__,
1842 "HSA NDrange Kernel (HSA clock)", j);
1843 #endif
1844
1845 hsa_signal_destroy (dd->running_signals[i]);
1846 dd->running_signals[i] = dd->running_signals[dd->running_list_size];
1847
1848 hsa_memory_free (event_data->kernargs);
1849 hsa_memory_free (event_data->context);
1850
1851 POCL_UNLOCK_OBJ (event);
1852
1853 if (d->device->device_side_printf && *d->printf_write_pos > 0)
1854 {
1855 write (STDOUT_FILENO, d->printf_buffer, *d->printf_write_pos);
1856 bzero (d->printf_write_pos, sizeof (size_t));
1857 }
1858
1859 POCL_UPDATE_EVENT_COMPLETE (event);
1860
1861 pocl_ndrange_node_cleanup (node);
1862 pocl_mem_manager_free_command (node);
1863 }
1864
1865 static void
check_running_signals(pocl_hsa_device_data_t * d)1866 check_running_signals (pocl_hsa_device_data_t *d)
1867 {
1868 unsigned i;
1869 pocl_hsa_device_pthread_data_t *dd = &d->driver_data;
1870 for (i = 0; i < dd->running_list_size; i++)
1871 {
1872 if (hsa_signal_load_acquire (dd->running_signals[i]) < 1)
1873 {
1874 pocl_hsa_ndrange_event_finished (d, i);
1875 }
1876 }
1877 }
1878
1879 static int
pocl_hsa_run_ready_commands(pocl_hsa_device_data_t * d)1880 pocl_hsa_run_ready_commands (pocl_hsa_device_data_t *d)
1881 {
1882 check_running_signals (d);
1883 int enqueued_ndrange = 0;
1884
1885 PTHREAD_CHECK (pthread_mutex_lock (&d->list_mutex));
1886 while (d->ready_list_size)
1887 {
1888 cl_event e = d->ready_list[0];
1889 PN_REMOVE (d->ready_list, 0);
1890 PTHREAD_CHECK (pthread_mutex_unlock (&d->list_mutex));
1891 if (e->command->type == CL_COMMAND_NDRANGE_KERNEL)
1892 {
1893 d->device->ops->compile_kernel (
1894 e->command, e->command->command.run.kernel, e->queue->device, 1);
1895 pocl_hsa_launch (d, e);
1896 enqueued_ndrange = 1;
1897 POCL_MSG_PRINT_INFO ("NDrange event %" PRIu64 " launched, remove"
1898 " from readylist\n", e->id);
1899 }
1900 else
1901 {
1902 POCL_MSG_PRINT_INFO ("running non-NDrange event %" PRIu64 ","
1903 " remove from readylist\n", e->id);
1904 pocl_exec_command (e->command);
1905 }
1906 check_running_signals (d);
1907 PTHREAD_CHECK (pthread_mutex_lock (&d->list_mutex));
1908 }
1909 PTHREAD_CHECK (pthread_mutex_unlock (&d->list_mutex));
1910 return enqueued_ndrange;
1911 }
1912
1913 void *
pocl_hsa_driver_pthread(void * cldev)1914 pocl_hsa_driver_pthread (void *cldev)
1915 {
1916 size_t i;
1917 #if AMD_HSA == 1
1918 if (!signal_array_initialized)
1919 {
1920 signal_array_initialized = 1;
1921 for (i = 0; i < (EVENT_LIST_SIZE+1); i++)
1922 {
1923 signal_ones_array[i] = 1;
1924 less_than_sigcond_array[i] = HSA_SIGNAL_CONDITION_LT;
1925 }
1926 }
1927 #endif
1928
1929 cl_device_id device = (cl_device_id)cldev;
1930 POCL_RETAIN_OBJECT_UNLOCKED (device);
1931 pocl_hsa_device_data_t *d = device->data;
1932 pocl_hsa_device_pthread_data_t *dd = &d->driver_data;
1933
1934 /* timeout counter, resets with each new queued kernel to 1/8, then
1935 * exponentially increases by 40% up to about 3/4 of d->timeout.
1936 * disabled for now */
1937 #if 0
1938 uint64_t kernel_timeout_ns = d->timeout >> 3;
1939 #endif
1940
1941 dd->running_list_size = 0;
1942 dd->last_queue = 0;
1943 dd->num_queues = d->hw_schedulers; // TODO this is somewhat arbitrary.
1944 POCL_MSG_PRINT_HSA ("Queues: %" PRIuS "\n", dd->num_queues);
1945
1946 dd->queues = (hsa_queue_t **) calloc (dd->num_queues, sizeof(hsa_queue_t*));
1947
1948 uint32_t queue_min_size, queue_max_size;
1949 HSA_CHECK(hsa_agent_get_info(d->agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE,
1950 &queue_min_size));
1951 HSA_CHECK(hsa_agent_get_info(d->agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE,
1952 &queue_max_size));
1953
1954 uint32_t queue_size = 1 << ((__builtin_ctz(queue_min_size)
1955 + __builtin_ctz(queue_max_size)) / 2);
1956 POCL_MSG_PRINT_HSA ("queue size: %" PRIu32 "\n", queue_size);
1957
1958 for (i = 0; i < dd->num_queues; i++)
1959 {
1960 HSA_CHECK (hsa_queue_create (d->agent, queue_size, HSA_QUEUE_TYPE_SINGLE,
1961 hsa_queue_callback, device, -1, -1,
1962 &dd->queues[i]));
1963 #if AMD_HSA == 1
1964 HSA_CHECK (hsa_amd_profiling_set_profiler_enabled (dd->queues[i], 1));
1965 #endif
1966 }
1967
1968 while (1)
1969 {
1970 /* reset the signal. Disabled for now; see below */
1971 #if 0
1972 if (pocl_hsa_run_ready_commands (d))
1973 kernel_timeout_ns = d->timeout >> 3;
1974 #else
1975 pocl_hsa_run_ready_commands (d);
1976 #endif
1977 if (d->exit_driver_thread)
1978 goto EXIT_PTHREAD;
1979
1980 // wait for anything to happen or timeout
1981 #if AMD_HSA == 1
1982 // FIXME: An ABA race condition here. If there was (another) submit after
1983 // the previous wait returned, but before this reset, we miss the
1984 // notification decrement and get stuck if there are no further submits
1985 // to decrement the 1.
1986 hsa_signal_store_release (d->nudge_driver_thread, 1);
1987
1988 if (d->have_wait_any)
1989 {
1990 dd->running_signals[dd->running_list_size].handle =
1991 d->nudge_driver_thread.handle;
1992 hsa_amd_signal_wait_any(dd->running_list_size+1,
1993 dd->running_signals,
1994 less_than_sigcond_array, signal_ones_array,
1995 d->timeout, HSA_WAIT_STATE_BLOCKED, NULL);
1996 dd->running_signals[dd->running_list_size].handle = 0;
1997 }
1998 else
1999 {
2000 #endif
2001 #if 0
2002 if (kernel_timeout_ns < (d->timeout >> 1))
2003 kernel_timeout_ns = (kernel_timeout_ns * 22937UL) >> 14;
2004 // See the above comment. Busy wait for now until a proper
2005 // synchronization fix is in place.
2006 hsa_signal_wait_acquire (d->nudge_driver_thread,
2007 HSA_SIGNAL_CONDITION_LT, 1,
2008 kernel_timeout_ns, HSA_WAIT_STATE_BLOCKED);
2009 #endif
2010
2011 #if AMD_HSA == 1
2012 }
2013 #endif
2014
2015 if (d->exit_driver_thread)
2016 goto EXIT_PTHREAD;
2017 }
2018
2019 EXIT_PTHREAD:
2020 /* TODO wait for commands to finish... */
2021 POCL_MSG_PRINT_HSA ("driver pthread exiting, still "
2022 "running evts: %" PRIuS "\n",
2023 dd->running_list_size);
2024 assert(dd->running_list_size == 0);
2025
2026 for (i = 0; i < dd->num_queues; i++)
2027 HSA_CHECK (hsa_queue_destroy (dd->queues[i]));
2028 POCL_MEM_FREE (dd->queues);
2029
2030 POname (clReleaseDevice) (device);
2031
2032 pthread_exit (NULL);
2033 }
2034
2035 void
pocl_hsa_notify_event_finished(cl_event event)2036 pocl_hsa_notify_event_finished (cl_event event)
2037 {
2038 pocl_hsa_event_data_t *e_d = (pocl_hsa_event_data_t *)event->data;
2039 pthread_cond_broadcast (&e_d->event_cond);
2040 }
2041
2042 void
pocl_hsa_update_event(cl_device_id device,cl_event event)2043 pocl_hsa_update_event (cl_device_id device, cl_event event)
2044 {
2045 pocl_hsa_event_data_t *e_d = NULL;
2046
2047 if (event->data == NULL && event->status == CL_QUEUED)
2048 {
2049 pocl_hsa_event_data_t *e_d
2050 = (pocl_hsa_event_data_t *)malloc (sizeof (pocl_hsa_event_data_t));
2051 assert (e_d);
2052 pthread_cond_init (&e_d->event_cond, NULL);
2053 event->data = (void *)e_d;
2054 }
2055 else
2056 {
2057 e_d = event->data;
2058 }
2059
2060 switch (event->status)
2061 {
2062 case CL_QUEUED:
2063 if (event->queue->properties & CL_QUEUE_PROFILING_ENABLE)
2064 event->time_queue = pocl_hsa_get_timer_value (device->data);
2065 break;
2066 case CL_SUBMITTED:
2067 if (event->queue->properties & CL_QUEUE_PROFILING_ENABLE)
2068 event->time_submit = pocl_hsa_get_timer_value (device->data);
2069 break;
2070 case CL_RUNNING:
2071 if (event->queue->properties & CL_QUEUE_PROFILING_ENABLE)
2072 event->time_start = pocl_hsa_get_timer_value (device->data);
2073 break;
2074 case CL_FAILED:
2075 case CL_COMPLETE:
2076 if (event->queue->properties & CL_QUEUE_PROFILING_ENABLE)
2077 event->time_end = pocl_hsa_get_timer_value (device->data);
2078 break;
2079 }
2080 }
2081
pocl_hsa_free_event_data(cl_event event)2082 void pocl_hsa_free_event_data (cl_event event)
2083 {
2084 assert(event->data != NULL);
2085 free(event->data);
2086 event->data = NULL;
2087 }
2088
2089 /****** SVM callbacks *****/
2090
2091 void
pocl_hsa_svm_free(cl_device_id dev,void * svm_ptr)2092 pocl_hsa_svm_free (cl_device_id dev, void *svm_ptr)
2093 {
2094 /* TODO we should somehow figure out the size argument
2095 * and call pocl_free_global_mem */
2096 HSA_CHECK (hsa_memory_free (svm_ptr));
2097 }
2098
2099 void *
pocl_hsa_svm_alloc(cl_device_id dev,cl_svm_mem_flags flags,size_t size)2100 pocl_hsa_svm_alloc (cl_device_id dev, cl_svm_mem_flags flags, size_t size)
2101 {
2102 if ((flags & CL_MEM_SVM_ATOMICS)
2103 && ((dev->svm_caps & CL_DEVICE_SVM_ATOMICS) == 0))
2104 {
2105 POCL_MSG_ERR ("This device doesn't support SVM Atomics\n");
2106 return NULL;
2107 }
2108
2109 if ((flags & CL_MEM_SVM_FINE_GRAIN_BUFFER)
2110 && ((dev->svm_caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) == 0))
2111 {
2112 POCL_MSG_ERR ("This device doesn't support SVM Fine grained Buffer\n");
2113 return NULL;
2114 }
2115
2116 pocl_hsa_device_data_t *d = (pocl_hsa_device_data_t *)dev->data;
2117 void *b = NULL;
2118 HSA_CHECK (hsa_memory_allocate (d->global_region, size, &b));
2119 return b;
2120 }
2121
2122 void
pocl_hsa_svm_copy(cl_device_id dev,void * __restrict__ dst,const void * __restrict__ src,size_t size)2123 pocl_hsa_svm_copy (cl_device_id dev, void *__restrict__ dst,
2124 const void *__restrict__ src, size_t size)
2125 {
2126 HSA_CHECK (hsa_memory_copy (dst, src, size));
2127 }
2128
2129 char*
pocl_hsa_init_build(void * data)2130 pocl_hsa_init_build (void *data)
2131 {
2132 if (!((pocl_hsa_device_data_t*)data)->device->device_side_printf)
2133 return strdup ("-DC99_PRINTF");
2134 else
2135 return NULL;
2136 }
2137