1 /* Run a stand-alone AMD GCN kernel.
2 
3    Copyright 2017 Mentor Graphics Corporation
4    Copyright (C) 2018-2021 Free Software Foundation, Inc.
5 
6    This program is free software: you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation, either version 3 of the License, or
9    (at your option) any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License
17    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
18 
19 /* This program will run a compiled stand-alone GCN kernel on a GPU.
20 
21    The kernel entry point's signature must use a standard main signature:
22 
23      int main(int argc, char **argv)
24 */
25 
26 #include <stdint.h>
27 #include <stdbool.h>
28 #include <stdlib.h>
29 #include <malloc.h>
30 #include <stdio.h>
31 #include <string.h>
32 #include <dlfcn.h>
33 #include <unistd.h>
34 #include <elf.h>
35 #include <signal.h>
36 
37 #include "hsa.h"
38 
39 #ifndef HSA_RUNTIME_LIB
40 #define HSA_RUNTIME_LIB "libhsa-runtime64.so.1"
41 #endif
42 
43 #ifndef VERSION_STRING
44 #define VERSION_STRING "(version unknown)"
45 #endif
46 
47 bool debug = false;
48 
49 hsa_agent_t device = { 0 };
50 hsa_queue_t *queue = NULL;
51 uint64_t init_array_kernel = 0;
52 uint64_t fini_array_kernel = 0;
53 uint64_t main_kernel = 0;
54 hsa_executable_t executable = { 0 };
55 
56 hsa_region_t kernargs_region = { 0 };
57 hsa_region_t heap_region = { 0 };
58 uint32_t kernarg_segment_size = 0;
59 uint32_t group_segment_size = 0;
60 uint32_t private_segment_size = 0;
61 
62 static void
usage(const char * progname)63 usage (const char *progname)
64 {
65   printf ("Usage: %s [options] kernel [kernel-args]\n\n"
66 	  "Options:\n"
67 	  "  --help\n"
68 	  "  --version\n"
69 	  "  --debug\n", progname);
70 }
71 
72 static void
version(const char * progname)73 version (const char *progname)
74 {
75   printf ("%s " VERSION_STRING "\n", progname);
76 }
77 
78 /* As an HSA runtime is dlopened, following structure defines the necessary
79    function pointers.
80    Code adapted from libgomp.  */
81 
82 struct hsa_runtime_fn_info
83 {
84   /* HSA runtime.  */
85   hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
86 					const char **status_string);
87   hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
88 					 hsa_agent_info_t attribute,
89 					 void *value);
90   hsa_status_t (*hsa_init_fn) (void);
91   hsa_status_t (*hsa_iterate_agents_fn)
92     (hsa_status_t (*callback) (hsa_agent_t agent, void *data), void *data);
93   hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
94 					  hsa_region_info_t attribute,
95 					  void *value);
96   hsa_status_t (*hsa_queue_create_fn)
97     (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
98      void (*callback) (hsa_status_t status, hsa_queue_t *source, void *data),
99      void *data, uint32_t private_segment_size,
100      uint32_t group_segment_size, hsa_queue_t **queue);
101   hsa_status_t (*hsa_agent_iterate_regions_fn)
102     (hsa_agent_t agent,
103      hsa_status_t (*callback) (hsa_region_t region, void *data), void *data);
104   hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
105   hsa_status_t (*hsa_executable_create_fn)
106     (hsa_profile_t profile, hsa_executable_state_t executable_state,
107      const char *options, hsa_executable_t *executable);
108   hsa_status_t (*hsa_executable_global_variable_define_fn)
109     (hsa_executable_t executable, const char *variable_name, void *address);
110   hsa_status_t (*hsa_executable_load_code_object_fn)
111     (hsa_executable_t executable, hsa_agent_t agent,
112      hsa_code_object_t code_object, const char *options);
113   hsa_status_t (*hsa_executable_freeze_fn) (hsa_executable_t executable,
114 					    const char *options);
115   hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
116 					uint32_t num_consumers,
117 					const hsa_agent_t *consumers,
118 					hsa_signal_t *signal);
119   hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
120 					  void **ptr);
121   hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
122 					      hsa_access_permission_t access);
123   hsa_status_t (*hsa_memory_copy_fn) (void *dst, const void *src,
124 				      size_t size);
125   hsa_status_t (*hsa_memory_free_fn) (void *ptr);
126   hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
127   hsa_status_t (*hsa_executable_get_symbol_fn)
128     (hsa_executable_t executable, const char *module_name,
129      const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
130      hsa_executable_symbol_t *symbol);
131   hsa_status_t (*hsa_executable_symbol_get_info_fn)
132     (hsa_executable_symbol_t executable_symbol,
133      hsa_executable_symbol_info_t attribute, void *value);
134   void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
135 				       hsa_signal_value_t value);
136   hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
137     (hsa_signal_t signal, hsa_signal_condition_t condition,
138      hsa_signal_value_t compare_value, uint64_t timeout_hint,
139      hsa_wait_state_t wait_state_hint);
140   hsa_signal_value_t (*hsa_signal_wait_relaxed_fn)
141     (hsa_signal_t signal, hsa_signal_condition_t condition,
142      hsa_signal_value_t compare_value, uint64_t timeout_hint,
143      hsa_wait_state_t wait_state_hint);
144   hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
145   hsa_status_t (*hsa_code_object_deserialize_fn)
146     (void *serialized_code_object, size_t serialized_code_object_size,
147      const char *options, hsa_code_object_t *code_object);
148   uint64_t (*hsa_queue_load_write_index_relaxed_fn)
149     (const hsa_queue_t *queue);
150   void (*hsa_queue_store_write_index_relaxed_fn)
151     (const hsa_queue_t *queue, uint64_t value);
152   hsa_status_t (*hsa_shut_down_fn) ();
153 };
154 
155 /* HSA runtime functions that are initialized in init_hsa_context.
156    Code adapted from libgomp.  */
157 
158 static struct hsa_runtime_fn_info hsa_fns;
159 
160 #define DLSYM_FN(function)					 \
161   *(void**)(&hsa_fns.function##_fn) = dlsym (handle, #function); \
162   if (hsa_fns.function##_fn == NULL)				 \
163     goto fail;
164 
165 static void
init_hsa_runtime_functions(void)166 init_hsa_runtime_functions (void)
167 {
168   void *handle = dlopen (HSA_RUNTIME_LIB, RTLD_LAZY);
169   if (handle == NULL)
170     {
171       fprintf (stderr,
172 	       "The HSA runtime is required to run GCN kernels on hardware.\n"
173 	       "%s: File not found or could not be opened\n",
174 	       HSA_RUNTIME_LIB);
175       exit (1);
176     }
177 
178   DLSYM_FN (hsa_status_string)
179   DLSYM_FN (hsa_agent_get_info)
180   DLSYM_FN (hsa_init)
181   DLSYM_FN (hsa_iterate_agents)
182   DLSYM_FN (hsa_region_get_info)
183   DLSYM_FN (hsa_queue_create)
184   DLSYM_FN (hsa_agent_iterate_regions)
185   DLSYM_FN (hsa_executable_destroy)
186   DLSYM_FN (hsa_executable_create)
187   DLSYM_FN (hsa_executable_global_variable_define)
188   DLSYM_FN (hsa_executable_load_code_object)
189   DLSYM_FN (hsa_executable_freeze)
190   DLSYM_FN (hsa_signal_create)
191   DLSYM_FN (hsa_memory_allocate)
192   DLSYM_FN (hsa_memory_assign_agent)
193   DLSYM_FN (hsa_memory_copy)
194   DLSYM_FN (hsa_memory_free)
195   DLSYM_FN (hsa_signal_destroy)
196   DLSYM_FN (hsa_executable_get_symbol)
197   DLSYM_FN (hsa_executable_symbol_get_info)
198   DLSYM_FN (hsa_signal_wait_acquire)
199   DLSYM_FN (hsa_signal_wait_relaxed)
200   DLSYM_FN (hsa_signal_store_relaxed)
201   DLSYM_FN (hsa_queue_destroy)
202   DLSYM_FN (hsa_code_object_deserialize)
203   DLSYM_FN (hsa_queue_load_write_index_relaxed)
204   DLSYM_FN (hsa_queue_store_write_index_relaxed)
205   DLSYM_FN (hsa_shut_down)
206 
207   return;
208 
209 fail:
210   fprintf (stderr, "Failed to find HSA functions in " HSA_RUNTIME_LIB "\n");
211   exit (1);
212 }
213 
214 #undef DLSYM_FN
215 
216 /* Report a fatal error STR together with the HSA error corresponding to
217    STATUS and terminate execution of the current process.  */
218 
219 static void
hsa_fatal(const char * str,hsa_status_t status)220 hsa_fatal (const char *str, hsa_status_t status)
221 {
222   const char *hsa_error_msg;
223   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
224   fprintf (stderr, "%s: FAILED\nHSA Runtime message: %s\n", str,
225 	   hsa_error_msg);
226   exit (1);
227 }
228 
229 /* Helper macros to ensure we check the return values from the HSA Runtime.
230    These just keep the rest of the code a bit cleaner.  */
231 
232 #define XHSA_CMP(FN, CMP, MSG)		   \
233   do {					   \
234     hsa_status_t status = (FN);		   \
235     if (!(CMP))				   \
236       hsa_fatal ((MSG), status);	   \
237     else if (debug)			   \
238       fprintf (stderr, "%s: OK\n", (MSG)); \
239   } while (0)
240 #define XHSA(FN, MSG) XHSA_CMP(FN, status == HSA_STATUS_SUCCESS, MSG)
241 
242 /* Callback of hsa_iterate_agents.
243    Called once for each available device, and returns "break" when a
244    suitable one has been found.  */
245 
246 static hsa_status_t
get_gpu_agent(hsa_agent_t agent,void * data)247 get_gpu_agent (hsa_agent_t agent, void *data __attribute__ ((unused)))
248 {
249   hsa_device_type_t device_type;
250   XHSA (hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
251 				       &device_type),
252 	"Get agent type");
253 
254   /* Select only GPU devices.  */
255   /* TODO: support selecting from multiple GPUs.  */
256   if (HSA_DEVICE_TYPE_GPU == device_type)
257     {
258       device = agent;
259       return HSA_STATUS_INFO_BREAK;
260     }
261 
262   /* The device was not suitable.  */
263   return HSA_STATUS_SUCCESS;
264 }
265 
266 /* Callback of hsa_iterate_regions.
267    Called once for each available memory region, and returns "break" when a
268    suitable one has been found.  */
269 
270 static hsa_status_t
get_memory_region(hsa_region_t region,hsa_region_t * retval,hsa_region_global_flag_t kind)271 get_memory_region (hsa_region_t region, hsa_region_t *retval,
272 		   hsa_region_global_flag_t kind)
273 {
274   /* Reject non-global regions.  */
275   hsa_region_segment_t segment;
276   hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT, &segment);
277   if (HSA_REGION_SEGMENT_GLOBAL != segment)
278     return HSA_STATUS_SUCCESS;
279 
280   /* Find a region with the KERNARG flag set.  */
281   hsa_region_global_flag_t flags;
282   hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
283 				  &flags);
284   if (flags & kind)
285     {
286       *retval = region;
287       return HSA_STATUS_INFO_BREAK;
288     }
289 
290   /* The region was not suitable.  */
291   return HSA_STATUS_SUCCESS;
292 }
293 
294 static hsa_status_t
get_kernarg_region(hsa_region_t region,void * data)295 get_kernarg_region (hsa_region_t region, void *data __attribute__((unused)))
296 {
297   return get_memory_region (region, &kernargs_region,
298 			    HSA_REGION_GLOBAL_FLAG_KERNARG);
299 }
300 
301 static hsa_status_t
get_heap_region(hsa_region_t region,void * data)302 get_heap_region (hsa_region_t region, void *data __attribute__((unused)))
303 {
304   return get_memory_region (region, &heap_region,
305 			    HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
306 }
307 
308 /* Initialize the HSA Runtime library and GPU device.  */
309 
310 static void
init_device()311 init_device ()
312 {
313   /* Load the shared library and find the API functions.  */
314   init_hsa_runtime_functions ();
315 
316   /* Initialize the HSA Runtime.  */
317   XHSA (hsa_fns.hsa_init_fn (),
318 	"Initialize run-time");
319 
320   /* Select a suitable device.
321      The call-back function, get_gpu_agent, does the selection.  */
322   XHSA_CMP (hsa_fns.hsa_iterate_agents_fn (get_gpu_agent, NULL),
323 	    status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
324 	    "Find a device");
325 
326   /* Initialize the queue used for launching kernels.  */
327   uint32_t queue_size = 0;
328   XHSA (hsa_fns.hsa_agent_get_info_fn (device, HSA_AGENT_INFO_QUEUE_MAX_SIZE,
329 				       &queue_size),
330 	"Find max queue size");
331   XHSA (hsa_fns.hsa_queue_create_fn (device, queue_size,
332 				     HSA_QUEUE_TYPE_SINGLE, NULL,
333 				     NULL, UINT32_MAX, UINT32_MAX, &queue),
334 	"Set up a device queue");
335 
336   /* Select a memory region for the kernel arguments.
337      The call-back function, get_kernarg_region, does the selection.  */
338   XHSA_CMP (hsa_fns.hsa_agent_iterate_regions_fn (device, get_kernarg_region,
339 						  NULL),
340 	    status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
341 	    "Locate kernargs memory");
342 
343   /* Select a memory region for the kernel heap.
344      The call-back function, get_heap_region, does the selection.  */
345   XHSA_CMP (hsa_fns.hsa_agent_iterate_regions_fn (device, get_heap_region,
346 						  NULL),
347 	    status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
348 	    "Locate device memory");
349 }
350 
351 
352 /* Read a whole input file.
353    Code copied from mkoffload. */
354 
355 static char *
read_file(const char * filename,size_t * plen)356 read_file (const char *filename, size_t *plen)
357 {
358   size_t alloc = 16384;
359   size_t base = 0;
360   char *buffer;
361 
362   FILE *stream = fopen (filename, "rb");
363   if (!stream)
364     {
365       perror (filename);
366       exit (1);
367     }
368 
369   if (!fseek (stream, 0, SEEK_END))
370     {
371       /* Get the file size.  */
372       long s = ftell (stream);
373       if (s >= 0)
374 	alloc = s + 100;
375       fseek (stream, 0, SEEK_SET);
376     }
377   buffer = malloc (alloc);
378 
379   for (;;)
380     {
381       size_t n = fread (buffer + base, 1, alloc - base - 1, stream);
382 
383       if (!n)
384 	break;
385       base += n;
386       if (base + 1 == alloc)
387 	{
388 	  alloc *= 2;
389 	  buffer = realloc (buffer, alloc);
390 	}
391     }
392   buffer[base] = 0;
393   *plen = base;
394 
395   fclose (stream);
396 
397   return buffer;
398 }
399 
400 /* Read a HSA Code Object (HSACO) from file, and load it into the device.  */
401 
402 static void
load_image(const char * filename)403 load_image (const char *filename)
404 {
405   size_t image_size;
406   Elf64_Ehdr *image = (void *) read_file (filename, &image_size);
407 
408   /* An "executable" consists of one or more code objects.  */
409   XHSA (hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
410 					  HSA_EXECUTABLE_STATE_UNFROZEN, "",
411 					  &executable),
412 	"Initialize GCN executable");
413 
414   /* Add the HSACO to the executable.  */
415   hsa_code_object_t co = { 0 };
416   XHSA (hsa_fns.hsa_code_object_deserialize_fn (image, image_size, NULL, &co),
417 	"Deserialize GCN code object");
418   XHSA (hsa_fns.hsa_executable_load_code_object_fn (executable, device, co,
419 						    ""),
420 	"Load GCN code object");
421 
422   /* We're done modifying he executable.  */
423   XHSA (hsa_fns.hsa_executable_freeze_fn (executable, ""),
424 	"Freeze GCN executable");
425 
426   /* Locate the "_init_array" function, and read the kernel's properties.  */
427   hsa_executable_symbol_t symbol;
428   XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL,
429 					      "_init_array.kd", device, 0,
430 					      &symbol),
431 	"Find '_init_array' function");
432   XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
433 	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
434 	     &init_array_kernel),
435 	"Extract '_init_array' kernel object kernel object");
436 
437   /* Locate the "_fini_array" function, and read the kernel's properties.  */
438   XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL,
439 					      "_fini_array.kd", device, 0,
440 					      &symbol),
441 	"Find '_fini_array' function");
442   XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
443 	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
444 	     &fini_array_kernel),
445 	"Extract '_fini_array' kernel object kernel object");
446 
447   /* Locate the "main" function, and read the kernel's properties.  */
448   XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "main.kd",
449 					      device, 0, &symbol),
450 	"Find 'main' function");
451   XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
452 	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &main_kernel),
453 	"Extract 'main' kernel object");
454   XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
455 	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
456 	     &kernarg_segment_size),
457 	"Extract kernarg segment size");
458   XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
459 	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
460 	     &group_segment_size),
461 	"Extract group segment size");
462   XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
463 	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
464 	     &private_segment_size),
465 	"Extract private segment size");
466 }
467 
468 /* Allocate some device memory from the kernargs region.
469    The returned address will be 32-bit (with excess zeroed on 64-bit host),
470    and accessible via the same address on both host and target (via
471    __flat_scalar GCN address space).  */
472 
473 static void *
device_malloc(size_t size,hsa_region_t region)474 device_malloc (size_t size, hsa_region_t region)
475 {
476   void *result;
477   XHSA (hsa_fns.hsa_memory_allocate_fn (region, size, &result),
478 	"Allocate device memory");
479   return result;
480 }
481 
482 /* These are the device pointers that will be transferred to the target.
483    The HSA Runtime points the kernargs register here.
484    They correspond to function signature:
485        int main (int argc, char *argv[], int *return_value)
486    The compiler expects this, for kernel functions, and will
487    automatically assign the exit value to *return_value.  */
488 struct kernargs
489 {
490   /* Kernargs.  */
491   int32_t argc;
492   int64_t argv;
493   int64_t out_ptr;
494   int64_t heap_ptr;
495 
496   /* Output data.  */
497   struct output
498   {
499     int return_value;
500     unsigned int next_output;
501     struct printf_data
502     {
503       int written;
504       char msg[128];
505       int type;
506       union
507       {
508 	int64_t ivalue;
509 	double dvalue;
510 	char text[128];
511       };
512     } queue[1024];
513     unsigned int consumed;
514   } output_data;
515 };
516 
517 struct heap
518 {
519   int64_t size;
520   char data[0];
521 } heap;
522 
523 /* Print any console output from the kernel.
524    We print all entries from "consumed" to the next entry without a "written"
525    flag, or "next_output" is reached.  The buffer is circular, but the
526    indices are absolute.  It is assumed the kernel will stop writing data
527    if "next_output" wraps (becomes smaller than "consumed").  */
528 void
gomp_print_output(struct kernargs * kernargs,bool final)529 gomp_print_output (struct kernargs *kernargs, bool final)
530 {
531   unsigned int limit = (sizeof (kernargs->output_data.queue)
532 			/ sizeof (kernargs->output_data.queue[0]));
533 
534   unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
535 				       __ATOMIC_ACQUIRE);
536   unsigned int to = kernargs->output_data.next_output;
537 
538   if (from > to)
539     {
540       /* Overflow.  */
541       if (final)
542 	printf ("GCN print buffer overflowed.\n");
543       return;
544     }
545 
546   unsigned int i;
547   for (i = from; i < to; i++)
548     {
549       struct printf_data *data = &kernargs->output_data.queue[i%limit];
550 
551       if (!data->written && !final)
552 	break;
553 
554       switch (data->type)
555 	{
556 	case 0:
557 	  printf ("%.128s%ld\n", data->msg, data->ivalue);
558 	  break;
559 	case 1:
560 	  printf ("%.128s%f\n", data->msg, data->dvalue);
561 	  break;
562 	case 2:
563 	  printf ("%.128s%.128s\n", data->msg, data->text);
564 	  break;
565 	case 3:
566 	  printf ("%.128s%.128s", data->msg, data->text);
567 	  break;
568 	default:
569 	  printf ("GCN print buffer error!\n");
570 	  break;
571 	}
572 
573       data->written = 0;
574       __atomic_store_n (&kernargs->output_data.consumed, i+1,
575 			__ATOMIC_RELEASE);
576     }
577   fflush (stdout);
578 }
579 
580 /* Execute an already-loaded kernel on the device.  */
581 
582 static void
run(uint64_t kernel,void * kernargs)583 run (uint64_t kernel, void *kernargs)
584 {
585   /* A "signal" is used to launch and monitor the kernel.  */
586   hsa_signal_t signal;
587   XHSA (hsa_fns.hsa_signal_create_fn (1, 0, NULL, &signal),
588 	"Create signal");
589 
590   /* Configure for a single-worker kernel.  */
591   uint64_t index = hsa_fns.hsa_queue_load_write_index_relaxed_fn (queue);
592   const uint32_t queueMask = queue->size - 1;
593   hsa_kernel_dispatch_packet_t *dispatch_packet =
594     &(((hsa_kernel_dispatch_packet_t *) (queue->base_address))[index &
595 							       queueMask]);
596   dispatch_packet->setup |= 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
597   dispatch_packet->workgroup_size_x = (uint16_t) 1;
598   dispatch_packet->workgroup_size_y = (uint16_t) 64;
599   dispatch_packet->workgroup_size_z = (uint16_t) 1;
600   dispatch_packet->grid_size_x = 1;
601   dispatch_packet->grid_size_y = 64;
602   dispatch_packet->grid_size_z = 1;
603   dispatch_packet->completion_signal = signal;
604   dispatch_packet->kernel_object = kernel;
605   dispatch_packet->kernarg_address = (void *) kernargs;
606   dispatch_packet->private_segment_size = private_segment_size;
607   dispatch_packet->group_segment_size = group_segment_size;
608 
609   uint16_t header = 0;
610   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
611   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
612   header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
613 
614   __atomic_store_n ((uint32_t *) dispatch_packet,
615 		    header | (dispatch_packet->setup << 16),
616 		    __ATOMIC_RELEASE);
617 
618   if (debug)
619     fprintf (stderr, "Launch kernel\n");
620 
621   hsa_fns.hsa_queue_store_write_index_relaxed_fn (queue, index + 1);
622   hsa_fns.hsa_signal_store_relaxed_fn (queue->doorbell_signal, index);
623   /* Kernel running ......  */
624   while (hsa_fns.hsa_signal_wait_relaxed_fn (signal, HSA_SIGNAL_CONDITION_LT,
625 					     1, 1000000,
626 					     HSA_WAIT_STATE_ACTIVE) != 0)
627     {
628       usleep (10000);
629       gomp_print_output (kernargs, false);
630     }
631 
632   gomp_print_output (kernargs, true);
633 
634   if (debug)
635     fprintf (stderr, "Kernel exited\n");
636 
637   XHSA (hsa_fns.hsa_signal_destroy_fn (signal),
638 	"Clean up signal");
639 }
640 
641 int
main(int argc,char * argv[])642 main (int argc, char *argv[])
643 {
644   int kernel_arg = 0;
645   for (int i = 1; i < argc; i++)
646     {
647       if (!strcmp (argv[i], "--help"))
648 	{
649 	  usage (argv[0]);
650 	  return 0;
651 	}
652       else if (!strcmp (argv[i], "--version"))
653 	{
654 	  version (argv[0]);
655 	  return 0;
656 	}
657       else if (!strcmp (argv[i], "--debug"))
658 	debug = true;
659       else if (argv[i][0] == '-')
660 	{
661 	  usage (argv[0]);
662 	  return 1;
663 	}
664       else
665 	{
666 	  kernel_arg = i;
667 	  break;
668 	}
669     }
670 
671   if (!kernel_arg)
672     {
673       /* No kernel arguments were found.  */
674       usage (argv[0]);
675       return 1;
676     }
677 
678   /* The remaining arguments are for the GCN kernel.  */
679   int kernel_argc = argc - kernel_arg;
680   char **kernel_argv = &argv[kernel_arg];
681 
682   init_device ();
683   load_image (kernel_argv[0]);
684 
685   /* Calculate size of function parameters + argv data.  */
686   size_t args_size = 0;
687   for (int i = 0; i < kernel_argc; i++)
688     args_size += strlen (kernel_argv[i]) + 1;
689 
690   /* Allocate device memory for both function parameters and the argv
691      data.  */
692   struct kernargs *kernargs = device_malloc (sizeof (*kernargs),
693 					     kernargs_region);
694   struct argdata
695   {
696     int64_t argv_data[kernel_argc];
697     char strings[args_size];
698   } *args = device_malloc (sizeof (struct argdata), kernargs_region);
699 
700   size_t heap_size = 10 * 1024 * 1024;	/* 10MB.  */
701   struct heap *heap = device_malloc (heap_size, heap_region);
702   XHSA (hsa_fns.hsa_memory_assign_agent_fn (heap, device,
703 					    HSA_ACCESS_PERMISSION_RW),
704 	"Assign heap to device agent");
705 
706   /* Write the data to the target.  */
707   kernargs->argc = kernel_argc;
708   kernargs->argv = (int64_t) args->argv_data;
709   kernargs->out_ptr = (int64_t) &kernargs->output_data;
710   kernargs->output_data.return_value = 0xcafe0000; /* Default return value. */
711   kernargs->output_data.next_output = 0;
712   for (unsigned i = 0; i < (sizeof (kernargs->output_data.queue)
713 			    / sizeof (kernargs->output_data.queue[0])); i++)
714     kernargs->output_data.queue[i].written = 0;
715   kernargs->output_data.consumed = 0;
716   int offset = 0;
717   for (int i = 0; i < kernel_argc; i++)
718     {
719       size_t arg_len = strlen (kernel_argv[i]) + 1;
720       args->argv_data[i] = (int64_t) &args->strings[offset];
721       memcpy (&args->strings[offset], kernel_argv[i], arg_len + 1);
722       offset += arg_len;
723     }
724   kernargs->heap_ptr = (int64_t) heap;
725   hsa_fns.hsa_memory_copy_fn (&heap->size, &heap_size, sizeof (heap_size));
726 
727   /* Run constructors on the GPU.  */
728   run (init_array_kernel, kernargs);
729 
730   /* Run the kernel on the GPU.  */
731   run (main_kernel, kernargs);
732   unsigned int return_value =
733     (unsigned int) kernargs->output_data.return_value;
734 
735   /* Run destructors on the GPU.  */
736   run (fini_array_kernel, kernargs);
737 
738   unsigned int upper = (return_value & ~0xffff) >> 16;
739   if (upper == 0xcafe)
740     {
741       printf ("Kernel exit value was never set\n");
742       return_value = 0xff;
743     }
744   else if (upper == 0xffff)
745     ; /* Set by exit.  */
746   else if (upper == 0)
747     ; /* Set by return from main.  */
748   else
749     printf ("Possible kernel exit value corruption, 2 most significant bytes "
750 	    "aren't 0xffff, 0xcafe, or 0: 0x%x\n", return_value);
751 
752   if (upper == 0xffff)
753     {
754       unsigned int signal = (return_value >> 8) & 0xff;
755       if (signal == SIGABRT)
756 	printf ("Kernel aborted\n");
757       else if (signal != 0)
758 	printf ("Kernel received unkown signal\n");
759     }
760 
761   if (debug)
762     printf ("Kernel exit value: %d\n", return_value & 0xff);
763 
764   /* Clean shut down.  */
765   XHSA (hsa_fns.hsa_memory_free_fn (kernargs),
766 	"Clean up device memory");
767   XHSA (hsa_fns.hsa_executable_destroy_fn (executable),
768 	"Clean up GCN executable");
769   XHSA (hsa_fns.hsa_queue_destroy_fn (queue),
770 	"Clean up device queue");
771   XHSA (hsa_fns.hsa_shut_down_fn (),
772 	"Shut down run-time");
773 
774   return return_value & 0xff;
775 }
776