1 /* Run a stand-alone AMD GCN kernel.
2 
3    Copyright 2017 Mentor Graphics Corporation
4    Copyright (C) 2018-2020 Free Software Foundation, Inc.
5 
6    This program is free software: you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation, either version 3 of the License, or
9    (at your option) any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License
17    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
18 
19 /* This program will run a compiled stand-alone GCN kernel on a GPU.
20 
21    The kernel entry point's signature must use a standard main signature:
22 
23      int main(int argc, char **argv)
24 */
25 
26 #include <stdint.h>
27 #include <stdbool.h>
28 #include <stdlib.h>
29 #include <malloc.h>
30 #include <stdio.h>
31 #include <string.h>
32 #include <dlfcn.h>
33 #include <unistd.h>
34 #include <elf.h>
35 #include <signal.h>
36 
37 /* These probably won't be in elf.h for a while.  */
38 #ifndef R_AMDGPU_NONE
39 #define R_AMDGPU_NONE		0
40 #define R_AMDGPU_ABS32_LO	1	/* (S + A) & 0xFFFFFFFF  */
41 #define R_AMDGPU_ABS32_HI	2	/* (S + A) >> 32  */
42 #define R_AMDGPU_ABS64		3	/* S + A  */
43 #define R_AMDGPU_REL32		4	/* S + A - P  */
44 #define R_AMDGPU_REL64		5	/* S + A - P  */
45 #define R_AMDGPU_ABS32		6	/* S + A  */
46 #define R_AMDGPU_GOTPCREL	7	/* G + GOT + A - P  */
47 #define R_AMDGPU_GOTPCREL32_LO	8	/* (G + GOT + A - P) & 0xFFFFFFFF  */
48 #define R_AMDGPU_GOTPCREL32_HI	9	/* (G + GOT + A - P) >> 32  */
49 #define R_AMDGPU_REL32_LO	10	/* (S + A - P) & 0xFFFFFFFF  */
50 #define R_AMDGPU_REL32_HI	11	/* (S + A - P) >> 32  */
51 #define reserved		12
52 #define R_AMDGPU_RELATIVE64	13	/* B + A  */
53 #endif
54 
55 #include "hsa.h"
56 
57 #ifndef HSA_RUNTIME_LIB
58 #define HSA_RUNTIME_LIB "libhsa-runtime64.so"
59 #endif
60 
61 #ifndef VERSION_STRING
62 #define VERSION_STRING "(version unknown)"
63 #endif
64 
65 bool debug = false;
66 
67 hsa_agent_t device = { 0 };
68 hsa_queue_t *queue = NULL;
69 uint64_t init_array_kernel = 0;
70 uint64_t fini_array_kernel = 0;
71 uint64_t main_kernel = 0;
72 hsa_executable_t executable = { 0 };
73 
74 hsa_region_t kernargs_region = { 0 };
75 hsa_region_t heap_region = { 0 };
76 uint32_t kernarg_segment_size = 0;
77 uint32_t group_segment_size = 0;
78 uint32_t private_segment_size = 0;
79 
80 static void
usage(const char * progname)81 usage (const char *progname)
82 {
83   printf ("Usage: %s [options] kernel [kernel-args]\n\n"
84 	  "Options:\n"
85 	  "  --help\n"
86 	  "  --version\n"
87 	  "  --debug\n", progname);
88 }
89 
90 static void
version(const char * progname)91 version (const char *progname)
92 {
93   printf ("%s " VERSION_STRING "\n", progname);
94 }
95 
96 /* As an HSA runtime is dlopened, following structure defines the necessary
97    function pointers.
98    Code adapted from libgomp.  */
99 
100 struct hsa_runtime_fn_info
101 {
102   /* HSA runtime.  */
103   hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
104 					const char **status_string);
105   hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
106 					 hsa_agent_info_t attribute,
107 					 void *value);
108   hsa_status_t (*hsa_init_fn) (void);
109   hsa_status_t (*hsa_iterate_agents_fn)
110     (hsa_status_t (*callback) (hsa_agent_t agent, void *data), void *data);
111   hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
112 					  hsa_region_info_t attribute,
113 					  void *value);
114   hsa_status_t (*hsa_queue_create_fn)
115     (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
116      void (*callback) (hsa_status_t status, hsa_queue_t *source, void *data),
117      void *data, uint32_t private_segment_size,
118      uint32_t group_segment_size, hsa_queue_t **queue);
119   hsa_status_t (*hsa_agent_iterate_regions_fn)
120     (hsa_agent_t agent,
121      hsa_status_t (*callback) (hsa_region_t region, void *data), void *data);
122   hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
123   hsa_status_t (*hsa_executable_create_fn)
124     (hsa_profile_t profile, hsa_executable_state_t executable_state,
125      const char *options, hsa_executable_t *executable);
126   hsa_status_t (*hsa_executable_global_variable_define_fn)
127     (hsa_executable_t executable, const char *variable_name, void *address);
128   hsa_status_t (*hsa_executable_load_code_object_fn)
129     (hsa_executable_t executable, hsa_agent_t agent,
130      hsa_code_object_t code_object, const char *options);
131   hsa_status_t (*hsa_executable_freeze_fn) (hsa_executable_t executable,
132 					    const char *options);
133   hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
134 					uint32_t num_consumers,
135 					const hsa_agent_t *consumers,
136 					hsa_signal_t *signal);
137   hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
138 					  void **ptr);
139   hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
140 					      hsa_access_permission_t access);
141   hsa_status_t (*hsa_memory_copy_fn) (void *dst, const void *src,
142 				      size_t size);
143   hsa_status_t (*hsa_memory_free_fn) (void *ptr);
144   hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
145   hsa_status_t (*hsa_executable_get_symbol_fn)
146     (hsa_executable_t executable, const char *module_name,
147      const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
148      hsa_executable_symbol_t *symbol);
149   hsa_status_t (*hsa_executable_symbol_get_info_fn)
150     (hsa_executable_symbol_t executable_symbol,
151      hsa_executable_symbol_info_t attribute, void *value);
152   void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
153 				       hsa_signal_value_t value);
154   hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
155     (hsa_signal_t signal, hsa_signal_condition_t condition,
156      hsa_signal_value_t compare_value, uint64_t timeout_hint,
157      hsa_wait_state_t wait_state_hint);
158   hsa_signal_value_t (*hsa_signal_wait_relaxed_fn)
159     (hsa_signal_t signal, hsa_signal_condition_t condition,
160      hsa_signal_value_t compare_value, uint64_t timeout_hint,
161      hsa_wait_state_t wait_state_hint);
162   hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
163   hsa_status_t (*hsa_code_object_deserialize_fn)
164     (void *serialized_code_object, size_t serialized_code_object_size,
165      const char *options, hsa_code_object_t *code_object);
166   uint64_t (*hsa_queue_load_write_index_relaxed_fn)
167     (const hsa_queue_t *queue);
168   void (*hsa_queue_store_write_index_relaxed_fn)
169     (const hsa_queue_t *queue, uint64_t value);
170   hsa_status_t (*hsa_shut_down_fn) ();
171 };
172 
173 /* HSA runtime functions that are initialized in init_hsa_context.
174    Code adapted from libgomp.  */
175 
176 static struct hsa_runtime_fn_info hsa_fns;
177 
178 #define DLSYM_FN(function)					 \
179   *(void**)(&hsa_fns.function##_fn) = dlsym (handle, #function); \
180   if (hsa_fns.function##_fn == NULL)				 \
181     goto fail;
182 
183 static void
init_hsa_runtime_functions(void)184 init_hsa_runtime_functions (void)
185 {
186   void *handle = dlopen (HSA_RUNTIME_LIB, RTLD_LAZY);
187   if (handle == NULL)
188     {
189       fprintf (stderr,
190 	       "The HSA runtime is required to run GCN kernels on hardware.\n"
191 	       "%s: File not found or could not be opened\n",
192 	       HSA_RUNTIME_LIB);
193       exit (1);
194     }
195 
196   DLSYM_FN (hsa_status_string)
197   DLSYM_FN (hsa_agent_get_info)
198   DLSYM_FN (hsa_init)
199   DLSYM_FN (hsa_iterate_agents)
200   DLSYM_FN (hsa_region_get_info)
201   DLSYM_FN (hsa_queue_create)
202   DLSYM_FN (hsa_agent_iterate_regions)
203   DLSYM_FN (hsa_executable_destroy)
204   DLSYM_FN (hsa_executable_create)
205   DLSYM_FN (hsa_executable_global_variable_define)
206   DLSYM_FN (hsa_executable_load_code_object)
207   DLSYM_FN (hsa_executable_freeze)
208   DLSYM_FN (hsa_signal_create)
209   DLSYM_FN (hsa_memory_allocate)
210   DLSYM_FN (hsa_memory_assign_agent)
211   DLSYM_FN (hsa_memory_copy)
212   DLSYM_FN (hsa_memory_free)
213   DLSYM_FN (hsa_signal_destroy)
214   DLSYM_FN (hsa_executable_get_symbol)
215   DLSYM_FN (hsa_executable_symbol_get_info)
216   DLSYM_FN (hsa_signal_wait_acquire)
217   DLSYM_FN (hsa_signal_wait_relaxed)
218   DLSYM_FN (hsa_signal_store_relaxed)
219   DLSYM_FN (hsa_queue_destroy)
220   DLSYM_FN (hsa_code_object_deserialize)
221   DLSYM_FN (hsa_queue_load_write_index_relaxed)
222   DLSYM_FN (hsa_queue_store_write_index_relaxed)
223   DLSYM_FN (hsa_shut_down)
224 
225   return;
226 
227 fail:
228   fprintf (stderr, "Failed to find HSA functions in " HSA_RUNTIME_LIB "\n");
229   exit (1);
230 }
231 
232 #undef DLSYM_FN
233 
234 /* Report a fatal error STR together with the HSA error corresponding to
235    STATUS and terminate execution of the current process.  */
236 
237 static void
hsa_fatal(const char * str,hsa_status_t status)238 hsa_fatal (const char *str, hsa_status_t status)
239 {
240   const char *hsa_error_msg;
241   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
242   fprintf (stderr, "%s: FAILED\nHSA Runtime message: %s\n", str,
243 	   hsa_error_msg);
244   exit (1);
245 }
246 
247 /* Helper macros to ensure we check the return values from the HSA Runtime.
248    These just keep the rest of the code a bit cleaner.  */
249 
250 #define XHSA_CMP(FN, CMP, MSG)		   \
251   do {					   \
252     hsa_status_t status = (FN);		   \
253     if (!(CMP))				   \
254       hsa_fatal ((MSG), status);	   \
255     else if (debug)			   \
256       fprintf (stderr, "%s: OK\n", (MSG)); \
257   } while (0)
258 #define XHSA(FN, MSG) XHSA_CMP(FN, status == HSA_STATUS_SUCCESS, MSG)
259 
260 /* Callback of hsa_iterate_agents.
261    Called once for each available device, and returns "break" when a
262    suitable one has been found.  */
263 
264 static hsa_status_t
get_gpu_agent(hsa_agent_t agent,void * data)265 get_gpu_agent (hsa_agent_t agent, void *data __attribute__ ((unused)))
266 {
267   hsa_device_type_t device_type;
268   XHSA (hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
269 				       &device_type),
270 	"Get agent type");
271 
272   /* Select only GPU devices.  */
273   /* TODO: support selecting from multiple GPUs.  */
274   if (HSA_DEVICE_TYPE_GPU == device_type)
275     {
276       device = agent;
277       return HSA_STATUS_INFO_BREAK;
278     }
279 
280   /* The device was not suitable.  */
281   return HSA_STATUS_SUCCESS;
282 }
283 
284 /* Callback of hsa_iterate_regions.
285    Called once for each available memory region, and returns "break" when a
286    suitable one has been found.  */
287 
288 static hsa_status_t
get_memory_region(hsa_region_t region,hsa_region_t * retval,hsa_region_global_flag_t kind)289 get_memory_region (hsa_region_t region, hsa_region_t *retval,
290 		   hsa_region_global_flag_t kind)
291 {
292   /* Reject non-global regions.  */
293   hsa_region_segment_t segment;
294   hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT, &segment);
295   if (HSA_REGION_SEGMENT_GLOBAL != segment)
296     return HSA_STATUS_SUCCESS;
297 
298   /* Find a region with the KERNARG flag set.  */
299   hsa_region_global_flag_t flags;
300   hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
301 				  &flags);
302   if (flags & kind)
303     {
304       *retval = region;
305       return HSA_STATUS_INFO_BREAK;
306     }
307 
308   /* The region was not suitable.  */
309   return HSA_STATUS_SUCCESS;
310 }
311 
312 static hsa_status_t
get_kernarg_region(hsa_region_t region,void * data)313 get_kernarg_region (hsa_region_t region, void *data __attribute__((unused)))
314 {
315   return get_memory_region (region, &kernargs_region,
316 			    HSA_REGION_GLOBAL_FLAG_KERNARG);
317 }
318 
319 static hsa_status_t
get_heap_region(hsa_region_t region,void * data)320 get_heap_region (hsa_region_t region, void *data __attribute__((unused)))
321 {
322   return get_memory_region (region, &heap_region,
323 			    HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
324 }
325 
326 /* Initialize the HSA Runtime library and GPU device.  */
327 
328 static void
init_device()329 init_device ()
330 {
331   /* Load the shared library and find the API functions.  */
332   init_hsa_runtime_functions ();
333 
334   /* Initialize the HSA Runtime.  */
335   XHSA (hsa_fns.hsa_init_fn (),
336 	"Initialize run-time");
337 
338   /* Select a suitable device.
339      The call-back function, get_gpu_agent, does the selection.  */
340   XHSA_CMP (hsa_fns.hsa_iterate_agents_fn (get_gpu_agent, NULL),
341 	    status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
342 	    "Find a device");
343 
344   /* Initialize the queue used for launching kernels.  */
345   uint32_t queue_size = 0;
346   XHSA (hsa_fns.hsa_agent_get_info_fn (device, HSA_AGENT_INFO_QUEUE_MAX_SIZE,
347 				       &queue_size),
348 	"Find max queue size");
349   XHSA (hsa_fns.hsa_queue_create_fn (device, queue_size,
350 				     HSA_QUEUE_TYPE_SINGLE, NULL,
351 				     NULL, UINT32_MAX, UINT32_MAX, &queue),
352 	"Set up a device queue");
353 
354   /* Select a memory region for the kernel arguments.
355      The call-back function, get_kernarg_region, does the selection.  */
356   XHSA_CMP (hsa_fns.hsa_agent_iterate_regions_fn (device, get_kernarg_region,
357 						  NULL),
358 	    status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
359 	    "Locate kernargs memory");
360 
361   /* Select a memory region for the kernel heap.
362      The call-back function, get_heap_region, does the selection.  */
363   XHSA_CMP (hsa_fns.hsa_agent_iterate_regions_fn (device, get_heap_region,
364 						  NULL),
365 	    status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
366 	    "Locate device memory");
367 }
368 
369 
370 /* Read a whole input file.
371    Code copied from mkoffload. */
372 
373 static char *
read_file(const char * filename,size_t * plen)374 read_file (const char *filename, size_t *plen)
375 {
376   size_t alloc = 16384;
377   size_t base = 0;
378   char *buffer;
379 
380   FILE *stream = fopen (filename, "rb");
381   if (!stream)
382     {
383       perror (filename);
384       exit (1);
385     }
386 
387   if (!fseek (stream, 0, SEEK_END))
388     {
389       /* Get the file size.  */
390       long s = ftell (stream);
391       if (s >= 0)
392 	alloc = s + 100;
393       fseek (stream, 0, SEEK_SET);
394     }
395   buffer = malloc (alloc);
396 
397   for (;;)
398     {
399       size_t n = fread (buffer + base, 1, alloc - base - 1, stream);
400 
401       if (!n)
402 	break;
403       base += n;
404       if (base + 1 == alloc)
405 	{
406 	  alloc *= 2;
407 	  buffer = realloc (buffer, alloc);
408 	}
409     }
410   buffer[base] = 0;
411   *plen = base;
412 
413   fclose (stream);
414 
415   return buffer;
416 }
417 
418 /* Read a HSA Code Object (HSACO) from file, and load it into the device.  */
419 
420 static void
load_image(const char * filename)421 load_image (const char *filename)
422 {
423   size_t image_size;
424   Elf64_Ehdr *image = (void *) read_file (filename, &image_size);
425 
426   /* An "executable" consists of one or more code objects.  */
427   XHSA (hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
428 					  HSA_EXECUTABLE_STATE_UNFROZEN, "",
429 					  &executable),
430 	"Initialize GCN executable");
431 
432   /* Hide relocations from the HSA runtime loader.
433      Keep a copy of the unmodified section headers to use later.  */
434   Elf64_Shdr *image_sections =
435     (Elf64_Shdr *) ((char *) image + image->e_shoff);
436   Elf64_Shdr *sections = malloc (sizeof (Elf64_Shdr) * image->e_shnum);
437   memcpy (sections, image_sections, sizeof (Elf64_Shdr) * image->e_shnum);
438   for (int i = image->e_shnum - 1; i >= 0; i--)
439     {
440       if (image_sections[i].sh_type == SHT_RELA
441 	  || image_sections[i].sh_type == SHT_REL)
442 	/* Change section type to something harmless.  */
443 	image_sections[i].sh_type = SHT_NOTE;
444     }
445 
446   /* Add the HSACO to the executable.  */
447   hsa_code_object_t co = { 0 };
448   XHSA (hsa_fns.hsa_code_object_deserialize_fn (image, image_size, NULL, &co),
449 	"Deserialize GCN code object");
450   XHSA (hsa_fns.hsa_executable_load_code_object_fn (executable, device, co,
451 						    ""),
452 	"Load GCN code object");
453 
454   /* We're done modifying he executable.  */
455   XHSA (hsa_fns.hsa_executable_freeze_fn (executable, ""),
456 	"Freeze GCN executable");
457 
458   /* Locate the "_init_array" function, and read the kernel's properties.  */
459   hsa_executable_symbol_t symbol;
460   XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "_init_array",
461 					      device, 0, &symbol),
462 	"Find '_init_array' function");
463   XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
464 	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &init_array_kernel),
465 	"Extract '_init_array' kernel object kernel object");
466 
467   /* Locate the "_fini_array" function, and read the kernel's properties.  */
468   XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "_fini_array",
469 					      device, 0, &symbol),
470 	"Find '_fini_array' function");
471   XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
472 	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &fini_array_kernel),
473 	"Extract '_fini_array' kernel object kernel object");
474 
475   /* Locate the "main" function, and read the kernel's properties.  */
476   XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "main",
477 					      device, 0, &symbol),
478 	"Find 'main' function");
479   XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
480 	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &main_kernel),
481 	"Extract 'main' kernel object");
482   XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
483 	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
484 	     &kernarg_segment_size),
485 	"Extract kernarg segment size");
486   XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
487 	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
488 	     &group_segment_size),
489 	"Extract group segment size");
490   XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
491 	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
492 	     &private_segment_size),
493 	"Extract private segment size");
494 
495   /* Find main function in ELF, and calculate actual load offset.  */
496   Elf64_Addr load_offset;
497   XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
498 	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
499 	     &load_offset),
500 	"Extract 'main' symbol address");
501   for (int i = 0; i < image->e_shnum; i++)
502     if (sections[i].sh_type == SHT_SYMTAB)
503       {
504 	Elf64_Shdr *strtab = &sections[sections[i].sh_link];
505 	char *strings = (char *) image + strtab->sh_offset;
506 
507 	for (size_t offset = 0;
508 	     offset < sections[i].sh_size;
509 	     offset += sections[i].sh_entsize)
510 	  {
511 	    Elf64_Sym *sym = (Elf64_Sym *) ((char *) image
512 					    + sections[i].sh_offset + offset);
513 	    if (strcmp ("main", strings + sym->st_name) == 0)
514 	      {
515 		load_offset -= sym->st_value;
516 		goto found_main;
517 	      }
518 	  }
519       }
520   /* We only get here when main was not found.
521      This should never happen.  */
522   fprintf (stderr, "Error: main function not found.\n");
523   abort ();
524 found_main:;
525 
526   /* Find dynamic symbol table.  */
527   Elf64_Shdr *dynsym = NULL;
528   for (int i = 0; i < image->e_shnum; i++)
529     if (sections[i].sh_type == SHT_DYNSYM)
530       {
531 	dynsym = &sections[i];
532 	break;
533       }
534 
535   /* Fix up relocations.  */
536   for (int i = 0; i < image->e_shnum; i++)
537     {
538       if (sections[i].sh_type == SHT_RELA)
539 	for (size_t offset = 0;
540 	     offset < sections[i].sh_size;
541 	     offset += sections[i].sh_entsize)
542 	  {
543 	    Elf64_Rela *reloc = (Elf64_Rela *) ((char *) image
544 						+ sections[i].sh_offset
545 						+ offset);
546 	    Elf64_Sym *sym =
547 	      (dynsym
548 	       ? (Elf64_Sym *) ((char *) image
549 				+ dynsym->sh_offset
550 				+ (dynsym->sh_entsize
551 				   * ELF64_R_SYM (reloc->r_info))) : NULL);
552 
553 	    int64_t S = (sym ? sym->st_value : 0);
554 	    int64_t P = reloc->r_offset + load_offset;
555 	    int64_t A = reloc->r_addend;
556 	    int64_t B = load_offset;
557 	    int64_t V, size;
558 	    switch (ELF64_R_TYPE (reloc->r_info))
559 	      {
560 	      case R_AMDGPU_ABS32_LO:
561 		V = (S + A) & 0xFFFFFFFF;
562 		size = 4;
563 		break;
564 	      case R_AMDGPU_ABS32_HI:
565 		V = (S + A) >> 32;
566 		size = 4;
567 		break;
568 	      case R_AMDGPU_ABS64:
569 		V = S + A;
570 		size = 8;
571 		break;
572 	      case R_AMDGPU_REL32:
573 		V = S + A - P;
574 		size = 4;
575 		break;
576 	      case R_AMDGPU_REL64:
577 		/* FIXME
578 		   LLD seems to emit REL64 where the assembler has ABS64.
579 		   This is clearly wrong because it's not what the compiler
580 		   is expecting.  Let's assume, for now, that it's a bug.
581 		   In any case, GCN kernels are always self contained and
582 		   therefore relative relocations will have been resolved
583 		   already, so this should be a safe workaround.  */
584 		V = S + A /* - P */ ;
585 		size = 8;
586 		break;
587 	      case R_AMDGPU_ABS32:
588 		V = S + A;
589 		size = 4;
590 		break;
591 	      /* TODO R_AMDGPU_GOTPCREL */
592 	      /* TODO R_AMDGPU_GOTPCREL32_LO */
593 	      /* TODO R_AMDGPU_GOTPCREL32_HI */
594 	      case R_AMDGPU_REL32_LO:
595 		V = (S + A - P) & 0xFFFFFFFF;
596 		size = 4;
597 		break;
598 	      case R_AMDGPU_REL32_HI:
599 		V = (S + A - P) >> 32;
600 		size = 4;
601 		break;
602 	      case R_AMDGPU_RELATIVE64:
603 		V = B + A;
604 		size = 8;
605 		break;
606 	      default:
607 		fprintf (stderr, "Error: unsupported relocation type.\n");
608 		exit (1);
609 	      }
610 	    XHSA (hsa_fns.hsa_memory_copy_fn ((void *) P, &V, size),
611 		  "Fix up relocation");
612 	  }
613     }
614 }
615 
616 /* Allocate some device memory from the kernargs region.
617    The returned address will be 32-bit (with excess zeroed on 64-bit host),
618    and accessible via the same address on both host and target (via
619    __flat_scalar GCN address space).  */
620 
621 static void *
device_malloc(size_t size,hsa_region_t region)622 device_malloc (size_t size, hsa_region_t region)
623 {
624   void *result;
625   XHSA (hsa_fns.hsa_memory_allocate_fn (region, size, &result),
626 	"Allocate device memory");
627   return result;
628 }
629 
630 /* These are the device pointers that will be transferred to the target.
631    The HSA Runtime points the kernargs register here.
632    They correspond to function signature:
633        int main (int argc, char *argv[], int *return_value)
634    The compiler expects this, for kernel functions, and will
635    automatically assign the exit value to *return_value.  */
636 struct kernargs
637 {
638   /* Kernargs.  */
639   int32_t argc;
640   int64_t argv;
641   int64_t out_ptr;
642   int64_t heap_ptr;
643 
644   /* Output data.  */
645   struct output
646   {
647     int return_value;
648     unsigned int next_output;
649     struct printf_data
650     {
651       int written;
652       char msg[128];
653       int type;
654       union
655       {
656 	int64_t ivalue;
657 	double dvalue;
658 	char text[128];
659       };
660     } queue[1024];
661     unsigned int consumed;
662   } output_data;
663 };
664 
665 struct heap
666 {
667   int64_t size;
668   char data[0];
669 } heap;
670 
671 /* Print any console output from the kernel.
672    We print all entries from "consumed" to the next entry without a "written"
673    flag, or "next_output" is reached.  The buffer is circular, but the
674    indices are absolute.  It is assumed the kernel will stop writing data
675    if "next_output" wraps (becomes smaller than "consumed").  */
676 void
gomp_print_output(struct kernargs * kernargs,bool final)677 gomp_print_output (struct kernargs *kernargs, bool final)
678 {
679   unsigned int limit = (sizeof (kernargs->output_data.queue)
680 			/ sizeof (kernargs->output_data.queue[0]));
681 
682   unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
683 				       __ATOMIC_ACQUIRE);
684   unsigned int to = kernargs->output_data.next_output;
685 
686   if (from > to)
687     {
688       /* Overflow.  */
689       if (final)
690 	printf ("GCN print buffer overflowed.\n");
691       return;
692     }
693 
694   unsigned int i;
695   for (i = from; i < to; i++)
696     {
697       struct printf_data *data = &kernargs->output_data.queue[i%limit];
698 
699       if (!data->written && !final)
700 	break;
701 
702       switch (data->type)
703 	{
704 	case 0:
705 	  printf ("%.128s%ld\n", data->msg, data->ivalue);
706 	  break;
707 	case 1:
708 	  printf ("%.128s%f\n", data->msg, data->dvalue);
709 	  break;
710 	case 2:
711 	  printf ("%.128s%.128s\n", data->msg, data->text);
712 	  break;
713 	case 3:
714 	  printf ("%.128s%.128s", data->msg, data->text);
715 	  break;
716 	default:
717 	  printf ("GCN print buffer error!\n");
718 	  break;
719 	}
720 
721       data->written = 0;
722       __atomic_store_n (&kernargs->output_data.consumed, i+1,
723 			__ATOMIC_RELEASE);
724     }
725   fflush (stdout);
726 }
727 
728 /* Execute an already-loaded kernel on the device.  */
729 
730 static void
run(uint64_t kernel,void * kernargs)731 run (uint64_t kernel, void *kernargs)
732 {
733   /* A "signal" is used to launch and monitor the kernel.  */
734   hsa_signal_t signal;
735   XHSA (hsa_fns.hsa_signal_create_fn (1, 0, NULL, &signal),
736 	"Create signal");
737 
738   /* Configure for a single-worker kernel.  */
739   uint64_t index = hsa_fns.hsa_queue_load_write_index_relaxed_fn (queue);
740   const uint32_t queueMask = queue->size - 1;
741   hsa_kernel_dispatch_packet_t *dispatch_packet =
742     &(((hsa_kernel_dispatch_packet_t *) (queue->base_address))[index &
743 							       queueMask]);
744   dispatch_packet->setup |= 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
745   dispatch_packet->workgroup_size_x = (uint16_t) 1;
746   dispatch_packet->workgroup_size_y = (uint16_t) 64;
747   dispatch_packet->workgroup_size_z = (uint16_t) 1;
748   dispatch_packet->grid_size_x = 1;
749   dispatch_packet->grid_size_y = 64;
750   dispatch_packet->grid_size_z = 1;
751   dispatch_packet->completion_signal = signal;
752   dispatch_packet->kernel_object = kernel;
753   dispatch_packet->kernarg_address = (void *) kernargs;
754   dispatch_packet->private_segment_size = private_segment_size;
755   dispatch_packet->group_segment_size = group_segment_size;
756 
757   uint16_t header = 0;
758   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
759   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
760   header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
761 
762   __atomic_store_n ((uint32_t *) dispatch_packet,
763 		    header | (dispatch_packet->setup << 16),
764 		    __ATOMIC_RELEASE);
765 
766   if (debug)
767     fprintf (stderr, "Launch kernel\n");
768 
769   hsa_fns.hsa_queue_store_write_index_relaxed_fn (queue, index + 1);
770   hsa_fns.hsa_signal_store_relaxed_fn (queue->doorbell_signal, index);
771   /* Kernel running ......  */
772   while (hsa_fns.hsa_signal_wait_relaxed_fn (signal, HSA_SIGNAL_CONDITION_LT,
773 					     1, 1000000,
774 					     HSA_WAIT_STATE_ACTIVE) != 0)
775     {
776       usleep (10000);
777       gomp_print_output (kernargs, false);
778     }
779 
780   gomp_print_output (kernargs, true);
781 
782   if (debug)
783     fprintf (stderr, "Kernel exited\n");
784 
785   XHSA (hsa_fns.hsa_signal_destroy_fn (signal),
786 	"Clean up signal");
787 }
788 
789 int
main(int argc,char * argv[])790 main (int argc, char *argv[])
791 {
792   int kernel_arg = 0;
793   for (int i = 1; i < argc; i++)
794     {
795       if (!strcmp (argv[i], "--help"))
796 	{
797 	  usage (argv[0]);
798 	  return 0;
799 	}
800       else if (!strcmp (argv[i], "--version"))
801 	{
802 	  version (argv[0]);
803 	  return 0;
804 	}
805       else if (!strcmp (argv[i], "--debug"))
806 	debug = true;
807       else if (argv[i][0] == '-')
808 	{
809 	  usage (argv[0]);
810 	  return 1;
811 	}
812       else
813 	{
814 	  kernel_arg = i;
815 	  break;
816 	}
817     }
818 
819   if (!kernel_arg)
820     {
821       /* No kernel arguments were found.  */
822       usage (argv[0]);
823       return 1;
824     }
825 
826   /* The remaining arguments are for the GCN kernel.  */
827   int kernel_argc = argc - kernel_arg;
828   char **kernel_argv = &argv[kernel_arg];
829 
830   init_device ();
831   load_image (kernel_argv[0]);
832 
833   /* Calculate size of function parameters + argv data.  */
834   size_t args_size = 0;
835   for (int i = 0; i < kernel_argc; i++)
836     args_size += strlen (kernel_argv[i]) + 1;
837 
838   /* Allocate device memory for both function parameters and the argv
839      data.  */
840   struct kernargs *kernargs = device_malloc (sizeof (*kernargs),
841 					     kernargs_region);
842   struct argdata
843   {
844     int64_t argv_data[kernel_argc];
845     char strings[args_size];
846   } *args = device_malloc (sizeof (struct argdata), kernargs_region);
847 
848   size_t heap_size = 10 * 1024 * 1024;	/* 10MB.  */
849   struct heap *heap = device_malloc (heap_size, heap_region);
850   XHSA (hsa_fns.hsa_memory_assign_agent_fn (heap, device,
851 					    HSA_ACCESS_PERMISSION_RW),
852 	"Assign heap to device agent");
853 
854   /* Write the data to the target.  */
855   kernargs->argc = kernel_argc;
856   kernargs->argv = (int64_t) args->argv_data;
857   kernargs->out_ptr = (int64_t) &kernargs->output_data;
858   kernargs->output_data.return_value = 0xcafe0000; /* Default return value. */
859   kernargs->output_data.next_output = 0;
860   for (unsigned i = 0; i < (sizeof (kernargs->output_data.queue)
861 			    / sizeof (kernargs->output_data.queue[0])); i++)
862     kernargs->output_data.queue[i].written = 0;
863   kernargs->output_data.consumed = 0;
864   int offset = 0;
865   for (int i = 0; i < kernel_argc; i++)
866     {
867       size_t arg_len = strlen (kernel_argv[i]) + 1;
868       args->argv_data[i] = (int64_t) &args->strings[offset];
869       memcpy (&args->strings[offset], kernel_argv[i], arg_len + 1);
870       offset += arg_len;
871     }
872   kernargs->heap_ptr = (int64_t) heap;
873   hsa_fns.hsa_memory_copy_fn (&heap->size, &heap_size, sizeof (heap_size));
874 
875   /* Run constructors on the GPU.  */
876   run (init_array_kernel, kernargs);
877 
878   /* Run the kernel on the GPU.  */
879   run (main_kernel, kernargs);
880   unsigned int return_value =
881     (unsigned int) kernargs->output_data.return_value;
882 
883   /* Run destructors on the GPU.  */
884   run (fini_array_kernel, kernargs);
885 
886   unsigned int upper = (return_value & ~0xffff) >> 16;
887   if (upper == 0xcafe)
888     {
889       printf ("Kernel exit value was never set\n");
890       return_value = 0xff;
891     }
892   else if (upper == 0xffff)
893     ; /* Set by exit.  */
894   else if (upper == 0)
895     ; /* Set by return from main.  */
896   else
897     printf ("Possible kernel exit value corruption, 2 most significant bytes "
898 	    "aren't 0xffff, 0xcafe, or 0: 0x%x\n", return_value);
899 
900   if (upper == 0xffff)
901     {
902       unsigned int signal = (return_value >> 8) & 0xff;
903       if (signal == SIGABRT)
904 	printf ("Kernel aborted\n");
905       else if (signal != 0)
906 	printf ("Kernel received unkown signal\n");
907     }
908 
909   if (debug)
910     printf ("Kernel exit value: %d\n", return_value & 0xff);
911 
912   /* Clean shut down.  */
913   XHSA (hsa_fns.hsa_memory_free_fn (kernargs),
914 	"Clean up device memory");
915   XHSA (hsa_fns.hsa_executable_destroy_fn (executable),
916 	"Clean up GCN executable");
917   XHSA (hsa_fns.hsa_queue_destroy_fn (queue),
918 	"Clean up device queue");
919   XHSA (hsa_fns.hsa_shut_down_fn (),
920 	"Shut down run-time");
921 
922   return return_value & 0xff;
923 }
924