1 /* Plugin for HSAIL execution.
2 
3    Copyright (C) 2013-2016 Free Software Foundation, Inc.
4 
5    Contributed by Martin Jambor <mjambor@suse.cz> and
6    Martin Liska <mliska@suse.cz>.
7 
8    This file is part of the GNU Offloading and Multi Processing Library
9    (libgomp).
10 
11    Libgomp is free software; you can redistribute it and/or modify it
12    under the terms of the GNU General Public License as published by
13    the Free Software Foundation; either version 3, or (at your option)
14    any later version.
15 
16    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
17    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
18    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
19    more details.
20 
21    Under Section 7 of GPL version 3, you are granted additional
22    permissions described in the GCC Runtime Library Exception, version
23    3.1, as published by the Free Software Foundation.
24 
25    You should have received a copy of the GNU General Public License and
26    a copy of the GCC Runtime Library Exception along with this program;
27    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
28    <http://www.gnu.org/licenses/>.  */
29 
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <pthread.h>
34 #include <hsa.h>
35 #include <hsa_ext_finalize.h>
36 #include <dlfcn.h>
37 #include "libgomp-plugin.h"
38 #include "gomp-constants.h"
39 
40 /* Keep the following GOMP prefixed structures in sync with respective parts of
41    the compiler.  */
42 
43 /* Structure describing the run-time and grid properties of an HSA kernel
44    lauch.  */
45 
46 struct GOMP_kernel_launch_attributes
47 {
48   /* Number of dimensions the workload has.  Maximum number is 3.  */
49   uint32_t ndim;
50   /* Size of the grid in the three respective dimensions.  */
51   uint32_t gdims[3];
52   /* Size of work-groups in the respective dimensions.  */
53   uint32_t wdims[3];
54 };
55 
56 /* Collection of information needed for a dispatch of a kernel from a
57    kernel.  */
58 
59 struct GOMP_hsa_kernel_dispatch
60 {
61   /* Pointer to a command queue associated with a kernel dispatch agent.  */
62   void *queue;
63   /* Pointer to reserved memory for OMP data struct copying.  */
64   void *omp_data_memory;
65   /* Pointer to a memory space used for kernel arguments passing.  */
66   void *kernarg_address;
67   /* Kernel object.  */
68   uint64_t object;
69   /* Synchronization signal used for dispatch synchronization.  */
70   uint64_t signal;
71   /* Private segment size.  */
72   uint32_t private_segment_size;
73   /* Group segment size.  */
74   uint32_t group_segment_size;
75   /* Number of children kernel dispatches.  */
76   uint64_t kernel_dispatch_count;
77   /* Debug purpose argument.  */
78   uint64_t debug;
79   /* Levels-var ICV.  */
80   uint64_t omp_level;
81   /* Kernel dispatch structures created for children kernel dispatches.  */
82   struct GOMP_hsa_kernel_dispatch **children_dispatches;
83   /* Number of threads.  */
84   uint32_t omp_num_threads;
85 };
86 
87 /* Part of the libgomp plugin interface.  Return the name of the accelerator,
88    which is "hsa".  */
89 
90 const char *
GOMP_OFFLOAD_get_name(void)91 GOMP_OFFLOAD_get_name (void)
92 {
93   return "hsa";
94 }
95 
96 /* Part of the libgomp plugin interface.  Return the specific capabilities the
97    HSA accelerator have.  */
98 
99 unsigned int
GOMP_OFFLOAD_get_caps(void)100 GOMP_OFFLOAD_get_caps (void)
101 {
102   return GOMP_OFFLOAD_CAP_SHARED_MEM | GOMP_OFFLOAD_CAP_OPENMP_400;
103 }
104 
105 /* Part of the libgomp plugin interface.  Identify as HSA accelerator.  */
106 
107 int
GOMP_OFFLOAD_get_type(void)108 GOMP_OFFLOAD_get_type (void)
109 {
110   return OFFLOAD_TARGET_TYPE_HSA;
111 }
112 
113 /* Return the libgomp version number we're compatible with.  There is
114    no requirement for cross-version compatibility.  */
115 
116 unsigned
GOMP_OFFLOAD_version(void)117 GOMP_OFFLOAD_version (void)
118 {
119   return GOMP_VERSION;
120 }
121 
122 /* Flag to decide whether print to stderr information about what is going on.
123    Set in init_debug depending on environment variables.  */
124 
125 static bool debug;
126 
127 /* Flag to decide if the runtime should suppress a possible fallback to host
128    execution.  */
129 
130 static bool suppress_host_fallback;
131 
132 /* Initialize debug and suppress_host_fallback according to the environment.  */
133 
134 static void
init_enviroment_variables(void)135 init_enviroment_variables (void)
136 {
137   if (getenv ("HSA_DEBUG"))
138     debug = true;
139   else
140     debug = false;
141 
142   if (getenv ("HSA_SUPPRESS_HOST_FALLBACK"))
143     suppress_host_fallback = true;
144   else
145     suppress_host_fallback = false;
146 }
147 
148 /* Print a logging message with PREFIX to stderr if HSA_DEBUG value
149    is set to true.  */
150 
151 #define HSA_LOG(prefix, ...) \
152   do \
153   { \
154     if (debug) \
155       { \
156 	fprintf (stderr, prefix); \
157 	fprintf (stderr, __VA_ARGS__); \
158       } \
159   } \
160   while (false);
161 
162 /* Print a debugging message to stderr.  */
163 
164 #define HSA_DEBUG(...) HSA_LOG ("HSA debug: ", __VA_ARGS__)
165 
166 /* Print a warning message to stderr.  */
167 
168 #define HSA_WARNING(...) HSA_LOG ("HSA warning: ", __VA_ARGS__)
169 
170 /* Print HSA warning STR with an HSA STATUS code.  */
171 
172 static void
hsa_warn(const char * str,hsa_status_t status)173 hsa_warn (const char *str, hsa_status_t status)
174 {
175   if (!debug)
176     return;
177 
178   const char *hsa_error;
179   hsa_status_string (status, &hsa_error);
180 
181   fprintf (stderr, "HSA warning: %s\nRuntime message: %s", str, hsa_error);
182 }
183 
184 /* Report a fatal error STR together with the HSA error corresponding to STATUS
185    and terminate execution of the current process.  */
186 
187 static void
hsa_fatal(const char * str,hsa_status_t status)188 hsa_fatal (const char *str, hsa_status_t status)
189 {
190   const char *hsa_error;
191   hsa_status_string (status, &hsa_error);
192   GOMP_PLUGIN_fatal ("HSA fatal error: %s\nRuntime message: %s", str,
193 		     hsa_error);
194 }
195 
196 struct hsa_kernel_description
197 {
198   const char *name;
199   unsigned omp_data_size;
200   bool gridified_kernel_p;
201   unsigned kernel_dependencies_count;
202   const char **kernel_dependencies;
203 };
204 
205 struct global_var_info
206 {
207   const char *name;
208   void *address;
209 };
210 
211 /* Data passed by the static initializer of a compilation unit containing BRIG
212    to GOMP_offload_register.  */
213 
214 struct brig_image_desc
215 {
216   hsa_ext_module_t brig_module;
217   const unsigned kernel_count;
218   struct hsa_kernel_description *kernel_infos;
219   const unsigned global_variable_count;
220   struct global_var_info *global_variables;
221 };
222 
223 struct agent_info;
224 
225 /* Information required to identify, finalize and run any given kernel.  */
226 
227 struct kernel_info
228 {
229   /* Name of the kernel, required to locate it within the brig module.  */
230   const char *name;
231   /* Size of memory space for OMP data.  */
232   unsigned omp_data_size;
233   /* The specific agent the kernel has been or will be finalized for and run
234      on.  */
235   struct agent_info *agent;
236   /* The specific module where the kernel takes place.  */
237   struct module_info *module;
238   /* Mutex enforcing that at most once thread ever initializes a kernel for
239      use.  A thread should have locked agent->modules_rwlock for reading before
240      acquiring it.  */
241   pthread_mutex_t init_mutex;
242   /* Flag indicating whether the kernel has been initialized and all fields
243      below it contain valid data.  */
244   bool initialized;
245   /* Flag indicating that the kernel has a problem that blocks an execution.  */
246   bool initialization_failed;
247   /* The object to be put into the dispatch queue.  */
248   uint64_t object;
249   /* Required size of kernel arguments.  */
250   uint32_t kernarg_segment_size;
251   /* Required size of group segment.  */
252   uint32_t group_segment_size;
253   /* Required size of private segment.  */
254   uint32_t private_segment_size;
255   /* List of all kernel dependencies.  */
256   const char **dependencies;
257   /* Number of dependencies.  */
258   unsigned dependencies_count;
259   /* Maximum OMP data size necessary for kernel from kernel dispatches.  */
260   unsigned max_omp_data_size;
261   /* True if the kernel is gridified.  */
262   bool gridified_kernel_p;
263 };
264 
265 /* Information about a particular brig module, its image and kernels.  */
266 
267 struct module_info
268 {
269   /* The next and previous module in the linked list of modules of an agent.  */
270   struct module_info *next, *prev;
271   /* The description with which the program has registered the image.  */
272   struct brig_image_desc *image_desc;
273 
274   /* Number of kernels in this module.  */
275   int kernel_count;
276   /* An array of kernel_info structures describing each kernel in this
277      module.  */
278   struct kernel_info kernels[];
279 };
280 
281 /* Information about shared brig library.  */
282 
283 struct brig_library_info
284 {
285   char *file_name;
286   hsa_ext_module_t image;
287 };
288 
289 /* Description of an HSA GPU agent and the program associated with it.  */
290 
291 struct agent_info
292 {
293   /* The HSA ID of the agent.  Assigned when hsa_context is initialized.  */
294   hsa_agent_t id;
295   /* Whether the agent has been initialized.  The fields below are usable only
296      if it has been.  */
297   bool initialized;
298   /* The HSA ISA of this agent.  */
299   hsa_isa_t isa;
300   /* Command queue of the agent.  */
301   hsa_queue_t *command_q;
302   /* Kernel from kernel dispatch command queue.  */
303   hsa_queue_t *kernel_dispatch_command_q;
304   /* The HSA memory region from which to allocate kernel arguments.  */
305   hsa_region_t kernarg_region;
306 
307   /* Read-write lock that protects kernels which are running or about to be run
308      from interference with loading and unloading of images.  Needs to be
309      locked for reading while a kernel is being run, and for writing if the
310      list of modules is manipulated (and thus the HSA program invalidated).  */
311   pthread_rwlock_t modules_rwlock;
312   /* The first module in a linked list of modules associated with this
313      kernel.  */
314   struct module_info *first_module;
315 
316   /* Mutex enforcing that only one thread will finalize the HSA program.  A
317      thread should have locked agent->modules_rwlock for reading before
318      acquiring it.  */
319   pthread_mutex_t prog_mutex;
320   /* Flag whether the HSA program that consists of all the modules has been
321      finalized.  */
322   bool prog_finalized;
323   /* Flag whether the program was finalized but with a failure.  */
324   bool prog_finalized_error;
325   /* HSA executable - the finalized program that is used to locate kernels.  */
326   hsa_executable_t executable;
327   /* List of BRIG libraries.  */
328   struct brig_library_info **brig_libraries;
329   /* Number of loaded shared BRIG libraries.  */
330   unsigned brig_libraries_count;
331 };
332 
333 /* Information about the whole HSA environment and all of its agents.  */
334 
335 struct hsa_context_info
336 {
337   /* Whether the structure has been initialized.  */
338   bool initialized;
339   /* Number of usable GPU HSA agents in the system.  */
340   int agent_count;
341   /* Array of agent_info structures describing the individual HSA agents.  */
342   struct agent_info *agents;
343 };
344 
345 /* Information about the whole HSA environment and all of its agents.  */
346 
347 static struct hsa_context_info hsa_context;
348 
349 /* Find kernel for an AGENT by name provided in KERNEL_NAME.  */
350 
351 static struct kernel_info *
get_kernel_for_agent(struct agent_info * agent,const char * kernel_name)352 get_kernel_for_agent (struct agent_info *agent, const char *kernel_name)
353 {
354   struct module_info *module = agent->first_module;
355 
356   while (module)
357     {
358       for (unsigned i = 0; i < module->kernel_count; i++)
359 	if (strcmp (module->kernels[i].name, kernel_name) == 0)
360 	  return &module->kernels[i];
361 
362       module = module->next;
363     }
364 
365   return NULL;
366 }
367 
368 /* Return true if the agent is a GPU and acceptable of concurrent submissions
369    from different threads.  */
370 
371 static bool
suitable_hsa_agent_p(hsa_agent_t agent)372 suitable_hsa_agent_p (hsa_agent_t agent)
373 {
374   hsa_device_type_t device_type;
375   hsa_status_t status
376     = hsa_agent_get_info (agent, HSA_AGENT_INFO_DEVICE, &device_type);
377   if (status != HSA_STATUS_SUCCESS || device_type != HSA_DEVICE_TYPE_GPU)
378     return false;
379 
380   uint32_t features = 0;
381   status = hsa_agent_get_info (agent, HSA_AGENT_INFO_FEATURE, &features);
382   if (status != HSA_STATUS_SUCCESS
383       || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
384     return false;
385   hsa_queue_type_t queue_type;
386   status = hsa_agent_get_info (agent, HSA_AGENT_INFO_QUEUE_TYPE, &queue_type);
387   if (status != HSA_STATUS_SUCCESS
388       || (queue_type != HSA_QUEUE_TYPE_MULTI))
389     return false;
390 
391   return true;
392 }
393 
394 /* Callback of hsa_iterate_agents, if AGENT is a GPU device, increment
395    agent_count in hsa_context.  */
396 
397 static hsa_status_t
count_gpu_agents(hsa_agent_t agent,void * data)398 count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
399 {
400   if (suitable_hsa_agent_p (agent))
401     hsa_context.agent_count++;
402   return HSA_STATUS_SUCCESS;
403 }
404 
405 /* Callback of hsa_iterate_agents, if AGENT is a GPU device, assign the agent
406    id to the describing structure in the hsa context.  The index of the
407    structure is pointed to by DATA, increment it afterwards.  */
408 
409 static hsa_status_t
assign_agent_ids(hsa_agent_t agent,void * data)410 assign_agent_ids (hsa_agent_t agent, void *data)
411 {
412   if (suitable_hsa_agent_p (agent))
413     {
414       int *agent_index = (int *) data;
415       hsa_context.agents[*agent_index].id = agent;
416       ++*agent_index;
417     }
418   return HSA_STATUS_SUCCESS;
419 }
420 
421 /* Initialize hsa_context if it has not already been done.  */
422 
423 static void
init_hsa_context(void)424 init_hsa_context (void)
425 {
426   hsa_status_t status;
427   int agent_index = 0;
428 
429   if (hsa_context.initialized)
430     return;
431   init_enviroment_variables ();
432   status = hsa_init ();
433   if (status != HSA_STATUS_SUCCESS)
434     hsa_fatal ("Run-time could not be initialized", status);
435   HSA_DEBUG ("HSA run-time initialized\n");
436   status = hsa_iterate_agents (count_gpu_agents, NULL);
437   if (status != HSA_STATUS_SUCCESS)
438     hsa_fatal ("HSA GPU devices could not be enumerated", status);
439   HSA_DEBUG ("There are %i HSA GPU devices.\n", hsa_context.agent_count);
440 
441   hsa_context.agents
442     = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
443 				  * sizeof (struct agent_info));
444   status = hsa_iterate_agents (assign_agent_ids, &agent_index);
445   if (agent_index != hsa_context.agent_count)
446     GOMP_PLUGIN_fatal ("Failed to assign IDs to all HSA agents");
447   hsa_context.initialized = true;
448 }
449 
450 /* Callback of dispatch queues to report errors.  */
451 
452 static void
queue_callback(hsa_status_t status,hsa_queue_t * queue,void * data)453 queue_callback (hsa_status_t status,
454 		hsa_queue_t *queue __attribute__ ((unused)),
455 		void *data __attribute__ ((unused)))
456 {
457   hsa_fatal ("Asynchronous queue error", status);
458 }
459 
460 /* Callback of hsa_agent_iterate_regions.  Determine if a memory REGION can be
461    used for kernarg allocations and if so write it to the memory pointed to by
462    DATA and break the query.  */
463 
464 static hsa_status_t
get_kernarg_memory_region(hsa_region_t region,void * data)465 get_kernarg_memory_region (hsa_region_t region, void *data)
466 {
467   hsa_status_t status;
468   hsa_region_segment_t segment;
469 
470   status = hsa_region_get_info (region, HSA_REGION_INFO_SEGMENT, &segment);
471   if (status != HSA_STATUS_SUCCESS)
472     return status;
473   if (segment != HSA_REGION_SEGMENT_GLOBAL)
474     return HSA_STATUS_SUCCESS;
475 
476   uint32_t flags;
477   status = hsa_region_get_info (region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
478   if (status != HSA_STATUS_SUCCESS)
479     return status;
480   if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
481     {
482       hsa_region_t *ret = (hsa_region_t *) data;
483       *ret = region;
484       return HSA_STATUS_INFO_BREAK;
485     }
486   return HSA_STATUS_SUCCESS;
487 }
488 
489 /* Part of the libgomp plugin interface.  Return the number of HSA devices on
490    the system.  */
491 
492 int
GOMP_OFFLOAD_get_num_devices(void)493 GOMP_OFFLOAD_get_num_devices (void)
494 {
495   init_hsa_context ();
496   return hsa_context.agent_count;
497 }
498 
499 /* Part of the libgomp plugin interface.  Initialize agent number N so that it
500    can be used for computation.  */
501 
502 void
GOMP_OFFLOAD_init_device(int n)503 GOMP_OFFLOAD_init_device (int n)
504 {
505   init_hsa_context ();
506   if (n >= hsa_context.agent_count)
507     GOMP_PLUGIN_fatal ("Request to initialize non-existing HSA device %i", n);
508   struct agent_info *agent = &hsa_context.agents[n];
509 
510   if (agent->initialized)
511     return;
512 
513   if (pthread_rwlock_init (&agent->modules_rwlock, NULL))
514     GOMP_PLUGIN_fatal ("Failed to initialize an HSA agent rwlock");
515   if (pthread_mutex_init (&agent->prog_mutex, NULL))
516     GOMP_PLUGIN_fatal ("Failed to initialize an HSA agent program mutex");
517 
518   uint32_t queue_size;
519   hsa_status_t status;
520   status = hsa_agent_get_info (agent->id, HSA_AGENT_INFO_QUEUE_MAX_SIZE,
521 			       &queue_size);
522   if (status != HSA_STATUS_SUCCESS)
523     hsa_fatal ("Error requesting maximum queue size of the HSA agent", status);
524   status = hsa_agent_get_info (agent->id, HSA_AGENT_INFO_ISA, &agent->isa);
525   if (status != HSA_STATUS_SUCCESS)
526     hsa_fatal ("Error querying the ISA of the agent", status);
527   status = hsa_queue_create (agent->id, queue_size, HSA_QUEUE_TYPE_MULTI,
528 			     queue_callback, NULL, UINT32_MAX, UINT32_MAX,
529 			     &agent->command_q);
530   if (status != HSA_STATUS_SUCCESS)
531     hsa_fatal ("Error creating command queue", status);
532 
533   status = hsa_queue_create (agent->id, queue_size, HSA_QUEUE_TYPE_MULTI,
534 			     queue_callback, NULL, UINT32_MAX, UINT32_MAX,
535 			     &agent->kernel_dispatch_command_q);
536   if (status != HSA_STATUS_SUCCESS)
537     hsa_fatal ("Error creating kernel dispatch command queue", status);
538 
539   agent->kernarg_region.handle = (uint64_t) -1;
540   status = hsa_agent_iterate_regions (agent->id, get_kernarg_memory_region,
541 				      &agent->kernarg_region);
542   if (agent->kernarg_region.handle == (uint64_t) -1)
543     GOMP_PLUGIN_fatal ("Could not find suitable memory region for kernel "
544 		       "arguments");
545   HSA_DEBUG ("HSA agent initialized, queue has id %llu\n",
546 	     (long long unsigned) agent->command_q->id);
547   HSA_DEBUG ("HSA agent initialized, kernel dispatch queue has id %llu\n",
548 	     (long long unsigned) agent->kernel_dispatch_command_q->id);
549   agent->initialized = true;
550 }
551 
552 /* Verify that hsa_context has already been initialized and return the
553    agent_info structure describing device number N.  */
554 
555 static struct agent_info *
get_agent_info(int n)556 get_agent_info (int n)
557 {
558   if (!hsa_context.initialized)
559     GOMP_PLUGIN_fatal ("Attempt to use uninitialized HSA context.");
560   if (n >= hsa_context.agent_count)
561     GOMP_PLUGIN_fatal ("Request to operate on anon-existing HSA device %i", n);
562   if (!hsa_context.agents[n].initialized)
563     GOMP_PLUGIN_fatal ("Attempt to use an uninitialized HSA agent.");
564   return &hsa_context.agents[n];
565 }
566 
567 /* Insert MODULE to the linked list of modules of AGENT.  */
568 
569 static void
add_module_to_agent(struct agent_info * agent,struct module_info * module)570 add_module_to_agent (struct agent_info *agent, struct module_info *module)
571 {
572   if (agent->first_module)
573     agent->first_module->prev = module;
574   module->next = agent->first_module;
575   module->prev = NULL;
576   agent->first_module = module;
577 }
578 
579 /* Remove MODULE from the linked list of modules of AGENT.  */
580 
581 static void
remove_module_from_agent(struct agent_info * agent,struct module_info * module)582 remove_module_from_agent (struct agent_info *agent, struct module_info *module)
583 {
584   if (agent->first_module == module)
585     agent->first_module = module->next;
586   if (module->prev)
587     module->prev->next = module->next;
588   if (module->next)
589     module->next->prev = module->prev;
590 }
591 
592 /* Free the HSA program in agent and everything associated with it and set
593    agent->prog_finalized and the initialized flags of all kernels to false.  */
594 
595 static void
destroy_hsa_program(struct agent_info * agent)596 destroy_hsa_program (struct agent_info *agent)
597 {
598   if (!agent->prog_finalized || agent->prog_finalized_error)
599     return;
600 
601   hsa_status_t status;
602 
603   HSA_DEBUG ("Destroying the current HSA program.\n");
604 
605   status = hsa_executable_destroy (agent->executable);
606   if (status != HSA_STATUS_SUCCESS)
607     hsa_fatal ("Could not destroy HSA executable", status);
608 
609   struct module_info *module;
610   for (module = agent->first_module; module; module = module->next)
611     {
612       int i;
613       for (i = 0; i < module->kernel_count; i++)
614 	module->kernels[i].initialized = false;
615     }
616   agent->prog_finalized = false;
617 }
618 
619 /* Part of the libgomp plugin interface.  Load BRIG module described by struct
620    brig_image_desc in TARGET_DATA and return references to kernel descriptors
621    in TARGET_TABLE.  */
622 
623 int
GOMP_OFFLOAD_load_image(int ord,unsigned version,void * target_data,struct addr_pair ** target_table)624 GOMP_OFFLOAD_load_image (int ord, unsigned version, void *target_data,
625 			 struct addr_pair **target_table)
626 {
627   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_HSA)
628     GOMP_PLUGIN_fatal ("Offload data incompatible with HSA plugin"
629 		       " (expected %u, received %u)",
630 		       GOMP_VERSION_HSA, GOMP_VERSION_DEV (version));
631 
632   struct brig_image_desc *image_desc = (struct brig_image_desc *) target_data;
633   struct agent_info *agent;
634   struct addr_pair *pair;
635   struct module_info *module;
636   struct kernel_info *kernel;
637   int kernel_count = image_desc->kernel_count;
638 
639   agent = get_agent_info (ord);
640   if (pthread_rwlock_wrlock (&agent->modules_rwlock))
641     GOMP_PLUGIN_fatal ("Unable to write-lock an HSA agent rwlock");
642   if (agent->prog_finalized)
643     destroy_hsa_program (agent);
644 
645   HSA_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
646   pair = GOMP_PLUGIN_malloc (kernel_count * sizeof (struct addr_pair));
647   *target_table = pair;
648   module = (struct module_info *)
649     GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
650 				+ kernel_count * sizeof (struct kernel_info));
651   module->image_desc = image_desc;
652   module->kernel_count = kernel_count;
653 
654   kernel = &module->kernels[0];
655 
656   /* Allocate memory for kernel dependencies.  */
657   for (unsigned i = 0; i < kernel_count; i++)
658     {
659       pair->start = (uintptr_t) kernel;
660       pair->end = (uintptr_t) (kernel + 1);
661 
662       struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
663       kernel->agent = agent;
664       kernel->module = module;
665       kernel->name = d->name;
666       kernel->omp_data_size = d->omp_data_size;
667       kernel->gridified_kernel_p = d->gridified_kernel_p;
668       kernel->dependencies_count = d->kernel_dependencies_count;
669       kernel->dependencies = d->kernel_dependencies;
670       if (pthread_mutex_init (&kernel->init_mutex, NULL))
671 	GOMP_PLUGIN_fatal ("Failed to initialize an HSA kernel mutex");
672 
673       kernel++;
674       pair++;
675     }
676 
677   add_module_to_agent (agent, module);
678   if (pthread_rwlock_unlock (&agent->modules_rwlock))
679     GOMP_PLUGIN_fatal ("Unable to unlock an HSA agent rwlock");
680   return kernel_count;
681 }
682 
683 /* Add a shared BRIG library from a FILE_NAME to an AGENT.  */
684 
685 static struct brig_library_info *
add_shared_library(const char * file_name,struct agent_info * agent)686 add_shared_library (const char *file_name, struct agent_info *agent)
687 {
688   struct brig_library_info *library = NULL;
689 
690   void *f = dlopen (file_name, RTLD_NOW);
691   void *start = dlsym (f, "__brig_start");
692   void *end = dlsym (f, "__brig_end");
693 
694   if (start == NULL || end == NULL)
695     return NULL;
696 
697   unsigned size = end - start;
698   char *buf = (char *) GOMP_PLUGIN_malloc (size);
699   memcpy (buf, start, size);
700 
701   library = GOMP_PLUGIN_malloc (sizeof (struct agent_info));
702   library->file_name = (char *) GOMP_PLUGIN_malloc
703     ((strlen (file_name) + 1));
704   strcpy (library->file_name, file_name);
705   library->image = (hsa_ext_module_t) buf;
706 
707   return library;
708 }
709 
710 /* Release memory used for BRIG shared libraries that correspond
711    to an AGENT.  */
712 
713 static void
release_agent_shared_libraries(struct agent_info * agent)714 release_agent_shared_libraries (struct agent_info *agent)
715 {
716   for (unsigned i = 0; i < agent->brig_libraries_count; i++)
717     if (agent->brig_libraries[i])
718       {
719 	free (agent->brig_libraries[i]->file_name);
720 	free (agent->brig_libraries[i]->image);
721 	free (agent->brig_libraries[i]);
722       }
723 
724   free (agent->brig_libraries);
725 }
726 
727 /* Create and finalize the program consisting of all loaded modules.  */
728 
729 static void
create_and_finalize_hsa_program(struct agent_info * agent)730 create_and_finalize_hsa_program (struct agent_info *agent)
731 {
732   hsa_status_t status;
733   hsa_ext_program_t prog_handle;
734   int mi = 0;
735 
736   if (pthread_mutex_lock (&agent->prog_mutex))
737     GOMP_PLUGIN_fatal ("Could not lock an HSA agent program mutex");
738   if (agent->prog_finalized)
739     goto final;
740 
741   status = hsa_ext_program_create (HSA_MACHINE_MODEL_LARGE, HSA_PROFILE_FULL,
742 				   HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
743 				   NULL, &prog_handle);
744   if (status != HSA_STATUS_SUCCESS)
745     hsa_fatal ("Could not create an HSA program", status);
746 
747   HSA_DEBUG ("Created a finalized program\n");
748 
749   struct module_info *module = agent->first_module;
750   while (module)
751     {
752       status = hsa_ext_program_add_module (prog_handle,
753 					   module->image_desc->brig_module);
754       if (status != HSA_STATUS_SUCCESS)
755 	hsa_fatal ("Could not add a module to the HSA program", status);
756       module = module->next;
757       mi++;
758     }
759 
760   /* Load all shared libraries.  */
761   const char *libraries[] = { "libhsamath.so", "libhsastd.so" };
762   const unsigned libraries_count = sizeof (libraries) / sizeof (const char *);
763 
764   agent->brig_libraries_count = libraries_count;
765   agent->brig_libraries = GOMP_PLUGIN_malloc_cleared
766     (sizeof (struct brig_library_info) * libraries_count);
767 
768   for (unsigned i = 0; i < libraries_count; i++)
769     {
770       struct brig_library_info *library = add_shared_library (libraries[i],
771 							      agent);
772       if (library == NULL)
773 	{
774 	  HSA_WARNING ("Could not open a shared BRIG library: %s\n",
775 		       libraries[i]);
776 	  continue;
777 	}
778 
779       status = hsa_ext_program_add_module (prog_handle, library->image);
780       if (status != HSA_STATUS_SUCCESS)
781 	hsa_warn ("Could not add a shared BRIG library the HSA program",
782 		  status);
783       else
784 	HSA_DEBUG ("a shared BRIG library has been added to a program: %s\n",
785 		   libraries[i]);
786     }
787 
788   hsa_ext_control_directives_t control_directives;
789   memset (&control_directives, 0, sizeof (control_directives));
790   hsa_code_object_t code_object;
791   status = hsa_ext_program_finalize (prog_handle, agent->isa,
792 				     HSA_EXT_FINALIZER_CALL_CONVENTION_AUTO,
793 				     control_directives, "",
794 				     HSA_CODE_OBJECT_TYPE_PROGRAM,
795 				     &code_object);
796   if (status != HSA_STATUS_SUCCESS)
797     {
798       hsa_warn ("Finalization of the HSA program failed", status);
799       goto failure;
800     }
801 
802   HSA_DEBUG ("Finalization done\n");
803   hsa_ext_program_destroy (prog_handle);
804 
805   status
806     = hsa_executable_create (HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN,
807 			     "", &agent->executable);
808   if (status != HSA_STATUS_SUCCESS)
809     hsa_fatal ("Could not create HSA executable", status);
810 
811   module = agent->first_module;
812   while (module)
813     {
814       /* Initialize all global variables declared in the module.  */
815       for (unsigned i = 0; i < module->image_desc->global_variable_count; i++)
816 	{
817 	  struct global_var_info *var;
818 	  var = &module->image_desc->global_variables[i];
819 	  status
820 	    = hsa_executable_global_variable_define (agent->executable,
821 						     var->name, var->address);
822 
823 	  HSA_DEBUG ("Defining global variable: %s, address: %p\n", var->name,
824 		     var->address);
825 
826 	  if (status != HSA_STATUS_SUCCESS)
827 	    hsa_fatal ("Could not define a global variable in the HSA program",
828 		       status);
829 	}
830 
831       module = module->next;
832     }
833 
834   status = hsa_executable_load_code_object (agent->executable, agent->id,
835 					    code_object, "");
836   if (status != HSA_STATUS_SUCCESS)
837     hsa_fatal ("Could not add a code object to the HSA executable", status);
838   status = hsa_executable_freeze (agent->executable, "");
839   if (status != HSA_STATUS_SUCCESS)
840     hsa_fatal ("Could not freeze the HSA executable", status);
841 
842   HSA_DEBUG ("Froze HSA executable with the finalized code object\n");
843 
844   /* If all goes good, jump to final.  */
845   goto final;
846 
847 failure:
848   agent->prog_finalized_error = true;
849 
850 final:
851   agent->prog_finalized = true;
852 
853   if (pthread_mutex_unlock (&agent->prog_mutex))
854     GOMP_PLUGIN_fatal ("Could not unlock an HSA agent program mutex");
855 }
856 
857 /* Create kernel dispatch data structure for given KERNEL.  */
858 
859 static struct GOMP_hsa_kernel_dispatch *
create_single_kernel_dispatch(struct kernel_info * kernel,unsigned omp_data_size)860 create_single_kernel_dispatch (struct kernel_info *kernel,
861 			       unsigned omp_data_size)
862 {
863   struct agent_info *agent = kernel->agent;
864   struct GOMP_hsa_kernel_dispatch *shadow
865     = GOMP_PLUGIN_malloc_cleared (sizeof (struct GOMP_hsa_kernel_dispatch));
866 
867   shadow->queue = agent->command_q;
868   shadow->omp_data_memory
869     = omp_data_size > 0 ? GOMP_PLUGIN_malloc (omp_data_size) : NULL;
870   unsigned dispatch_count = kernel->dependencies_count;
871   shadow->kernel_dispatch_count = dispatch_count;
872 
873   shadow->children_dispatches
874     = GOMP_PLUGIN_malloc (dispatch_count * sizeof (shadow));
875 
876   shadow->object = kernel->object;
877 
878   hsa_signal_t sync_signal;
879   hsa_status_t status = hsa_signal_create (1, 0, NULL, &sync_signal);
880   if (status != HSA_STATUS_SUCCESS)
881     hsa_fatal ("Error creating the HSA sync signal", status);
882 
883   shadow->signal = sync_signal.handle;
884   shadow->private_segment_size = kernel->private_segment_size;
885   shadow->group_segment_size = kernel->group_segment_size;
886 
887   status
888     = hsa_memory_allocate (agent->kernarg_region, kernel->kernarg_segment_size,
889 			   &shadow->kernarg_address);
890   if (status != HSA_STATUS_SUCCESS)
891     hsa_fatal ("Could not allocate memory for HSA kernel arguments", status);
892 
893   return shadow;
894 }
895 
896 /* Release data structure created for a kernel dispatch in SHADOW argument.  */
897 
898 static void
release_kernel_dispatch(struct GOMP_hsa_kernel_dispatch * shadow)899 release_kernel_dispatch (struct GOMP_hsa_kernel_dispatch *shadow)
900 {
901   HSA_DEBUG ("Released kernel dispatch: %p has value: %lu (%p)\n", shadow,
902 	     shadow->debug, (void *) shadow->debug);
903 
904   hsa_memory_free (shadow->kernarg_address);
905 
906   hsa_signal_t s;
907   s.handle = shadow->signal;
908   hsa_signal_destroy (s);
909 
910   free (shadow->omp_data_memory);
911 
912   for (unsigned i = 0; i < shadow->kernel_dispatch_count; i++)
913     release_kernel_dispatch (shadow->children_dispatches[i]);
914 
915   free (shadow->children_dispatches);
916   free (shadow);
917 }
918 
919 /* Initialize a KERNEL without its dependencies.  MAX_OMP_DATA_SIZE is used
920    to calculate maximum necessary memory for OMP data allocation.  */
921 
922 static void
init_single_kernel(struct kernel_info * kernel,unsigned * max_omp_data_size)923 init_single_kernel (struct kernel_info *kernel, unsigned *max_omp_data_size)
924 {
925   hsa_status_t status;
926   struct agent_info *agent = kernel->agent;
927   hsa_executable_symbol_t kernel_symbol;
928   status = hsa_executable_get_symbol (agent->executable, NULL, kernel->name,
929 				      agent->id, 0, &kernel_symbol);
930   if (status != HSA_STATUS_SUCCESS)
931     {
932       hsa_warn ("Could not find symbol for kernel in the code object", status);
933       goto failure;
934     }
935   HSA_DEBUG ("Located kernel %s\n", kernel->name);
936   status
937     = hsa_executable_symbol_get_info (kernel_symbol,
938 				      HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
939 				      &kernel->object);
940   if (status != HSA_STATUS_SUCCESS)
941     hsa_fatal ("Could not extract a kernel object from its symbol", status);
942   status = hsa_executable_symbol_get_info
943     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
944      &kernel->kernarg_segment_size);
945   if (status != HSA_STATUS_SUCCESS)
946     hsa_fatal ("Could not get info about kernel argument size", status);
947   status = hsa_executable_symbol_get_info
948     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
949      &kernel->group_segment_size);
950   if (status != HSA_STATUS_SUCCESS)
951     hsa_fatal ("Could not get info about kernel group segment size", status);
952   status = hsa_executable_symbol_get_info
953     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
954      &kernel->private_segment_size);
955   if (status != HSA_STATUS_SUCCESS)
956     hsa_fatal ("Could not get info about kernel private segment size",
957 	       status);
958 
959   HSA_DEBUG ("Kernel structure for %s fully initialized with "
960 	     "following segment sizes: \n", kernel->name);
961   HSA_DEBUG ("  group_segment_size: %u\n",
962 	     (unsigned) kernel->group_segment_size);
963   HSA_DEBUG ("  private_segment_size: %u\n",
964 	     (unsigned) kernel->private_segment_size);
965   HSA_DEBUG ("  kernarg_segment_size: %u\n",
966 	     (unsigned) kernel->kernarg_segment_size);
967   HSA_DEBUG ("  omp_data_size: %u\n", kernel->omp_data_size);
968   HSA_DEBUG ("  gridified_kernel_p: %u\n", kernel->gridified_kernel_p);
969 
970   if (kernel->omp_data_size > *max_omp_data_size)
971     *max_omp_data_size = kernel->omp_data_size;
972 
973   for (unsigned i = 0; i < kernel->dependencies_count; i++)
974     {
975       struct kernel_info *dependency
976 	= get_kernel_for_agent (agent, kernel->dependencies[i]);
977 
978       if (dependency == NULL)
979 	{
980 	  HSA_DEBUG ("Could not find a dependency for a kernel: %s, "
981 		     "dependency name: %s\n", kernel->name,
982 		     kernel->dependencies[i]);
983 	  goto failure;
984 	}
985 
986       if (dependency->dependencies_count > 0)
987 	{
988 	  HSA_DEBUG ("HSA does not allow kernel dispatching code with "
989 		     "a depth bigger than one\n")
990 	  goto failure;
991 	}
992 
993       init_single_kernel (dependency, max_omp_data_size);
994     }
995 
996   return;
997 
998 failure:
999   kernel->initialization_failed = true;
1000 }
1001 
1002 /* Indent stream F by INDENT spaces.  */
1003 
1004 static void
indent_stream(FILE * f,unsigned indent)1005 indent_stream (FILE *f, unsigned indent)
1006 {
1007   fprintf (f, "%*s", indent, "");
1008 }
1009 
1010 /* Dump kernel DISPATCH data structure and indent it by INDENT spaces.  */
1011 
1012 static void
print_kernel_dispatch(struct GOMP_hsa_kernel_dispatch * dispatch,unsigned indent)1013 print_kernel_dispatch (struct GOMP_hsa_kernel_dispatch *dispatch, unsigned indent)
1014 {
1015   indent_stream (stderr, indent);
1016   fprintf (stderr, "this: %p\n", dispatch);
1017   indent_stream (stderr, indent);
1018   fprintf (stderr, "queue: %p\n", dispatch->queue);
1019   indent_stream (stderr, indent);
1020   fprintf (stderr, "omp_data_memory: %p\n", dispatch->omp_data_memory);
1021   indent_stream (stderr, indent);
1022   fprintf (stderr, "kernarg_address: %p\n", dispatch->kernarg_address);
1023   indent_stream (stderr, indent);
1024   fprintf (stderr, "object: %lu\n", dispatch->object);
1025   indent_stream (stderr, indent);
1026   fprintf (stderr, "signal: %lu\n", dispatch->signal);
1027   indent_stream (stderr, indent);
1028   fprintf (stderr, "private_segment_size: %u\n",
1029 	   dispatch->private_segment_size);
1030   indent_stream (stderr, indent);
1031   fprintf (stderr, "group_segment_size: %u\n",
1032 	   dispatch->group_segment_size);
1033   indent_stream (stderr, indent);
1034   fprintf (stderr, "children dispatches: %lu\n",
1035 	   dispatch->kernel_dispatch_count);
1036   indent_stream (stderr, indent);
1037   fprintf (stderr, "omp_num_threads: %u\n",
1038 	   dispatch->omp_num_threads);
1039   fprintf (stderr, "\n");
1040 
1041   for (unsigned i = 0; i < dispatch->kernel_dispatch_count; i++)
1042     print_kernel_dispatch (dispatch->children_dispatches[i], indent + 2);
1043 }
1044 
1045 /* Create kernel dispatch data structure for a KERNEL and all its
1046    dependencies.  */
1047 
1048 static struct GOMP_hsa_kernel_dispatch *
create_kernel_dispatch(struct kernel_info * kernel,unsigned omp_data_size)1049 create_kernel_dispatch (struct kernel_info *kernel, unsigned omp_data_size)
1050 {
1051   struct GOMP_hsa_kernel_dispatch *shadow
1052     = create_single_kernel_dispatch (kernel, omp_data_size);
1053   shadow->omp_num_threads = 64;
1054   shadow->debug = 0;
1055   shadow->omp_level = kernel->gridified_kernel_p ? 1 : 0;
1056 
1057   /* Create kernel dispatch data structures.  We do not allow to have
1058      a kernel dispatch with depth bigger than one.  */
1059   for (unsigned i = 0; i < kernel->dependencies_count; i++)
1060     {
1061       struct kernel_info *dependency
1062 	= get_kernel_for_agent (kernel->agent, kernel->dependencies[i]);
1063       shadow->children_dispatches[i]
1064 	= create_single_kernel_dispatch (dependency, omp_data_size);
1065       shadow->children_dispatches[i]->queue
1066 	= kernel->agent->kernel_dispatch_command_q;
1067       shadow->children_dispatches[i]->omp_level = 1;
1068     }
1069 
1070   return shadow;
1071 }
1072 
1073 /* Do all the work that is necessary before running KERNEL for the first time.
1074    The function assumes the program has been created, finalized and frozen by
1075    create_and_finalize_hsa_program.  */
1076 
1077 static void
init_kernel(struct kernel_info * kernel)1078 init_kernel (struct kernel_info *kernel)
1079 {
1080   if (pthread_mutex_lock (&kernel->init_mutex))
1081     GOMP_PLUGIN_fatal ("Could not lock an HSA kernel initialization mutex");
1082   if (kernel->initialized)
1083     {
1084       if (pthread_mutex_unlock (&kernel->init_mutex))
1085 	GOMP_PLUGIN_fatal ("Could not unlock an HSA kernel initialization "
1086 			   "mutex");
1087 
1088       return;
1089     }
1090 
1091   /* Precomputed maximum size of OMP data necessary for a kernel from kernel
1092      dispatch operation.  */
1093   init_single_kernel (kernel, &kernel->max_omp_data_size);
1094 
1095   if (!kernel->initialization_failed)
1096     HSA_DEBUG ("\n");
1097 
1098   kernel->initialized = true;
1099   if (pthread_mutex_unlock (&kernel->init_mutex))
1100     GOMP_PLUGIN_fatal ("Could not unlock an HSA kernel initialization "
1101 		       "mutex");
1102 }
1103 
1104 /* Parse the target attributes INPUT provided by the compiler and return true
1105    if we should run anything all.  If INPUT is NULL, fill DEF with default
1106    values, then store INPUT or DEF into *RESULT.  */
1107 
1108 static bool
parse_target_attributes(void ** input,struct GOMP_kernel_launch_attributes * def,struct GOMP_kernel_launch_attributes ** result)1109 parse_target_attributes (void **input,
1110 			 struct GOMP_kernel_launch_attributes *def,
1111 			 struct GOMP_kernel_launch_attributes **result)
1112 {
1113   if (!input)
1114     GOMP_PLUGIN_fatal ("No target arguments provided");
1115 
1116   bool attrs_found = false;
1117   while (*input)
1118     {
1119       uintptr_t id = (uintptr_t) *input;
1120       if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_HSA
1121 	  && ((id & GOMP_TARGET_ARG_ID_MASK)
1122 	      == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1123 	{
1124 	  input++;
1125 	  attrs_found = true;
1126 	  break;
1127 	}
1128 
1129       if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1130 	input++;
1131       input++;
1132     }
1133 
1134   if (!attrs_found)
1135     {
1136       def->ndim = 1;
1137       def->gdims[0] = 1;
1138       def->gdims[1] = 1;
1139       def->gdims[2] = 1;
1140       def->wdims[0] = 1;
1141       def->wdims[1] = 1;
1142       def->wdims[2] = 1;
1143       *result = def;
1144       HSA_DEBUG ("GOMP_OFFLOAD_run called with no launch attributes\n");
1145       return true;
1146     }
1147 
1148   struct GOMP_kernel_launch_attributes *kla;
1149   kla = (struct GOMP_kernel_launch_attributes *) *input;
1150   *result = kla;
1151   if (kla->ndim != 1)
1152     GOMP_PLUGIN_fatal ("HSA does not yet support number of dimensions "
1153 		       "different from one.");
1154   if (kla->gdims[0] == 0)
1155     return false;
1156 
1157   HSA_DEBUG ("GOMP_OFFLOAD_run called with grid size %u and group size %u\n",
1158 	     kla->gdims[0], kla->wdims[0]);
1159 
1160   return true;
1161 }
1162 
1163 /* Return true if the HSA runtime can run function FN_PTR.  */
1164 
1165 bool
GOMP_OFFLOAD_can_run(void * fn_ptr)1166 GOMP_OFFLOAD_can_run (void *fn_ptr)
1167 {
1168   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
1169   struct agent_info *agent = kernel->agent;
1170   create_and_finalize_hsa_program (agent);
1171 
1172   if (agent->prog_finalized_error)
1173     goto failure;
1174 
1175   init_kernel (kernel);
1176   if (kernel->initialization_failed)
1177     goto failure;
1178 
1179   return true;
1180 
1181 failure:
1182   if (suppress_host_fallback)
1183     GOMP_PLUGIN_fatal ("HSA host fallback has been suppressed");
1184   HSA_DEBUG ("HSA target cannot be launched, doing a host fallback\n");
1185   return false;
1186 }
1187 
1188 /* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET.  */
1189 
1190 void
packet_store_release(uint32_t * packet,uint16_t header,uint16_t rest)1191 packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1192 {
1193   __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1194 }
1195 
1196 /* Part of the libgomp plugin interface.  Run a kernel on device N and pass it
1197    an array of pointers in VARS as a parameter.  The kernel is identified by
1198    FN_PTR which must point to a kernel_info structure.  */
1199 
1200 void
GOMP_OFFLOAD_run(int n,void * fn_ptr,void * vars,void ** args)1201 GOMP_OFFLOAD_run (int n, void *fn_ptr, void *vars, void **args)
1202 {
1203   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
1204   struct agent_info *agent = kernel->agent;
1205   struct GOMP_kernel_launch_attributes def;
1206   struct GOMP_kernel_launch_attributes *kla;
1207   if (!parse_target_attributes (args, &def, &kla))
1208     {
1209       HSA_DEBUG ("Will not run HSA kernel because the grid size is zero\n");
1210       return;
1211     }
1212   if (pthread_rwlock_rdlock (&agent->modules_rwlock))
1213     GOMP_PLUGIN_fatal ("Unable to read-lock an HSA agent rwlock");
1214 
1215   if (!agent->initialized)
1216     GOMP_PLUGIN_fatal ("Agent must be initialized");
1217 
1218   if (!kernel->initialized)
1219     GOMP_PLUGIN_fatal ("Called kernel must be initialized");
1220 
1221   struct GOMP_hsa_kernel_dispatch *shadow
1222     = create_kernel_dispatch (kernel, kernel->max_omp_data_size);
1223 
1224   if (debug)
1225     {
1226       fprintf (stderr, "\nKernel has following dependencies:\n");
1227       print_kernel_dispatch (shadow, 2);
1228     }
1229 
1230   uint64_t index = hsa_queue_add_write_index_release (agent->command_q, 1);
1231   HSA_DEBUG ("Got AQL index %llu\n", (long long int) index);
1232 
1233   /* Wait until the queue is not full before writing the packet.   */
1234   while (index - hsa_queue_load_read_index_acquire (agent->command_q)
1235 	 >= agent->command_q->size)
1236     ;
1237 
1238   hsa_kernel_dispatch_packet_t *packet;
1239   packet = ((hsa_kernel_dispatch_packet_t *) agent->command_q->base_address)
1240 	   + index % agent->command_q->size;
1241 
1242   memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
1243   packet->grid_size_x = kla->gdims[0];
1244   uint32_t wgs = kla->wdims[0];
1245   if (wgs == 0)
1246     /* TODO: Provide a default via environment.  */
1247     wgs = 64;
1248   else if (wgs > kla->gdims[0])
1249     wgs = kla->gdims[0];
1250   packet->workgroup_size_x = wgs;
1251   packet->grid_size_y = 1;
1252   packet->workgroup_size_y = 1;
1253   packet->grid_size_z = 1;
1254   packet->workgroup_size_z = 1;
1255   packet->private_segment_size = kernel->private_segment_size;
1256   packet->group_segment_size = kernel->group_segment_size;
1257   packet->kernel_object = kernel->object;
1258   packet->kernarg_address = shadow->kernarg_address;
1259   hsa_signal_t s;
1260   s.handle = shadow->signal;
1261   packet->completion_signal = s;
1262   hsa_signal_store_relaxed (s, 1);
1263   memcpy (shadow->kernarg_address, &vars, sizeof (vars));
1264 
1265   /* PR hsa/70337.  */
1266   size_t vars_size = sizeof (vars);
1267   if (kernel->kernarg_segment_size > vars_size)
1268     {
1269       if (kernel->kernarg_segment_size != vars_size
1270 	  + sizeof (struct hsa_kernel_runtime *))
1271 	GOMP_PLUGIN_fatal ("Kernel segment size has an unexpected value");
1272       memcpy (packet->kernarg_address + vars_size, &shadow,
1273 	      sizeof (struct hsa_kernel_runtime *));
1274     }
1275 
1276   HSA_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
1277 
1278   uint16_t header;
1279   header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
1280   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
1281   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
1282 
1283   HSA_DEBUG ("Going to dispatch kernel %s\n", kernel->name);
1284 
1285   packet_store_release ((uint32_t *) packet, header,
1286 			1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
1287 
1288   hsa_signal_store_release (agent->command_q->doorbell_signal, index);
1289 
1290   /* TODO: GPU agents in Carrizo APUs cannot properly update L2 cache for
1291      signal wait and signal load operations on their own and we need to
1292      periodically call the hsa_signal_load_acquire on completion signals of
1293      children kernels in the CPU to make that happen.  As soon the
1294      limitation will be resolved, this workaround can be removed.  */
1295 
1296   HSA_DEBUG ("Kernel dispatched, waiting for completion\n");
1297 
1298   /* Root signal waits with 1ms timeout.  */
1299   while (hsa_signal_wait_acquire (s, HSA_SIGNAL_CONDITION_LT, 1, 1000 * 1000,
1300 				  HSA_WAIT_STATE_BLOCKED) != 0)
1301     for (unsigned i = 0; i < shadow->kernel_dispatch_count; i++)
1302       {
1303 	hsa_signal_t child_s;
1304 	child_s.handle = shadow->children_dispatches[i]->signal;
1305 
1306 	HSA_DEBUG ("Waiting for children completion signal: %lu\n",
1307 		   shadow->children_dispatches[i]->signal);
1308 	hsa_signal_load_acquire (child_s);
1309       }
1310 
1311   release_kernel_dispatch (shadow);
1312 
1313   if (pthread_rwlock_unlock (&agent->modules_rwlock))
1314     GOMP_PLUGIN_fatal ("Unable to unlock an HSA agent rwlock");
1315 }
1316 
1317 /* Information to be passed to a thread running a kernel asycnronously.  */
1318 
1319 struct async_run_info
1320 {
1321   int device;
1322   void *tgt_fn;
1323   void *tgt_vars;
1324   void **args;
1325   void *async_data;
1326 };
1327 
1328 /* Thread routine to run a kernel asynchronously.  */
1329 
1330 static void *
run_kernel_asynchronously(void * thread_arg)1331 run_kernel_asynchronously (void *thread_arg)
1332 {
1333   struct async_run_info *info = (struct async_run_info *) thread_arg;
1334   int device = info->device;
1335   void *tgt_fn = info->tgt_fn;
1336   void *tgt_vars = info->tgt_vars;
1337   void **args = info->args;
1338   void *async_data = info->async_data;
1339 
1340   free (info);
1341   GOMP_OFFLOAD_run (device, tgt_fn, tgt_vars, args);
1342   GOMP_PLUGIN_target_task_completion (async_data);
1343   return NULL;
1344 }
1345 
1346 /* Part of the libgomp plugin interface.  Run a kernel like GOMP_OFFLOAD_run
1347    does, but asynchronously and call GOMP_PLUGIN_target_task_completion when it
1348    has finished.  */
1349 
1350 void
GOMP_OFFLOAD_async_run(int device,void * tgt_fn,void * tgt_vars,void ** args,void * async_data)1351 GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
1352 			void **args, void *async_data)
1353 {
1354   pthread_t pt;
1355   struct async_run_info *info;
1356   HSA_DEBUG ("GOMP_OFFLOAD_async_run invoked\n")
1357   info = GOMP_PLUGIN_malloc (sizeof (struct async_run_info));
1358 
1359   info->device = device;
1360   info->tgt_fn = tgt_fn;
1361   info->tgt_vars = tgt_vars;
1362   info->args = args;
1363   info->async_data = async_data;
1364 
1365   int err = pthread_create (&pt, NULL, &run_kernel_asynchronously, info);
1366   if (err != 0)
1367     GOMP_PLUGIN_fatal ("HSA asynchronous thread creation failed: %s",
1368 		       strerror (err));
1369   err = pthread_detach (pt);
1370   if (err != 0)
1371     GOMP_PLUGIN_fatal ("Failed to detach a thread to run HSA kernel "
1372 		       "asynchronously: %s", strerror (err));
1373 }
1374 
1375 /* Deinitialize all information associated with MODULE and kernels within
1376    it.  */
1377 
1378 void
destroy_module(struct module_info * module)1379 destroy_module (struct module_info *module)
1380 {
1381   int i;
1382   for (i = 0; i < module->kernel_count; i++)
1383     if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
1384       GOMP_PLUGIN_fatal ("Failed to destroy an HSA kernel initialization "
1385 			 "mutex");
1386 }
1387 
1388 /* Part of the libgomp plugin interface.  Unload BRIG module described by
1389    struct brig_image_desc in TARGET_DATA from agent number N.  */
1390 
1391 void
GOMP_OFFLOAD_unload_image(int n,unsigned version,void * target_data)1392 GOMP_OFFLOAD_unload_image (int n, unsigned version, void *target_data)
1393 {
1394   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_HSA)
1395     GOMP_PLUGIN_fatal ("Offload data incompatible with HSA plugin"
1396 		       " (expected %u, received %u)",
1397 		       GOMP_VERSION_HSA, GOMP_VERSION_DEV (version));
1398 
1399   struct agent_info *agent;
1400   agent = get_agent_info (n);
1401   if (pthread_rwlock_wrlock (&agent->modules_rwlock))
1402     GOMP_PLUGIN_fatal ("Unable to write-lock an HSA agent rwlock");
1403 
1404   struct module_info *module = agent->first_module;
1405   while (module)
1406     {
1407       if (module->image_desc == target_data)
1408 	break;
1409       module = module->next;
1410     }
1411   if (!module)
1412     GOMP_PLUGIN_fatal ("Attempt to unload an image that has never been "
1413 		       "loaded before");
1414 
1415   remove_module_from_agent (agent, module);
1416   destroy_module (module);
1417   free (module);
1418   destroy_hsa_program (agent);
1419   if (pthread_rwlock_unlock (&agent->modules_rwlock))
1420     GOMP_PLUGIN_fatal ("Unable to unlock an HSA agent rwlock");
1421 }
1422 
1423 /* Part of the libgomp plugin interface.  Deinitialize all information and
1424    status associated with agent number N.  We do not attempt any
1425    synchronization, assuming the user and libgomp will not attempt
1426    deinitialization of a device that is in any way being used at the same
1427    time.  */
1428 
1429 void
GOMP_OFFLOAD_fini_device(int n)1430 GOMP_OFFLOAD_fini_device (int n)
1431 {
1432   struct agent_info *agent = get_agent_info (n);
1433   if (!agent->initialized)
1434     return;
1435 
1436   struct module_info *next_module = agent->first_module;
1437   while (next_module)
1438     {
1439       struct module_info *module = next_module;
1440       next_module = module->next;
1441       destroy_module (module);
1442       free (module);
1443     }
1444   agent->first_module = NULL;
1445   destroy_hsa_program (agent);
1446 
1447   release_agent_shared_libraries (agent);
1448 
1449   hsa_status_t status = hsa_queue_destroy (agent->command_q);
1450   if (status != HSA_STATUS_SUCCESS)
1451     hsa_fatal ("Error destroying command queue", status);
1452   status = hsa_queue_destroy (agent->kernel_dispatch_command_q);
1453   if (status != HSA_STATUS_SUCCESS)
1454     hsa_fatal ("Error destroying kernel dispatch command queue", status);
1455   if (pthread_mutex_destroy (&agent->prog_mutex))
1456     GOMP_PLUGIN_fatal ("Failed to destroy an HSA agent program mutex");
1457   if (pthread_rwlock_destroy (&agent->modules_rwlock))
1458     GOMP_PLUGIN_fatal ("Failed to destroy an HSA agent rwlock");
1459   agent->initialized = false;
1460 }
1461 
1462 /* Part of the libgomp plugin interface.  Not implemented as it is not required
1463    for HSA.  */
1464 
1465 void *
GOMP_OFFLOAD_alloc(int ord,size_t size)1466 GOMP_OFFLOAD_alloc (int ord, size_t size)
1467 {
1468   GOMP_PLUGIN_fatal ("HSA GOMP_OFFLOAD_alloc is not implemented because "
1469 		     "it should never be called");
1470 }
1471 
1472 /* Part of the libgomp plugin interface.  Not implemented as it is not required
1473    for HSA.  */
1474 
1475 void
GOMP_OFFLOAD_free(int ord,void * ptr)1476 GOMP_OFFLOAD_free (int ord, void *ptr)
1477 {
1478   GOMP_PLUGIN_fatal ("HSA GOMP_OFFLOAD_free is not implemented because "
1479 		     "it should never be called");
1480 }
1481 
1482 /* Part of the libgomp plugin interface.  Not implemented as it is not required
1483    for HSA.  */
1484 
1485 void *
GOMP_OFFLOAD_dev2host(int ord,void * dst,const void * src,size_t n)1486 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1487 {
1488   GOMP_PLUGIN_fatal ("HSA GOMP_OFFLOAD_dev2host is not implemented because "
1489 		     "it should never be called");
1490 }
1491 
1492 /* Part of the libgomp plugin interface.  Not implemented as it is not required
1493    for HSA.  */
1494 
1495 void *
GOMP_OFFLOAD_host2dev(int ord,void * dst,const void * src,size_t n)1496 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1497 {
1498   GOMP_PLUGIN_fatal ("HSA GOMP_OFFLOAD_host2dev is not implemented because "
1499 		     "it should never be called");
1500 }
1501 
1502 /* Part of the libgomp plugin interface.  Not implemented as it is not required
1503    for HSA.  */
1504 
1505 void *
GOMP_OFFLOAD_dev2dev(int ord,void * dst,const void * src,size_t n)1506 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1507 {
1508   GOMP_PLUGIN_fatal ("HSA GOMP_OFFLOAD_dev2dev is not implemented because "
1509 		     "it should never be called");
1510 }
1511