1 /* Plugin for HSAIL execution.
2
3 Copyright (C) 2013-2016 Free Software Foundation, Inc.
4
5 Contributed by Martin Jambor <mjambor@suse.cz> and
6 Martin Liska <mliska@suse.cz>.
7
8 This file is part of the GNU Offloading and Multi Processing Library
9 (libgomp).
10
11 Libgomp is free software; you can redistribute it and/or modify it
12 under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 3, or (at your option)
14 any later version.
15
16 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
17 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
18 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
19 more details.
20
21 Under Section 7 of GPL version 3, you are granted additional
22 permissions described in the GCC Runtime Library Exception, version
23 3.1, as published by the Free Software Foundation.
24
25 You should have received a copy of the GNU General Public License and
26 a copy of the GCC Runtime Library Exception along with this program;
27 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
28 <http://www.gnu.org/licenses/>. */
29
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <pthread.h>
34 #include <hsa.h>
35 #include <hsa_ext_finalize.h>
36 #include <dlfcn.h>
37 #include "libgomp-plugin.h"
38 #include "gomp-constants.h"
39
40 /* Keep the following GOMP prefixed structures in sync with respective parts of
41 the compiler. */
42
43 /* Structure describing the run-time and grid properties of an HSA kernel
44 lauch. */
45
46 struct GOMP_kernel_launch_attributes
47 {
48 /* Number of dimensions the workload has. Maximum number is 3. */
49 uint32_t ndim;
50 /* Size of the grid in the three respective dimensions. */
51 uint32_t gdims[3];
52 /* Size of work-groups in the respective dimensions. */
53 uint32_t wdims[3];
54 };
55
56 /* Collection of information needed for a dispatch of a kernel from a
57 kernel. */
58
59 struct GOMP_hsa_kernel_dispatch
60 {
61 /* Pointer to a command queue associated with a kernel dispatch agent. */
62 void *queue;
63 /* Pointer to reserved memory for OMP data struct copying. */
64 void *omp_data_memory;
65 /* Pointer to a memory space used for kernel arguments passing. */
66 void *kernarg_address;
67 /* Kernel object. */
68 uint64_t object;
69 /* Synchronization signal used for dispatch synchronization. */
70 uint64_t signal;
71 /* Private segment size. */
72 uint32_t private_segment_size;
73 /* Group segment size. */
74 uint32_t group_segment_size;
75 /* Number of children kernel dispatches. */
76 uint64_t kernel_dispatch_count;
77 /* Debug purpose argument. */
78 uint64_t debug;
79 /* Levels-var ICV. */
80 uint64_t omp_level;
81 /* Kernel dispatch structures created for children kernel dispatches. */
82 struct GOMP_hsa_kernel_dispatch **children_dispatches;
83 /* Number of threads. */
84 uint32_t omp_num_threads;
85 };
86
87 /* Part of the libgomp plugin interface. Return the name of the accelerator,
88 which is "hsa". */
89
90 const char *
GOMP_OFFLOAD_get_name(void)91 GOMP_OFFLOAD_get_name (void)
92 {
93 return "hsa";
94 }
95
96 /* Part of the libgomp plugin interface. Return the specific capabilities the
97 HSA accelerator have. */
98
99 unsigned int
GOMP_OFFLOAD_get_caps(void)100 GOMP_OFFLOAD_get_caps (void)
101 {
102 return GOMP_OFFLOAD_CAP_SHARED_MEM | GOMP_OFFLOAD_CAP_OPENMP_400;
103 }
104
105 /* Part of the libgomp plugin interface. Identify as HSA accelerator. */
106
107 int
GOMP_OFFLOAD_get_type(void)108 GOMP_OFFLOAD_get_type (void)
109 {
110 return OFFLOAD_TARGET_TYPE_HSA;
111 }
112
113 /* Return the libgomp version number we're compatible with. There is
114 no requirement for cross-version compatibility. */
115
116 unsigned
GOMP_OFFLOAD_version(void)117 GOMP_OFFLOAD_version (void)
118 {
119 return GOMP_VERSION;
120 }
121
122 /* Flag to decide whether print to stderr information about what is going on.
123 Set in init_debug depending on environment variables. */
124
125 static bool debug;
126
127 /* Flag to decide if the runtime should suppress a possible fallback to host
128 execution. */
129
130 static bool suppress_host_fallback;
131
132 /* Initialize debug and suppress_host_fallback according to the environment. */
133
134 static void
init_enviroment_variables(void)135 init_enviroment_variables (void)
136 {
137 if (getenv ("HSA_DEBUG"))
138 debug = true;
139 else
140 debug = false;
141
142 if (getenv ("HSA_SUPPRESS_HOST_FALLBACK"))
143 suppress_host_fallback = true;
144 else
145 suppress_host_fallback = false;
146 }
147
148 /* Print a logging message with PREFIX to stderr if HSA_DEBUG value
149 is set to true. */
150
151 #define HSA_LOG(prefix, ...) \
152 do \
153 { \
154 if (debug) \
155 { \
156 fprintf (stderr, prefix); \
157 fprintf (stderr, __VA_ARGS__); \
158 } \
159 } \
160 while (false);
161
162 /* Print a debugging message to stderr. */
163
164 #define HSA_DEBUG(...) HSA_LOG ("HSA debug: ", __VA_ARGS__)
165
166 /* Print a warning message to stderr. */
167
168 #define HSA_WARNING(...) HSA_LOG ("HSA warning: ", __VA_ARGS__)
169
170 /* Print HSA warning STR with an HSA STATUS code. */
171
172 static void
hsa_warn(const char * str,hsa_status_t status)173 hsa_warn (const char *str, hsa_status_t status)
174 {
175 if (!debug)
176 return;
177
178 const char *hsa_error;
179 hsa_status_string (status, &hsa_error);
180
181 fprintf (stderr, "HSA warning: %s\nRuntime message: %s", str, hsa_error);
182 }
183
184 /* Report a fatal error STR together with the HSA error corresponding to STATUS
185 and terminate execution of the current process. */
186
187 static void
hsa_fatal(const char * str,hsa_status_t status)188 hsa_fatal (const char *str, hsa_status_t status)
189 {
190 const char *hsa_error;
191 hsa_status_string (status, &hsa_error);
192 GOMP_PLUGIN_fatal ("HSA fatal error: %s\nRuntime message: %s", str,
193 hsa_error);
194 }
195
196 struct hsa_kernel_description
197 {
198 const char *name;
199 unsigned omp_data_size;
200 bool gridified_kernel_p;
201 unsigned kernel_dependencies_count;
202 const char **kernel_dependencies;
203 };
204
205 struct global_var_info
206 {
207 const char *name;
208 void *address;
209 };
210
211 /* Data passed by the static initializer of a compilation unit containing BRIG
212 to GOMP_offload_register. */
213
214 struct brig_image_desc
215 {
216 hsa_ext_module_t brig_module;
217 const unsigned kernel_count;
218 struct hsa_kernel_description *kernel_infos;
219 const unsigned global_variable_count;
220 struct global_var_info *global_variables;
221 };
222
223 struct agent_info;
224
225 /* Information required to identify, finalize and run any given kernel. */
226
227 struct kernel_info
228 {
229 /* Name of the kernel, required to locate it within the brig module. */
230 const char *name;
231 /* Size of memory space for OMP data. */
232 unsigned omp_data_size;
233 /* The specific agent the kernel has been or will be finalized for and run
234 on. */
235 struct agent_info *agent;
236 /* The specific module where the kernel takes place. */
237 struct module_info *module;
238 /* Mutex enforcing that at most once thread ever initializes a kernel for
239 use. A thread should have locked agent->modules_rwlock for reading before
240 acquiring it. */
241 pthread_mutex_t init_mutex;
242 /* Flag indicating whether the kernel has been initialized and all fields
243 below it contain valid data. */
244 bool initialized;
245 /* Flag indicating that the kernel has a problem that blocks an execution. */
246 bool initialization_failed;
247 /* The object to be put into the dispatch queue. */
248 uint64_t object;
249 /* Required size of kernel arguments. */
250 uint32_t kernarg_segment_size;
251 /* Required size of group segment. */
252 uint32_t group_segment_size;
253 /* Required size of private segment. */
254 uint32_t private_segment_size;
255 /* List of all kernel dependencies. */
256 const char **dependencies;
257 /* Number of dependencies. */
258 unsigned dependencies_count;
259 /* Maximum OMP data size necessary for kernel from kernel dispatches. */
260 unsigned max_omp_data_size;
261 /* True if the kernel is gridified. */
262 bool gridified_kernel_p;
263 };
264
265 /* Information about a particular brig module, its image and kernels. */
266
267 struct module_info
268 {
269 /* The next and previous module in the linked list of modules of an agent. */
270 struct module_info *next, *prev;
271 /* The description with which the program has registered the image. */
272 struct brig_image_desc *image_desc;
273
274 /* Number of kernels in this module. */
275 int kernel_count;
276 /* An array of kernel_info structures describing each kernel in this
277 module. */
278 struct kernel_info kernels[];
279 };
280
281 /* Information about shared brig library. */
282
283 struct brig_library_info
284 {
285 char *file_name;
286 hsa_ext_module_t image;
287 };
288
289 /* Description of an HSA GPU agent and the program associated with it. */
290
291 struct agent_info
292 {
293 /* The HSA ID of the agent. Assigned when hsa_context is initialized. */
294 hsa_agent_t id;
295 /* Whether the agent has been initialized. The fields below are usable only
296 if it has been. */
297 bool initialized;
298 /* The HSA ISA of this agent. */
299 hsa_isa_t isa;
300 /* Command queue of the agent. */
301 hsa_queue_t *command_q;
302 /* Kernel from kernel dispatch command queue. */
303 hsa_queue_t *kernel_dispatch_command_q;
304 /* The HSA memory region from which to allocate kernel arguments. */
305 hsa_region_t kernarg_region;
306
307 /* Read-write lock that protects kernels which are running or about to be run
308 from interference with loading and unloading of images. Needs to be
309 locked for reading while a kernel is being run, and for writing if the
310 list of modules is manipulated (and thus the HSA program invalidated). */
311 pthread_rwlock_t modules_rwlock;
312 /* The first module in a linked list of modules associated with this
313 kernel. */
314 struct module_info *first_module;
315
316 /* Mutex enforcing that only one thread will finalize the HSA program. A
317 thread should have locked agent->modules_rwlock for reading before
318 acquiring it. */
319 pthread_mutex_t prog_mutex;
320 /* Flag whether the HSA program that consists of all the modules has been
321 finalized. */
322 bool prog_finalized;
323 /* Flag whether the program was finalized but with a failure. */
324 bool prog_finalized_error;
325 /* HSA executable - the finalized program that is used to locate kernels. */
326 hsa_executable_t executable;
327 /* List of BRIG libraries. */
328 struct brig_library_info **brig_libraries;
329 /* Number of loaded shared BRIG libraries. */
330 unsigned brig_libraries_count;
331 };
332
333 /* Information about the whole HSA environment and all of its agents. */
334
335 struct hsa_context_info
336 {
337 /* Whether the structure has been initialized. */
338 bool initialized;
339 /* Number of usable GPU HSA agents in the system. */
340 int agent_count;
341 /* Array of agent_info structures describing the individual HSA agents. */
342 struct agent_info *agents;
343 };
344
345 /* Information about the whole HSA environment and all of its agents. */
346
347 static struct hsa_context_info hsa_context;
348
349 /* Find kernel for an AGENT by name provided in KERNEL_NAME. */
350
351 static struct kernel_info *
get_kernel_for_agent(struct agent_info * agent,const char * kernel_name)352 get_kernel_for_agent (struct agent_info *agent, const char *kernel_name)
353 {
354 struct module_info *module = agent->first_module;
355
356 while (module)
357 {
358 for (unsigned i = 0; i < module->kernel_count; i++)
359 if (strcmp (module->kernels[i].name, kernel_name) == 0)
360 return &module->kernels[i];
361
362 module = module->next;
363 }
364
365 return NULL;
366 }
367
368 /* Return true if the agent is a GPU and acceptable of concurrent submissions
369 from different threads. */
370
371 static bool
suitable_hsa_agent_p(hsa_agent_t agent)372 suitable_hsa_agent_p (hsa_agent_t agent)
373 {
374 hsa_device_type_t device_type;
375 hsa_status_t status
376 = hsa_agent_get_info (agent, HSA_AGENT_INFO_DEVICE, &device_type);
377 if (status != HSA_STATUS_SUCCESS || device_type != HSA_DEVICE_TYPE_GPU)
378 return false;
379
380 uint32_t features = 0;
381 status = hsa_agent_get_info (agent, HSA_AGENT_INFO_FEATURE, &features);
382 if (status != HSA_STATUS_SUCCESS
383 || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
384 return false;
385 hsa_queue_type_t queue_type;
386 status = hsa_agent_get_info (agent, HSA_AGENT_INFO_QUEUE_TYPE, &queue_type);
387 if (status != HSA_STATUS_SUCCESS
388 || (queue_type != HSA_QUEUE_TYPE_MULTI))
389 return false;
390
391 return true;
392 }
393
394 /* Callback of hsa_iterate_agents, if AGENT is a GPU device, increment
395 agent_count in hsa_context. */
396
397 static hsa_status_t
count_gpu_agents(hsa_agent_t agent,void * data)398 count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
399 {
400 if (suitable_hsa_agent_p (agent))
401 hsa_context.agent_count++;
402 return HSA_STATUS_SUCCESS;
403 }
404
405 /* Callback of hsa_iterate_agents, if AGENT is a GPU device, assign the agent
406 id to the describing structure in the hsa context. The index of the
407 structure is pointed to by DATA, increment it afterwards. */
408
409 static hsa_status_t
assign_agent_ids(hsa_agent_t agent,void * data)410 assign_agent_ids (hsa_agent_t agent, void *data)
411 {
412 if (suitable_hsa_agent_p (agent))
413 {
414 int *agent_index = (int *) data;
415 hsa_context.agents[*agent_index].id = agent;
416 ++*agent_index;
417 }
418 return HSA_STATUS_SUCCESS;
419 }
420
421 /* Initialize hsa_context if it has not already been done. */
422
423 static void
init_hsa_context(void)424 init_hsa_context (void)
425 {
426 hsa_status_t status;
427 int agent_index = 0;
428
429 if (hsa_context.initialized)
430 return;
431 init_enviroment_variables ();
432 status = hsa_init ();
433 if (status != HSA_STATUS_SUCCESS)
434 hsa_fatal ("Run-time could not be initialized", status);
435 HSA_DEBUG ("HSA run-time initialized\n");
436 status = hsa_iterate_agents (count_gpu_agents, NULL);
437 if (status != HSA_STATUS_SUCCESS)
438 hsa_fatal ("HSA GPU devices could not be enumerated", status);
439 HSA_DEBUG ("There are %i HSA GPU devices.\n", hsa_context.agent_count);
440
441 hsa_context.agents
442 = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
443 * sizeof (struct agent_info));
444 status = hsa_iterate_agents (assign_agent_ids, &agent_index);
445 if (agent_index != hsa_context.agent_count)
446 GOMP_PLUGIN_fatal ("Failed to assign IDs to all HSA agents");
447 hsa_context.initialized = true;
448 }
449
450 /* Callback of dispatch queues to report errors. */
451
452 static void
queue_callback(hsa_status_t status,hsa_queue_t * queue,void * data)453 queue_callback (hsa_status_t status,
454 hsa_queue_t *queue __attribute__ ((unused)),
455 void *data __attribute__ ((unused)))
456 {
457 hsa_fatal ("Asynchronous queue error", status);
458 }
459
460 /* Callback of hsa_agent_iterate_regions. Determine if a memory REGION can be
461 used for kernarg allocations and if so write it to the memory pointed to by
462 DATA and break the query. */
463
464 static hsa_status_t
get_kernarg_memory_region(hsa_region_t region,void * data)465 get_kernarg_memory_region (hsa_region_t region, void *data)
466 {
467 hsa_status_t status;
468 hsa_region_segment_t segment;
469
470 status = hsa_region_get_info (region, HSA_REGION_INFO_SEGMENT, &segment);
471 if (status != HSA_STATUS_SUCCESS)
472 return status;
473 if (segment != HSA_REGION_SEGMENT_GLOBAL)
474 return HSA_STATUS_SUCCESS;
475
476 uint32_t flags;
477 status = hsa_region_get_info (region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
478 if (status != HSA_STATUS_SUCCESS)
479 return status;
480 if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
481 {
482 hsa_region_t *ret = (hsa_region_t *) data;
483 *ret = region;
484 return HSA_STATUS_INFO_BREAK;
485 }
486 return HSA_STATUS_SUCCESS;
487 }
488
489 /* Part of the libgomp plugin interface. Return the number of HSA devices on
490 the system. */
491
492 int
GOMP_OFFLOAD_get_num_devices(void)493 GOMP_OFFLOAD_get_num_devices (void)
494 {
495 init_hsa_context ();
496 return hsa_context.agent_count;
497 }
498
499 /* Part of the libgomp plugin interface. Initialize agent number N so that it
500 can be used for computation. */
501
502 void
GOMP_OFFLOAD_init_device(int n)503 GOMP_OFFLOAD_init_device (int n)
504 {
505 init_hsa_context ();
506 if (n >= hsa_context.agent_count)
507 GOMP_PLUGIN_fatal ("Request to initialize non-existing HSA device %i", n);
508 struct agent_info *agent = &hsa_context.agents[n];
509
510 if (agent->initialized)
511 return;
512
513 if (pthread_rwlock_init (&agent->modules_rwlock, NULL))
514 GOMP_PLUGIN_fatal ("Failed to initialize an HSA agent rwlock");
515 if (pthread_mutex_init (&agent->prog_mutex, NULL))
516 GOMP_PLUGIN_fatal ("Failed to initialize an HSA agent program mutex");
517
518 uint32_t queue_size;
519 hsa_status_t status;
520 status = hsa_agent_get_info (agent->id, HSA_AGENT_INFO_QUEUE_MAX_SIZE,
521 &queue_size);
522 if (status != HSA_STATUS_SUCCESS)
523 hsa_fatal ("Error requesting maximum queue size of the HSA agent", status);
524 status = hsa_agent_get_info (agent->id, HSA_AGENT_INFO_ISA, &agent->isa);
525 if (status != HSA_STATUS_SUCCESS)
526 hsa_fatal ("Error querying the ISA of the agent", status);
527 status = hsa_queue_create (agent->id, queue_size, HSA_QUEUE_TYPE_MULTI,
528 queue_callback, NULL, UINT32_MAX, UINT32_MAX,
529 &agent->command_q);
530 if (status != HSA_STATUS_SUCCESS)
531 hsa_fatal ("Error creating command queue", status);
532
533 status = hsa_queue_create (agent->id, queue_size, HSA_QUEUE_TYPE_MULTI,
534 queue_callback, NULL, UINT32_MAX, UINT32_MAX,
535 &agent->kernel_dispatch_command_q);
536 if (status != HSA_STATUS_SUCCESS)
537 hsa_fatal ("Error creating kernel dispatch command queue", status);
538
539 agent->kernarg_region.handle = (uint64_t) -1;
540 status = hsa_agent_iterate_regions (agent->id, get_kernarg_memory_region,
541 &agent->kernarg_region);
542 if (agent->kernarg_region.handle == (uint64_t) -1)
543 GOMP_PLUGIN_fatal ("Could not find suitable memory region for kernel "
544 "arguments");
545 HSA_DEBUG ("HSA agent initialized, queue has id %llu\n",
546 (long long unsigned) agent->command_q->id);
547 HSA_DEBUG ("HSA agent initialized, kernel dispatch queue has id %llu\n",
548 (long long unsigned) agent->kernel_dispatch_command_q->id);
549 agent->initialized = true;
550 }
551
552 /* Verify that hsa_context has already been initialized and return the
553 agent_info structure describing device number N. */
554
555 static struct agent_info *
get_agent_info(int n)556 get_agent_info (int n)
557 {
558 if (!hsa_context.initialized)
559 GOMP_PLUGIN_fatal ("Attempt to use uninitialized HSA context.");
560 if (n >= hsa_context.agent_count)
561 GOMP_PLUGIN_fatal ("Request to operate on anon-existing HSA device %i", n);
562 if (!hsa_context.agents[n].initialized)
563 GOMP_PLUGIN_fatal ("Attempt to use an uninitialized HSA agent.");
564 return &hsa_context.agents[n];
565 }
566
567 /* Insert MODULE to the linked list of modules of AGENT. */
568
569 static void
add_module_to_agent(struct agent_info * agent,struct module_info * module)570 add_module_to_agent (struct agent_info *agent, struct module_info *module)
571 {
572 if (agent->first_module)
573 agent->first_module->prev = module;
574 module->next = agent->first_module;
575 module->prev = NULL;
576 agent->first_module = module;
577 }
578
579 /* Remove MODULE from the linked list of modules of AGENT. */
580
581 static void
remove_module_from_agent(struct agent_info * agent,struct module_info * module)582 remove_module_from_agent (struct agent_info *agent, struct module_info *module)
583 {
584 if (agent->first_module == module)
585 agent->first_module = module->next;
586 if (module->prev)
587 module->prev->next = module->next;
588 if (module->next)
589 module->next->prev = module->prev;
590 }
591
592 /* Free the HSA program in agent and everything associated with it and set
593 agent->prog_finalized and the initialized flags of all kernels to false. */
594
595 static void
destroy_hsa_program(struct agent_info * agent)596 destroy_hsa_program (struct agent_info *agent)
597 {
598 if (!agent->prog_finalized || agent->prog_finalized_error)
599 return;
600
601 hsa_status_t status;
602
603 HSA_DEBUG ("Destroying the current HSA program.\n");
604
605 status = hsa_executable_destroy (agent->executable);
606 if (status != HSA_STATUS_SUCCESS)
607 hsa_fatal ("Could not destroy HSA executable", status);
608
609 struct module_info *module;
610 for (module = agent->first_module; module; module = module->next)
611 {
612 int i;
613 for (i = 0; i < module->kernel_count; i++)
614 module->kernels[i].initialized = false;
615 }
616 agent->prog_finalized = false;
617 }
618
619 /* Part of the libgomp plugin interface. Load BRIG module described by struct
620 brig_image_desc in TARGET_DATA and return references to kernel descriptors
621 in TARGET_TABLE. */
622
623 int
GOMP_OFFLOAD_load_image(int ord,unsigned version,void * target_data,struct addr_pair ** target_table)624 GOMP_OFFLOAD_load_image (int ord, unsigned version, void *target_data,
625 struct addr_pair **target_table)
626 {
627 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_HSA)
628 GOMP_PLUGIN_fatal ("Offload data incompatible with HSA plugin"
629 " (expected %u, received %u)",
630 GOMP_VERSION_HSA, GOMP_VERSION_DEV (version));
631
632 struct brig_image_desc *image_desc = (struct brig_image_desc *) target_data;
633 struct agent_info *agent;
634 struct addr_pair *pair;
635 struct module_info *module;
636 struct kernel_info *kernel;
637 int kernel_count = image_desc->kernel_count;
638
639 agent = get_agent_info (ord);
640 if (pthread_rwlock_wrlock (&agent->modules_rwlock))
641 GOMP_PLUGIN_fatal ("Unable to write-lock an HSA agent rwlock");
642 if (agent->prog_finalized)
643 destroy_hsa_program (agent);
644
645 HSA_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
646 pair = GOMP_PLUGIN_malloc (kernel_count * sizeof (struct addr_pair));
647 *target_table = pair;
648 module = (struct module_info *)
649 GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
650 + kernel_count * sizeof (struct kernel_info));
651 module->image_desc = image_desc;
652 module->kernel_count = kernel_count;
653
654 kernel = &module->kernels[0];
655
656 /* Allocate memory for kernel dependencies. */
657 for (unsigned i = 0; i < kernel_count; i++)
658 {
659 pair->start = (uintptr_t) kernel;
660 pair->end = (uintptr_t) (kernel + 1);
661
662 struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
663 kernel->agent = agent;
664 kernel->module = module;
665 kernel->name = d->name;
666 kernel->omp_data_size = d->omp_data_size;
667 kernel->gridified_kernel_p = d->gridified_kernel_p;
668 kernel->dependencies_count = d->kernel_dependencies_count;
669 kernel->dependencies = d->kernel_dependencies;
670 if (pthread_mutex_init (&kernel->init_mutex, NULL))
671 GOMP_PLUGIN_fatal ("Failed to initialize an HSA kernel mutex");
672
673 kernel++;
674 pair++;
675 }
676
677 add_module_to_agent (agent, module);
678 if (pthread_rwlock_unlock (&agent->modules_rwlock))
679 GOMP_PLUGIN_fatal ("Unable to unlock an HSA agent rwlock");
680 return kernel_count;
681 }
682
683 /* Add a shared BRIG library from a FILE_NAME to an AGENT. */
684
685 static struct brig_library_info *
add_shared_library(const char * file_name,struct agent_info * agent)686 add_shared_library (const char *file_name, struct agent_info *agent)
687 {
688 struct brig_library_info *library = NULL;
689
690 void *f = dlopen (file_name, RTLD_NOW);
691 void *start = dlsym (f, "__brig_start");
692 void *end = dlsym (f, "__brig_end");
693
694 if (start == NULL || end == NULL)
695 return NULL;
696
697 unsigned size = end - start;
698 char *buf = (char *) GOMP_PLUGIN_malloc (size);
699 memcpy (buf, start, size);
700
701 library = GOMP_PLUGIN_malloc (sizeof (struct agent_info));
702 library->file_name = (char *) GOMP_PLUGIN_malloc
703 ((strlen (file_name) + 1));
704 strcpy (library->file_name, file_name);
705 library->image = (hsa_ext_module_t) buf;
706
707 return library;
708 }
709
710 /* Release memory used for BRIG shared libraries that correspond
711 to an AGENT. */
712
713 static void
release_agent_shared_libraries(struct agent_info * agent)714 release_agent_shared_libraries (struct agent_info *agent)
715 {
716 for (unsigned i = 0; i < agent->brig_libraries_count; i++)
717 if (agent->brig_libraries[i])
718 {
719 free (agent->brig_libraries[i]->file_name);
720 free (agent->brig_libraries[i]->image);
721 free (agent->brig_libraries[i]);
722 }
723
724 free (agent->brig_libraries);
725 }
726
727 /* Create and finalize the program consisting of all loaded modules. */
728
729 static void
create_and_finalize_hsa_program(struct agent_info * agent)730 create_and_finalize_hsa_program (struct agent_info *agent)
731 {
732 hsa_status_t status;
733 hsa_ext_program_t prog_handle;
734 int mi = 0;
735
736 if (pthread_mutex_lock (&agent->prog_mutex))
737 GOMP_PLUGIN_fatal ("Could not lock an HSA agent program mutex");
738 if (agent->prog_finalized)
739 goto final;
740
741 status = hsa_ext_program_create (HSA_MACHINE_MODEL_LARGE, HSA_PROFILE_FULL,
742 HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
743 NULL, &prog_handle);
744 if (status != HSA_STATUS_SUCCESS)
745 hsa_fatal ("Could not create an HSA program", status);
746
747 HSA_DEBUG ("Created a finalized program\n");
748
749 struct module_info *module = agent->first_module;
750 while (module)
751 {
752 status = hsa_ext_program_add_module (prog_handle,
753 module->image_desc->brig_module);
754 if (status != HSA_STATUS_SUCCESS)
755 hsa_fatal ("Could not add a module to the HSA program", status);
756 module = module->next;
757 mi++;
758 }
759
760 /* Load all shared libraries. */
761 const char *libraries[] = { "libhsamath.so", "libhsastd.so" };
762 const unsigned libraries_count = sizeof (libraries) / sizeof (const char *);
763
764 agent->brig_libraries_count = libraries_count;
765 agent->brig_libraries = GOMP_PLUGIN_malloc_cleared
766 (sizeof (struct brig_library_info) * libraries_count);
767
768 for (unsigned i = 0; i < libraries_count; i++)
769 {
770 struct brig_library_info *library = add_shared_library (libraries[i],
771 agent);
772 if (library == NULL)
773 {
774 HSA_WARNING ("Could not open a shared BRIG library: %s\n",
775 libraries[i]);
776 continue;
777 }
778
779 status = hsa_ext_program_add_module (prog_handle, library->image);
780 if (status != HSA_STATUS_SUCCESS)
781 hsa_warn ("Could not add a shared BRIG library the HSA program",
782 status);
783 else
784 HSA_DEBUG ("a shared BRIG library has been added to a program: %s\n",
785 libraries[i]);
786 }
787
788 hsa_ext_control_directives_t control_directives;
789 memset (&control_directives, 0, sizeof (control_directives));
790 hsa_code_object_t code_object;
791 status = hsa_ext_program_finalize (prog_handle, agent->isa,
792 HSA_EXT_FINALIZER_CALL_CONVENTION_AUTO,
793 control_directives, "",
794 HSA_CODE_OBJECT_TYPE_PROGRAM,
795 &code_object);
796 if (status != HSA_STATUS_SUCCESS)
797 {
798 hsa_warn ("Finalization of the HSA program failed", status);
799 goto failure;
800 }
801
802 HSA_DEBUG ("Finalization done\n");
803 hsa_ext_program_destroy (prog_handle);
804
805 status
806 = hsa_executable_create (HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN,
807 "", &agent->executable);
808 if (status != HSA_STATUS_SUCCESS)
809 hsa_fatal ("Could not create HSA executable", status);
810
811 module = agent->first_module;
812 while (module)
813 {
814 /* Initialize all global variables declared in the module. */
815 for (unsigned i = 0; i < module->image_desc->global_variable_count; i++)
816 {
817 struct global_var_info *var;
818 var = &module->image_desc->global_variables[i];
819 status
820 = hsa_executable_global_variable_define (agent->executable,
821 var->name, var->address);
822
823 HSA_DEBUG ("Defining global variable: %s, address: %p\n", var->name,
824 var->address);
825
826 if (status != HSA_STATUS_SUCCESS)
827 hsa_fatal ("Could not define a global variable in the HSA program",
828 status);
829 }
830
831 module = module->next;
832 }
833
834 status = hsa_executable_load_code_object (agent->executable, agent->id,
835 code_object, "");
836 if (status != HSA_STATUS_SUCCESS)
837 hsa_fatal ("Could not add a code object to the HSA executable", status);
838 status = hsa_executable_freeze (agent->executable, "");
839 if (status != HSA_STATUS_SUCCESS)
840 hsa_fatal ("Could not freeze the HSA executable", status);
841
842 HSA_DEBUG ("Froze HSA executable with the finalized code object\n");
843
844 /* If all goes good, jump to final. */
845 goto final;
846
847 failure:
848 agent->prog_finalized_error = true;
849
850 final:
851 agent->prog_finalized = true;
852
853 if (pthread_mutex_unlock (&agent->prog_mutex))
854 GOMP_PLUGIN_fatal ("Could not unlock an HSA agent program mutex");
855 }
856
857 /* Create kernel dispatch data structure for given KERNEL. */
858
859 static struct GOMP_hsa_kernel_dispatch *
create_single_kernel_dispatch(struct kernel_info * kernel,unsigned omp_data_size)860 create_single_kernel_dispatch (struct kernel_info *kernel,
861 unsigned omp_data_size)
862 {
863 struct agent_info *agent = kernel->agent;
864 struct GOMP_hsa_kernel_dispatch *shadow
865 = GOMP_PLUGIN_malloc_cleared (sizeof (struct GOMP_hsa_kernel_dispatch));
866
867 shadow->queue = agent->command_q;
868 shadow->omp_data_memory
869 = omp_data_size > 0 ? GOMP_PLUGIN_malloc (omp_data_size) : NULL;
870 unsigned dispatch_count = kernel->dependencies_count;
871 shadow->kernel_dispatch_count = dispatch_count;
872
873 shadow->children_dispatches
874 = GOMP_PLUGIN_malloc (dispatch_count * sizeof (shadow));
875
876 shadow->object = kernel->object;
877
878 hsa_signal_t sync_signal;
879 hsa_status_t status = hsa_signal_create (1, 0, NULL, &sync_signal);
880 if (status != HSA_STATUS_SUCCESS)
881 hsa_fatal ("Error creating the HSA sync signal", status);
882
883 shadow->signal = sync_signal.handle;
884 shadow->private_segment_size = kernel->private_segment_size;
885 shadow->group_segment_size = kernel->group_segment_size;
886
887 status
888 = hsa_memory_allocate (agent->kernarg_region, kernel->kernarg_segment_size,
889 &shadow->kernarg_address);
890 if (status != HSA_STATUS_SUCCESS)
891 hsa_fatal ("Could not allocate memory for HSA kernel arguments", status);
892
893 return shadow;
894 }
895
896 /* Release data structure created for a kernel dispatch in SHADOW argument. */
897
898 static void
release_kernel_dispatch(struct GOMP_hsa_kernel_dispatch * shadow)899 release_kernel_dispatch (struct GOMP_hsa_kernel_dispatch *shadow)
900 {
901 HSA_DEBUG ("Released kernel dispatch: %p has value: %lu (%p)\n", shadow,
902 shadow->debug, (void *) shadow->debug);
903
904 hsa_memory_free (shadow->kernarg_address);
905
906 hsa_signal_t s;
907 s.handle = shadow->signal;
908 hsa_signal_destroy (s);
909
910 free (shadow->omp_data_memory);
911
912 for (unsigned i = 0; i < shadow->kernel_dispatch_count; i++)
913 release_kernel_dispatch (shadow->children_dispatches[i]);
914
915 free (shadow->children_dispatches);
916 free (shadow);
917 }
918
919 /* Initialize a KERNEL without its dependencies. MAX_OMP_DATA_SIZE is used
920 to calculate maximum necessary memory for OMP data allocation. */
921
922 static void
init_single_kernel(struct kernel_info * kernel,unsigned * max_omp_data_size)923 init_single_kernel (struct kernel_info *kernel, unsigned *max_omp_data_size)
924 {
925 hsa_status_t status;
926 struct agent_info *agent = kernel->agent;
927 hsa_executable_symbol_t kernel_symbol;
928 status = hsa_executable_get_symbol (agent->executable, NULL, kernel->name,
929 agent->id, 0, &kernel_symbol);
930 if (status != HSA_STATUS_SUCCESS)
931 {
932 hsa_warn ("Could not find symbol for kernel in the code object", status);
933 goto failure;
934 }
935 HSA_DEBUG ("Located kernel %s\n", kernel->name);
936 status
937 = hsa_executable_symbol_get_info (kernel_symbol,
938 HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
939 &kernel->object);
940 if (status != HSA_STATUS_SUCCESS)
941 hsa_fatal ("Could not extract a kernel object from its symbol", status);
942 status = hsa_executable_symbol_get_info
943 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
944 &kernel->kernarg_segment_size);
945 if (status != HSA_STATUS_SUCCESS)
946 hsa_fatal ("Could not get info about kernel argument size", status);
947 status = hsa_executable_symbol_get_info
948 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
949 &kernel->group_segment_size);
950 if (status != HSA_STATUS_SUCCESS)
951 hsa_fatal ("Could not get info about kernel group segment size", status);
952 status = hsa_executable_symbol_get_info
953 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
954 &kernel->private_segment_size);
955 if (status != HSA_STATUS_SUCCESS)
956 hsa_fatal ("Could not get info about kernel private segment size",
957 status);
958
959 HSA_DEBUG ("Kernel structure for %s fully initialized with "
960 "following segment sizes: \n", kernel->name);
961 HSA_DEBUG (" group_segment_size: %u\n",
962 (unsigned) kernel->group_segment_size);
963 HSA_DEBUG (" private_segment_size: %u\n",
964 (unsigned) kernel->private_segment_size);
965 HSA_DEBUG (" kernarg_segment_size: %u\n",
966 (unsigned) kernel->kernarg_segment_size);
967 HSA_DEBUG (" omp_data_size: %u\n", kernel->omp_data_size);
968 HSA_DEBUG (" gridified_kernel_p: %u\n", kernel->gridified_kernel_p);
969
970 if (kernel->omp_data_size > *max_omp_data_size)
971 *max_omp_data_size = kernel->omp_data_size;
972
973 for (unsigned i = 0; i < kernel->dependencies_count; i++)
974 {
975 struct kernel_info *dependency
976 = get_kernel_for_agent (agent, kernel->dependencies[i]);
977
978 if (dependency == NULL)
979 {
980 HSA_DEBUG ("Could not find a dependency for a kernel: %s, "
981 "dependency name: %s\n", kernel->name,
982 kernel->dependencies[i]);
983 goto failure;
984 }
985
986 if (dependency->dependencies_count > 0)
987 {
988 HSA_DEBUG ("HSA does not allow kernel dispatching code with "
989 "a depth bigger than one\n")
990 goto failure;
991 }
992
993 init_single_kernel (dependency, max_omp_data_size);
994 }
995
996 return;
997
998 failure:
999 kernel->initialization_failed = true;
1000 }
1001
1002 /* Indent stream F by INDENT spaces. */
1003
1004 static void
indent_stream(FILE * f,unsigned indent)1005 indent_stream (FILE *f, unsigned indent)
1006 {
1007 fprintf (f, "%*s", indent, "");
1008 }
1009
1010 /* Dump kernel DISPATCH data structure and indent it by INDENT spaces. */
1011
1012 static void
print_kernel_dispatch(struct GOMP_hsa_kernel_dispatch * dispatch,unsigned indent)1013 print_kernel_dispatch (struct GOMP_hsa_kernel_dispatch *dispatch, unsigned indent)
1014 {
1015 indent_stream (stderr, indent);
1016 fprintf (stderr, "this: %p\n", dispatch);
1017 indent_stream (stderr, indent);
1018 fprintf (stderr, "queue: %p\n", dispatch->queue);
1019 indent_stream (stderr, indent);
1020 fprintf (stderr, "omp_data_memory: %p\n", dispatch->omp_data_memory);
1021 indent_stream (stderr, indent);
1022 fprintf (stderr, "kernarg_address: %p\n", dispatch->kernarg_address);
1023 indent_stream (stderr, indent);
1024 fprintf (stderr, "object: %lu\n", dispatch->object);
1025 indent_stream (stderr, indent);
1026 fprintf (stderr, "signal: %lu\n", dispatch->signal);
1027 indent_stream (stderr, indent);
1028 fprintf (stderr, "private_segment_size: %u\n",
1029 dispatch->private_segment_size);
1030 indent_stream (stderr, indent);
1031 fprintf (stderr, "group_segment_size: %u\n",
1032 dispatch->group_segment_size);
1033 indent_stream (stderr, indent);
1034 fprintf (stderr, "children dispatches: %lu\n",
1035 dispatch->kernel_dispatch_count);
1036 indent_stream (stderr, indent);
1037 fprintf (stderr, "omp_num_threads: %u\n",
1038 dispatch->omp_num_threads);
1039 fprintf (stderr, "\n");
1040
1041 for (unsigned i = 0; i < dispatch->kernel_dispatch_count; i++)
1042 print_kernel_dispatch (dispatch->children_dispatches[i], indent + 2);
1043 }
1044
1045 /* Create kernel dispatch data structure for a KERNEL and all its
1046 dependencies. */
1047
1048 static struct GOMP_hsa_kernel_dispatch *
create_kernel_dispatch(struct kernel_info * kernel,unsigned omp_data_size)1049 create_kernel_dispatch (struct kernel_info *kernel, unsigned omp_data_size)
1050 {
1051 struct GOMP_hsa_kernel_dispatch *shadow
1052 = create_single_kernel_dispatch (kernel, omp_data_size);
1053 shadow->omp_num_threads = 64;
1054 shadow->debug = 0;
1055 shadow->omp_level = kernel->gridified_kernel_p ? 1 : 0;
1056
1057 /* Create kernel dispatch data structures. We do not allow to have
1058 a kernel dispatch with depth bigger than one. */
1059 for (unsigned i = 0; i < kernel->dependencies_count; i++)
1060 {
1061 struct kernel_info *dependency
1062 = get_kernel_for_agent (kernel->agent, kernel->dependencies[i]);
1063 shadow->children_dispatches[i]
1064 = create_single_kernel_dispatch (dependency, omp_data_size);
1065 shadow->children_dispatches[i]->queue
1066 = kernel->agent->kernel_dispatch_command_q;
1067 shadow->children_dispatches[i]->omp_level = 1;
1068 }
1069
1070 return shadow;
1071 }
1072
1073 /* Do all the work that is necessary before running KERNEL for the first time.
1074 The function assumes the program has been created, finalized and frozen by
1075 create_and_finalize_hsa_program. */
1076
1077 static void
init_kernel(struct kernel_info * kernel)1078 init_kernel (struct kernel_info *kernel)
1079 {
1080 if (pthread_mutex_lock (&kernel->init_mutex))
1081 GOMP_PLUGIN_fatal ("Could not lock an HSA kernel initialization mutex");
1082 if (kernel->initialized)
1083 {
1084 if (pthread_mutex_unlock (&kernel->init_mutex))
1085 GOMP_PLUGIN_fatal ("Could not unlock an HSA kernel initialization "
1086 "mutex");
1087
1088 return;
1089 }
1090
1091 /* Precomputed maximum size of OMP data necessary for a kernel from kernel
1092 dispatch operation. */
1093 init_single_kernel (kernel, &kernel->max_omp_data_size);
1094
1095 if (!kernel->initialization_failed)
1096 HSA_DEBUG ("\n");
1097
1098 kernel->initialized = true;
1099 if (pthread_mutex_unlock (&kernel->init_mutex))
1100 GOMP_PLUGIN_fatal ("Could not unlock an HSA kernel initialization "
1101 "mutex");
1102 }
1103
1104 /* Parse the target attributes INPUT provided by the compiler and return true
1105 if we should run anything all. If INPUT is NULL, fill DEF with default
1106 values, then store INPUT or DEF into *RESULT. */
1107
1108 static bool
parse_target_attributes(void ** input,struct GOMP_kernel_launch_attributes * def,struct GOMP_kernel_launch_attributes ** result)1109 parse_target_attributes (void **input,
1110 struct GOMP_kernel_launch_attributes *def,
1111 struct GOMP_kernel_launch_attributes **result)
1112 {
1113 if (!input)
1114 GOMP_PLUGIN_fatal ("No target arguments provided");
1115
1116 bool attrs_found = false;
1117 while (*input)
1118 {
1119 uintptr_t id = (uintptr_t) *input;
1120 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_HSA
1121 && ((id & GOMP_TARGET_ARG_ID_MASK)
1122 == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1123 {
1124 input++;
1125 attrs_found = true;
1126 break;
1127 }
1128
1129 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1130 input++;
1131 input++;
1132 }
1133
1134 if (!attrs_found)
1135 {
1136 def->ndim = 1;
1137 def->gdims[0] = 1;
1138 def->gdims[1] = 1;
1139 def->gdims[2] = 1;
1140 def->wdims[0] = 1;
1141 def->wdims[1] = 1;
1142 def->wdims[2] = 1;
1143 *result = def;
1144 HSA_DEBUG ("GOMP_OFFLOAD_run called with no launch attributes\n");
1145 return true;
1146 }
1147
1148 struct GOMP_kernel_launch_attributes *kla;
1149 kla = (struct GOMP_kernel_launch_attributes *) *input;
1150 *result = kla;
1151 if (kla->ndim != 1)
1152 GOMP_PLUGIN_fatal ("HSA does not yet support number of dimensions "
1153 "different from one.");
1154 if (kla->gdims[0] == 0)
1155 return false;
1156
1157 HSA_DEBUG ("GOMP_OFFLOAD_run called with grid size %u and group size %u\n",
1158 kla->gdims[0], kla->wdims[0]);
1159
1160 return true;
1161 }
1162
1163 /* Return true if the HSA runtime can run function FN_PTR. */
1164
1165 bool
GOMP_OFFLOAD_can_run(void * fn_ptr)1166 GOMP_OFFLOAD_can_run (void *fn_ptr)
1167 {
1168 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
1169 struct agent_info *agent = kernel->agent;
1170 create_and_finalize_hsa_program (agent);
1171
1172 if (agent->prog_finalized_error)
1173 goto failure;
1174
1175 init_kernel (kernel);
1176 if (kernel->initialization_failed)
1177 goto failure;
1178
1179 return true;
1180
1181 failure:
1182 if (suppress_host_fallback)
1183 GOMP_PLUGIN_fatal ("HSA host fallback has been suppressed");
1184 HSA_DEBUG ("HSA target cannot be launched, doing a host fallback\n");
1185 return false;
1186 }
1187
1188 /* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET. */
1189
1190 void
packet_store_release(uint32_t * packet,uint16_t header,uint16_t rest)1191 packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1192 {
1193 __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1194 }
1195
1196 /* Part of the libgomp plugin interface. Run a kernel on device N and pass it
1197 an array of pointers in VARS as a parameter. The kernel is identified by
1198 FN_PTR which must point to a kernel_info structure. */
1199
1200 void
GOMP_OFFLOAD_run(int n,void * fn_ptr,void * vars,void ** args)1201 GOMP_OFFLOAD_run (int n, void *fn_ptr, void *vars, void **args)
1202 {
1203 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
1204 struct agent_info *agent = kernel->agent;
1205 struct GOMP_kernel_launch_attributes def;
1206 struct GOMP_kernel_launch_attributes *kla;
1207 if (!parse_target_attributes (args, &def, &kla))
1208 {
1209 HSA_DEBUG ("Will not run HSA kernel because the grid size is zero\n");
1210 return;
1211 }
1212 if (pthread_rwlock_rdlock (&agent->modules_rwlock))
1213 GOMP_PLUGIN_fatal ("Unable to read-lock an HSA agent rwlock");
1214
1215 if (!agent->initialized)
1216 GOMP_PLUGIN_fatal ("Agent must be initialized");
1217
1218 if (!kernel->initialized)
1219 GOMP_PLUGIN_fatal ("Called kernel must be initialized");
1220
1221 struct GOMP_hsa_kernel_dispatch *shadow
1222 = create_kernel_dispatch (kernel, kernel->max_omp_data_size);
1223
1224 if (debug)
1225 {
1226 fprintf (stderr, "\nKernel has following dependencies:\n");
1227 print_kernel_dispatch (shadow, 2);
1228 }
1229
1230 uint64_t index = hsa_queue_add_write_index_release (agent->command_q, 1);
1231 HSA_DEBUG ("Got AQL index %llu\n", (long long int) index);
1232
1233 /* Wait until the queue is not full before writing the packet. */
1234 while (index - hsa_queue_load_read_index_acquire (agent->command_q)
1235 >= agent->command_q->size)
1236 ;
1237
1238 hsa_kernel_dispatch_packet_t *packet;
1239 packet = ((hsa_kernel_dispatch_packet_t *) agent->command_q->base_address)
1240 + index % agent->command_q->size;
1241
1242 memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
1243 packet->grid_size_x = kla->gdims[0];
1244 uint32_t wgs = kla->wdims[0];
1245 if (wgs == 0)
1246 /* TODO: Provide a default via environment. */
1247 wgs = 64;
1248 else if (wgs > kla->gdims[0])
1249 wgs = kla->gdims[0];
1250 packet->workgroup_size_x = wgs;
1251 packet->grid_size_y = 1;
1252 packet->workgroup_size_y = 1;
1253 packet->grid_size_z = 1;
1254 packet->workgroup_size_z = 1;
1255 packet->private_segment_size = kernel->private_segment_size;
1256 packet->group_segment_size = kernel->group_segment_size;
1257 packet->kernel_object = kernel->object;
1258 packet->kernarg_address = shadow->kernarg_address;
1259 hsa_signal_t s;
1260 s.handle = shadow->signal;
1261 packet->completion_signal = s;
1262 hsa_signal_store_relaxed (s, 1);
1263 memcpy (shadow->kernarg_address, &vars, sizeof (vars));
1264
1265 /* PR hsa/70337. */
1266 size_t vars_size = sizeof (vars);
1267 if (kernel->kernarg_segment_size > vars_size)
1268 {
1269 if (kernel->kernarg_segment_size != vars_size
1270 + sizeof (struct hsa_kernel_runtime *))
1271 GOMP_PLUGIN_fatal ("Kernel segment size has an unexpected value");
1272 memcpy (packet->kernarg_address + vars_size, &shadow,
1273 sizeof (struct hsa_kernel_runtime *));
1274 }
1275
1276 HSA_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
1277
1278 uint16_t header;
1279 header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
1280 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
1281 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
1282
1283 HSA_DEBUG ("Going to dispatch kernel %s\n", kernel->name);
1284
1285 packet_store_release ((uint32_t *) packet, header,
1286 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
1287
1288 hsa_signal_store_release (agent->command_q->doorbell_signal, index);
1289
1290 /* TODO: GPU agents in Carrizo APUs cannot properly update L2 cache for
1291 signal wait and signal load operations on their own and we need to
1292 periodically call the hsa_signal_load_acquire on completion signals of
1293 children kernels in the CPU to make that happen. As soon the
1294 limitation will be resolved, this workaround can be removed. */
1295
1296 HSA_DEBUG ("Kernel dispatched, waiting for completion\n");
1297
1298 /* Root signal waits with 1ms timeout. */
1299 while (hsa_signal_wait_acquire (s, HSA_SIGNAL_CONDITION_LT, 1, 1000 * 1000,
1300 HSA_WAIT_STATE_BLOCKED) != 0)
1301 for (unsigned i = 0; i < shadow->kernel_dispatch_count; i++)
1302 {
1303 hsa_signal_t child_s;
1304 child_s.handle = shadow->children_dispatches[i]->signal;
1305
1306 HSA_DEBUG ("Waiting for children completion signal: %lu\n",
1307 shadow->children_dispatches[i]->signal);
1308 hsa_signal_load_acquire (child_s);
1309 }
1310
1311 release_kernel_dispatch (shadow);
1312
1313 if (pthread_rwlock_unlock (&agent->modules_rwlock))
1314 GOMP_PLUGIN_fatal ("Unable to unlock an HSA agent rwlock");
1315 }
1316
1317 /* Information to be passed to a thread running a kernel asycnronously. */
1318
1319 struct async_run_info
1320 {
1321 int device;
1322 void *tgt_fn;
1323 void *tgt_vars;
1324 void **args;
1325 void *async_data;
1326 };
1327
1328 /* Thread routine to run a kernel asynchronously. */
1329
1330 static void *
run_kernel_asynchronously(void * thread_arg)1331 run_kernel_asynchronously (void *thread_arg)
1332 {
1333 struct async_run_info *info = (struct async_run_info *) thread_arg;
1334 int device = info->device;
1335 void *tgt_fn = info->tgt_fn;
1336 void *tgt_vars = info->tgt_vars;
1337 void **args = info->args;
1338 void *async_data = info->async_data;
1339
1340 free (info);
1341 GOMP_OFFLOAD_run (device, tgt_fn, tgt_vars, args);
1342 GOMP_PLUGIN_target_task_completion (async_data);
1343 return NULL;
1344 }
1345
1346 /* Part of the libgomp plugin interface. Run a kernel like GOMP_OFFLOAD_run
1347 does, but asynchronously and call GOMP_PLUGIN_target_task_completion when it
1348 has finished. */
1349
1350 void
GOMP_OFFLOAD_async_run(int device,void * tgt_fn,void * tgt_vars,void ** args,void * async_data)1351 GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
1352 void **args, void *async_data)
1353 {
1354 pthread_t pt;
1355 struct async_run_info *info;
1356 HSA_DEBUG ("GOMP_OFFLOAD_async_run invoked\n")
1357 info = GOMP_PLUGIN_malloc (sizeof (struct async_run_info));
1358
1359 info->device = device;
1360 info->tgt_fn = tgt_fn;
1361 info->tgt_vars = tgt_vars;
1362 info->args = args;
1363 info->async_data = async_data;
1364
1365 int err = pthread_create (&pt, NULL, &run_kernel_asynchronously, info);
1366 if (err != 0)
1367 GOMP_PLUGIN_fatal ("HSA asynchronous thread creation failed: %s",
1368 strerror (err));
1369 err = pthread_detach (pt);
1370 if (err != 0)
1371 GOMP_PLUGIN_fatal ("Failed to detach a thread to run HSA kernel "
1372 "asynchronously: %s", strerror (err));
1373 }
1374
1375 /* Deinitialize all information associated with MODULE and kernels within
1376 it. */
1377
1378 void
destroy_module(struct module_info * module)1379 destroy_module (struct module_info *module)
1380 {
1381 int i;
1382 for (i = 0; i < module->kernel_count; i++)
1383 if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
1384 GOMP_PLUGIN_fatal ("Failed to destroy an HSA kernel initialization "
1385 "mutex");
1386 }
1387
1388 /* Part of the libgomp plugin interface. Unload BRIG module described by
1389 struct brig_image_desc in TARGET_DATA from agent number N. */
1390
1391 void
GOMP_OFFLOAD_unload_image(int n,unsigned version,void * target_data)1392 GOMP_OFFLOAD_unload_image (int n, unsigned version, void *target_data)
1393 {
1394 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_HSA)
1395 GOMP_PLUGIN_fatal ("Offload data incompatible with HSA plugin"
1396 " (expected %u, received %u)",
1397 GOMP_VERSION_HSA, GOMP_VERSION_DEV (version));
1398
1399 struct agent_info *agent;
1400 agent = get_agent_info (n);
1401 if (pthread_rwlock_wrlock (&agent->modules_rwlock))
1402 GOMP_PLUGIN_fatal ("Unable to write-lock an HSA agent rwlock");
1403
1404 struct module_info *module = agent->first_module;
1405 while (module)
1406 {
1407 if (module->image_desc == target_data)
1408 break;
1409 module = module->next;
1410 }
1411 if (!module)
1412 GOMP_PLUGIN_fatal ("Attempt to unload an image that has never been "
1413 "loaded before");
1414
1415 remove_module_from_agent (agent, module);
1416 destroy_module (module);
1417 free (module);
1418 destroy_hsa_program (agent);
1419 if (pthread_rwlock_unlock (&agent->modules_rwlock))
1420 GOMP_PLUGIN_fatal ("Unable to unlock an HSA agent rwlock");
1421 }
1422
1423 /* Part of the libgomp plugin interface. Deinitialize all information and
1424 status associated with agent number N. We do not attempt any
1425 synchronization, assuming the user and libgomp will not attempt
1426 deinitialization of a device that is in any way being used at the same
1427 time. */
1428
1429 void
GOMP_OFFLOAD_fini_device(int n)1430 GOMP_OFFLOAD_fini_device (int n)
1431 {
1432 struct agent_info *agent = get_agent_info (n);
1433 if (!agent->initialized)
1434 return;
1435
1436 struct module_info *next_module = agent->first_module;
1437 while (next_module)
1438 {
1439 struct module_info *module = next_module;
1440 next_module = module->next;
1441 destroy_module (module);
1442 free (module);
1443 }
1444 agent->first_module = NULL;
1445 destroy_hsa_program (agent);
1446
1447 release_agent_shared_libraries (agent);
1448
1449 hsa_status_t status = hsa_queue_destroy (agent->command_q);
1450 if (status != HSA_STATUS_SUCCESS)
1451 hsa_fatal ("Error destroying command queue", status);
1452 status = hsa_queue_destroy (agent->kernel_dispatch_command_q);
1453 if (status != HSA_STATUS_SUCCESS)
1454 hsa_fatal ("Error destroying kernel dispatch command queue", status);
1455 if (pthread_mutex_destroy (&agent->prog_mutex))
1456 GOMP_PLUGIN_fatal ("Failed to destroy an HSA agent program mutex");
1457 if (pthread_rwlock_destroy (&agent->modules_rwlock))
1458 GOMP_PLUGIN_fatal ("Failed to destroy an HSA agent rwlock");
1459 agent->initialized = false;
1460 }
1461
1462 /* Part of the libgomp plugin interface. Not implemented as it is not required
1463 for HSA. */
1464
1465 void *
GOMP_OFFLOAD_alloc(int ord,size_t size)1466 GOMP_OFFLOAD_alloc (int ord, size_t size)
1467 {
1468 GOMP_PLUGIN_fatal ("HSA GOMP_OFFLOAD_alloc is not implemented because "
1469 "it should never be called");
1470 }
1471
1472 /* Part of the libgomp plugin interface. Not implemented as it is not required
1473 for HSA. */
1474
1475 void
GOMP_OFFLOAD_free(int ord,void * ptr)1476 GOMP_OFFLOAD_free (int ord, void *ptr)
1477 {
1478 GOMP_PLUGIN_fatal ("HSA GOMP_OFFLOAD_free is not implemented because "
1479 "it should never be called");
1480 }
1481
1482 /* Part of the libgomp plugin interface. Not implemented as it is not required
1483 for HSA. */
1484
1485 void *
GOMP_OFFLOAD_dev2host(int ord,void * dst,const void * src,size_t n)1486 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1487 {
1488 GOMP_PLUGIN_fatal ("HSA GOMP_OFFLOAD_dev2host is not implemented because "
1489 "it should never be called");
1490 }
1491
1492 /* Part of the libgomp plugin interface. Not implemented as it is not required
1493 for HSA. */
1494
1495 void *
GOMP_OFFLOAD_host2dev(int ord,void * dst,const void * src,size_t n)1496 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1497 {
1498 GOMP_PLUGIN_fatal ("HSA GOMP_OFFLOAD_host2dev is not implemented because "
1499 "it should never be called");
1500 }
1501
1502 /* Part of the libgomp plugin interface. Not implemented as it is not required
1503 for HSA. */
1504
1505 void *
GOMP_OFFLOAD_dev2dev(int ord,void * dst,const void * src,size_t n)1506 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1507 {
1508 GOMP_PLUGIN_fatal ("HSA GOMP_OFFLOAD_dev2dev is not implemented because "
1509 "it should never be called");
1510 }
1511