1 /* workitems.c -- The main runtime entry that performs work-item execution in
2    various ways and the builtin functions closely related to the
3    implementation.
4 
5    Copyright (C) 2015-2020 Free Software Foundation, Inc.
6    Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com>
7    for General Processor Tech.
8 
9    Permission is hereby granted, free of charge, to any person obtaining a
10    copy of this software and associated documentation files
11    (the "Software"), to deal in the Software without restriction, including
12    without limitation the rights to use, copy, modify, merge, publish,
13    distribute, sublicense, and/or sell copies of the Software, and to
14    permit persons to whom the Software is furnished to do so, subject to
15    the following conditions:
16 
17    The above copyright notice and this permission notice shall be included
18    in all copies or substantial portions of the Software.
19 
20    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
21    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
24    DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
25    OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
26    USE OR OTHER DEALINGS IN THE SOFTWARE.
27 */
28 
29 /* The fiber based multiple work-item work-group execution uses ucontext
30    based user mode threading.  However, if gccbrig is able to optimize the
31    kernel to a much faster work-group function that implements the multiple
32    WI execution using loops instead of fibers requiring slow context switches,
33    the fiber-based implementation won't be called.
34  */
35 
36 #include <stdlib.h>
37 #include <signal.h>
38 #include <string.h>
39 
40 #include "workitems.h"
41 #include "phsa-rt.h"
42 
43 #ifdef HAVE_FIBERS
44 #include "fibers.h"
45 #endif
46 
47 #ifdef BENCHMARK_PHSA_RT
48 #include <stdio.h>
49 #include <time.h>
50 
51 static uint64_t wi_count = 0;
52 static uint64_t wis_skipped = 0;
53 static uint64_t wi_total = 0;
54 static clock_t start_time;
55 
56 #endif
57 
58 #ifdef DEBUG_PHSA_RT
59 #include <stdio.h>
60 #endif
61 
62 #define PRIVATE_SEGMENT_ALIGN 256
63 #define FIBER_STACK_SIZE (64*1024)
64 #define GROUP_SEGMENT_ALIGN 256
65 
66 /* Preserve this amount of additional space in the alloca stack as we need to
67    store the alloca frame pointer to the alloca frame, thus must preserve
68    space for it.  This thus supports at most 1024 functions with allocas in
69    a call chain.  */
70 #define ALLOCA_OVERHEAD 1024*4
71 
72 uint32_t __hsail_workitemabsid (uint32_t dim, PHSAWorkItem *context);
73 
74 uint32_t __hsail_workitemid (uint32_t dim, PHSAWorkItem *context);
75 
76 uint32_t __hsail_gridgroups (uint32_t dim, PHSAWorkItem *context);
77 
78 uint32_t __hsail_currentworkgroupsize (uint32_t dim, PHSAWorkItem *wi);
79 
80 uint32_t __hsail_workgroupsize (uint32_t dim, PHSAWorkItem *wi);
81 
82 void
phsa_fatal_error(int code)83 phsa_fatal_error (int code)
84 {
85   exit (code);
86 }
87 
88 #ifdef HAVE_FIBERS
89 /* ucontext-based work-item thread implementation.  Runs all work-items in
90    separate fibers.  */
91 
92 static void
phsa_work_item_thread(int arg0,int arg1)93 phsa_work_item_thread (int arg0, int arg1)
94 {
95   void *arg = fiber_int_args_to_ptr (arg0, arg1);
96 
97   PHSAWorkItem *wi = (PHSAWorkItem *) arg;
98   volatile PHSAWorkGroup *wg = wi->wg;
99   PHSAKernelLaunchData *l_data = wi->launch_data;
100 
101   do
102     {
103       int retcode
104 	= fiber_barrier_reach ((fiber_barrier_t *) l_data->wg_start_barrier);
105 
106       /* At this point the threads can assume that either more_wgs is 0 or
107 	 the current_work_group_* is set to point to the WG executed next.  */
108       if (!wi->wg->more_wgs)
109 	break;
110 
111       wi->group_x = wg->x;
112       wi->group_y = wg->y;
113       wi->group_z = wg->z;
114 
115       wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi);
116       wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi);
117       wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi);
118 
119 #ifdef DEBUG_PHSA_RT
120       printf (
121 	"Running work-item %lu/%lu/%lu for wg %lu/%lu/%lu / %lu/%lu/%lu...\n",
122 	wi->x, wi->y, wi->z, wi->group_x, wi->group_y, wi->group_z,
123 	l_data->wg_max_x, l_data->wg_max_y, l_data->wg_max_z);
124 #endif
125 
126       if (wi->x < __hsail_currentworkgroupsize (0, wi)
127 	  && wi->y < __hsail_currentworkgroupsize (1, wi)
128 	  && wi->z < __hsail_currentworkgroupsize (2, wi))
129 	{
130 	  l_data->kernel (l_data->kernarg_addr, wi, wg->group_base_ptr,
131 			  wg->initial_group_offset, wg->private_base_ptr);
132 #ifdef DEBUG_PHSA_RT
133 	  printf ("done.\n");
134 #endif
135 #ifdef BENCHMARK_PHSA_RT
136 	  wi_count++;
137 #endif
138 	}
139       else
140 	{
141 #ifdef DEBUG_PHSA_RT
142 	  printf ("skipped (partial WG).\n");
143 #endif
144 #ifdef BENCHMARK_PHSA_RT
145 	  wis_skipped++;
146 #endif
147 	}
148 
149       retcode
150 	= fiber_barrier_reach ((fiber_barrier_t *)
151 			       l_data->wg_completion_barrier);
152 
153       /* The first thread updates the WG to execute next etc.  */
154 
155       if (retcode == 0)
156 	{
157 #ifdef EXECUTE_WGS_BACKWARDS
158 	  if (wg->x == l_data->wg_min_x)
159 	    {
160 	      wg->x = l_data->wg_max_x - 1;
161 	      if (wg->y == l_data->wg_min_y)
162 		{
163 		  wg->y = l_data->wg_max_y - 1;
164 		  if (wg->z == l_data->wg_min_z)
165 		    wg->more_wgs = 0;
166 		  else
167 		    wg->z--;
168 		}
169 	      else
170 		wg->y--;
171 	    }
172 	  else
173 	    wg->x--;
174 #else
175 	  if (wg->x + 1 >= l_data->wg_max_x)
176 	    {
177 	      wg->x = l_data->wg_min_x;
178 	      if (wg->y + 1 >= l_data->wg_max_y)
179 		{
180 		  wg->y = l_data->wg_min_y;
181 		  if (wg->z + 1 >= l_data->wg_max_z)
182 		    wg->more_wgs = 0;
183 		  else
184 		    wg->z++;
185 		}
186 	      else
187 		wg->y++;
188 	    }
189 	  else
190 	    wg->x++;
191 #endif
192 	  wi->group_x = wg->x;
193 	  wi->group_y = wg->y;
194 	  wi->group_z = wg->z;
195 
196 	  wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi);
197 	  wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi);
198 	  wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi);
199 
200 	  /* Reinitialize the work-group barrier according to the new WG's
201 	     size, which might not be the same as the previous ones, due
202 	     to "partial WGs".  */
203 	  size_t wg_size = __hsail_currentworkgroupsize (0, wi)
204 			   * __hsail_currentworkgroupsize (1, wi)
205 			   * __hsail_currentworkgroupsize (2, wi);
206 
207 #ifdef DEBUG_PHSA_RT
208 	  printf ("Reinitializing the WG barrier to %lu.\n", wg_size);
209 #endif
210 	  fiber_barrier_init ((fiber_barrier_t *)
211 			      wi->launch_data->wg_sync_barrier,
212 			      wg_size);
213 
214 #ifdef BENCHMARK_PHSA_RT
215 	  if (wi_count % 1000 == 0)
216 	    {
217 	      clock_t spent_time = clock () - start_time;
218 	      double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC;
219 	      double wis_per_sec = wi_count / spent_time_sec;
220 	      uint64_t eta_sec
221 		= (wi_total - wi_count - wis_skipped) / wis_per_sec;
222 
223 	      printf ("%lu WIs executed %lu skipped in %lus (%lu WIs/s, ETA in "
224 		      "%lu s)\n",
225 		      wi_count, wis_skipped, (uint64_t) spent_time_sec,
226 		      (uint64_t) wis_per_sec, (uint64_t) eta_sec);
227 	    }
228 #endif
229 	}
230     }
231   while (1);
232 
233   fiber_exit ();
234 }
235 #endif
236 
237 #define MIN(a, b) ((a < b) ? a : b)
238 #define MAX(a, b) ((a > b) ? a : b)
239 
240 #ifdef HAVE_FIBERS
241 /* Spawns a given number of work-items to execute a set of work-groups,
242    blocks until their completion.  */
243 
244 static void
phsa_execute_wi_gang(PHSAKernelLaunchData * context,void * group_base_ptr,uint32_t group_local_offset,size_t wg_size_x,size_t wg_size_y,size_t wg_size_z)245 phsa_execute_wi_gang (PHSAKernelLaunchData *context, void *group_base_ptr,
246 		      uint32_t group_local_offset, size_t wg_size_x,
247 		      size_t wg_size_y, size_t wg_size_z)
248 {
249   PHSAWorkItem *wi_threads = NULL;
250   PHSAWorkGroup wg;
251   size_t flat_wi_id = 0, x, y, z, max_x, max_y, max_z;
252   uint32_t group_x, group_y, group_z;
253   fiber_barrier_t wg_start_barrier;
254   fiber_barrier_t wg_completion_barrier;
255   fiber_barrier_t wg_sync_barrier;
256 
257   max_x = wg_size_x == 0 ? 1 : wg_size_x;
258   max_y = wg_size_y == 0 ? 1 : wg_size_y;
259   max_z = wg_size_z == 0 ? 1 : wg_size_z;
260 
261   size_t wg_size = max_x * max_y * max_z;
262   if (wg_size > PHSA_MAX_WG_SIZE)
263     phsa_fatal_error (2);
264 
265   wg.private_segment_total_size = context->dp->private_segment_size * wg_size;
266   if (wg.private_segment_total_size > 0
267       && posix_memalign (&wg.private_base_ptr, PRIVATE_SEGMENT_ALIGN,
268 			 wg.private_segment_total_size)
269 	   != 0)
270     phsa_fatal_error (3);
271 
272   wg.alloca_stack_p = wg.private_segment_total_size + ALLOCA_OVERHEAD;
273   wg.alloca_frame_p = wg.alloca_stack_p;
274   wg.initial_group_offset = group_local_offset;
275 
276 #ifdef EXECUTE_WGS_BACKWARDS
277   group_x = context->wg_max_x - 1;
278   group_y = context->wg_max_y - 1;
279   group_z = context->wg_max_z - 1;
280 #else
281   group_x = context->wg_min_x;
282   group_y = context->wg_min_y;
283   group_z = context->wg_min_z;
284 #endif
285 
286   fiber_barrier_init (&wg_sync_barrier, wg_size);
287   fiber_barrier_init (&wg_start_barrier, wg_size);
288   fiber_barrier_init (&wg_completion_barrier, wg_size);
289 
290   context->wg_start_barrier = &wg_start_barrier;
291   context->wg_sync_barrier = &wg_sync_barrier;
292   context->wg_completion_barrier = &wg_completion_barrier;
293 
294   wg.more_wgs = 1;
295   wg.group_base_ptr = group_base_ptr;
296 
297 #ifdef BENCHMARK_PHSA_RT
298   wi_count = 0;
299   wis_skipped = 0;
300   start_time = clock ();
301 #endif
302   wi_threads = malloc (sizeof (PHSAWorkItem) * max_x * max_y * max_z);
303   for (x = 0; x < max_x; ++x)
304     for (y = 0; y < max_y; ++y)
305       for (z = 0; z < max_z; ++z)
306 	{
307 	  PHSAWorkItem *wi = &wi_threads[flat_wi_id];
308 	  wi->launch_data = context;
309 	  wi->wg = &wg;
310 
311 	  wg.x = wi->group_x = group_x;
312 	  wg.y = wi->group_y = group_y;
313 	  wg.z = wi->group_z = group_z;
314 
315 	  wi->wg_size_x = context->dp->workgroup_size_x;
316 	  wi->wg_size_y = context->dp->workgroup_size_y;
317 	  wi->wg_size_z = context->dp->workgroup_size_z;
318 
319 	  wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi);
320 	  wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi);
321 	  wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi);
322 
323 	  wi->x = x;
324 	  wi->y = y;
325 	  wi->z = z;
326 
327 	  /* TODO: set the stack size according to the private
328 		   segment size.  Too big stack consumes huge amount of
329 		   memory in case of huge number of WIs and a too small stack
330 		   will fail in mysterious and potentially dangerous ways.  */
331 
332 	  fiber_init (&wi->fiber, phsa_work_item_thread, wi,
333 		      FIBER_STACK_SIZE, PRIVATE_SEGMENT_ALIGN);
334 	  ++flat_wi_id;
335 	}
336 
337   do
338     {
339       --flat_wi_id;
340       fiber_join (&wi_threads[flat_wi_id].fiber);
341     }
342   while (flat_wi_id > 0);
343 
344   if (wg.private_segment_total_size > 0)
345     free (wg.private_base_ptr);
346 
347   free (wi_threads);
348 }
349 
350 /* Spawn the work-item threads to execute work-groups and let
351    them execute all the WGs, including a potential partial WG.  */
352 
353 static void
phsa_spawn_work_items(PHSAKernelLaunchData * context,void * group_base_ptr,uint32_t group_local_offset)354 phsa_spawn_work_items (PHSAKernelLaunchData *context, void *group_base_ptr,
355 		       uint32_t group_local_offset)
356 {
357   hsa_kernel_dispatch_packet_t *dp = context->dp;
358   size_t x, y, z;
359 
360   context->group_segment_start_addr = (size_t) group_base_ptr;
361 
362   /* HSA seems to allow the WG size to be larger than the grid size.  We need to
363      saturate the effective WG size to the grid size to prevent the extra WIs
364      from executing.  */
365   size_t sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, sat_wg_size;
366   sat_wg_size_x = MIN (dp->workgroup_size_x, dp->grid_size_x);
367   sat_wg_size_y = MIN (dp->workgroup_size_y, dp->grid_size_y);
368   sat_wg_size_z = MIN (dp->workgroup_size_z, dp->grid_size_z);
369   sat_wg_size = sat_wg_size_x * sat_wg_size_y * sat_wg_size_z;
370 
371 #ifdef BENCHMARK_PHSA_RT
372   wi_total = (uint64_t) dp->grid_size_x
373 	     * (dp->grid_size_y > 0 ? dp->grid_size_y : 1)
374 	     * (dp->grid_size_z > 0 ? dp->grid_size_z : 1);
375 #endif
376 
377   /* For now execute all work groups in a single coarse thread (does not utilize
378      multicore/multithread).  */
379   context->wg_min_x = context->wg_min_y = context->wg_min_z = 0;
380 
381   int dims = dp->setup & 0x3;
382 
383   context->wg_max_x = ((uint64_t) dp->grid_size_x + dp->workgroup_size_x - 1)
384 		      / dp->workgroup_size_x;
385 
386   context->wg_max_y
387     = dims < 2 ? 1 : ((uint64_t) dp->grid_size_y + dp->workgroup_size_y - 1)
388 		       / dp->workgroup_size_y;
389 
390   context->wg_max_z
391     = dims < 3 ? 1 : ((uint64_t) dp->grid_size_z + dp->workgroup_size_z - 1)
392 		       / dp->workgroup_size_z;
393 
394 #ifdef DEBUG_PHSA_RT
395   printf ("### launching work-groups %lu/%lu/%lu to %lu/%lu/%lu with "
396 	  "wg size %lu/%lu/%lu grid size %u/%u/%u\n",
397 	  context->wg_min_x, context->wg_min_y, context->wg_min_z,
398 	  context->wg_max_x, context->wg_max_y, context->wg_max_z,
399 	  sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, dp->grid_size_x,
400 	  dp->grid_size_y, dp->grid_size_z);
401 #endif
402 
403   phsa_execute_wi_gang (context, group_base_ptr, group_local_offset,
404 			sat_wg_size_x, sat_wg_size_y, sat_wg_size_z);
405 }
406 #endif
407 
408 /* Executes the given work-group function for all work groups in the grid.
409 
410    A work-group function is a version of the original kernel which executes
411    the kernel for all work-items in a work-group.  It is produced by gccbrig
412    if it can handle the kernel's barrier usage and is much faster way to
413    execute massive numbers of work-items in a non-SPMD machine than fibers
414    (easily 100x faster).  */
415 static void
phsa_execute_work_groups(PHSAKernelLaunchData * context,void * group_base_ptr,uint32_t group_local_offset)416 phsa_execute_work_groups (PHSAKernelLaunchData *context, void *group_base_ptr,
417 			  uint32_t group_local_offset)
418 {
419   hsa_kernel_dispatch_packet_t *dp = context->dp;
420   size_t x, y, z, wg_x, wg_y, wg_z;
421 
422   context->group_segment_start_addr = (size_t) group_base_ptr;
423 
424   /* HSA seems to allow the WG size to be larger than the grid size.  We need
425      to saturate the effective WG size to the grid size to prevent the extra WIs
426      from executing.  */
427   size_t sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, sat_wg_size;
428   sat_wg_size_x = MIN (dp->workgroup_size_x, dp->grid_size_x);
429   sat_wg_size_y = MIN (dp->workgroup_size_y, dp->grid_size_y);
430   sat_wg_size_z = MIN (dp->workgroup_size_z, dp->grid_size_z);
431   sat_wg_size = sat_wg_size_x * sat_wg_size_y * sat_wg_size_z;
432 
433 #ifdef BENCHMARK_PHSA_RT
434   wi_total = (uint64_t) dp->grid_size_x
435 	     * (dp->grid_size_y > 0 ? dp->grid_size_y : 1)
436 	     * (dp->grid_size_z > 0 ? dp->grid_size_z : 1);
437 #endif
438 
439   context->wg_min_x = context->wg_min_y = context->wg_min_z = 0;
440 
441   int dims = dp->setup & 0x3;
442 
443   context->wg_max_x = ((uint64_t) dp->grid_size_x + dp->workgroup_size_x - 1)
444 		      / dp->workgroup_size_x;
445 
446   context->wg_max_y
447     = dims < 2 ? 1 : ((uint64_t) dp->grid_size_y + dp->workgroup_size_y - 1)
448 		       / dp->workgroup_size_y;
449 
450   context->wg_max_z
451     = dims < 3 ? 1 : ((uint64_t) dp->grid_size_z + dp->workgroup_size_z - 1)
452 		       / dp->workgroup_size_z;
453 
454 #ifdef DEBUG_PHSA_RT
455   printf ("### launching work-groups %lu/%lu/%lu to %lu/%lu/%lu with "
456 	  "wg size %lu/%lu/%lu grid size %u/%u/%u\n",
457 	  context->wg_min_x, context->wg_min_y, context->wg_min_z,
458 	  context->wg_max_x, context->wg_max_y, context->wg_max_z,
459 	  sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, dp->grid_size_x,
460 	  dp->grid_size_y, dp->grid_size_z);
461 #endif
462 
463   PHSAWorkItem wi;
464   PHSAWorkGroup wg;
465   wi.wg = &wg;
466   wi.x = wi.y = wi.z = 0;
467   wi.launch_data = context;
468 
469 #ifdef BENCHMARK_PHSA_RT
470   start_time = clock ();
471   uint64_t wg_count = 0;
472 #endif
473 
474   size_t wg_size = __hsail_workgroupsize (0, &wi)
475 		   * __hsail_workgroupsize (1, &wi)
476 		   * __hsail_workgroupsize (2, &wi);
477 
478   void *private_base_ptr = NULL;
479   if (dp->private_segment_size > 0
480       && posix_memalign (&private_base_ptr, PRIVATE_SEGMENT_ALIGN,
481 			 dp->private_segment_size * wg_size)
482 	   != 0)
483     phsa_fatal_error (3);
484 
485   wg.alloca_stack_p = dp->private_segment_size * wg_size + ALLOCA_OVERHEAD;
486   wg.alloca_frame_p = wg.alloca_stack_p;
487 
488   wg.private_base_ptr = private_base_ptr;
489   wg.group_base_ptr = group_base_ptr;
490 
491 #ifdef DEBUG_PHSA_RT
492   printf ("priv seg size %u wg_size %lu @ %p\n", dp->private_segment_size,
493 	  wg_size, private_base_ptr);
494 #endif
495 
496   for (wg_z = context->wg_min_z; wg_z < context->wg_max_z; ++wg_z)
497     for (wg_y = context->wg_min_y; wg_y < context->wg_max_y; ++wg_y)
498       for (wg_x = context->wg_min_x; wg_x < context->wg_max_x; ++wg_x)
499 	{
500 	  wi.group_x = wg_x;
501 	  wi.group_y = wg_y;
502 	  wi.group_z = wg_z;
503 
504 	  wi.wg_size_x = context->dp->workgroup_size_x;
505 	  wi.wg_size_y = context->dp->workgroup_size_y;
506 	  wi.wg_size_z = context->dp->workgroup_size_z;
507 
508 	  wi.cur_wg_size_x = __hsail_currentworkgroupsize (0, &wi);
509 	  wi.cur_wg_size_y = __hsail_currentworkgroupsize (1, &wi);
510 	  wi.cur_wg_size_z = __hsail_currentworkgroupsize (2, &wi);
511 
512 	  context->kernel (context->kernarg_addr, &wi, group_base_ptr,
513 			   group_local_offset, private_base_ptr);
514 
515 #if defined (BENCHMARK_PHSA_RT)
516 	  wg_count++;
517 	  if (wg_count % 1000000 == 0)
518 	    {
519 	      clock_t spent_time = clock () - start_time;
520 	      uint64_t wi_count = wg_x * sat_wg_size_x + wg_y * sat_wg_size_y
521 				  + wg_z * sat_wg_size_z;
522 	      double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC;
523 	      double wis_per_sec = wi_count / spent_time_sec;
524 	      uint64_t eta_sec = (wi_total - wi_count) / wis_per_sec;
525 
526 	      printf ("%lu WIs executed in %lus (%lu WIs/s, ETA in %lu s)\n",
527 		      wi_count, (uint64_t) spent_time_sec,
528 		      (uint64_t) wis_per_sec, (uint64_t) eta_sec);
529 	    }
530 #endif
531 	}
532 
533 #ifdef BENCHMARK_PHSA_RT
534   clock_t spent_time = clock () - start_time;
535   double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC;
536   double wis_per_sec = wi_total / spent_time_sec;
537 
538   printf ("### %lu WIs executed in %lu s (%lu WIs / s)\n", wi_total,
539 	  (uint64_t) spent_time_sec, (uint64_t) wis_per_sec);
540 #endif
541   free (private_base_ptr);
542   private_base_ptr = NULL;
543 }
544 
545 /* gccbrig generates the following from each HSAIL kernel:
546 
547    1) The actual kernel function (a single work-item kernel or a work-group
548       function) generated from HSAIL (BRIG).
549 
550 	 static void _Kernel (void* args, void* context, void* group_base_ptr)
551 	 {
552 	   ...
553 	 }
554 
555   2) A public facing kernel function that is called from the PHSA runtime:
556 
557    a) A single work-item function (that requires fibers for multi-WI):
558 
559       void Kernel (void* context)
560       {
561 	 __launch_launch_kernel (_Kernel, context);
562       }
563 
564       or
565 
566     b) a when gccbrig could generate a work-group function:
567 
568       void Kernel (void* context)
569       {
570 		__hsail_launch_wg_function (_Kernel, context);
571       }
572 */
573 
574 #ifdef HAVE_FIBERS
575 
576 void
__hsail_launch_kernel(gccbrigKernelFunc kernel,PHSAKernelLaunchData * context,void * group_base_ptr,uint32_t group_local_offset)577 __hsail_launch_kernel (gccbrigKernelFunc kernel, PHSAKernelLaunchData *context,
578 		       void *group_base_ptr, uint32_t group_local_offset)
579 {
580   context->kernel = kernel;
581   phsa_spawn_work_items (context, group_base_ptr, group_local_offset);
582 }
583 #endif
584 
585 void
__hsail_launch_wg_function(gccbrigKernelFunc kernel,PHSAKernelLaunchData * context,void * group_base_ptr,uint32_t group_local_offset)586 __hsail_launch_wg_function (gccbrigKernelFunc kernel,
587 			    PHSAKernelLaunchData *context, void *group_base_ptr,
588 			    uint32_t group_local_offset)
589 {
590   context->kernel = kernel;
591   phsa_execute_work_groups (context, group_base_ptr, group_local_offset);
592 }
593 
594 uint32_t
__hsail_workitemabsid(uint32_t dim,PHSAWorkItem * context)595 __hsail_workitemabsid (uint32_t dim, PHSAWorkItem *context)
596 {
597   hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
598 
599   uint32_t id;
600   switch (dim)
601     {
602     default:
603     case 0:
604       /* Overflow semantics in the case of WG dim > grid dim.  */
605       id = ((uint64_t) context->group_x * dp->workgroup_size_x + context->x)
606 	   % dp->grid_size_x;
607       break;
608     case 1:
609       id = ((uint64_t) context->group_y * dp->workgroup_size_y + context->y)
610 	   % dp->grid_size_y;
611       break;
612     case 2:
613       id = ((uint64_t) context->group_z * dp->workgroup_size_z + context->z)
614 	   % dp->grid_size_z;
615       break;
616     }
617   return id;
618 }
619 
620 uint64_t
__hsail_workitemabsid_u64(uint32_t dim,PHSAWorkItem * context)621 __hsail_workitemabsid_u64 (uint32_t dim, PHSAWorkItem *context)
622 {
623   hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
624 
625   uint64_t id;
626   switch (dim)
627     {
628     default:
629     case 0:
630       /* Overflow semantics in the case of WG dim > grid dim.  */
631       id = ((uint64_t) context->group_x * dp->workgroup_size_x + context->x)
632 	   % dp->grid_size_x;
633       break;
634     case 1:
635       id = ((uint64_t) context->group_y * dp->workgroup_size_y + context->y)
636 	   % dp->grid_size_y;
637       break;
638     case 2:
639       id = ((uint64_t) context->group_z * dp->workgroup_size_z + context->z)
640 	   % dp->grid_size_z;
641       break;
642     }
643   return id;
644 }
645 
646 
647 uint32_t
__hsail_workitemid(uint32_t dim,PHSAWorkItem * context)648 __hsail_workitemid (uint32_t dim, PHSAWorkItem *context)
649 {
650   PHSAWorkItem *c = (PHSAWorkItem *) context;
651   hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
652 
653   /* The number of dimensions is in the two least significant bits.  */
654   int dims = dp->setup & 0x3;
655 
656   uint32_t id;
657   switch (dim)
658     {
659     default:
660     case 0:
661       id = c->x;
662       break;
663     case 1:
664       id = dims < 2 ? 0 : c->y;
665       break;
666     case 2:
667       id = dims < 3 ? 0 : c->z;
668       break;
669     }
670   return id;
671 }
672 
673 uint32_t
__hsail_gridgroups(uint32_t dim,PHSAWorkItem * context)674 __hsail_gridgroups (uint32_t dim, PHSAWorkItem *context)
675 {
676   hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
677   int dims = dp->setup & 0x3;
678 
679   uint32_t id;
680   switch (dim)
681     {
682     default:
683     case 0:
684       id = (dp->grid_size_x + dp->workgroup_size_x - 1) / dp->workgroup_size_x;
685       break;
686     case 1:
687       id = dims < 2 ? 1 : (dp->grid_size_y + dp->workgroup_size_y - 1)
688 			    / dp->workgroup_size_y;
689       break;
690     case 2:
691       id = dims < 3 ? 1 : (dp->grid_size_z + dp->workgroup_size_z - 1)
692 			    / dp->workgroup_size_z;
693       break;
694     }
695   return id;
696 }
697 
698 uint32_t
__hsail_workitemflatid(PHSAWorkItem * c)699 __hsail_workitemflatid (PHSAWorkItem *c)
700 {
701   hsa_kernel_dispatch_packet_t *dp = c->launch_data->dp;
702 
703   return c->x + c->y * dp->workgroup_size_x
704 	 + c->z * dp->workgroup_size_x * dp->workgroup_size_y;
705 }
706 
707 uint32_t
__hsail_currentworkitemflatid(PHSAWorkItem * c)708 __hsail_currentworkitemflatid (PHSAWorkItem *c)
709 {
710   hsa_kernel_dispatch_packet_t *dp = c->launch_data->dp;
711 
712   return c->x + c->y * __hsail_currentworkgroupsize (0, c)
713 	 + c->z * __hsail_currentworkgroupsize (0, c)
714 	     * __hsail_currentworkgroupsize (1, c);
715 }
716 
717 void
__hsail_setworkitemid(uint32_t dim,uint32_t id,PHSAWorkItem * context)718 __hsail_setworkitemid (uint32_t dim, uint32_t id, PHSAWorkItem *context)
719 {
720   switch (dim)
721     {
722     default:
723     case 0:
724       context->x = id;
725       break;
726     case 1:
727       context->y = id;
728       break;
729     case 2:
730       context->z = id;
731       break;
732     }
733 }
734 
735 uint64_t
__hsail_workitemflatabsid_u64(PHSAWorkItem * context)736 __hsail_workitemflatabsid_u64 (PHSAWorkItem *context)
737 {
738   PHSAWorkItem *c = (PHSAWorkItem *) context;
739   hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
740 
741   /* Work-item flattened absolute ID = ID0 + ID1 * max0 + ID2 * max0 * max1.  */
742   uint64_t id0 = __hsail_workitemabsid (0, context);
743   uint64_t id1 = __hsail_workitemabsid (1, context);
744   uint64_t id2 = __hsail_workitemabsid (2, context);
745 
746   uint64_t max0 = dp->grid_size_x;
747   uint64_t max1 = dp->grid_size_y;
748   uint64_t id = id0 + id1 * max0 + id2 * max0 * max1;
749 
750   return id;
751 }
752 
753 uint32_t
__hsail_workitemflatabsid_u32(PHSAWorkItem * context)754 __hsail_workitemflatabsid_u32 (PHSAWorkItem *context)
755 {
756   PHSAWorkItem *c = (PHSAWorkItem *) context;
757   hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
758 
759   /* work-item flattened absolute ID = ID0 + ID1 * max0 + ID2 * max0 * max1.  */
760   uint64_t id0 = __hsail_workitemabsid (0, context);
761   uint64_t id1 = __hsail_workitemabsid (1, context);
762   uint64_t id2 = __hsail_workitemabsid (2, context);
763 
764   uint64_t max0 = dp->grid_size_x;
765   uint64_t max1 = dp->grid_size_y;
766   uint64_t id = id0 + id1 * max0 + id2 * max0 * max1;
767   return (uint32_t) id;
768 }
769 
770 uint32_t
__hsail_currentworkgroupsize(uint32_t dim,PHSAWorkItem * wi)771 __hsail_currentworkgroupsize (uint32_t dim, PHSAWorkItem *wi)
772 {
773   hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
774   uint32_t wg_size = 0;
775   switch (dim)
776     {
777     default:
778     case 0:
779       if ((uint64_t) wi->group_x < dp->grid_size_x / dp->workgroup_size_x)
780 	wg_size = dp->workgroup_size_x; /* Full WG.  */
781       else
782 	wg_size = dp->grid_size_x % dp->workgroup_size_x; /* Partial WG.  */
783       break;
784     case 1:
785       if ((uint64_t) wi->group_y < dp->grid_size_y / dp->workgroup_size_y)
786 	wg_size = dp->workgroup_size_y; /* Full WG.  */
787       else
788 	wg_size = dp->grid_size_y % dp->workgroup_size_y; /* Partial WG.  */
789       break;
790     case 2:
791       if ((uint64_t) wi->group_z < dp->grid_size_z / dp->workgroup_size_z)
792 	wg_size = dp->workgroup_size_z; /* Full WG.  */
793       else
794 	wg_size = dp->grid_size_z % dp->workgroup_size_z; /* Partial WG.  */
795       break;
796     }
797   return wg_size;
798 }
799 
800 uint32_t
__hsail_workgroupsize(uint32_t dim,PHSAWorkItem * wi)801 __hsail_workgroupsize (uint32_t dim, PHSAWorkItem *wi)
802 {
803   hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
804   switch (dim)
805     {
806     default:
807     case 0:
808       return dp->workgroup_size_x;
809     case 1:
810       return dp->workgroup_size_y;
811     case 2:
812       return dp->workgroup_size_z;
813     }
814 }
815 
816 uint32_t
__hsail_gridsize(uint32_t dim,PHSAWorkItem * wi)817 __hsail_gridsize (uint32_t dim, PHSAWorkItem *wi)
818 {
819   hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
820   switch (dim)
821     {
822     default:
823     case 0:
824       return dp->grid_size_x;
825     case 1:
826       return dp->grid_size_y;
827     case 2:
828       return dp->grid_size_z;
829     }
830 }
831 
832 uint32_t
__hsail_workgroupid(uint32_t dim,PHSAWorkItem * wi)833 __hsail_workgroupid (uint32_t dim, PHSAWorkItem *wi)
834 {
835   switch (dim)
836     {
837     default:
838     case 0:
839       return wi->group_x;
840     case 1:
841       return wi->group_y;
842     case 2:
843       return wi->group_z;
844     }
845 }
846 
847 uint32_t
__hsail_dim(PHSAWorkItem * wi)848 __hsail_dim (PHSAWorkItem *wi)
849 {
850   hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
851   return dp->setup & 0x3;
852 }
853 
854 uint64_t
__hsail_packetid(PHSAWorkItem * wi)855 __hsail_packetid (PHSAWorkItem *wi)
856 {
857   return wi->launch_data->packet_id;
858 }
859 
860 uint32_t
__hsail_packetcompletionsig_sig32(PHSAWorkItem * wi)861 __hsail_packetcompletionsig_sig32 (PHSAWorkItem *wi)
862 {
863   return (uint32_t) wi->launch_data->dp->completion_signal.handle;
864 }
865 
866 uint64_t
__hsail_packetcompletionsig_sig64(PHSAWorkItem * wi)867 __hsail_packetcompletionsig_sig64 (PHSAWorkItem *wi)
868 {
869   return (uint64_t) (wi->launch_data->dp->completion_signal.handle);
870 }
871 
872 #ifdef HAVE_FIBERS
873 void
__hsail_barrier(PHSAWorkItem * wi)874 __hsail_barrier (PHSAWorkItem *wi)
875 {
876   fiber_barrier_reach ((fiber_barrier_t *) wi->launch_data->wg_sync_barrier);
877 }
878 #endif
879 
880 /* Return a 32b private segment address that points to a dynamically
881    allocated chunk of 'size' with 'align'.
882 
883    Allocates the space from the end of the private segment allocated
884    for the whole work group.  In implementations with separate private
885    memories per WI, we will need to have a stack pointer per WI.  But in
886    the current implementation, the segment is shared, so we possibly
887    save some space in case all WIs do not call the alloca.
888 
889    The "alloca frames" are organized as follows:
890 
891    wg->alloca_stack_p points to the last allocated data (initially
892    outside the private segment)
893    wg->alloca_frame_p points to the first address _outside_ the current
894    function's allocations (initially to the same as alloca_stack_p)
895 
896    The data is allocated downwards from the end of the private segment.
897 
898    In the beginning of a new function which has allocas, a new alloca
899    frame is pushed which adds the current alloca_frame_p (the current
900    function's frame starting point) to the top of the alloca stack and
901    alloca_frame_p is set to the current stack position.
902 
903    At the exit points of a function with allocas, the alloca frame
904    is popped before returning.  This involves popping the alloca_frame_p
905    to the one of the previous function in the call stack, and alloca_stack_p
906    similarly, to the position of the last word alloca'd by the previous
907    function.
908  */
909 
910 uint32_t
__hsail_alloca(uint32_t size,uint32_t align,PHSAWorkItem * wi)911 __hsail_alloca (uint32_t size, uint32_t align, PHSAWorkItem *wi)
912 {
913   volatile PHSAWorkGroup *wg = wi->wg;
914   int64_t new_pos = wg->alloca_stack_p - size;
915   while (new_pos % align != 0)
916     new_pos--;
917   if (new_pos < 0)
918     phsa_fatal_error (2);
919 
920   wg->alloca_stack_p = new_pos;
921 
922 #ifdef DEBUG_ALLOCA
923   printf ("--- alloca (%u, %u) sp @%u fp @%u\n", size, align,
924 	  wg->alloca_stack_p, wg->alloca_frame_p);
925 #endif
926   return new_pos;
927 }
928 
929 /* Initializes a new "alloca frame" in the private segment.
930    This should be called at all the function entry points in case
931    the function contains at least one call to alloca.  */
932 
933 void
__hsail_alloca_push_frame(PHSAWorkItem * wi)934 __hsail_alloca_push_frame (PHSAWorkItem *wi)
935 {
936   volatile PHSAWorkGroup *wg = wi->wg;
937 
938   /* Store the alloca_frame_p without any alignment padding so
939      we know exactly where the previous frame ended after popping
940      it.  */
941 #ifdef DEBUG_ALLOCA
942   printf ("--- push frame ");
943 #endif
944   uint32_t last_word_offs = __hsail_alloca (4, 1, wi);
945   memcpy (wg->private_base_ptr + last_word_offs,
946 	  (const void *) &wg->alloca_frame_p, 4);
947   wg->alloca_frame_p = last_word_offs;
948 
949 #ifdef DEBUG_ALLOCA
950   printf ("--- sp @%u fp @%u\n", wg->alloca_stack_p, wg->alloca_frame_p);
951 #endif
952 }
953 
954 /* Frees the current "alloca frame" and restores the frame
955    pointer.
956    This should be called at all the function return points in case
957    the function contains at least one call to alloca.  Restores the
958    alloca stack to the condition it was before pushing the frame
959    the last time.  */
960 void
__hsail_alloca_pop_frame(PHSAWorkItem * wi)961 __hsail_alloca_pop_frame (PHSAWorkItem *wi)
962 {
963   volatile PHSAWorkGroup *wg = wi->wg;
964 
965   wg->alloca_stack_p = wg->alloca_frame_p;
966   memcpy ((void *) &wg->alloca_frame_p,
967 	  (const void *) (wg->private_base_ptr + wg->alloca_frame_p), 4);
968   /* Now frame_p points to the beginning of the previous function's
969      frame and stack_p to its end.  */
970 
971   wg->alloca_stack_p += 4;
972 
973 #ifdef DEBUG_ALLOCA
974   printf ("--- pop frame sp @%u fp @%u\n", wg->alloca_stack_p,
975 	  wg->alloca_frame_p);
976 #endif
977 }
978