1 /* Plugin for NVPTX execution.
2 
3    Copyright (C) 2013-2021 Free Software Foundation, Inc.
4 
5    Contributed by Mentor Embedded.
6 
7    This file is part of the GNU Offloading and Multi Processing Library
8    (libgomp).
9 
10    Libgomp is free software; you can redistribute it and/or modify it
11    under the terms of the GNU General Public License as published by
12    the Free Software Foundation; either version 3, or (at your option)
13    any later version.
14 
15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
18    more details.
19 
20    Under Section 7 of GPL version 3, you are granted additional
21    permissions described in the GCC Runtime Library Exception, version
22    3.1, as published by the Free Software Foundation.
23 
24    You should have received a copy of the GNU General Public License and
25    a copy of the GCC Runtime Library Exception along with this program;
26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
27    <http://www.gnu.org/licenses/>.  */
28 
29 /* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
30    library appears to hold some implicit state, but the documentation
31    is not clear as to what that state might be.  Or how one might
32    propagate it from one thread to another.  */
33 
34 #define _GNU_SOURCE
35 #include "openacc.h"
36 #include "config.h"
37 #include "symcat.h"
38 #include "libgomp-plugin.h"
39 #include "oacc-plugin.h"
40 #include "gomp-constants.h"
41 #include "oacc-int.h"
42 
43 #include <pthread.h>
44 #include <cuda.h>
45 #include <stdbool.h>
46 #include <limits.h>
47 #include <string.h>
48 #include <stdio.h>
49 #include <unistd.h>
50 #include <assert.h>
51 #include <errno.h>
52 
53 /* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
54    block to cache between kernel invocations.  For soft-stacks blocks bigger
55    than this, we will free the block before attempting another GPU memory
56    allocation (i.e. in GOMP_OFFLOAD_alloc).  Otherwise, if an allocation fails,
57    we will free the cached soft-stacks block anyway then retry the
58    allocation.  If that fails too, we lose.  */
59 
60 #define SOFTSTACK_CACHE_LIMIT 134217728
61 
62 #if CUDA_VERSION < 6000
63 extern CUresult cuGetErrorString (CUresult, const char **);
64 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
65 #endif
66 
67 #if CUDA_VERSION >= 6050
68 #undef cuLinkCreate
69 #undef cuLinkAddData
70 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
71 			const char *, unsigned, CUjit_option *, void **);
72 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
73 #else
74 typedef size_t (*CUoccupancyB2DSize)(int);
75 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
76 			   const char *, unsigned, CUjit_option *, void **);
77 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
78 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
79 					  CUoccupancyB2DSize, size_t, int);
80 #endif
81 
82 #define DO_PRAGMA(x) _Pragma (#x)
83 
84 #if PLUGIN_NVPTX_DYNAMIC
85 # include <dlfcn.h>
86 
87 struct cuda_lib_s {
88 
89 # define CUDA_ONE_CALL(call)			\
90   __typeof (call) *call;
91 # define CUDA_ONE_CALL_MAYBE_NULL(call)		\
92   CUDA_ONE_CALL (call)
93 #include "cuda-lib.def"
94 # undef CUDA_ONE_CALL
95 # undef CUDA_ONE_CALL_MAYBE_NULL
96 
97 } cuda_lib;
98 
99 /* -1 if init_cuda_lib has not been called yet, false
100    if it has been and failed, true if it has been and succeeded.  */
101 static signed char cuda_lib_inited = -1;
102 
103 /* Dynamically load the CUDA runtime library and initialize function
104    pointers, return false if unsuccessful, true if successful.  */
105 static bool
init_cuda_lib(void)106 init_cuda_lib (void)
107 {
108   if (cuda_lib_inited != -1)
109     return cuda_lib_inited;
110   const char *cuda_runtime_lib = "libcuda.so.1";
111   void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
112   cuda_lib_inited = false;
113   if (h == NULL)
114     return false;
115 
116 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
117 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
118 # define CUDA_ONE_CALL_1(call, allow_null)		\
119   cuda_lib.call = dlsym (h, #call);	\
120   if (!allow_null && cuda_lib.call == NULL)		\
121     return false;
122 #include "cuda-lib.def"
123 # undef CUDA_ONE_CALL
124 # undef CUDA_ONE_CALL_1
125 # undef CUDA_ONE_CALL_MAYBE_NULL
126 
127   cuda_lib_inited = true;
128   return true;
129 }
130 # define CUDA_CALL_PREFIX cuda_lib.
131 #else
132 
133 # define CUDA_ONE_CALL(call)
134 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
135 #include "cuda-lib.def"
136 #undef CUDA_ONE_CALL_MAYBE_NULL
137 #undef CUDA_ONE_CALL
138 
139 # define CUDA_CALL_PREFIX
140 # define init_cuda_lib() true
141 #endif
142 
143 #include "secure_getenv.h"
144 
145 #undef MIN
146 #undef MAX
147 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
148 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
149 
150 /* Convenience macros for the frequently used CUDA library call and
151    error handling sequence as well as CUDA library calls that
152    do the error checking themselves or don't do it at all.  */
153 
154 #define CUDA_CALL_ERET(ERET, FN, ...)		\
155   do {						\
156     unsigned __r				\
157       = CUDA_CALL_PREFIX FN (__VA_ARGS__);	\
158     if (__r != CUDA_SUCCESS)			\
159       {						\
160 	GOMP_PLUGIN_error (#FN " error: %s",	\
161 			   cuda_error (__r));	\
162 	return ERET;				\
163       }						\
164   } while (0)
165 
166 #define CUDA_CALL(FN, ...)			\
167   CUDA_CALL_ERET (false, FN, __VA_ARGS__)
168 
169 #define CUDA_CALL_ASSERT(FN, ...)		\
170   do {						\
171     unsigned __r				\
172       = CUDA_CALL_PREFIX FN (__VA_ARGS__);	\
173     if (__r != CUDA_SUCCESS)			\
174       {						\
175 	GOMP_PLUGIN_fatal (#FN " error: %s",	\
176 			   cuda_error (__r));	\
177       }						\
178   } while (0)
179 
180 #define CUDA_CALL_NOCHECK(FN, ...)		\
181   CUDA_CALL_PREFIX FN (__VA_ARGS__)
182 
183 #define CUDA_CALL_EXISTS(FN)			\
184   CUDA_CALL_PREFIX FN
185 
186 static const char *
cuda_error(CUresult r)187 cuda_error (CUresult r)
188 {
189   const char *fallback = "unknown cuda error";
190   const char *desc;
191 
192   if (!CUDA_CALL_EXISTS (cuGetErrorString))
193     return fallback;
194 
195   r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
196   if (r == CUDA_SUCCESS)
197     return desc;
198 
199   return fallback;
200 }
201 
202 /* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by
203    Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */
204 static char cuda_driver_version_s[30];
205 
206 static unsigned int instantiated_devices = 0;
207 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
208 
209 /* NVPTX/CUDA specific definition of asynchronous queues.  */
210 struct goacc_asyncqueue
211 {
212   CUstream cuda_stream;
213 };
214 
215 struct nvptx_callback
216 {
217   void (*fn) (void *);
218   void *ptr;
219   struct goacc_asyncqueue *aq;
220   struct nvptx_callback *next;
221 };
222 
223 /* Thread-specific data for PTX.  */
224 
225 struct nvptx_thread
226 {
227   /* We currently have this embedded inside the plugin because libgomp manages
228      devices through integer target_ids.  This might be better if using an
229      opaque target-specific pointer directly from gomp_device_descr.  */
230   struct ptx_device *ptx_dev;
231 };
232 
233 /* Target data function launch information.  */
234 
235 struct targ_fn_launch
236 {
237   const char *fn;
238   unsigned short dim[GOMP_DIM_MAX];
239 };
240 
241 /* Target PTX object information.  */
242 
243 struct targ_ptx_obj
244 {
245   const char *code;
246   size_t size;
247 };
248 
249 /* Target data image information.  */
250 
251 typedef struct nvptx_tdata
252 {
253   const struct targ_ptx_obj *ptx_objs;
254   unsigned ptx_num;
255 
256   const char *const *var_names;
257   unsigned var_num;
258 
259   const struct targ_fn_launch *fn_descs;
260   unsigned fn_num;
261 } nvptx_tdata_t;
262 
263 /* Descriptor of a loaded function.  */
264 
265 struct targ_fn_descriptor
266 {
267   CUfunction fn;
268   const struct targ_fn_launch *launch;
269   int regs_per_thread;
270   int max_threads_per_block;
271 };
272 
273 /* A loaded PTX image.  */
274 struct ptx_image_data
275 {
276   const void *target_data;
277   CUmodule module;
278 
279   struct targ_fn_descriptor *fns;  /* Array of functions.  */
280 
281   struct ptx_image_data *next;
282 };
283 
284 struct ptx_free_block
285 {
286   void *ptr;
287   struct ptx_free_block *next;
288 };
289 
290 struct ptx_device
291 {
292   CUcontext ctx;
293   bool ctx_shared;
294   CUdevice dev;
295 
296   int ord;
297   bool overlap;
298   bool map;
299   bool concur;
300   bool mkern;
301   int mode;
302   int clock_khz;
303   int num_sms;
304   int regs_per_block;
305   int regs_per_sm;
306   int warp_size;
307   int max_threads_per_block;
308   int max_threads_per_multiprocessor;
309   int default_dims[GOMP_DIM_MAX];
310 
311   /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp').  */
312   char name[256];
313 
314   struct ptx_image_data *images;  /* Images loaded on device.  */
315   pthread_mutex_t image_lock;     /* Lock for above list.  */
316 
317   struct ptx_free_block *free_blocks;
318   pthread_mutex_t free_blocks_lock;
319 
320   /* OpenMP stacks, cached between kernel invocations.  */
321   struct
322     {
323       CUdeviceptr ptr;
324       size_t size;
325       pthread_mutex_t lock;
326     } omp_stacks;
327 
328   struct ptx_device *next;
329 };
330 
331 static struct ptx_device **ptx_devices;
332 
333 static inline struct nvptx_thread *
nvptx_thread(void)334 nvptx_thread (void)
335 {
336   return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
337 }
338 
339 /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
340    should be locked on entry and remains locked on exit.  */
341 
342 static bool
nvptx_init(void)343 nvptx_init (void)
344 {
345   int ndevs;
346 
347   if (instantiated_devices != 0)
348     return true;
349 
350   if (!init_cuda_lib ())
351     return false;
352 
353   CUDA_CALL (cuInit, 0);
354 
355   int cuda_driver_version;
356   CUDA_CALL_ERET (NULL, cuDriverGetVersion, &cuda_driver_version);
357   snprintf (cuda_driver_version_s, sizeof cuda_driver_version_s,
358 	    "CUDA Driver %u.%u",
359 	    cuda_driver_version / 1000, cuda_driver_version % 1000 / 10);
360 
361   CUDA_CALL (cuDeviceGetCount, &ndevs);
362   ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
363 					    * ndevs);
364 
365   return true;
366 }
367 
368 /* Select the N'th PTX device for the current host thread.  The device must
369    have been previously opened before calling this function.  */
370 
371 static bool
nvptx_attach_host_thread_to_device(int n)372 nvptx_attach_host_thread_to_device (int n)
373 {
374   CUdevice dev;
375   CUresult r;
376   struct ptx_device *ptx_dev;
377   CUcontext thd_ctx;
378 
379   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
380   if (r == CUDA_ERROR_NOT_PERMITTED)
381     {
382       /* Assume we're in a CUDA callback, just return true.  */
383       return true;
384     }
385   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
386     {
387       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
388       return false;
389     }
390 
391   if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
392     return true;
393   else
394     {
395       CUcontext old_ctx;
396 
397       ptx_dev = ptx_devices[n];
398       if (!ptx_dev)
399 	{
400 	  GOMP_PLUGIN_error ("device %d not found", n);
401 	  return false;
402 	}
403 
404       CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
405 
406       /* We don't necessarily have a current context (e.g. if it has been
407          destroyed.  Pop it if we do though.  */
408       if (thd_ctx != NULL)
409 	CUDA_CALL (cuCtxPopCurrent, &old_ctx);
410 
411       CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
412     }
413   return true;
414 }
415 
416 static struct ptx_device *
nvptx_open_device(int n)417 nvptx_open_device (int n)
418 {
419   struct ptx_device *ptx_dev;
420   CUdevice dev, ctx_dev;
421   CUresult r;
422   int async_engines, pi;
423 
424   CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
425 
426   ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
427 
428   ptx_dev->ord = n;
429   ptx_dev->dev = dev;
430   ptx_dev->ctx_shared = false;
431 
432   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
433   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
434     {
435       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
436       return NULL;
437     }
438 
439   if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
440     {
441       /* The current host thread has an active context for a different device.
442          Detach it.  */
443       CUcontext old_ctx;
444       CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
445     }
446 
447   CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
448 
449   if (!ptx_dev->ctx)
450     CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
451   else
452     ptx_dev->ctx_shared = true;
453 
454   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
455 		  &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
456   ptx_dev->overlap = pi;
457 
458   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
459 		  &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
460   ptx_dev->map = pi;
461 
462   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
463 		  &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
464   ptx_dev->concur = pi;
465 
466   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
467 		  &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
468   ptx_dev->mode = pi;
469 
470   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
471 		  &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
472   ptx_dev->mkern = pi;
473 
474   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
475 		  &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
476   ptx_dev->clock_khz = pi;
477 
478   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
479 		  &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
480   ptx_dev->num_sms = pi;
481 
482   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
483 		  &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
484   ptx_dev->regs_per_block = pi;
485 
486   /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
487      in CUDA 6.0 and newer.  */
488   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
489 			 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
490 			 dev);
491   /* Fallback: use limit of registers per block, which is usually equal.  */
492   if (r == CUDA_ERROR_INVALID_VALUE)
493     pi = ptx_dev->regs_per_block;
494   else if (r != CUDA_SUCCESS)
495     {
496       GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
497       return NULL;
498     }
499   ptx_dev->regs_per_sm = pi;
500 
501   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
502 		  &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
503   if (pi != 32)
504     {
505       GOMP_PLUGIN_error ("Only warp size 32 is supported");
506       return NULL;
507     }
508   ptx_dev->warp_size = pi;
509 
510   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
511 		  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
512   ptx_dev->max_threads_per_block = pi;
513 
514   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
515 		  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
516   ptx_dev->max_threads_per_multiprocessor = pi;
517 
518   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
519 			 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
520   if (r != CUDA_SUCCESS)
521     async_engines = 1;
522 
523   for (int i = 0; i != GOMP_DIM_MAX; i++)
524     ptx_dev->default_dims[i] = 0;
525 
526   CUDA_CALL_ERET (NULL, cuDeviceGetName, ptx_dev->name, sizeof ptx_dev->name,
527 		  dev);
528 
529   ptx_dev->images = NULL;
530   pthread_mutex_init (&ptx_dev->image_lock, NULL);
531 
532   ptx_dev->free_blocks = NULL;
533   pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
534 
535   ptx_dev->omp_stacks.ptr = 0;
536   ptx_dev->omp_stacks.size = 0;
537   pthread_mutex_init (&ptx_dev->omp_stacks.lock, NULL);
538 
539   return ptx_dev;
540 }
541 
542 static bool
nvptx_close_device(struct ptx_device * ptx_dev)543 nvptx_close_device (struct ptx_device *ptx_dev)
544 {
545   if (!ptx_dev)
546     return true;
547 
548   for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
549     {
550       struct ptx_free_block *b_next = b->next;
551       CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr);
552       free (b);
553       b = b_next;
554     }
555 
556   pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
557   pthread_mutex_destroy (&ptx_dev->image_lock);
558 
559   pthread_mutex_destroy (&ptx_dev->omp_stacks.lock);
560 
561   if (ptx_dev->omp_stacks.ptr)
562     CUDA_CALL (cuMemFree, ptx_dev->omp_stacks.ptr);
563 
564   if (!ptx_dev->ctx_shared)
565     CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
566 
567   free (ptx_dev);
568   return true;
569 }
570 
571 static int
nvptx_get_num_devices(void)572 nvptx_get_num_devices (void)
573 {
574   int n;
575 
576   /* This function will be called before the plugin has been initialized in
577      order to enumerate available devices, but CUDA API routines can't be used
578      until cuInit has been called.  Just call it now (but don't yet do any
579      further initialization).  */
580   if (instantiated_devices == 0)
581     {
582       if (!init_cuda_lib ())
583 	return 0;
584       CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
585       /* This is not an error: e.g. we may have CUDA libraries installed but
586          no devices available.  */
587       if (r != CUDA_SUCCESS)
588 	{
589 	  GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
590 			     cuda_error (r));
591 	  return 0;
592 	}
593     }
594 
595   CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
596   return n;
597 }
598 
599 static void
notify_var(const char * var_name,const char * env_var)600 notify_var (const char *var_name, const char *env_var)
601 {
602   if (env_var == NULL)
603     GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
604   else
605     GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
606 }
607 
608 static void
process_GOMP_NVPTX_JIT(intptr_t * gomp_nvptx_o)609 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
610 {
611   const char *var_name = "GOMP_NVPTX_JIT";
612   const char *env_var = secure_getenv (var_name);
613   notify_var (var_name, env_var);
614 
615   if (env_var == NULL)
616     return;
617 
618   const char *c = env_var;
619   while (*c != '\0')
620     {
621       while (*c == ' ')
622 	c++;
623 
624       if (c[0] == '-' && c[1] == 'O'
625 	  && '0' <= c[2] && c[2] <= '4'
626 	  && (c[3] == '\0' || c[3] == ' '))
627 	{
628 	  *gomp_nvptx_o = c[2] - '0';
629 	  c += 3;
630 	  continue;
631 	}
632 
633       GOMP_PLUGIN_error ("Error parsing %s", var_name);
634       break;
635     }
636 }
637 
638 static bool
link_ptx(CUmodule * module,const struct targ_ptx_obj * ptx_objs,unsigned num_objs)639 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
640 	  unsigned num_objs)
641 {
642   CUjit_option opts[7];
643   void *optvals[7];
644   float elapsed = 0.0;
645   char elog[1024];
646   char ilog[16384];
647   CUlinkState linkstate;
648   CUresult r;
649   void *linkout;
650   size_t linkoutsize __attribute__ ((unused));
651 
652   opts[0] = CU_JIT_WALL_TIME;
653   optvals[0] = &elapsed;
654 
655   opts[1] = CU_JIT_INFO_LOG_BUFFER;
656   optvals[1] = &ilog[0];
657 
658   opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
659   optvals[2] = (void *) sizeof ilog;
660 
661   opts[3] = CU_JIT_ERROR_LOG_BUFFER;
662   optvals[3] = &elog[0];
663 
664   opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
665   optvals[4] = (void *) sizeof elog;
666 
667   opts[5] = CU_JIT_LOG_VERBOSE;
668   optvals[5] = (void *) 1;
669 
670   static intptr_t gomp_nvptx_o = -1;
671 
672   static bool init_done = false;
673   if (!init_done)
674     {
675       process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
676       init_done = true;
677   }
678 
679   int nopts = 6;
680   if (gomp_nvptx_o != -1)
681     {
682       opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
683       optvals[nopts] = (void *) gomp_nvptx_o;
684       nopts++;
685     }
686 
687   if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
688     CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
689   else
690     CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
691 
692   for (; num_objs--; ptx_objs++)
693     {
694       /* cuLinkAddData's 'data' argument erroneously omits the const
695 	 qualifier.  */
696       GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
697       if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
698 	r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
699 			       (char *) ptx_objs->code, ptx_objs->size,
700 			       0, 0, 0, 0);
701       else
702 	r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
703 			       (char *) ptx_objs->code, ptx_objs->size,
704 			       0, 0, 0, 0);
705       if (r != CUDA_SUCCESS)
706 	{
707 	  GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
708 	  GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
709 			     cuda_error (r));
710 	  return false;
711 	}
712     }
713 
714   GOMP_PLUGIN_debug (0, "Linking\n");
715   r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
716 
717   GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
718   GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
719 
720   if (r != CUDA_SUCCESS)
721     {
722       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
723       GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
724       return false;
725     }
726 
727   CUDA_CALL (cuModuleLoadData, module, linkout);
728   CUDA_CALL (cuLinkDestroy, linkstate);
729   return true;
730 }
731 
732 static void
nvptx_exec(void (* fn),size_t mapnum,void ** hostaddrs,void ** devaddrs,unsigned * dims,void * targ_mem_desc,CUdeviceptr dp,CUstream stream)733 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
734 	    unsigned *dims, void *targ_mem_desc,
735 	    CUdeviceptr dp, CUstream stream)
736 {
737   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
738   CUfunction function;
739   int i;
740   void *kargs[1];
741   struct nvptx_thread *nvthd = nvptx_thread ();
742   int warp_size = nvthd->ptx_dev->warp_size;
743 
744   function = targ_fn->fn;
745 
746   /* Initialize the launch dimensions.  Typically this is constant,
747      provided by the device compiler, but we must permit runtime
748      values.  */
749   int seen_zero = 0;
750   for (i = 0; i != GOMP_DIM_MAX; i++)
751     {
752       if (targ_fn->launch->dim[i])
753        dims[i] = targ_fn->launch->dim[i];
754       if (!dims[i])
755        seen_zero = 1;
756     }
757 
758   if (seen_zero)
759     {
760       pthread_mutex_lock (&ptx_dev_lock);
761 
762       static int gomp_openacc_dims[GOMP_DIM_MAX];
763       if (!gomp_openacc_dims[0])
764 	{
765 	  /* See if the user provided GOMP_OPENACC_DIM environment
766 	     variable to specify runtime defaults.  */
767 	  for (int i = 0; i < GOMP_DIM_MAX; ++i)
768 	    gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
769 	}
770 
771       if (!nvthd->ptx_dev->default_dims[0])
772 	{
773 	  int default_dims[GOMP_DIM_MAX];
774 	  for (int i = 0; i < GOMP_DIM_MAX; ++i)
775 	    default_dims[i] = gomp_openacc_dims[i];
776 
777 	  int gang, worker, vector;
778 	  {
779 	    int block_size = nvthd->ptx_dev->max_threads_per_block;
780 	    int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
781 	    int dev_size = nvthd->ptx_dev->num_sms;
782 	    GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
783 			       " dev_size=%d, cpu_size=%d\n",
784 			       warp_size, block_size, dev_size, cpu_size);
785 
786 	    gang = (cpu_size / block_size) * dev_size;
787 	    worker = block_size / warp_size;
788 	    vector = warp_size;
789 	  }
790 
791 	  /* There is no upper bound on the gang size.  The best size
792 	     matches the hardware configuration.  Logical gangs are
793 	     scheduled onto physical hardware.  To maximize usage, we
794 	     should guess a large number.  */
795 	  if (default_dims[GOMP_DIM_GANG] < 1)
796 	    default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
797 	  /* The worker size must not exceed the hardware.  */
798 	  if (default_dims[GOMP_DIM_WORKER] < 1
799 	      || (default_dims[GOMP_DIM_WORKER] > worker && gang))
800 	    default_dims[GOMP_DIM_WORKER] = worker;
801 	  /* The vector size must exactly match the hardware.  */
802 	  if (default_dims[GOMP_DIM_VECTOR] < 1
803 	      || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
804 	    default_dims[GOMP_DIM_VECTOR] = vector;
805 
806 	  GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
807 			     default_dims[GOMP_DIM_GANG],
808 			     default_dims[GOMP_DIM_WORKER],
809 			     default_dims[GOMP_DIM_VECTOR]);
810 
811 	  for (i = 0; i != GOMP_DIM_MAX; i++)
812 	    nvthd->ptx_dev->default_dims[i] = default_dims[i];
813 	}
814       pthread_mutex_unlock (&ptx_dev_lock);
815 
816       {
817 	bool default_dim_p[GOMP_DIM_MAX];
818 	for (i = 0; i != GOMP_DIM_MAX; i++)
819 	  default_dim_p[i] = !dims[i];
820 
821 	if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
822 	  {
823 	    for (i = 0; i != GOMP_DIM_MAX; i++)
824 	      if (default_dim_p[i])
825 		dims[i] = nvthd->ptx_dev->default_dims[i];
826 
827 	    if (default_dim_p[GOMP_DIM_VECTOR])
828 	      dims[GOMP_DIM_VECTOR]
829 		= MIN (dims[GOMP_DIM_VECTOR],
830 		       (targ_fn->max_threads_per_block / warp_size
831 			* warp_size));
832 
833 	    if (default_dim_p[GOMP_DIM_WORKER])
834 	      dims[GOMP_DIM_WORKER]
835 		= MIN (dims[GOMP_DIM_WORKER],
836 		       targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
837 	  }
838 	else
839 	  {
840 	    /* Handle the case that the compiler allows the runtime to choose
841 	       the vector-length conservatively, by ignoring
842 	       gomp_openacc_dims[GOMP_DIM_VECTOR].  TODO: actually handle
843 	       it.  */
844 	    int vectors = 0;
845 	    /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
846 	       gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
847 	       exceed targ_fn->max_threads_per_block. */
848 	    int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
849 	    int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
850 	    int grids, blocks;
851 
852 	    CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
853 			      &blocks, function, NULL, 0,
854 			      dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
855 	    GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
856 			       "grid = %d, block = %d\n", grids, blocks);
857 
858 	    /* Keep the num_gangs proportional to the block size.  In
859 	       the case were a block size is limited by shared-memory
860 	       or the register file capacity, the runtime will not
861 	       excessively over assign gangs to the multiprocessor
862 	       units if their state is going to be swapped out even
863 	       more than necessary. The constant factor 2 is there to
864 	       prevent threads from idling when there is insufficient
865 	       work for them.  */
866 	    if (gangs == 0)
867 	      gangs = 2 * grids * (blocks / warp_size);
868 
869 	    if (vectors == 0)
870 	      vectors = warp_size;
871 
872 	    if (workers == 0)
873 	      {
874 		int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
875 				      ? vectors
876 				      : dims[GOMP_DIM_VECTOR]);
877 		workers = blocks / actual_vectors;
878 		workers = MAX (workers, 1);
879 		/* If we need a per-worker barrier ... .  */
880 		if (actual_vectors > 32)
881 		  /* Don't use more barriers than available.  */
882 		  workers = MIN (workers, 15);
883 	      }
884 
885 	    for (i = 0; i != GOMP_DIM_MAX; i++)
886 	      if (default_dim_p[i])
887 		switch (i)
888 		  {
889 		  case GOMP_DIM_GANG: dims[i] = gangs; break;
890 		  case GOMP_DIM_WORKER: dims[i] = workers; break;
891 		  case GOMP_DIM_VECTOR: dims[i] = vectors; break;
892 		  default: GOMP_PLUGIN_fatal ("invalid dim");
893 		  }
894 	  }
895       }
896     }
897 
898   /* Check if the accelerator has sufficient hardware resources to
899      launch the offloaded kernel.  */
900   if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
901       > targ_fn->max_threads_per_block)
902     {
903       const char *msg
904 	= ("The Nvidia accelerator has insufficient resources to launch '%s'"
905 	   " with num_workers = %d and vector_length = %d"
906 	   "; "
907 	   "recompile the program with 'num_workers = x and vector_length = y'"
908 	   " on that offloaded region or '-fopenacc-dim=:x:y' where"
909 	   " x * y <= %d"
910 	   ".\n");
911       GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
912 			 dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block);
913     }
914 
915   /* Check if the accelerator has sufficient barrier resources to
916      launch the offloaded kernel.  */
917   if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
918     {
919       const char *msg
920 	= ("The Nvidia accelerator has insufficient barrier resources to launch"
921 	   " '%s' with num_workers = %d and vector_length = %d"
922 	   "; "
923 	   "recompile the program with 'num_workers = x' on that offloaded"
924 	   " region or '-fopenacc-dim=:x:' where x <= 15"
925 	   "; "
926 	   "or, recompile the program with 'vector_length = 32' on that"
927 	   " offloaded region or '-fopenacc-dim=::32'"
928 	   ".\n");
929 	GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
930 			   dims[GOMP_DIM_VECTOR]);
931     }
932 
933   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
934 		     " gangs=%u, workers=%u, vectors=%u\n",
935 		     __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
936 		     dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
937 
938   // OpenACC		CUDA
939   //
940   // num_gangs		nctaid.x
941   // num_workers	ntid.y
942   // vector length	ntid.x
943 
944   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
945   acc_prof_info *prof_info = thr->prof_info;
946   acc_event_info enqueue_launch_event_info;
947   acc_api_info *api_info = thr->api_info;
948   bool profiling_p = __builtin_expect (prof_info != NULL, false);
949   if (profiling_p)
950     {
951       prof_info->event_type = acc_ev_enqueue_launch_start;
952 
953       enqueue_launch_event_info.launch_event.event_type
954 	= prof_info->event_type;
955       enqueue_launch_event_info.launch_event.valid_bytes
956 	= _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
957       enqueue_launch_event_info.launch_event.parent_construct
958 	= acc_construct_parallel;
959       enqueue_launch_event_info.launch_event.implicit = 1;
960       enqueue_launch_event_info.launch_event.tool_info = NULL;
961       enqueue_launch_event_info.launch_event.kernel_name = targ_fn->launch->fn;
962       enqueue_launch_event_info.launch_event.num_gangs
963 	= dims[GOMP_DIM_GANG];
964       enqueue_launch_event_info.launch_event.num_workers
965 	= dims[GOMP_DIM_WORKER];
966       enqueue_launch_event_info.launch_event.vector_length
967 	= dims[GOMP_DIM_VECTOR];
968 
969       api_info->device_api = acc_device_api_cuda;
970 
971       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
972 					    api_info);
973     }
974 
975   kargs[0] = &dp;
976   CUDA_CALL_ASSERT (cuLaunchKernel, function,
977 		    dims[GOMP_DIM_GANG], 1, 1,
978 		    dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
979 		    0, stream, kargs, 0);
980 
981   if (profiling_p)
982     {
983       prof_info->event_type = acc_ev_enqueue_launch_end;
984       enqueue_launch_event_info.launch_event.event_type
985 	= prof_info->event_type;
986       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
987 					    api_info);
988     }
989 
990   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
991 		     targ_fn->launch->fn);
992 }
993 
994 void * openacc_get_current_cuda_context (void);
995 
996 static void
goacc_profiling_acc_ev_alloc(struct goacc_thread * thr,void * dp,size_t s)997 goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s)
998 {
999   acc_prof_info *prof_info = thr->prof_info;
1000   acc_event_info data_event_info;
1001   acc_api_info *api_info = thr->api_info;
1002 
1003   prof_info->event_type = acc_ev_alloc;
1004 
1005   data_event_info.data_event.event_type = prof_info->event_type;
1006   data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1007   data_event_info.data_event.parent_construct = acc_construct_parallel;
1008   data_event_info.data_event.implicit = 1;
1009   data_event_info.data_event.tool_info = NULL;
1010   data_event_info.data_event.var_name = NULL;
1011   data_event_info.data_event.bytes = s;
1012   data_event_info.data_event.host_ptr = NULL;
1013   data_event_info.data_event.device_ptr = dp;
1014 
1015   api_info->device_api = acc_device_api_cuda;
1016 
1017   GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1018 }
1019 
1020 /* Free the cached soft-stacks block if it is above the SOFTSTACK_CACHE_LIMIT
1021    size threshold, or if FORCE is true.  */
1022 
1023 static void
nvptx_stacks_free(struct ptx_device * ptx_dev,bool force)1024 nvptx_stacks_free (struct ptx_device *ptx_dev, bool force)
1025 {
1026   pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
1027   if (ptx_dev->omp_stacks.ptr
1028       && (force || ptx_dev->omp_stacks.size > SOFTSTACK_CACHE_LIMIT))
1029     {
1030       CUresult r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
1031       if (r != CUDA_SUCCESS)
1032 	GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1033       ptx_dev->omp_stacks.ptr = 0;
1034       ptx_dev->omp_stacks.size = 0;
1035     }
1036   pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
1037 }
1038 
1039 static void *
nvptx_alloc(size_t s,bool suppress_errors)1040 nvptx_alloc (size_t s, bool suppress_errors)
1041 {
1042   CUdeviceptr d;
1043 
1044   CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &d, s);
1045   if (suppress_errors && r == CUDA_ERROR_OUT_OF_MEMORY)
1046     return NULL;
1047   else if (r != CUDA_SUCCESS)
1048     {
1049       GOMP_PLUGIN_error ("nvptx_alloc error: %s", cuda_error (r));
1050       return NULL;
1051     }
1052 
1053   /* NOTE: We only do profiling stuff if the memory allocation succeeds.  */
1054   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1055   bool profiling_p
1056     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1057   if (profiling_p)
1058     goacc_profiling_acc_ev_alloc (thr, (void *) d, s);
1059 
1060   return (void *) d;
1061 }
1062 
1063 static void
goacc_profiling_acc_ev_free(struct goacc_thread * thr,void * p)1064 goacc_profiling_acc_ev_free (struct goacc_thread *thr, void *p)
1065 {
1066   acc_prof_info *prof_info = thr->prof_info;
1067   acc_event_info data_event_info;
1068   acc_api_info *api_info = thr->api_info;
1069 
1070   prof_info->event_type = acc_ev_free;
1071 
1072   data_event_info.data_event.event_type = prof_info->event_type;
1073   data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1074   data_event_info.data_event.parent_construct = acc_construct_parallel;
1075   data_event_info.data_event.implicit = 1;
1076   data_event_info.data_event.tool_info = NULL;
1077   data_event_info.data_event.var_name = NULL;
1078   data_event_info.data_event.bytes = -1;
1079   data_event_info.data_event.host_ptr = NULL;
1080   data_event_info.data_event.device_ptr = p;
1081 
1082   api_info->device_api = acc_device_api_cuda;
1083 
1084   GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1085 }
1086 
1087 static bool
nvptx_free(void * p,struct ptx_device * ptx_dev)1088 nvptx_free (void *p, struct ptx_device *ptx_dev)
1089 {
1090   CUdeviceptr pb;
1091   size_t ps;
1092 
1093   CUresult r = CUDA_CALL_NOCHECK (cuMemGetAddressRange, &pb, &ps,
1094 				  (CUdeviceptr) p);
1095   if (r == CUDA_ERROR_NOT_PERMITTED)
1096     {
1097       /* We assume that this error indicates we are in a CUDA callback context,
1098 	 where all CUDA calls are not allowed (see cuStreamAddCallback
1099 	 documentation for description). Arrange to free this piece of device
1100 	 memory later.  */
1101       struct ptx_free_block *n
1102 	= GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
1103       n->ptr = p;
1104       pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1105       n->next = ptx_dev->free_blocks;
1106       ptx_dev->free_blocks = n;
1107       pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1108       return true;
1109     }
1110   else if (r != CUDA_SUCCESS)
1111     {
1112       GOMP_PLUGIN_error ("cuMemGetAddressRange error: %s", cuda_error (r));
1113       return false;
1114     }
1115   if ((CUdeviceptr) p != pb)
1116     {
1117       GOMP_PLUGIN_error ("invalid device address");
1118       return false;
1119     }
1120 
1121   CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1122   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1123   bool profiling_p
1124     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1125   if (profiling_p)
1126     goacc_profiling_acc_ev_free (thr, p);
1127 
1128   return true;
1129 }
1130 
1131 static void *
nvptx_get_current_cuda_device(void)1132 nvptx_get_current_cuda_device (void)
1133 {
1134   struct nvptx_thread *nvthd = nvptx_thread ();
1135 
1136   if (!nvthd || !nvthd->ptx_dev)
1137     return NULL;
1138 
1139   return &nvthd->ptx_dev->dev;
1140 }
1141 
1142 static void *
nvptx_get_current_cuda_context(void)1143 nvptx_get_current_cuda_context (void)
1144 {
1145   struct nvptx_thread *nvthd = nvptx_thread ();
1146 
1147   if (!nvthd || !nvthd->ptx_dev)
1148     return NULL;
1149 
1150   return nvthd->ptx_dev->ctx;
1151 }
1152 
1153 /* Plugin entry points.  */
1154 
1155 const char *
GOMP_OFFLOAD_get_name(void)1156 GOMP_OFFLOAD_get_name (void)
1157 {
1158   return "nvptx";
1159 }
1160 
1161 unsigned int
GOMP_OFFLOAD_get_caps(void)1162 GOMP_OFFLOAD_get_caps (void)
1163 {
1164   return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1165 }
1166 
1167 int
GOMP_OFFLOAD_get_type(void)1168 GOMP_OFFLOAD_get_type (void)
1169 {
1170   return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1171 }
1172 
1173 int
GOMP_OFFLOAD_get_num_devices(void)1174 GOMP_OFFLOAD_get_num_devices (void)
1175 {
1176   return nvptx_get_num_devices ();
1177 }
1178 
1179 bool
GOMP_OFFLOAD_init_device(int n)1180 GOMP_OFFLOAD_init_device (int n)
1181 {
1182   struct ptx_device *dev;
1183 
1184   pthread_mutex_lock (&ptx_dev_lock);
1185 
1186   if (!nvptx_init () || ptx_devices[n] != NULL)
1187     {
1188       pthread_mutex_unlock (&ptx_dev_lock);
1189       return false;
1190     }
1191 
1192   dev = nvptx_open_device (n);
1193   if (dev)
1194     {
1195       ptx_devices[n] = dev;
1196       instantiated_devices++;
1197     }
1198 
1199   pthread_mutex_unlock (&ptx_dev_lock);
1200 
1201   return dev != NULL;
1202 }
1203 
1204 bool
GOMP_OFFLOAD_fini_device(int n)1205 GOMP_OFFLOAD_fini_device (int n)
1206 {
1207   pthread_mutex_lock (&ptx_dev_lock);
1208 
1209   if (ptx_devices[n] != NULL)
1210     {
1211       if (!nvptx_attach_host_thread_to_device (n)
1212 	  || !nvptx_close_device (ptx_devices[n]))
1213 	{
1214 	  pthread_mutex_unlock (&ptx_dev_lock);
1215 	  return false;
1216 	}
1217       ptx_devices[n] = NULL;
1218       instantiated_devices--;
1219     }
1220 
1221   if (instantiated_devices == 0)
1222     {
1223       free (ptx_devices);
1224       ptx_devices = NULL;
1225     }
1226 
1227   pthread_mutex_unlock (&ptx_dev_lock);
1228   return true;
1229 }
1230 
1231 /* Return the libgomp version number we're compatible with.  There is
1232    no requirement for cross-version compatibility.  */
1233 
1234 unsigned
GOMP_OFFLOAD_version(void)1235 GOMP_OFFLOAD_version (void)
1236 {
1237   return GOMP_VERSION;
1238 }
1239 
1240 /* Initialize __nvptx_clocktick, if present in MODULE.  */
1241 
1242 static void
nvptx_set_clocktick(CUmodule module,struct ptx_device * dev)1243 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1244 {
1245   CUdeviceptr dptr;
1246   CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1247 				  module, "__nvptx_clocktick");
1248   if (r == CUDA_ERROR_NOT_FOUND)
1249     return;
1250   if (r != CUDA_SUCCESS)
1251     GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1252   double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1253   r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1254 			 sizeof (__nvptx_clocktick));
1255   if (r != CUDA_SUCCESS)
1256     GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1257 }
1258 
1259 /* Load the (partial) program described by TARGET_DATA to device
1260    number ORD.  Allocate and return TARGET_TABLE.  */
1261 
1262 int
GOMP_OFFLOAD_load_image(int ord,unsigned version,const void * target_data,struct addr_pair ** target_table)1263 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1264 			 struct addr_pair **target_table)
1265 {
1266   CUmodule module;
1267   const char *const *var_names;
1268   const struct targ_fn_launch *fn_descs;
1269   unsigned int fn_entries, var_entries, other_entries, i, j;
1270   struct targ_fn_descriptor *targ_fns;
1271   struct addr_pair *targ_tbl;
1272   const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1273   struct ptx_image_data *new_image;
1274   struct ptx_device *dev;
1275 
1276   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1277     {
1278       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1279 			 " (expected %u, received %u)",
1280 			 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1281       return -1;
1282     }
1283 
1284   if (!nvptx_attach_host_thread_to_device (ord)
1285       || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1286     return -1;
1287 
1288   dev = ptx_devices[ord];
1289 
1290   /* The mkoffload utility emits a struct of pointers/integers at the
1291      start of each offload image.  The array of kernel names and the
1292      functions addresses form a one-to-one correspondence.  */
1293 
1294   var_entries = img_header->var_num;
1295   var_names = img_header->var_names;
1296   fn_entries = img_header->fn_num;
1297   fn_descs = img_header->fn_descs;
1298 
1299   /* Currently, the only other entry kind is 'device number'.  */
1300   other_entries = 1;
1301 
1302   targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1303 				 * (fn_entries + var_entries + other_entries));
1304   targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1305 				 * fn_entries);
1306 
1307   *target_table = targ_tbl;
1308 
1309   new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1310   new_image->target_data = target_data;
1311   new_image->module = module;
1312   new_image->fns = targ_fns;
1313 
1314   pthread_mutex_lock (&dev->image_lock);
1315   new_image->next = dev->images;
1316   dev->images = new_image;
1317   pthread_mutex_unlock (&dev->image_lock);
1318 
1319   for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1320     {
1321       CUfunction function;
1322       int nregs, mthrs;
1323 
1324       CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1325 		      fn_descs[i].fn);
1326       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1327 		      CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1328       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1329 		      CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1330 
1331       targ_fns->fn = function;
1332       targ_fns->launch = &fn_descs[i];
1333       targ_fns->regs_per_thread = nregs;
1334       targ_fns->max_threads_per_block = mthrs;
1335 
1336       targ_tbl->start = (uintptr_t) targ_fns;
1337       targ_tbl->end = targ_tbl->start + 1;
1338     }
1339 
1340   for (j = 0; j < var_entries; j++, targ_tbl++)
1341     {
1342       CUdeviceptr var;
1343       size_t bytes;
1344 
1345       CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1346 		      &var, &bytes, module, var_names[j]);
1347 
1348       targ_tbl->start = (uintptr_t) var;
1349       targ_tbl->end = targ_tbl->start + bytes;
1350     }
1351 
1352   CUdeviceptr device_num_varptr;
1353   size_t device_num_varsize;
1354   CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &device_num_varptr,
1355 				  &device_num_varsize, module,
1356 				  STRINGX (GOMP_DEVICE_NUM_VAR));
1357   if (r == CUDA_SUCCESS)
1358     {
1359       targ_tbl->start = (uintptr_t) device_num_varptr;
1360       targ_tbl->end = (uintptr_t) (device_num_varptr + device_num_varsize);
1361     }
1362   else
1363     /* The 'GOMP_DEVICE_NUM_VAR' variable was not in this image.  */
1364     targ_tbl->start = targ_tbl->end = 0;
1365   targ_tbl++;
1366 
1367   nvptx_set_clocktick (module, dev);
1368 
1369   return fn_entries + var_entries + other_entries;
1370 }
1371 
1372 /* Unload the program described by TARGET_DATA.  DEV_DATA is the
1373    function descriptors allocated by G_O_load_image.  */
1374 
1375 bool
GOMP_OFFLOAD_unload_image(int ord,unsigned version,const void * target_data)1376 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1377 {
1378   struct ptx_image_data *image, **prev_p;
1379   struct ptx_device *dev = ptx_devices[ord];
1380 
1381   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1382     {
1383       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1384 			 " (expected %u, received %u)",
1385 			 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1386       return false;
1387     }
1388 
1389   bool ret = true;
1390   pthread_mutex_lock (&dev->image_lock);
1391   for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1392     if (image->target_data == target_data)
1393       {
1394 	*prev_p = image->next;
1395 	if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
1396 	  ret = false;
1397 	free (image->fns);
1398 	free (image);
1399 	break;
1400       }
1401   pthread_mutex_unlock (&dev->image_lock);
1402   return ret;
1403 }
1404 
1405 void *
GOMP_OFFLOAD_alloc(int ord,size_t size)1406 GOMP_OFFLOAD_alloc (int ord, size_t size)
1407 {
1408   if (!nvptx_attach_host_thread_to_device (ord))
1409     return NULL;
1410 
1411   struct ptx_device *ptx_dev = ptx_devices[ord];
1412   struct ptx_free_block *blocks, *tmp;
1413 
1414   pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1415   blocks = ptx_dev->free_blocks;
1416   ptx_dev->free_blocks = NULL;
1417   pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1418 
1419   nvptx_stacks_free (ptx_dev, false);
1420 
1421   while (blocks)
1422     {
1423       tmp = blocks->next;
1424       nvptx_free (blocks->ptr, ptx_dev);
1425       free (blocks);
1426       blocks = tmp;
1427     }
1428 
1429   void *d = nvptx_alloc (size, true);
1430   if (d)
1431     return d;
1432   else
1433     {
1434       /* Memory allocation failed.  Try freeing the stacks block, and
1435 	 retrying.  */
1436       nvptx_stacks_free (ptx_dev, true);
1437       return nvptx_alloc (size, false);
1438     }
1439 }
1440 
1441 bool
GOMP_OFFLOAD_free(int ord,void * ptr)1442 GOMP_OFFLOAD_free (int ord, void *ptr)
1443 {
1444   return (nvptx_attach_host_thread_to_device (ord)
1445 	  && nvptx_free (ptr, ptx_devices[ord]));
1446 }
1447 
1448 void
GOMP_OFFLOAD_openacc_exec(void (* fn)(void *),size_t mapnum,void ** hostaddrs,void ** devaddrs,unsigned * dims,void * targ_mem_desc)1449 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
1450 			   void **hostaddrs, void **devaddrs,
1451 			   unsigned *dims, void *targ_mem_desc)
1452 {
1453   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
1454 
1455   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1456   acc_prof_info *prof_info = thr->prof_info;
1457   acc_event_info data_event_info;
1458   acc_api_info *api_info = thr->api_info;
1459   bool profiling_p = __builtin_expect (prof_info != NULL, false);
1460 
1461   void **hp = NULL;
1462   CUdeviceptr dp = 0;
1463 
1464   if (mapnum > 0)
1465     {
1466       size_t s = mapnum * sizeof (void *);
1467       hp = alloca (s);
1468       for (int i = 0; i < mapnum; i++)
1469 	hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
1470       CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
1471       if (profiling_p)
1472 	goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
1473     }
1474 
1475   /* Copy the (device) pointers to arguments to the device (dp and hp might in
1476      fact have the same value on a unified-memory system).  */
1477   if (mapnum > 0)
1478     {
1479       if (profiling_p)
1480 	{
1481 	  prof_info->event_type = acc_ev_enqueue_upload_start;
1482 
1483 	  data_event_info.data_event.event_type = prof_info->event_type;
1484 	  data_event_info.data_event.valid_bytes
1485 	    = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1486 	  data_event_info.data_event.parent_construct
1487 	    = acc_construct_parallel;
1488 	  data_event_info.data_event.implicit = 1; /* Always implicit.  */
1489 	  data_event_info.data_event.tool_info = NULL;
1490 	  data_event_info.data_event.var_name = NULL;
1491 	  data_event_info.data_event.bytes = mapnum * sizeof (void *);
1492 	  data_event_info.data_event.host_ptr = hp;
1493 	  data_event_info.data_event.device_ptr = (const void *) dp;
1494 
1495 	  api_info->device_api = acc_device_api_cuda;
1496 
1497 	  GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1498 						api_info);
1499 	}
1500       CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
1501 			mapnum * sizeof (void *));
1502       if (profiling_p)
1503 	{
1504 	  prof_info->event_type = acc_ev_enqueue_upload_end;
1505 	  data_event_info.data_event.event_type = prof_info->event_type;
1506 	  GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1507 						api_info);
1508 	}
1509     }
1510 
1511   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1512 	      dp, NULL);
1513 
1514   CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
1515   const char *maybe_abort_msg = "(perhaps abort was called)";
1516   if (r == CUDA_ERROR_LAUNCH_FAILED)
1517     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1518 		       maybe_abort_msg);
1519   else if (r != CUDA_SUCCESS)
1520     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1521 
1522   CUDA_CALL_ASSERT (cuMemFree, dp);
1523   if (profiling_p)
1524     goacc_profiling_acc_ev_free (thr, (void *) dp);
1525 }
1526 
1527 static void
cuda_free_argmem(void * ptr)1528 cuda_free_argmem (void *ptr)
1529 {
1530   void **block = (void **) ptr;
1531   nvptx_free (block[0], (struct ptx_device *) block[1]);
1532   free (block);
1533 }
1534 
1535 void
GOMP_OFFLOAD_openacc_async_exec(void (* fn)(void *),size_t mapnum,void ** hostaddrs,void ** devaddrs,unsigned * dims,void * targ_mem_desc,struct goacc_asyncqueue * aq)1536 GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
1537 				 void **hostaddrs, void **devaddrs,
1538 				 unsigned *dims, void *targ_mem_desc,
1539 				 struct goacc_asyncqueue *aq)
1540 {
1541   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
1542 
1543   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1544   acc_prof_info *prof_info = thr->prof_info;
1545   acc_event_info data_event_info;
1546   acc_api_info *api_info = thr->api_info;
1547   bool profiling_p = __builtin_expect (prof_info != NULL, false);
1548 
1549   void **hp = NULL;
1550   CUdeviceptr dp = 0;
1551   void **block = NULL;
1552 
1553   if (mapnum > 0)
1554     {
1555       size_t s = mapnum * sizeof (void *);
1556       block = (void **) GOMP_PLUGIN_malloc (2 * sizeof (void *) + s);
1557       hp = block + 2;
1558       for (int i = 0; i < mapnum; i++)
1559 	hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
1560       CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
1561       if (profiling_p)
1562 	goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
1563     }
1564 
1565   /* Copy the (device) pointers to arguments to the device (dp and hp might in
1566      fact have the same value on a unified-memory system).  */
1567   if (mapnum > 0)
1568     {
1569       if (profiling_p)
1570 	{
1571 	  prof_info->event_type = acc_ev_enqueue_upload_start;
1572 
1573 	  data_event_info.data_event.event_type = prof_info->event_type;
1574 	  data_event_info.data_event.valid_bytes
1575 	    = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1576 	  data_event_info.data_event.parent_construct
1577 	    = acc_construct_parallel;
1578 	  data_event_info.data_event.implicit = 1; /* Always implicit.  */
1579 	  data_event_info.data_event.tool_info = NULL;
1580 	  data_event_info.data_event.var_name = NULL;
1581 	  data_event_info.data_event.bytes = mapnum * sizeof (void *);
1582 	  data_event_info.data_event.host_ptr = hp;
1583 	  data_event_info.data_event.device_ptr = (const void *) dp;
1584 
1585 	  api_info->device_api = acc_device_api_cuda;
1586 
1587 	  GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1588 						api_info);
1589 	}
1590 
1591       CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp,
1592 			mapnum * sizeof (void *), aq->cuda_stream);
1593       block[0] = (void *) dp;
1594 
1595       struct nvptx_thread *nvthd =
1596 	(struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
1597       block[1] = (void *) nvthd->ptx_dev;
1598 
1599       if (profiling_p)
1600 	{
1601 	  prof_info->event_type = acc_ev_enqueue_upload_end;
1602 	  data_event_info.data_event.event_type = prof_info->event_type;
1603 	  GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1604 						api_info);
1605 	}
1606     }
1607 
1608   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1609 	      dp, aq->cuda_stream);
1610 
1611   if (mapnum > 0)
1612     GOMP_OFFLOAD_openacc_async_queue_callback (aq, cuda_free_argmem, block);
1613 }
1614 
1615 void *
GOMP_OFFLOAD_openacc_create_thread_data(int ord)1616 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1617 {
1618   struct ptx_device *ptx_dev;
1619   struct nvptx_thread *nvthd
1620     = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1621   CUcontext thd_ctx;
1622 
1623   ptx_dev = ptx_devices[ord];
1624 
1625   assert (ptx_dev);
1626 
1627   CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
1628 
1629   assert (ptx_dev->ctx);
1630 
1631   if (!thd_ctx)
1632     CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
1633 
1634   nvthd->ptx_dev = ptx_dev;
1635 
1636   return (void *) nvthd;
1637 }
1638 
1639 void
GOMP_OFFLOAD_openacc_destroy_thread_data(void * data)1640 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1641 {
1642   free (data);
1643 }
1644 
1645 void *
GOMP_OFFLOAD_openacc_cuda_get_current_device(void)1646 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
1647 {
1648   return nvptx_get_current_cuda_device ();
1649 }
1650 
1651 void *
GOMP_OFFLOAD_openacc_cuda_get_current_context(void)1652 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
1653 {
1654   return nvptx_get_current_cuda_context ();
1655 }
1656 
1657 /* This returns a CUstream.  */
1658 void *
GOMP_OFFLOAD_openacc_cuda_get_stream(struct goacc_asyncqueue * aq)1659 GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq)
1660 {
1661   return (void *) aq->cuda_stream;
1662 }
1663 
1664 /* This takes a CUstream.  */
1665 int
GOMP_OFFLOAD_openacc_cuda_set_stream(struct goacc_asyncqueue * aq,void * stream)1666 GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
1667 {
1668   if (aq->cuda_stream)
1669     {
1670       CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
1671       CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream);
1672     }
1673 
1674   aq->cuda_stream = (CUstream) stream;
1675   return 1;
1676 }
1677 
1678 struct goacc_asyncqueue *
GOMP_OFFLOAD_openacc_async_construct(int device)1679 GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
1680 {
1681   CUstream stream = NULL;
1682   CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
1683 
1684   struct goacc_asyncqueue *aq
1685     = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
1686   aq->cuda_stream = stream;
1687   return aq;
1688 }
1689 
1690 bool
GOMP_OFFLOAD_openacc_async_destruct(struct goacc_asyncqueue * aq)1691 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
1692 {
1693   CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
1694   free (aq);
1695   return true;
1696 }
1697 
1698 int
GOMP_OFFLOAD_openacc_async_test(struct goacc_asyncqueue * aq)1699 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
1700 {
1701   CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
1702   if (r == CUDA_SUCCESS)
1703     return 1;
1704   if (r == CUDA_ERROR_NOT_READY)
1705     return 0;
1706 
1707   GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
1708   return -1;
1709 }
1710 
1711 bool
GOMP_OFFLOAD_openacc_async_synchronize(struct goacc_asyncqueue * aq)1712 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
1713 {
1714   CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
1715   return true;
1716 }
1717 
1718 bool
GOMP_OFFLOAD_openacc_async_serialize(struct goacc_asyncqueue * aq1,struct goacc_asyncqueue * aq2)1719 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
1720 				      struct goacc_asyncqueue *aq2)
1721 {
1722   CUevent e;
1723   CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
1724   CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
1725   CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
1726   return true;
1727 }
1728 
1729 static void
cuda_callback_wrapper(CUstream stream,CUresult res,void * ptr)1730 cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
1731 {
1732   if (res != CUDA_SUCCESS)
1733     GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
1734   struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
1735   cb->fn (cb->ptr);
1736   free (ptr);
1737 }
1738 
1739 void
GOMP_OFFLOAD_openacc_async_queue_callback(struct goacc_asyncqueue * aq,void (* callback_fn)(void *),void * userptr)1740 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
1741 					   void (*callback_fn)(void *),
1742 					   void *userptr)
1743 {
1744   struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
1745   b->fn = callback_fn;
1746   b->ptr = userptr;
1747   b->aq = aq;
1748   CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
1749 		    cuda_callback_wrapper, (void *) b, 0);
1750 }
1751 
1752 static bool
cuda_memcpy_sanity_check(const void * h,const void * d,size_t s)1753 cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
1754 {
1755   CUdeviceptr pb;
1756   size_t ps;
1757   if (!s)
1758     return true;
1759   if (!d)
1760     {
1761       GOMP_PLUGIN_error ("invalid device address");
1762       return false;
1763     }
1764   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1765   if (!pb)
1766     {
1767       GOMP_PLUGIN_error ("invalid device address");
1768       return false;
1769     }
1770   if (!h)
1771     {
1772       GOMP_PLUGIN_error ("invalid host address");
1773       return false;
1774     }
1775   if (d == h)
1776     {
1777       GOMP_PLUGIN_error ("invalid host or device address");
1778       return false;
1779     }
1780   if ((void *)(d + s) > (void *)(pb + ps))
1781     {
1782       GOMP_PLUGIN_error ("invalid size");
1783       return false;
1784     }
1785   return true;
1786 }
1787 
1788 bool
GOMP_OFFLOAD_host2dev(int ord,void * dst,const void * src,size_t n)1789 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1790 {
1791   if (!nvptx_attach_host_thread_to_device (ord)
1792       || !cuda_memcpy_sanity_check (src, dst, n))
1793     return false;
1794   CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n);
1795   return true;
1796 }
1797 
1798 bool
GOMP_OFFLOAD_dev2host(int ord,void * dst,const void * src,size_t n)1799 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1800 {
1801   if (!nvptx_attach_host_thread_to_device (ord)
1802       || !cuda_memcpy_sanity_check (dst, src, n))
1803     return false;
1804   CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n);
1805   return true;
1806 }
1807 
1808 bool
GOMP_OFFLOAD_dev2dev(int ord,void * dst,const void * src,size_t n)1809 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1810 {
1811   CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
1812   return true;
1813 }
1814 
1815 bool
GOMP_OFFLOAD_openacc_async_host2dev(int ord,void * dst,const void * src,size_t n,struct goacc_asyncqueue * aq)1816 GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
1817 				     size_t n, struct goacc_asyncqueue *aq)
1818 {
1819   if (!nvptx_attach_host_thread_to_device (ord)
1820       || !cuda_memcpy_sanity_check (src, dst, n))
1821     return false;
1822   CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream);
1823   return true;
1824 }
1825 
1826 bool
GOMP_OFFLOAD_openacc_async_dev2host(int ord,void * dst,const void * src,size_t n,struct goacc_asyncqueue * aq)1827 GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
1828 				     size_t n, struct goacc_asyncqueue *aq)
1829 {
1830   if (!nvptx_attach_host_thread_to_device (ord)
1831       || !cuda_memcpy_sanity_check (dst, src, n))
1832     return false;
1833   CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream);
1834   return true;
1835 }
1836 
1837 union goacc_property_value
GOMP_OFFLOAD_openacc_get_property(int n,enum goacc_property prop)1838 GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop)
1839 {
1840   union goacc_property_value propval = { .val = 0 };
1841 
1842   pthread_mutex_lock (&ptx_dev_lock);
1843 
1844   if (n >= nvptx_get_num_devices () || n < 0 || ptx_devices[n] == NULL)
1845     {
1846       pthread_mutex_unlock (&ptx_dev_lock);
1847       return propval;
1848     }
1849 
1850   struct ptx_device *ptx_dev = ptx_devices[n];
1851   switch (prop)
1852     {
1853     case GOACC_PROPERTY_MEMORY:
1854       {
1855 	size_t total_mem;
1856 
1857 	CUDA_CALL_ERET (propval, cuDeviceTotalMem, &total_mem, ptx_dev->dev);
1858 	propval.val = total_mem;
1859       }
1860       break;
1861     case GOACC_PROPERTY_FREE_MEMORY:
1862       {
1863 	size_t total_mem;
1864 	size_t free_mem;
1865 	CUdevice ctxdev;
1866 
1867 	CUDA_CALL_ERET (propval, cuCtxGetDevice, &ctxdev);
1868 	if (ptx_dev->dev == ctxdev)
1869 	  CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1870 	else if (ptx_dev->ctx)
1871 	  {
1872 	    CUcontext old_ctx;
1873 
1874 	    CUDA_CALL_ERET (propval, cuCtxPushCurrent, ptx_dev->ctx);
1875 	    CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1876 	    CUDA_CALL_ASSERT (cuCtxPopCurrent, &old_ctx);
1877 	  }
1878 	else
1879 	  {
1880 	    CUcontext new_ctx;
1881 
1882 	    CUDA_CALL_ERET (propval, cuCtxCreate, &new_ctx, CU_CTX_SCHED_AUTO,
1883 			    ptx_dev->dev);
1884 	    CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1885 	    CUDA_CALL_ASSERT (cuCtxDestroy, new_ctx);
1886 	  }
1887 	propval.val = free_mem;
1888       }
1889       break;
1890     case GOACC_PROPERTY_NAME:
1891       propval.ptr = ptx_dev->name;
1892       break;
1893     case GOACC_PROPERTY_VENDOR:
1894       propval.ptr = "Nvidia";
1895       break;
1896     case GOACC_PROPERTY_DRIVER:
1897       propval.ptr = cuda_driver_version_s;
1898       break;
1899     default:
1900       break;
1901     }
1902 
1903   pthread_mutex_unlock (&ptx_dev_lock);
1904   return propval;
1905 }
1906 
1907 /* Adjust launch dimensions: pick good values for number of blocks and warps
1908    and ensure that number of warps does not exceed CUDA limits as well as GCC's
1909    own limits.  */
1910 
1911 static void
nvptx_adjust_launch_bounds(struct targ_fn_descriptor * fn,struct ptx_device * ptx_dev,int * teams_p,int * threads_p)1912 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
1913 			    struct ptx_device *ptx_dev,
1914 			    int *teams_p, int *threads_p)
1915 {
1916   int max_warps_block = fn->max_threads_per_block / 32;
1917   /* Maximum 32 warps per block is an implementation limit in NVPTX backend
1918      and libgcc, which matches documented limit of all GPUs as of 2015.  */
1919   if (max_warps_block > 32)
1920     max_warps_block = 32;
1921   if (*threads_p <= 0)
1922     *threads_p = 8;
1923   if (*threads_p > max_warps_block)
1924     *threads_p = max_warps_block;
1925 
1926   int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
1927   /* This is an estimate of how many blocks the device can host simultaneously.
1928      Actual limit, which may be lower, can be queried with "occupancy control"
1929      driver interface (since CUDA 6.0).  */
1930   int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
1931   if (*teams_p <= 0 || *teams_p > max_blocks)
1932     *teams_p = max_blocks;
1933 }
1934 
1935 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
1936    target regions.  */
1937 
1938 static size_t
nvptx_stacks_size()1939 nvptx_stacks_size ()
1940 {
1941   return 128 * 1024;
1942 }
1943 
1944 /* Return contiguous storage for NUM stacks, each SIZE bytes.  The lock for
1945    the storage should be held on entry, and remains held on exit.  */
1946 
1947 static void *
nvptx_stacks_acquire(struct ptx_device * ptx_dev,size_t size,int num)1948 nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
1949 {
1950   if (ptx_dev->omp_stacks.ptr && ptx_dev->omp_stacks.size >= size * num)
1951     return (void *) ptx_dev->omp_stacks.ptr;
1952 
1953   /* Free the old, too-small stacks.  */
1954   if (ptx_dev->omp_stacks.ptr)
1955     {
1956       CUresult r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
1957       if (r != CUDA_SUCCESS)
1958 	GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s\n", cuda_error (r));
1959       r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
1960       if (r != CUDA_SUCCESS)
1961 	GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1962     }
1963 
1964   /* Make new and bigger stacks, and remember where we put them and how big
1965      they are.  */
1966   CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &ptx_dev->omp_stacks.ptr,
1967 				  size * num);
1968   if (r != CUDA_SUCCESS)
1969     GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1970 
1971   ptx_dev->omp_stacks.size = size * num;
1972 
1973   return (void *) ptx_dev->omp_stacks.ptr;
1974 }
1975 
1976 void
GOMP_OFFLOAD_run(int ord,void * tgt_fn,void * tgt_vars,void ** args)1977 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
1978 {
1979   struct targ_fn_descriptor *tgt_fn_desc
1980     = (struct targ_fn_descriptor *) tgt_fn;
1981   CUfunction function = tgt_fn_desc->fn;
1982   const struct targ_fn_launch *launch = tgt_fn_desc->launch;
1983   const char *fn_name = launch->fn;
1984   CUresult r;
1985   struct ptx_device *ptx_dev = ptx_devices[ord];
1986   const char *maybe_abort_msg = "(perhaps abort was called)";
1987   int teams = 0, threads = 0;
1988 
1989   if (!args)
1990     GOMP_PLUGIN_fatal ("No target arguments provided");
1991   while (*args)
1992     {
1993       intptr_t id = (intptr_t) *args++, val;
1994       if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1995 	val = (intptr_t) *args++;
1996       else
1997         val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1998       if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
1999 	continue;
2000       val = val > INT_MAX ? INT_MAX : val;
2001       id &= GOMP_TARGET_ARG_ID_MASK;
2002       if (id == GOMP_TARGET_ARG_NUM_TEAMS)
2003 	teams = val;
2004       else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
2005 	threads = val;
2006     }
2007   nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
2008 
2009   size_t stack_size = nvptx_stacks_size ();
2010 
2011   pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
2012   void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
2013   void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
2014   size_t fn_args_size = sizeof fn_args;
2015   void *config[] = {
2016     CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
2017     CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
2018     CU_LAUNCH_PARAM_END
2019   };
2020   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
2021 		     " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
2022 		     __FUNCTION__, fn_name, teams, threads);
2023   r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
2024 			 32, threads, 1, 0, NULL, NULL, config);
2025   if (r != CUDA_SUCCESS)
2026     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
2027 
2028   r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2029   if (r == CUDA_ERROR_LAUNCH_FAILED)
2030     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2031 		       maybe_abort_msg);
2032   else if (r != CUDA_SUCCESS)
2033     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
2034 
2035   pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
2036 }
2037 
2038 /* TODO: Implement GOMP_OFFLOAD_async_run. */
2039