1 /* Plugin for NVPTX execution.
2 
3    Copyright (C) 2013-2018 Free Software Foundation, Inc.
4 
5    Contributed by Mentor Embedded.
6 
7    This file is part of the GNU Offloading and Multi Processing Library
8    (libgomp).
9 
10    Libgomp is free software; you can redistribute it and/or modify it
11    under the terms of the GNU General Public License as published by
12    the Free Software Foundation; either version 3, or (at your option)
13    any later version.
14 
15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
18    more details.
19 
20    Under Section 7 of GPL version 3, you are granted additional
21    permissions described in the GCC Runtime Library Exception, version
22    3.1, as published by the Free Software Foundation.
23 
24    You should have received a copy of the GNU General Public License and
25    a copy of the GCC Runtime Library Exception along with this program;
26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
27    <http://www.gnu.org/licenses/>.  */
28 
29 /* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
30    library appears to hold some implicit state, but the documentation
31    is not clear as to what that state might be.  Or how one might
32    propagate it from one thread to another.  */
33 
34 #include "openacc.h"
35 #include "config.h"
36 #include "libgomp-plugin.h"
37 #include "oacc-plugin.h"
38 #include "gomp-constants.h"
39 
40 #include <pthread.h>
41 #include <cuda.h>
42 #include <stdbool.h>
43 #include <stdint.h>
44 #include <limits.h>
45 #include <string.h>
46 #include <stdio.h>
47 #include <unistd.h>
48 #include <assert.h>
49 #include <errno.h>
50 
51 #if PLUGIN_NVPTX_DYNAMIC
52 # include <dlfcn.h>
53 
54 # define CUDA_CALLS \
55 CUDA_ONE_CALL (cuCtxCreate)		\
56 CUDA_ONE_CALL (cuCtxDestroy)		\
57 CUDA_ONE_CALL (cuCtxGetCurrent)		\
58 CUDA_ONE_CALL (cuCtxGetDevice)		\
59 CUDA_ONE_CALL (cuCtxPopCurrent)		\
60 CUDA_ONE_CALL (cuCtxPushCurrent)	\
61 CUDA_ONE_CALL (cuCtxSynchronize)	\
62 CUDA_ONE_CALL (cuDeviceGet)		\
63 CUDA_ONE_CALL (cuDeviceGetAttribute)	\
64 CUDA_ONE_CALL (cuDeviceGetCount)	\
65 CUDA_ONE_CALL (cuEventCreate)		\
66 CUDA_ONE_CALL (cuEventDestroy)		\
67 CUDA_ONE_CALL (cuEventElapsedTime)	\
68 CUDA_ONE_CALL (cuEventQuery)		\
69 CUDA_ONE_CALL (cuEventRecord)		\
70 CUDA_ONE_CALL (cuEventSynchronize)	\
71 CUDA_ONE_CALL (cuFuncGetAttribute)	\
72 CUDA_ONE_CALL (cuGetErrorString)	\
73 CUDA_ONE_CALL (cuInit)			\
74 CUDA_ONE_CALL (cuLaunchKernel)		\
75 CUDA_ONE_CALL (cuLinkAddData)		\
76 CUDA_ONE_CALL (cuLinkComplete)		\
77 CUDA_ONE_CALL (cuLinkCreate)		\
78 CUDA_ONE_CALL (cuLinkDestroy)		\
79 CUDA_ONE_CALL (cuMemAlloc)		\
80 CUDA_ONE_CALL (cuMemAllocHost)		\
81 CUDA_ONE_CALL (cuMemcpy)		\
82 CUDA_ONE_CALL (cuMemcpyDtoDAsync)	\
83 CUDA_ONE_CALL (cuMemcpyDtoH)		\
84 CUDA_ONE_CALL (cuMemcpyDtoHAsync)	\
85 CUDA_ONE_CALL (cuMemcpyHtoD)		\
86 CUDA_ONE_CALL (cuMemcpyHtoDAsync)	\
87 CUDA_ONE_CALL (cuMemFree)		\
88 CUDA_ONE_CALL (cuMemFreeHost)		\
89 CUDA_ONE_CALL (cuMemGetAddressRange)	\
90 CUDA_ONE_CALL (cuMemHostGetDevicePointer)\
91 CUDA_ONE_CALL (cuModuleGetFunction)	\
92 CUDA_ONE_CALL (cuModuleGetGlobal)	\
93 CUDA_ONE_CALL (cuModuleLoad)		\
94 CUDA_ONE_CALL (cuModuleLoadData)	\
95 CUDA_ONE_CALL (cuModuleUnload)		\
96 CUDA_ONE_CALL (cuStreamCreate)		\
97 CUDA_ONE_CALL (cuStreamDestroy)		\
98 CUDA_ONE_CALL (cuStreamQuery)		\
99 CUDA_ONE_CALL (cuStreamSynchronize)	\
100 CUDA_ONE_CALL (cuStreamWaitEvent)
101 # define CUDA_ONE_CALL(call) \
102   __typeof (call) *call;
103 struct cuda_lib_s {
104   CUDA_CALLS
105 } cuda_lib;
106 
107 /* -1 if init_cuda_lib has not been called yet, false
108    if it has been and failed, true if it has been and succeeded.  */
109 static signed char cuda_lib_inited = -1;
110 
111 /* Dynamically load the CUDA runtime library and initialize function
112    pointers, return false if unsuccessful, true if successful.  */
113 static bool
init_cuda_lib(void)114 init_cuda_lib (void)
115 {
116   if (cuda_lib_inited != -1)
117     return cuda_lib_inited;
118   const char *cuda_runtime_lib = "libcuda.so.1";
119   void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
120   cuda_lib_inited = false;
121   if (h == NULL)
122     return false;
123 # undef CUDA_ONE_CALL
124 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call)
125 # define CUDA_ONE_CALL_1(call) \
126   cuda_lib.call = dlsym (h, #call);	\
127   if (cuda_lib.call == NULL)		\
128     return false;
129   CUDA_CALLS
130   cuda_lib_inited = true;
131   return true;
132 }
133 # undef CUDA_ONE_CALL
134 # undef CUDA_ONE_CALL_1
135 # define CUDA_CALL_PREFIX cuda_lib.
136 #else
137 # define CUDA_CALL_PREFIX
138 # define init_cuda_lib() true
139 #endif
140 
141 /* Convenience macros for the frequently used CUDA library call and
142    error handling sequence as well as CUDA library calls that
143    do the error checking themselves or don't do it at all.  */
144 
145 #define CUDA_CALL_ERET(ERET, FN, ...)		\
146   do {						\
147     unsigned __r				\
148       = CUDA_CALL_PREFIX FN (__VA_ARGS__);	\
149     if (__r != CUDA_SUCCESS)			\
150       {						\
151 	GOMP_PLUGIN_error (#FN " error: %s",	\
152 			   cuda_error (__r));	\
153 	return ERET;				\
154       }						\
155   } while (0)
156 
157 #define CUDA_CALL(FN, ...)			\
158   CUDA_CALL_ERET (false, FN, __VA_ARGS__)
159 
160 #define CUDA_CALL_ASSERT(FN, ...)		\
161   do {						\
162     unsigned __r				\
163       = CUDA_CALL_PREFIX FN (__VA_ARGS__);	\
164     if (__r != CUDA_SUCCESS)			\
165       {						\
166 	GOMP_PLUGIN_fatal (#FN " error: %s",	\
167 			   cuda_error (__r));	\
168       }						\
169   } while (0)
170 
171 #define CUDA_CALL_NOCHECK(FN, ...)		\
172   CUDA_CALL_PREFIX FN (__VA_ARGS__)
173 
174 static const char *
cuda_error(CUresult r)175 cuda_error (CUresult r)
176 {
177 #if CUDA_VERSION < 7000
178   /* Specified in documentation and present in library from at least
179      5.5.  Not declared in header file prior to 7.0.  */
180   extern CUresult cuGetErrorString (CUresult, const char **);
181 #endif
182   const char *desc;
183 
184   r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
185   if (r != CUDA_SUCCESS)
186     desc = "unknown cuda error";
187 
188   return desc;
189 }
190 
191 static unsigned int instantiated_devices = 0;
192 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
193 
194 struct ptx_stream
195 {
196   CUstream stream;
197   pthread_t host_thread;
198   bool multithreaded;
199 
200   CUdeviceptr d;
201   void *h;
202   void *h_begin;
203   void *h_end;
204   void *h_next;
205   void *h_prev;
206   void *h_tail;
207 
208   struct ptx_stream *next;
209 };
210 
211 /* Thread-specific data for PTX.  */
212 
213 struct nvptx_thread
214 {
215   struct ptx_stream *current_stream;
216   struct ptx_device *ptx_dev;
217 };
218 
219 struct map
220 {
221   int     async;
222   size_t  size;
223   char    mappings[0];
224 };
225 
226 static bool
map_init(struct ptx_stream * s)227 map_init (struct ptx_stream *s)
228 {
229   int size = getpagesize ();
230 
231   assert (s);
232   assert (!s->d);
233   assert (!s->h);
234 
235   CUDA_CALL (cuMemAllocHost, &s->h, size);
236   CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0);
237 
238   assert (s->h);
239 
240   s->h_begin = s->h;
241   s->h_end = s->h_begin + size;
242   s->h_next = s->h_prev = s->h_tail = s->h_begin;
243 
244   assert (s->h_next);
245   assert (s->h_end);
246   return true;
247 }
248 
249 static bool
map_fini(struct ptx_stream * s)250 map_fini (struct ptx_stream *s)
251 {
252   CUDA_CALL (cuMemFreeHost, s->h);
253   return true;
254 }
255 
256 static void
map_pop(struct ptx_stream * s)257 map_pop (struct ptx_stream *s)
258 {
259   struct map *m;
260 
261   assert (s != NULL);
262   assert (s->h_next);
263   assert (s->h_prev);
264   assert (s->h_tail);
265 
266   m = s->h_tail;
267 
268   s->h_tail += m->size;
269 
270   if (s->h_tail >= s->h_end)
271     s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
272 
273   if (s->h_next == s->h_tail)
274     s->h_prev = s->h_next;
275 
276   assert (s->h_next >= s->h_begin);
277   assert (s->h_tail >= s->h_begin);
278   assert (s->h_prev >= s->h_begin);
279 
280   assert (s->h_next <= s->h_end);
281   assert (s->h_tail <= s->h_end);
282   assert (s->h_prev <= s->h_end);
283 }
284 
285 static void
map_push(struct ptx_stream * s,int async,size_t size,void ** h,void ** d)286 map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
287 {
288   int left;
289   int offset;
290   struct map *m;
291 
292   assert (s != NULL);
293 
294   left = s->h_end - s->h_next;
295   size += sizeof (struct map);
296 
297   assert (s->h_prev);
298   assert (s->h_next);
299 
300   if (size >= left)
301     {
302       m = s->h_prev;
303       m->size += left;
304       s->h_next = s->h_begin;
305 
306       if (s->h_next + size > s->h_end)
307 	GOMP_PLUGIN_fatal ("unable to push map");
308     }
309 
310   assert (s->h_next);
311 
312   m = s->h_next;
313   m->async = async;
314   m->size = size;
315 
316   offset = (void *)&m->mappings[0] - s->h;
317 
318   *d = (void *)(s->d + offset);
319   *h = (void *)(s->h + offset);
320 
321   s->h_prev = s->h_next;
322   s->h_next += size;
323 
324   assert (s->h_prev);
325   assert (s->h_next);
326 
327   assert (s->h_next >= s->h_begin);
328   assert (s->h_tail >= s->h_begin);
329   assert (s->h_prev >= s->h_begin);
330   assert (s->h_next <= s->h_end);
331   assert (s->h_tail <= s->h_end);
332   assert (s->h_prev <= s->h_end);
333 
334   return;
335 }
336 
337 /* Target data function launch information.  */
338 
339 struct targ_fn_launch
340 {
341   const char *fn;
342   unsigned short dim[GOMP_DIM_MAX];
343 };
344 
345 /* Target PTX object information.  */
346 
347 struct targ_ptx_obj
348 {
349   const char *code;
350   size_t size;
351 };
352 
353 /* Target data image information.  */
354 
355 typedef struct nvptx_tdata
356 {
357   const struct targ_ptx_obj *ptx_objs;
358   unsigned ptx_num;
359 
360   const char *const *var_names;
361   unsigned var_num;
362 
363   const struct targ_fn_launch *fn_descs;
364   unsigned fn_num;
365 } nvptx_tdata_t;
366 
367 /* Descriptor of a loaded function.  */
368 
369 struct targ_fn_descriptor
370 {
371   CUfunction fn;
372   const struct targ_fn_launch *launch;
373   int regs_per_thread;
374   int max_threads_per_block;
375 };
376 
377 /* A loaded PTX image.  */
378 struct ptx_image_data
379 {
380   const void *target_data;
381   CUmodule module;
382 
383   struct targ_fn_descriptor *fns;  /* Array of functions.  */
384 
385   struct ptx_image_data *next;
386 };
387 
388 struct ptx_device
389 {
390   CUcontext ctx;
391   bool ctx_shared;
392   CUdevice dev;
393   struct ptx_stream *null_stream;
394   /* All non-null streams associated with this device (actually context),
395      either created implicitly or passed in from the user (via
396      acc_set_cuda_stream).  */
397   struct ptx_stream *active_streams;
398   struct {
399     struct ptx_stream **arr;
400     int size;
401   } async_streams;
402   /* A lock for use when manipulating the above stream list and array.  */
403   pthread_mutex_t stream_lock;
404   int ord;
405   bool overlap;
406   bool map;
407   bool concur;
408   bool mkern;
409   int  mode;
410   int clock_khz;
411   int num_sms;
412   int regs_per_block;
413   int regs_per_sm;
414 
415   struct ptx_image_data *images;  /* Images loaded on device.  */
416   pthread_mutex_t image_lock;     /* Lock for above list.  */
417 
418   struct ptx_device *next;
419 };
420 
421 enum ptx_event_type
422 {
423   PTX_EVT_MEM,
424   PTX_EVT_KNL,
425   PTX_EVT_SYNC,
426   PTX_EVT_ASYNC_CLEANUP
427 };
428 
429 struct ptx_event
430 {
431   CUevent *evt;
432   int type;
433   void *addr;
434   int ord;
435   int val;
436 
437   struct ptx_event *next;
438 };
439 
440 static pthread_mutex_t ptx_event_lock;
441 static struct ptx_event *ptx_events;
442 
443 static struct ptx_device **ptx_devices;
444 
445 static inline struct nvptx_thread *
nvptx_thread(void)446 nvptx_thread (void)
447 {
448   return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
449 }
450 
451 static bool
init_streams_for_device(struct ptx_device * ptx_dev,int concurrency)452 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
453 {
454   int i;
455   struct ptx_stream *null_stream
456     = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
457 
458   null_stream->stream = NULL;
459   null_stream->host_thread = pthread_self ();
460   null_stream->multithreaded = true;
461   null_stream->d = (CUdeviceptr) NULL;
462   null_stream->h = NULL;
463   if (!map_init (null_stream))
464     return false;
465 
466   ptx_dev->null_stream = null_stream;
467   ptx_dev->active_streams = NULL;
468   pthread_mutex_init (&ptx_dev->stream_lock, NULL);
469 
470   if (concurrency < 1)
471     concurrency = 1;
472 
473   /* This is just a guess -- make space for as many async streams as the
474      current device is capable of concurrently executing.  This can grow
475      later as necessary.  No streams are created yet.  */
476   ptx_dev->async_streams.arr
477     = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
478   ptx_dev->async_streams.size = concurrency;
479 
480   for (i = 0; i < concurrency; i++)
481     ptx_dev->async_streams.arr[i] = NULL;
482 
483   return true;
484 }
485 
486 static bool
fini_streams_for_device(struct ptx_device * ptx_dev)487 fini_streams_for_device (struct ptx_device *ptx_dev)
488 {
489   free (ptx_dev->async_streams.arr);
490 
491   bool ret = true;
492   while (ptx_dev->active_streams != NULL)
493     {
494       struct ptx_stream *s = ptx_dev->active_streams;
495       ptx_dev->active_streams = ptx_dev->active_streams->next;
496 
497       ret &= map_fini (s);
498 
499       CUresult r = CUDA_CALL_NOCHECK (cuStreamDestroy, s->stream);
500       if (r != CUDA_SUCCESS)
501 	{
502 	  GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
503 	  ret = false;
504 	}
505       free (s);
506     }
507 
508   ret &= map_fini (ptx_dev->null_stream);
509   free (ptx_dev->null_stream);
510   return ret;
511 }
512 
513 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
514    thread THREAD (and also current device/context).  If CREATE is true, create
515    the stream if it does not exist (or use EXISTING if it is non-NULL), and
516    associate the stream with the same thread argument.  Returns stream to use
517    as result.  */
518 
519 static struct ptx_stream *
select_stream_for_async(int async,pthread_t thread,bool create,CUstream existing)520 select_stream_for_async (int async, pthread_t thread, bool create,
521 			 CUstream existing)
522 {
523   struct nvptx_thread *nvthd = nvptx_thread ();
524   /* Local copy of TLS variable.  */
525   struct ptx_device *ptx_dev = nvthd->ptx_dev;
526   struct ptx_stream *stream = NULL;
527   int orig_async = async;
528 
529   /* The special value acc_async_noval (-1) maps (for now) to an
530      implicitly-created stream, which is then handled the same as any other
531      numbered async stream.  Other options are available, e.g. using the null
532      stream for anonymous async operations, or choosing an idle stream from an
533      active set.  But, stick with this for now.  */
534   if (async > acc_async_sync)
535     async++;
536 
537   if (create)
538     pthread_mutex_lock (&ptx_dev->stream_lock);
539 
540   /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
541      null stream, and in fact better performance may be obtainable if it doesn't
542      (because the null stream enforces overly-strict synchronisation with
543      respect to other streams for legacy reasons, and that's probably not
544      needed with OpenACC).  Maybe investigate later.  */
545   if (async == acc_async_sync)
546     stream = ptx_dev->null_stream;
547   else if (async >= 0 && async < ptx_dev->async_streams.size
548 	   && ptx_dev->async_streams.arr[async] && !(create && existing))
549     stream = ptx_dev->async_streams.arr[async];
550   else if (async >= 0 && create)
551     {
552       if (async >= ptx_dev->async_streams.size)
553 	{
554 	  int i, newsize = ptx_dev->async_streams.size * 2;
555 
556 	  if (async >= newsize)
557 	    newsize = async + 1;
558 
559 	  ptx_dev->async_streams.arr
560 	    = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
561 				   newsize * sizeof (struct ptx_stream *));
562 
563 	  for (i = ptx_dev->async_streams.size; i < newsize; i++)
564 	    ptx_dev->async_streams.arr[i] = NULL;
565 
566 	  ptx_dev->async_streams.size = newsize;
567 	}
568 
569       /* Create a new stream on-demand if there isn't one already, or if we're
570 	 setting a particular async value to an existing (externally-provided)
571 	 stream.  */
572       if (!ptx_dev->async_streams.arr[async] || existing)
573         {
574 	  CUresult r;
575 	  struct ptx_stream *s
576 	    = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
577 
578 	  if (existing)
579 	    s->stream = existing;
580 	  else
581 	    {
582 	      r = CUDA_CALL_NOCHECK (cuStreamCreate, &s->stream,
583 				     CU_STREAM_DEFAULT);
584 	      if (r != CUDA_SUCCESS)
585 		{
586 		  pthread_mutex_unlock (&ptx_dev->stream_lock);
587 		  GOMP_PLUGIN_fatal ("cuStreamCreate error: %s",
588 				     cuda_error (r));
589 		}
590 	    }
591 
592 	  /* If CREATE is true, we're going to be queueing some work on this
593 	     stream.  Associate it with the current host thread.  */
594 	  s->host_thread = thread;
595 	  s->multithreaded = false;
596 
597 	  s->d = (CUdeviceptr) NULL;
598 	  s->h = NULL;
599 	  if (!map_init (s))
600 	    {
601 	      pthread_mutex_unlock (&ptx_dev->stream_lock);
602 	      GOMP_PLUGIN_fatal ("map_init fail");
603 	    }
604 
605 	  s->next = ptx_dev->active_streams;
606 	  ptx_dev->active_streams = s;
607 	  ptx_dev->async_streams.arr[async] = s;
608 	}
609 
610       stream = ptx_dev->async_streams.arr[async];
611     }
612   else if (async < 0)
613     {
614       if (create)
615 	pthread_mutex_unlock (&ptx_dev->stream_lock);
616       GOMP_PLUGIN_fatal ("bad async %d", async);
617     }
618 
619   if (create)
620     {
621       assert (stream != NULL);
622 
623       /* If we're trying to use the same stream from different threads
624 	 simultaneously, set stream->multithreaded to true.  This affects the
625 	 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
626 	 only wait for asynchronous launches from the same host thread they are
627 	 invoked on.  If multiple threads use the same async value, we make note
628 	 of that here and fall back to testing/waiting for all threads in those
629 	 functions.  */
630       if (thread != stream->host_thread)
631         stream->multithreaded = true;
632 
633       pthread_mutex_unlock (&ptx_dev->stream_lock);
634     }
635   else if (stream && !stream->multithreaded
636 	   && !pthread_equal (stream->host_thread, thread))
637     GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
638 
639   return stream;
640 }
641 
642 /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
643    should be locked on entry and remains locked on exit.  */
644 
645 static bool
nvptx_init(void)646 nvptx_init (void)
647 {
648   int ndevs;
649 
650   if (instantiated_devices != 0)
651     return true;
652 
653   ptx_events = NULL;
654   pthread_mutex_init (&ptx_event_lock, NULL);
655 
656   if (!init_cuda_lib ())
657     return false;
658 
659   CUDA_CALL (cuInit, 0);
660 
661   CUDA_CALL (cuDeviceGetCount, &ndevs);
662   ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
663 					    * ndevs);
664   return true;
665 }
666 
667 /* Select the N'th PTX device for the current host thread.  The device must
668    have been previously opened before calling this function.  */
669 
670 static bool
nvptx_attach_host_thread_to_device(int n)671 nvptx_attach_host_thread_to_device (int n)
672 {
673   CUdevice dev;
674   CUresult r;
675   struct ptx_device *ptx_dev;
676   CUcontext thd_ctx;
677 
678   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
679   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
680     {
681       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
682       return false;
683     }
684 
685   if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
686     return true;
687   else
688     {
689       CUcontext old_ctx;
690 
691       ptx_dev = ptx_devices[n];
692       if (!ptx_dev)
693 	{
694 	  GOMP_PLUGIN_error ("device %d not found", n);
695 	  return false;
696 	}
697 
698       CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
699 
700       /* We don't necessarily have a current context (e.g. if it has been
701          destroyed.  Pop it if we do though.  */
702       if (thd_ctx != NULL)
703 	CUDA_CALL (cuCtxPopCurrent, &old_ctx);
704 
705       CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
706     }
707   return true;
708 }
709 
710 static struct ptx_device *
nvptx_open_device(int n)711 nvptx_open_device (int n)
712 {
713   struct ptx_device *ptx_dev;
714   CUdevice dev, ctx_dev;
715   CUresult r;
716   int async_engines, pi;
717 
718   CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
719 
720   ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
721 
722   ptx_dev->ord = n;
723   ptx_dev->dev = dev;
724   ptx_dev->ctx_shared = false;
725 
726   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
727   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
728     {
729       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
730       return NULL;
731     }
732 
733   if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
734     {
735       /* The current host thread has an active context for a different device.
736          Detach it.  */
737       CUcontext old_ctx;
738       CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
739     }
740 
741   CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
742 
743   if (!ptx_dev->ctx)
744     CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
745   else
746     ptx_dev->ctx_shared = true;
747 
748   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
749 		  &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
750   ptx_dev->overlap = pi;
751 
752   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
753 		  &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
754   ptx_dev->map = pi;
755 
756   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
757 		  &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
758   ptx_dev->concur = pi;
759 
760   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
761 		  &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
762   ptx_dev->mode = pi;
763 
764   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
765 		  &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
766   ptx_dev->mkern = pi;
767 
768   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
769 		  &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
770   ptx_dev->clock_khz = pi;
771 
772   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
773 		  &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
774   ptx_dev->num_sms = pi;
775 
776   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
777 		  &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
778   ptx_dev->regs_per_block = pi;
779 
780   /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 is defined only
781      in CUDA 6.0 and newer.  */
782   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi, 82, dev);
783   /* Fallback: use limit of registers per block, which is usually equal.  */
784   if (r == CUDA_ERROR_INVALID_VALUE)
785     pi = ptx_dev->regs_per_block;
786   else if (r != CUDA_SUCCESS)
787     {
788       GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
789       return NULL;
790     }
791   ptx_dev->regs_per_sm = pi;
792 
793   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
794 		  &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
795   if (pi != 32)
796     {
797       GOMP_PLUGIN_error ("Only warp size 32 is supported");
798       return NULL;
799     }
800 
801   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
802 			 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
803   if (r != CUDA_SUCCESS)
804     async_engines = 1;
805 
806   ptx_dev->images = NULL;
807   pthread_mutex_init (&ptx_dev->image_lock, NULL);
808 
809   if (!init_streams_for_device (ptx_dev, async_engines))
810     return NULL;
811 
812   return ptx_dev;
813 }
814 
815 static bool
nvptx_close_device(struct ptx_device * ptx_dev)816 nvptx_close_device (struct ptx_device *ptx_dev)
817 {
818   if (!ptx_dev)
819     return true;
820 
821   if (!fini_streams_for_device (ptx_dev))
822     return false;
823 
824   pthread_mutex_destroy (&ptx_dev->image_lock);
825 
826   if (!ptx_dev->ctx_shared)
827     CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
828 
829   free (ptx_dev);
830   return true;
831 }
832 
833 static int
nvptx_get_num_devices(void)834 nvptx_get_num_devices (void)
835 {
836   int n;
837 
838   /* PR libgomp/65099: Currently, we only support offloading in 64-bit
839      configurations.  */
840   if (sizeof (void *) != 8)
841     {
842       GOMP_PLUGIN_debug (0, "Disabling nvptx offloading;"
843 			 " only 64-bit configurations are supported\n");
844       return 0;
845     }
846 
847   /* This function will be called before the plugin has been initialized in
848      order to enumerate available devices, but CUDA API routines can't be used
849      until cuInit has been called.  Just call it now (but don't yet do any
850      further initialization).  */
851   if (instantiated_devices == 0)
852     {
853       if (!init_cuda_lib ())
854 	return 0;
855       CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
856       /* This is not an error: e.g. we may have CUDA libraries installed but
857          no devices available.  */
858       if (r != CUDA_SUCCESS)
859 	{
860 	  GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
861 			     cuda_error (r));
862 	  return 0;
863 	}
864     }
865 
866   CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
867   return n;
868 }
869 
870 static void
notify_var(const char * var_name,const char * env_var)871 notify_var (const char *var_name, const char *env_var)
872 {
873   if (env_var == NULL)
874     GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
875   else
876     GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
877 }
878 
879 static bool
link_ptx(CUmodule * module,const struct targ_ptx_obj * ptx_objs,unsigned num_objs)880 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
881 	  unsigned num_objs)
882 {
883   CUjit_option opts[6];
884   void *optvals[6];
885   float elapsed = 0.0;
886   char elog[1024];
887   char ilog[16384];
888   CUlinkState linkstate;
889   CUresult r;
890   void *linkout;
891   size_t linkoutsize __attribute__ ((unused));
892 
893   opts[0] = CU_JIT_WALL_TIME;
894   optvals[0] = &elapsed;
895 
896   opts[1] = CU_JIT_INFO_LOG_BUFFER;
897   optvals[1] = &ilog[0];
898 
899   opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
900   optvals[2] = (void *) sizeof ilog;
901 
902   opts[3] = CU_JIT_ERROR_LOG_BUFFER;
903   optvals[3] = &elog[0];
904 
905   opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
906   optvals[4] = (void *) sizeof elog;
907 
908   opts[5] = CU_JIT_LOG_VERBOSE;
909   optvals[5] = (void *) 1;
910 
911   CUDA_CALL (cuLinkCreate, 6, opts, optvals, &linkstate);
912 
913   for (; num_objs--; ptx_objs++)
914     {
915       /* cuLinkAddData's 'data' argument erroneously omits the const
916 	 qualifier.  */
917       GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
918       r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
919 			     (char *) ptx_objs->code, ptx_objs->size,
920 			     0, 0, 0, 0);
921       if (r != CUDA_SUCCESS)
922 	{
923 	  GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
924 	  GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
925 			     cuda_error (r));
926 	  return false;
927 	}
928     }
929 
930   GOMP_PLUGIN_debug (0, "Linking\n");
931   r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
932 
933   GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
934   GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
935 
936   if (r != CUDA_SUCCESS)
937     {
938       GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
939       return false;
940     }
941 
942   CUDA_CALL (cuModuleLoadData, module, linkout);
943   CUDA_CALL (cuLinkDestroy, linkstate);
944   return true;
945 }
946 
947 static void
event_gc(bool memmap_lockable)948 event_gc (bool memmap_lockable)
949 {
950   struct ptx_event *ptx_event = ptx_events;
951   struct ptx_event *async_cleanups = NULL;
952   struct nvptx_thread *nvthd = nvptx_thread ();
953 
954   pthread_mutex_lock (&ptx_event_lock);
955 
956   while (ptx_event != NULL)
957     {
958       CUresult r;
959       struct ptx_event *e = ptx_event;
960 
961       ptx_event = ptx_event->next;
962 
963       if (e->ord != nvthd->ptx_dev->ord)
964 	continue;
965 
966       r = CUDA_CALL_NOCHECK (cuEventQuery, *e->evt);
967       if (r == CUDA_SUCCESS)
968 	{
969 	  bool append_async = false;
970 	  CUevent *te;
971 
972 	  te = e->evt;
973 
974 	  switch (e->type)
975 	    {
976 	    case PTX_EVT_MEM:
977 	    case PTX_EVT_SYNC:
978 	      break;
979 
980 	    case PTX_EVT_KNL:
981 	      map_pop (e->addr);
982 	      break;
983 
984 	    case PTX_EVT_ASYNC_CLEANUP:
985 	      {
986 		/* The function gomp_plugin_async_unmap_vars needs to claim the
987 		   memory-map splay tree lock for the current device, so we
988 		   can't call it when one of our callers has already claimed
989 		   the lock.  In that case, just delay the GC for this event
990 		   until later.  */
991 		if (!memmap_lockable)
992 		  continue;
993 
994 		append_async = true;
995 	      }
996 	      break;
997 	    }
998 
999 	  CUDA_CALL_NOCHECK (cuEventDestroy, *te);
1000 	  free ((void *)te);
1001 
1002 	  /* Unlink 'e' from ptx_events list.  */
1003 	  if (ptx_events == e)
1004 	    ptx_events = ptx_events->next;
1005 	  else
1006 	    {
1007 	      struct ptx_event *e_ = ptx_events;
1008 	      while (e_->next != e)
1009 		e_ = e_->next;
1010 	      e_->next = e_->next->next;
1011 	    }
1012 
1013 	  if (append_async)
1014 	    {
1015 	      e->next = async_cleanups;
1016 	      async_cleanups = e;
1017 	    }
1018 	  else
1019 	    free (e);
1020 	}
1021     }
1022 
1023   pthread_mutex_unlock (&ptx_event_lock);
1024 
1025   /* We have to do these here, after ptx_event_lock is released.  */
1026   while (async_cleanups)
1027     {
1028       struct ptx_event *e = async_cleanups;
1029       async_cleanups = async_cleanups->next;
1030 
1031       GOMP_PLUGIN_async_unmap_vars (e->addr, e->val);
1032       free (e);
1033     }
1034 }
1035 
1036 static void
event_add(enum ptx_event_type type,CUevent * e,void * h,int val)1037 event_add (enum ptx_event_type type, CUevent *e, void *h, int val)
1038 {
1039   struct ptx_event *ptx_event;
1040   struct nvptx_thread *nvthd = nvptx_thread ();
1041 
1042   assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
1043 	  || type == PTX_EVT_ASYNC_CLEANUP);
1044 
1045   ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
1046   ptx_event->type = type;
1047   ptx_event->evt = e;
1048   ptx_event->addr = h;
1049   ptx_event->ord = nvthd->ptx_dev->ord;
1050   ptx_event->val = val;
1051 
1052   pthread_mutex_lock (&ptx_event_lock);
1053 
1054   ptx_event->next = ptx_events;
1055   ptx_events = ptx_event;
1056 
1057   pthread_mutex_unlock (&ptx_event_lock);
1058 }
1059 
1060 static void
nvptx_exec(void (* fn),size_t mapnum,void ** hostaddrs,void ** devaddrs,int async,unsigned * dims,void * targ_mem_desc)1061 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
1062 	    int async, unsigned *dims, void *targ_mem_desc)
1063 {
1064   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
1065   CUfunction function;
1066   CUresult r;
1067   int i;
1068   struct ptx_stream *dev_str;
1069   void *kargs[1];
1070   void *hp, *dp;
1071   struct nvptx_thread *nvthd = nvptx_thread ();
1072   const char *maybe_abort_msg = "(perhaps abort was called)";
1073 
1074   function = targ_fn->fn;
1075 
1076   dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
1077   assert (dev_str == nvthd->current_stream);
1078 
1079   /* Initialize the launch dimensions.  Typically this is constant,
1080      provided by the device compiler, but we must permit runtime
1081      values.  */
1082   int seen_zero = 0;
1083   for (i = 0; i != GOMP_DIM_MAX; i++)
1084     {
1085       if (targ_fn->launch->dim[i])
1086        dims[i] = targ_fn->launch->dim[i];
1087       if (!dims[i])
1088        seen_zero = 1;
1089     }
1090 
1091   if (seen_zero)
1092     {
1093       /* See if the user provided GOMP_OPENACC_DIM environment
1094 	 variable to specify runtime defaults. */
1095       static int default_dims[GOMP_DIM_MAX];
1096 
1097       pthread_mutex_lock (&ptx_dev_lock);
1098       if (!default_dims[0])
1099 	{
1100 	  const char *var_name = "GOMP_OPENACC_DIM";
1101 	  /* We only read the environment variable once.  You can't
1102 	     change it in the middle of execution.  The syntax  is
1103 	     the same as for the -fopenacc-dim compilation option.  */
1104 	  const char *env_var = getenv (var_name);
1105 	  notify_var (var_name, env_var);
1106 	  if (env_var)
1107 	    {
1108 	      const char *pos = env_var;
1109 
1110 	      for (i = 0; *pos && i != GOMP_DIM_MAX; i++)
1111 		{
1112 		  if (i && *pos++ != ':')
1113 		    break;
1114 		  if (*pos != ':')
1115 		    {
1116 		      const char *eptr;
1117 
1118 		      errno = 0;
1119 		      long val = strtol (pos, (char **)&eptr, 10);
1120 		      if (errno || val < 0 || (unsigned)val != val)
1121 			break;
1122 		      default_dims[i] = (int)val;
1123 		      pos = eptr;
1124 		    }
1125 		}
1126 	    }
1127 
1128 	  int warp_size, block_size, dev_size, cpu_size;
1129 	  CUdevice dev = nvptx_thread()->ptx_dev->dev;
1130 	  /* 32 is the default for known hardware.  */
1131 	  int gang = 0, worker = 32, vector = 32;
1132 	  CUdevice_attribute cu_tpb, cu_ws, cu_mpc, cu_tpm;
1133 
1134 	  cu_tpb = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK;
1135 	  cu_ws = CU_DEVICE_ATTRIBUTE_WARP_SIZE;
1136 	  cu_mpc = CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT;
1137 	  cu_tpm  = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR;
1138 
1139 	  if (CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &block_size, cu_tpb,
1140 				 dev) == CUDA_SUCCESS
1141 	      && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &warp_size, cu_ws,
1142 				    dev) == CUDA_SUCCESS
1143 	      && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &dev_size, cu_mpc,
1144 				    dev) == CUDA_SUCCESS
1145 	      && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &cpu_size, cu_tpm,
1146 				    dev) == CUDA_SUCCESS)
1147 	    {
1148 	      GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
1149 				 " dev_size=%d, cpu_size=%d\n",
1150 				 warp_size, block_size, dev_size, cpu_size);
1151 	      gang = (cpu_size / block_size) * dev_size;
1152 	      worker = block_size / warp_size;
1153 	      vector = warp_size;
1154 	    }
1155 
1156 	  /* There is no upper bound on the gang size.  The best size
1157 	     matches the hardware configuration.  Logical gangs are
1158 	     scheduled onto physical hardware.  To maximize usage, we
1159 	     should guess a large number.  */
1160 	  if (default_dims[GOMP_DIM_GANG] < 1)
1161 	    default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
1162 	  /* The worker size must not exceed the hardware.  */
1163 	  if (default_dims[GOMP_DIM_WORKER] < 1
1164 	      || (default_dims[GOMP_DIM_WORKER] > worker && gang))
1165 	    default_dims[GOMP_DIM_WORKER] = worker;
1166 	  /* The vector size must exactly match the hardware.  */
1167 	  if (default_dims[GOMP_DIM_VECTOR] < 1
1168 	      || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
1169 	    default_dims[GOMP_DIM_VECTOR] = vector;
1170 
1171 	  GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
1172 			     default_dims[GOMP_DIM_GANG],
1173 			     default_dims[GOMP_DIM_WORKER],
1174 			     default_dims[GOMP_DIM_VECTOR]);
1175 	}
1176       pthread_mutex_unlock (&ptx_dev_lock);
1177 
1178       for (i = 0; i != GOMP_DIM_MAX; i++)
1179 	if (!dims[i])
1180 	  dims[i] = default_dims[i];
1181     }
1182 
1183   /* This reserves a chunk of a pre-allocated page of memory mapped on both
1184      the host and the device. HP is a host pointer to the new chunk, and DP is
1185      the corresponding device pointer.  */
1186   map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
1187 
1188   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
1189 
1190   /* Copy the array of arguments to the mapped page.  */
1191   for (i = 0; i < mapnum; i++)
1192     ((void **) hp)[i] = devaddrs[i];
1193 
1194   /* Copy the (device) pointers to arguments to the device (dp and hp might in
1195      fact have the same value on a unified-memory system).  */
1196   CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp,
1197 		    mapnum * sizeof (void *));
1198   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
1199 		     " gangs=%u, workers=%u, vectors=%u\n",
1200 		     __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
1201 		     dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
1202 
1203   // OpenACC		CUDA
1204   //
1205   // num_gangs		nctaid.x
1206   // num_workers	ntid.y
1207   // vector length	ntid.x
1208 
1209   kargs[0] = &dp;
1210   CUDA_CALL_ASSERT (cuLaunchKernel, function,
1211 		    dims[GOMP_DIM_GANG], 1, 1,
1212 		    dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
1213 		    0, dev_str->stream, kargs, 0);
1214 
1215 #ifndef DISABLE_ASYNC
1216   if (async < acc_async_noval)
1217     {
1218       r = CUDA_CALL_NOCHECK (cuStreamSynchronize, dev_str->stream);
1219       if (r == CUDA_ERROR_LAUNCH_FAILED)
1220 	GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1221 			   maybe_abort_msg);
1222       else if (r != CUDA_SUCCESS)
1223         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1224     }
1225   else
1226     {
1227       CUevent *e;
1228 
1229       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1230 
1231       r = CUDA_CALL_NOCHECK (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1232       if (r == CUDA_ERROR_LAUNCH_FAILED)
1233 	GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
1234 			   maybe_abort_msg);
1235       else if (r != CUDA_SUCCESS)
1236         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1237 
1238       event_gc (true);
1239 
1240       CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream);
1241 
1242       event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
1243     }
1244 #else
1245   r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
1246   if (r == CUDA_ERROR_LAUNCH_FAILED)
1247     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1248 		       maybe_abort_msg);
1249   else if (r != CUDA_SUCCESS)
1250     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1251 #endif
1252 
1253   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
1254 		     targ_fn->launch->fn);
1255 
1256 #ifndef DISABLE_ASYNC
1257   if (async < acc_async_noval)
1258 #endif
1259     map_pop (dev_str);
1260 }
1261 
1262 void * openacc_get_current_cuda_context (void);
1263 
1264 static void *
nvptx_alloc(size_t s)1265 nvptx_alloc (size_t s)
1266 {
1267   CUdeviceptr d;
1268 
1269   CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
1270   return (void *) d;
1271 }
1272 
1273 static bool
nvptx_free(void * p)1274 nvptx_free (void *p)
1275 {
1276   CUdeviceptr pb;
1277   size_t ps;
1278 
1279   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) p);
1280   if ((CUdeviceptr) p != pb)
1281     {
1282       GOMP_PLUGIN_error ("invalid device address");
1283       return false;
1284     }
1285 
1286   CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1287   return true;
1288 }
1289 
1290 
1291 static bool
nvptx_host2dev(void * d,const void * h,size_t s)1292 nvptx_host2dev (void *d, const void *h, size_t s)
1293 {
1294   CUdeviceptr pb;
1295   size_t ps;
1296   struct nvptx_thread *nvthd = nvptx_thread ();
1297 
1298   if (!s)
1299     return true;
1300   if (!d)
1301     {
1302       GOMP_PLUGIN_error ("invalid device address");
1303       return false;
1304     }
1305 
1306   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1307 
1308   if (!pb)
1309     {
1310       GOMP_PLUGIN_error ("invalid device address");
1311       return false;
1312     }
1313   if (!h)
1314     {
1315       GOMP_PLUGIN_error ("invalid host address");
1316       return false;
1317     }
1318   if (d == h)
1319     {
1320       GOMP_PLUGIN_error ("invalid host or device address");
1321       return false;
1322     }
1323   if ((void *)(d + s) > (void *)(pb + ps))
1324     {
1325       GOMP_PLUGIN_error ("invalid size");
1326       return false;
1327     }
1328 
1329 #ifndef DISABLE_ASYNC
1330   if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
1331     {
1332       CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1333       CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1334       event_gc (false);
1335       CUDA_CALL (cuMemcpyHtoDAsync,
1336 		 (CUdeviceptr) d, h, s, nvthd->current_stream->stream);
1337       CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1338       event_add (PTX_EVT_MEM, e, (void *)h, 0);
1339     }
1340   else
1341 #endif
1342     CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) d, h, s);
1343 
1344   return true;
1345 }
1346 
1347 static bool
nvptx_dev2host(void * h,const void * d,size_t s)1348 nvptx_dev2host (void *h, const void *d, size_t s)
1349 {
1350   CUdeviceptr pb;
1351   size_t ps;
1352   struct nvptx_thread *nvthd = nvptx_thread ();
1353 
1354   if (!s)
1355     return true;
1356   if (!d)
1357     {
1358       GOMP_PLUGIN_error ("invalid device address");
1359       return false;
1360     }
1361 
1362   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1363 
1364   if (!pb)
1365     {
1366       GOMP_PLUGIN_error ("invalid device address");
1367       return false;
1368     }
1369   if (!h)
1370     {
1371       GOMP_PLUGIN_error ("invalid host address");
1372       return false;
1373     }
1374   if (d == h)
1375     {
1376       GOMP_PLUGIN_error ("invalid host or device address");
1377       return false;
1378     }
1379   if ((void *)(d + s) > (void *)(pb + ps))
1380     {
1381       GOMP_PLUGIN_error ("invalid size");
1382       return false;
1383     }
1384 
1385 #ifndef DISABLE_ASYNC
1386   if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
1387     {
1388       CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1389       CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1390       event_gc (false);
1391       CUDA_CALL (cuMemcpyDtoHAsync,
1392 		 h, (CUdeviceptr) d, s, nvthd->current_stream->stream);
1393       CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1394       event_add (PTX_EVT_MEM, e, (void *)h, 0);
1395     }
1396   else
1397 #endif
1398     CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s);
1399 
1400   return true;
1401 }
1402 
1403 static void
nvptx_set_async(int async)1404 nvptx_set_async (int async)
1405 {
1406   struct nvptx_thread *nvthd = nvptx_thread ();
1407   nvthd->current_stream
1408     = select_stream_for_async (async, pthread_self (), true, NULL);
1409 }
1410 
1411 static int
nvptx_async_test(int async)1412 nvptx_async_test (int async)
1413 {
1414   CUresult r;
1415   struct ptx_stream *s;
1416 
1417   s = select_stream_for_async (async, pthread_self (), false, NULL);
1418 
1419   if (!s)
1420     GOMP_PLUGIN_fatal ("unknown async %d", async);
1421 
1422   r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
1423   if (r == CUDA_SUCCESS)
1424     {
1425       /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1426 	 whether all work has completed on this stream, and if so omits the call
1427 	 to the wait hook.  If that happens, event_gc might not get called
1428 	 (which prevents variables from getting unmapped and their associated
1429 	 device storage freed), so call it here.  */
1430       event_gc (true);
1431       return 1;
1432     }
1433   else if (r == CUDA_ERROR_NOT_READY)
1434     return 0;
1435 
1436   GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1437 
1438   return 0;
1439 }
1440 
1441 static int
nvptx_async_test_all(void)1442 nvptx_async_test_all (void)
1443 {
1444   struct ptx_stream *s;
1445   pthread_t self = pthread_self ();
1446   struct nvptx_thread *nvthd = nvptx_thread ();
1447 
1448   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1449 
1450   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1451     {
1452       if ((s->multithreaded || pthread_equal (s->host_thread, self))
1453 	  && CUDA_CALL_NOCHECK (cuStreamQuery,
1454 				s->stream) == CUDA_ERROR_NOT_READY)
1455 	{
1456 	  pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1457 	  return 0;
1458 	}
1459     }
1460 
1461   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1462 
1463   event_gc (true);
1464 
1465   return 1;
1466 }
1467 
1468 static void
nvptx_wait(int async)1469 nvptx_wait (int async)
1470 {
1471   struct ptx_stream *s;
1472 
1473   s = select_stream_for_async (async, pthread_self (), false, NULL);
1474   if (!s)
1475     GOMP_PLUGIN_fatal ("unknown async %d", async);
1476 
1477   CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1478 
1479   event_gc (true);
1480 }
1481 
1482 static void
nvptx_wait_async(int async1,int async2)1483 nvptx_wait_async (int async1, int async2)
1484 {
1485   CUevent *e;
1486   struct ptx_stream *s1, *s2;
1487   pthread_t self = pthread_self ();
1488 
1489   /* The stream that is waiting (rather than being waited for) doesn't
1490      necessarily have to exist already.  */
1491   s2 = select_stream_for_async (async2, self, true, NULL);
1492 
1493   s1 = select_stream_for_async (async1, self, false, NULL);
1494   if (!s1)
1495     GOMP_PLUGIN_fatal ("invalid async 1\n");
1496 
1497   if (s1 == s2)
1498     GOMP_PLUGIN_fatal ("identical parameters");
1499 
1500   e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1501 
1502   CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1503 
1504   event_gc (true);
1505 
1506   CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream);
1507 
1508   event_add (PTX_EVT_SYNC, e, NULL, 0);
1509 
1510   CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0);
1511 }
1512 
1513 static void
nvptx_wait_all(void)1514 nvptx_wait_all (void)
1515 {
1516   CUresult r;
1517   struct ptx_stream *s;
1518   pthread_t self = pthread_self ();
1519   struct nvptx_thread *nvthd = nvptx_thread ();
1520 
1521   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1522 
1523   /* Wait for active streams initiated by this thread (or by multiple threads)
1524      to complete.  */
1525   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1526     {
1527       if (s->multithreaded || pthread_equal (s->host_thread, self))
1528 	{
1529 	  r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
1530 	  if (r == CUDA_SUCCESS)
1531 	    continue;
1532 	  else if (r != CUDA_ERROR_NOT_READY)
1533 	    GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1534 
1535 	  CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1536 	}
1537     }
1538 
1539   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1540 
1541   event_gc (true);
1542 }
1543 
1544 static void
nvptx_wait_all_async(int async)1545 nvptx_wait_all_async (int async)
1546 {
1547   struct ptx_stream *waiting_stream, *other_stream;
1548   CUevent *e;
1549   struct nvptx_thread *nvthd = nvptx_thread ();
1550   pthread_t self = pthread_self ();
1551 
1552   /* The stream doing the waiting.  This could be the first mention of the
1553      stream, so create it if necessary.  */
1554   waiting_stream
1555     = select_stream_for_async (async, pthread_self (), true, NULL);
1556 
1557   /* Launches on the null stream already block on other streams in the
1558      context.  */
1559   if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1560     return;
1561 
1562   event_gc (true);
1563 
1564   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1565 
1566   for (other_stream = nvthd->ptx_dev->active_streams;
1567        other_stream != NULL;
1568        other_stream = other_stream->next)
1569     {
1570       if (!other_stream->multithreaded
1571 	  && !pthread_equal (other_stream->host_thread, self))
1572 	continue;
1573 
1574       e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1575 
1576       CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1577 
1578       /* Record an event on the waited-for stream.  */
1579       CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream);
1580 
1581       event_add (PTX_EVT_SYNC, e, NULL, 0);
1582 
1583       CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0);
1584    }
1585 
1586   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1587 }
1588 
1589 static void *
nvptx_get_current_cuda_device(void)1590 nvptx_get_current_cuda_device (void)
1591 {
1592   struct nvptx_thread *nvthd = nvptx_thread ();
1593 
1594   if (!nvthd || !nvthd->ptx_dev)
1595     return NULL;
1596 
1597   return &nvthd->ptx_dev->dev;
1598 }
1599 
1600 static void *
nvptx_get_current_cuda_context(void)1601 nvptx_get_current_cuda_context (void)
1602 {
1603   struct nvptx_thread *nvthd = nvptx_thread ();
1604 
1605   if (!nvthd || !nvthd->ptx_dev)
1606     return NULL;
1607 
1608   return nvthd->ptx_dev->ctx;
1609 }
1610 
1611 static void *
nvptx_get_cuda_stream(int async)1612 nvptx_get_cuda_stream (int async)
1613 {
1614   struct ptx_stream *s;
1615   struct nvptx_thread *nvthd = nvptx_thread ();
1616 
1617   if (!nvthd || !nvthd->ptx_dev)
1618     return NULL;
1619 
1620   s = select_stream_for_async (async, pthread_self (), false, NULL);
1621 
1622   return s ? s->stream : NULL;
1623 }
1624 
1625 static int
nvptx_set_cuda_stream(int async,void * stream)1626 nvptx_set_cuda_stream (int async, void *stream)
1627 {
1628   struct ptx_stream *oldstream;
1629   pthread_t self = pthread_self ();
1630   struct nvptx_thread *nvthd = nvptx_thread ();
1631 
1632   if (async < 0)
1633     GOMP_PLUGIN_fatal ("bad async %d", async);
1634 
1635   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1636 
1637   /* We have a list of active streams and an array mapping async values to
1638      entries of that list.  We need to take "ownership" of the passed-in stream,
1639      and add it to our list, removing the previous entry also (if there was one)
1640      in order to prevent resource leaks.  Note the potential for surprise
1641      here: maybe we should keep track of passed-in streams and leave it up to
1642      the user to tidy those up, but that doesn't work for stream handles
1643      returned from acc_get_cuda_stream above...  */
1644 
1645   oldstream = select_stream_for_async (async, self, false, NULL);
1646 
1647   if (oldstream)
1648     {
1649       if (nvthd->ptx_dev->active_streams == oldstream)
1650 	nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1651       else
1652 	{
1653 	  struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1654 	  while (s->next != oldstream)
1655 	    s = s->next;
1656 	  s->next = s->next->next;
1657 	}
1658 
1659       CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream);
1660 
1661       if (!map_fini (oldstream))
1662 	GOMP_PLUGIN_fatal ("error when freeing host memory");
1663 
1664       free (oldstream);
1665     }
1666 
1667   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1668 
1669   (void) select_stream_for_async (async, self, true, (CUstream) stream);
1670 
1671   return 1;
1672 }
1673 
1674 /* Plugin entry points.  */
1675 
1676 const char *
GOMP_OFFLOAD_get_name(void)1677 GOMP_OFFLOAD_get_name (void)
1678 {
1679   return "nvptx";
1680 }
1681 
1682 unsigned int
GOMP_OFFLOAD_get_caps(void)1683 GOMP_OFFLOAD_get_caps (void)
1684 {
1685   return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1686 }
1687 
1688 int
GOMP_OFFLOAD_get_type(void)1689 GOMP_OFFLOAD_get_type (void)
1690 {
1691   return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1692 }
1693 
1694 int
GOMP_OFFLOAD_get_num_devices(void)1695 GOMP_OFFLOAD_get_num_devices (void)
1696 {
1697   return nvptx_get_num_devices ();
1698 }
1699 
1700 bool
GOMP_OFFLOAD_init_device(int n)1701 GOMP_OFFLOAD_init_device (int n)
1702 {
1703   struct ptx_device *dev;
1704 
1705   pthread_mutex_lock (&ptx_dev_lock);
1706 
1707   if (!nvptx_init () || ptx_devices[n] != NULL)
1708     {
1709       pthread_mutex_unlock (&ptx_dev_lock);
1710       return false;
1711     }
1712 
1713   dev = nvptx_open_device (n);
1714   if (dev)
1715     {
1716       ptx_devices[n] = dev;
1717       instantiated_devices++;
1718     }
1719 
1720   pthread_mutex_unlock (&ptx_dev_lock);
1721 
1722   return dev != NULL;
1723 }
1724 
1725 bool
GOMP_OFFLOAD_fini_device(int n)1726 GOMP_OFFLOAD_fini_device (int n)
1727 {
1728   pthread_mutex_lock (&ptx_dev_lock);
1729 
1730   if (ptx_devices[n] != NULL)
1731     {
1732       if (!nvptx_attach_host_thread_to_device (n)
1733 	  || !nvptx_close_device (ptx_devices[n]))
1734 	{
1735 	  pthread_mutex_unlock (&ptx_dev_lock);
1736 	  return false;
1737 	}
1738       ptx_devices[n] = NULL;
1739       instantiated_devices--;
1740     }
1741 
1742   pthread_mutex_unlock (&ptx_dev_lock);
1743   return true;
1744 }
1745 
1746 /* Return the libgomp version number we're compatible with.  There is
1747    no requirement for cross-version compatibility.  */
1748 
1749 unsigned
GOMP_OFFLOAD_version(void)1750 GOMP_OFFLOAD_version (void)
1751 {
1752   return GOMP_VERSION;
1753 }
1754 
1755 /* Initialize __nvptx_clocktick, if present in MODULE.  */
1756 
1757 static void
nvptx_set_clocktick(CUmodule module,struct ptx_device * dev)1758 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1759 {
1760   CUdeviceptr dptr;
1761   CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1762 				  module, "__nvptx_clocktick");
1763   if (r == CUDA_ERROR_NOT_FOUND)
1764     return;
1765   if (r != CUDA_SUCCESS)
1766     GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1767   double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1768   r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1769 			 sizeof (__nvptx_clocktick));
1770   if (r != CUDA_SUCCESS)
1771     GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1772 }
1773 
1774 /* Load the (partial) program described by TARGET_DATA to device
1775    number ORD.  Allocate and return TARGET_TABLE.  */
1776 
1777 int
GOMP_OFFLOAD_load_image(int ord,unsigned version,const void * target_data,struct addr_pair ** target_table)1778 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1779 			 struct addr_pair **target_table)
1780 {
1781   CUmodule module;
1782   const char *const *var_names;
1783   const struct targ_fn_launch *fn_descs;
1784   unsigned int fn_entries, var_entries, i, j;
1785   struct targ_fn_descriptor *targ_fns;
1786   struct addr_pair *targ_tbl;
1787   const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1788   struct ptx_image_data *new_image;
1789   struct ptx_device *dev;
1790 
1791   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1792     {
1793       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1794 			 " (expected %u, received %u)",
1795 			 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1796       return -1;
1797     }
1798 
1799   if (!nvptx_attach_host_thread_to_device (ord)
1800       || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1801     return -1;
1802 
1803   dev = ptx_devices[ord];
1804 
1805   /* The mkoffload utility emits a struct of pointers/integers at the
1806      start of each offload image.  The array of kernel names and the
1807      functions addresses form a one-to-one correspondence.  */
1808 
1809   var_entries = img_header->var_num;
1810   var_names = img_header->var_names;
1811   fn_entries = img_header->fn_num;
1812   fn_descs = img_header->fn_descs;
1813 
1814   targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1815 				 * (fn_entries + var_entries));
1816   targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1817 				 * fn_entries);
1818 
1819   *target_table = targ_tbl;
1820 
1821   new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1822   new_image->target_data = target_data;
1823   new_image->module = module;
1824   new_image->fns = targ_fns;
1825 
1826   pthread_mutex_lock (&dev->image_lock);
1827   new_image->next = dev->images;
1828   dev->images = new_image;
1829   pthread_mutex_unlock (&dev->image_lock);
1830 
1831   for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1832     {
1833       CUfunction function;
1834       int nregs, mthrs;
1835 
1836       CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1837 		      fn_descs[i].fn);
1838       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1839 		      CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1840       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1841 		      CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1842 
1843       targ_fns->fn = function;
1844       targ_fns->launch = &fn_descs[i];
1845       targ_fns->regs_per_thread = nregs;
1846       targ_fns->max_threads_per_block = mthrs;
1847 
1848       targ_tbl->start = (uintptr_t) targ_fns;
1849       targ_tbl->end = targ_tbl->start + 1;
1850     }
1851 
1852   for (j = 0; j < var_entries; j++, targ_tbl++)
1853     {
1854       CUdeviceptr var;
1855       size_t bytes;
1856 
1857       CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1858 		      &var, &bytes, module, var_names[j]);
1859 
1860       targ_tbl->start = (uintptr_t) var;
1861       targ_tbl->end = targ_tbl->start + bytes;
1862     }
1863 
1864   nvptx_set_clocktick (module, dev);
1865 
1866   return fn_entries + var_entries;
1867 }
1868 
1869 /* Unload the program described by TARGET_DATA.  DEV_DATA is the
1870    function descriptors allocated by G_O_load_image.  */
1871 
1872 bool
GOMP_OFFLOAD_unload_image(int ord,unsigned version,const void * target_data)1873 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1874 {
1875   struct ptx_image_data *image, **prev_p;
1876   struct ptx_device *dev = ptx_devices[ord];
1877 
1878   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1879     {
1880       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1881 			 " (expected %u, received %u)",
1882 			 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1883       return false;
1884     }
1885 
1886   bool ret = true;
1887   pthread_mutex_lock (&dev->image_lock);
1888   for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1889     if (image->target_data == target_data)
1890       {
1891 	*prev_p = image->next;
1892 	if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
1893 	  ret = false;
1894 	free (image->fns);
1895 	free (image);
1896 	break;
1897       }
1898   pthread_mutex_unlock (&dev->image_lock);
1899   return ret;
1900 }
1901 
1902 void *
GOMP_OFFLOAD_alloc(int ord,size_t size)1903 GOMP_OFFLOAD_alloc (int ord, size_t size)
1904 {
1905   if (!nvptx_attach_host_thread_to_device (ord))
1906     return NULL;
1907   return nvptx_alloc (size);
1908 }
1909 
1910 bool
GOMP_OFFLOAD_free(int ord,void * ptr)1911 GOMP_OFFLOAD_free (int ord, void *ptr)
1912 {
1913   return (nvptx_attach_host_thread_to_device (ord)
1914 	  && nvptx_free (ptr));
1915 }
1916 
1917 bool
GOMP_OFFLOAD_dev2host(int ord,void * dst,const void * src,size_t n)1918 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1919 {
1920   return (nvptx_attach_host_thread_to_device (ord)
1921 	  && nvptx_dev2host (dst, src, n));
1922 }
1923 
1924 bool
GOMP_OFFLOAD_host2dev(int ord,void * dst,const void * src,size_t n)1925 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1926 {
1927   return (nvptx_attach_host_thread_to_device (ord)
1928 	  && nvptx_host2dev (dst, src, n));
1929 }
1930 
1931 bool
GOMP_OFFLOAD_dev2dev(int ord,void * dst,const void * src,size_t n)1932 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1933 {
1934   struct ptx_device *ptx_dev = ptx_devices[ord];
1935   CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
1936 				ptx_dev->null_stream->stream);
1937   return true;
1938 }
1939 
1940 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
1941 
1942 void
GOMP_OFFLOAD_openacc_exec(void (* fn)(void *),size_t mapnum,void ** hostaddrs,void ** devaddrs,int async,unsigned * dims,void * targ_mem_desc)1943 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
1944 			   void **hostaddrs, void **devaddrs,
1945 			   int async, unsigned *dims, void *targ_mem_desc)
1946 {
1947   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, async, dims, targ_mem_desc);
1948 }
1949 
1950 void
GOMP_OFFLOAD_openacc_register_async_cleanup(void * targ_mem_desc,int async)1951 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc, int async)
1952 {
1953   struct nvptx_thread *nvthd = nvptx_thread ();
1954   CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1955 
1956   CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1957   CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream);
1958   event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc, async);
1959 }
1960 
1961 int
GOMP_OFFLOAD_openacc_async_test(int async)1962 GOMP_OFFLOAD_openacc_async_test (int async)
1963 {
1964   return nvptx_async_test (async);
1965 }
1966 
1967 int
GOMP_OFFLOAD_openacc_async_test_all(void)1968 GOMP_OFFLOAD_openacc_async_test_all (void)
1969 {
1970   return nvptx_async_test_all ();
1971 }
1972 
1973 void
GOMP_OFFLOAD_openacc_async_wait(int async)1974 GOMP_OFFLOAD_openacc_async_wait (int async)
1975 {
1976   nvptx_wait (async);
1977 }
1978 
1979 void
GOMP_OFFLOAD_openacc_async_wait_async(int async1,int async2)1980 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
1981 {
1982   nvptx_wait_async (async1, async2);
1983 }
1984 
1985 void
GOMP_OFFLOAD_openacc_async_wait_all(void)1986 GOMP_OFFLOAD_openacc_async_wait_all (void)
1987 {
1988   nvptx_wait_all ();
1989 }
1990 
1991 void
GOMP_OFFLOAD_openacc_async_wait_all_async(int async)1992 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
1993 {
1994   nvptx_wait_all_async (async);
1995 }
1996 
1997 void
GOMP_OFFLOAD_openacc_async_set_async(int async)1998 GOMP_OFFLOAD_openacc_async_set_async (int async)
1999 {
2000   nvptx_set_async (async);
2001 }
2002 
2003 void *
GOMP_OFFLOAD_openacc_create_thread_data(int ord)2004 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
2005 {
2006   struct ptx_device *ptx_dev;
2007   struct nvptx_thread *nvthd
2008     = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
2009   CUcontext thd_ctx;
2010 
2011   ptx_dev = ptx_devices[ord];
2012 
2013   assert (ptx_dev);
2014 
2015   CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
2016 
2017   assert (ptx_dev->ctx);
2018 
2019   if (!thd_ctx)
2020     CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
2021 
2022   nvthd->current_stream = ptx_dev->null_stream;
2023   nvthd->ptx_dev = ptx_dev;
2024 
2025   return (void *) nvthd;
2026 }
2027 
2028 void
GOMP_OFFLOAD_openacc_destroy_thread_data(void * data)2029 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
2030 {
2031   free (data);
2032 }
2033 
2034 void *
GOMP_OFFLOAD_openacc_cuda_get_current_device(void)2035 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
2036 {
2037   return nvptx_get_current_cuda_device ();
2038 }
2039 
2040 void *
GOMP_OFFLOAD_openacc_cuda_get_current_context(void)2041 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
2042 {
2043   return nvptx_get_current_cuda_context ();
2044 }
2045 
2046 /* NOTE: This returns a CUstream, not a ptx_stream pointer.  */
2047 
2048 void *
GOMP_OFFLOAD_openacc_cuda_get_stream(int async)2049 GOMP_OFFLOAD_openacc_cuda_get_stream (int async)
2050 {
2051   return nvptx_get_cuda_stream (async);
2052 }
2053 
2054 /* NOTE: This takes a CUstream, not a ptx_stream pointer.  */
2055 
2056 int
GOMP_OFFLOAD_openacc_cuda_set_stream(int async,void * stream)2057 GOMP_OFFLOAD_openacc_cuda_set_stream (int async, void *stream)
2058 {
2059   return nvptx_set_cuda_stream (async, stream);
2060 }
2061 
2062 /* Adjust launch dimensions: pick good values for number of blocks and warps
2063    and ensure that number of warps does not exceed CUDA limits as well as GCC's
2064    own limits.  */
2065 
2066 static void
nvptx_adjust_launch_bounds(struct targ_fn_descriptor * fn,struct ptx_device * ptx_dev,int * teams_p,int * threads_p)2067 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
2068 			    struct ptx_device *ptx_dev,
2069 			    int *teams_p, int *threads_p)
2070 {
2071   int max_warps_block = fn->max_threads_per_block / 32;
2072   /* Maximum 32 warps per block is an implementation limit in NVPTX backend
2073      and libgcc, which matches documented limit of all GPUs as of 2015.  */
2074   if (max_warps_block > 32)
2075     max_warps_block = 32;
2076   if (*threads_p <= 0)
2077     *threads_p = 8;
2078   if (*threads_p > max_warps_block)
2079     *threads_p = max_warps_block;
2080 
2081   int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
2082   /* This is an estimate of how many blocks the device can host simultaneously.
2083      Actual limit, which may be lower, can be queried with "occupancy control"
2084      driver interface (since CUDA 6.0).  */
2085   int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
2086   if (*teams_p <= 0 || *teams_p > max_blocks)
2087     *teams_p = max_blocks;
2088 }
2089 
2090 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
2091    target regions.  */
2092 
2093 static size_t
nvptx_stacks_size()2094 nvptx_stacks_size ()
2095 {
2096   return 128 * 1024;
2097 }
2098 
2099 /* Return contiguous storage for NUM stacks, each SIZE bytes.  */
2100 
2101 static void *
nvptx_stacks_alloc(size_t size,int num)2102 nvptx_stacks_alloc (size_t size, int num)
2103 {
2104   CUdeviceptr stacks;
2105   CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num);
2106   if (r != CUDA_SUCCESS)
2107     GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
2108   return (void *) stacks;
2109 }
2110 
2111 /* Release storage previously allocated by nvptx_stacks_alloc.  */
2112 
2113 static void
nvptx_stacks_free(void * p,int num)2114 nvptx_stacks_free (void *p, int num)
2115 {
2116   CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p);
2117   if (r != CUDA_SUCCESS)
2118     GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
2119 }
2120 
2121 void
GOMP_OFFLOAD_run(int ord,void * tgt_fn,void * tgt_vars,void ** args)2122 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
2123 {
2124   CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
2125   CUresult r;
2126   struct ptx_device *ptx_dev = ptx_devices[ord];
2127   const char *maybe_abort_msg = "(perhaps abort was called)";
2128   int teams = 0, threads = 0;
2129 
2130   if (!args)
2131     GOMP_PLUGIN_fatal ("No target arguments provided");
2132   while (*args)
2133     {
2134       intptr_t id = (intptr_t) *args++, val;
2135       if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
2136 	val = (intptr_t) *args++;
2137       else
2138         val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
2139       if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
2140 	continue;
2141       val = val > INT_MAX ? INT_MAX : val;
2142       id &= GOMP_TARGET_ARG_ID_MASK;
2143       if (id == GOMP_TARGET_ARG_NUM_TEAMS)
2144 	teams = val;
2145       else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
2146 	threads = val;
2147     }
2148   nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
2149 
2150   size_t stack_size = nvptx_stacks_size ();
2151   void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
2152   void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
2153   size_t fn_args_size = sizeof fn_args;
2154   void *config[] = {
2155     CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
2156     CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
2157     CU_LAUNCH_PARAM_END
2158   };
2159   r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
2160 			 32, threads, 1, 0, ptx_dev->null_stream->stream,
2161 			 NULL, config);
2162   if (r != CUDA_SUCCESS)
2163     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
2164 
2165   r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2166   if (r == CUDA_ERROR_LAUNCH_FAILED)
2167     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2168 		       maybe_abort_msg);
2169   else if (r != CUDA_SUCCESS)
2170     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
2171   nvptx_stacks_free (stacks, teams * threads);
2172 }
2173 
2174 void
GOMP_OFFLOAD_async_run(int ord,void * tgt_fn,void * tgt_vars,void ** args,void * async_data)2175 GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void **args,
2176 			void *async_data)
2177 {
2178   GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");
2179 }
2180