1 /* Plugin for NVPTX execution.
2
3 Copyright (C) 2013-2018 Free Software Foundation, Inc.
4
5 Contributed by Mentor Embedded.
6
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
9
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
14
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
19
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
23
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
28
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
33
34 #include "openacc.h"
35 #include "config.h"
36 #include "libgomp-plugin.h"
37 #include "oacc-plugin.h"
38 #include "gomp-constants.h"
39
40 #include <pthread.h>
41 #include <cuda.h>
42 #include <stdbool.h>
43 #include <stdint.h>
44 #include <limits.h>
45 #include <string.h>
46 #include <stdio.h>
47 #include <unistd.h>
48 #include <assert.h>
49 #include <errno.h>
50
51 #if PLUGIN_NVPTX_DYNAMIC
52 # include <dlfcn.h>
53
54 # define CUDA_CALLS \
55 CUDA_ONE_CALL (cuCtxCreate) \
56 CUDA_ONE_CALL (cuCtxDestroy) \
57 CUDA_ONE_CALL (cuCtxGetCurrent) \
58 CUDA_ONE_CALL (cuCtxGetDevice) \
59 CUDA_ONE_CALL (cuCtxPopCurrent) \
60 CUDA_ONE_CALL (cuCtxPushCurrent) \
61 CUDA_ONE_CALL (cuCtxSynchronize) \
62 CUDA_ONE_CALL (cuDeviceGet) \
63 CUDA_ONE_CALL (cuDeviceGetAttribute) \
64 CUDA_ONE_CALL (cuDeviceGetCount) \
65 CUDA_ONE_CALL (cuEventCreate) \
66 CUDA_ONE_CALL (cuEventDestroy) \
67 CUDA_ONE_CALL (cuEventElapsedTime) \
68 CUDA_ONE_CALL (cuEventQuery) \
69 CUDA_ONE_CALL (cuEventRecord) \
70 CUDA_ONE_CALL (cuEventSynchronize) \
71 CUDA_ONE_CALL (cuFuncGetAttribute) \
72 CUDA_ONE_CALL (cuGetErrorString) \
73 CUDA_ONE_CALL (cuInit) \
74 CUDA_ONE_CALL (cuLaunchKernel) \
75 CUDA_ONE_CALL (cuLinkAddData) \
76 CUDA_ONE_CALL (cuLinkComplete) \
77 CUDA_ONE_CALL (cuLinkCreate) \
78 CUDA_ONE_CALL (cuLinkDestroy) \
79 CUDA_ONE_CALL (cuMemAlloc) \
80 CUDA_ONE_CALL (cuMemAllocHost) \
81 CUDA_ONE_CALL (cuMemcpy) \
82 CUDA_ONE_CALL (cuMemcpyDtoDAsync) \
83 CUDA_ONE_CALL (cuMemcpyDtoH) \
84 CUDA_ONE_CALL (cuMemcpyDtoHAsync) \
85 CUDA_ONE_CALL (cuMemcpyHtoD) \
86 CUDA_ONE_CALL (cuMemcpyHtoDAsync) \
87 CUDA_ONE_CALL (cuMemFree) \
88 CUDA_ONE_CALL (cuMemFreeHost) \
89 CUDA_ONE_CALL (cuMemGetAddressRange) \
90 CUDA_ONE_CALL (cuMemHostGetDevicePointer)\
91 CUDA_ONE_CALL (cuModuleGetFunction) \
92 CUDA_ONE_CALL (cuModuleGetGlobal) \
93 CUDA_ONE_CALL (cuModuleLoad) \
94 CUDA_ONE_CALL (cuModuleLoadData) \
95 CUDA_ONE_CALL (cuModuleUnload) \
96 CUDA_ONE_CALL (cuStreamCreate) \
97 CUDA_ONE_CALL (cuStreamDestroy) \
98 CUDA_ONE_CALL (cuStreamQuery) \
99 CUDA_ONE_CALL (cuStreamSynchronize) \
100 CUDA_ONE_CALL (cuStreamWaitEvent)
101 # define CUDA_ONE_CALL(call) \
102 __typeof (call) *call;
103 struct cuda_lib_s {
104 CUDA_CALLS
105 } cuda_lib;
106
107 /* -1 if init_cuda_lib has not been called yet, false
108 if it has been and failed, true if it has been and succeeded. */
109 static signed char cuda_lib_inited = -1;
110
111 /* Dynamically load the CUDA runtime library and initialize function
112 pointers, return false if unsuccessful, true if successful. */
113 static bool
init_cuda_lib(void)114 init_cuda_lib (void)
115 {
116 if (cuda_lib_inited != -1)
117 return cuda_lib_inited;
118 const char *cuda_runtime_lib = "libcuda.so.1";
119 void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
120 cuda_lib_inited = false;
121 if (h == NULL)
122 return false;
123 # undef CUDA_ONE_CALL
124 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call)
125 # define CUDA_ONE_CALL_1(call) \
126 cuda_lib.call = dlsym (h, #call); \
127 if (cuda_lib.call == NULL) \
128 return false;
129 CUDA_CALLS
130 cuda_lib_inited = true;
131 return true;
132 }
133 # undef CUDA_ONE_CALL
134 # undef CUDA_ONE_CALL_1
135 # define CUDA_CALL_PREFIX cuda_lib.
136 #else
137 # define CUDA_CALL_PREFIX
138 # define init_cuda_lib() true
139 #endif
140
141 /* Convenience macros for the frequently used CUDA library call and
142 error handling sequence as well as CUDA library calls that
143 do the error checking themselves or don't do it at all. */
144
145 #define CUDA_CALL_ERET(ERET, FN, ...) \
146 do { \
147 unsigned __r \
148 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
149 if (__r != CUDA_SUCCESS) \
150 { \
151 GOMP_PLUGIN_error (#FN " error: %s", \
152 cuda_error (__r)); \
153 return ERET; \
154 } \
155 } while (0)
156
157 #define CUDA_CALL(FN, ...) \
158 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
159
160 #define CUDA_CALL_ASSERT(FN, ...) \
161 do { \
162 unsigned __r \
163 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
164 if (__r != CUDA_SUCCESS) \
165 { \
166 GOMP_PLUGIN_fatal (#FN " error: %s", \
167 cuda_error (__r)); \
168 } \
169 } while (0)
170
171 #define CUDA_CALL_NOCHECK(FN, ...) \
172 CUDA_CALL_PREFIX FN (__VA_ARGS__)
173
174 static const char *
cuda_error(CUresult r)175 cuda_error (CUresult r)
176 {
177 #if CUDA_VERSION < 7000
178 /* Specified in documentation and present in library from at least
179 5.5. Not declared in header file prior to 7.0. */
180 extern CUresult cuGetErrorString (CUresult, const char **);
181 #endif
182 const char *desc;
183
184 r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
185 if (r != CUDA_SUCCESS)
186 desc = "unknown cuda error";
187
188 return desc;
189 }
190
191 static unsigned int instantiated_devices = 0;
192 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
193
194 struct ptx_stream
195 {
196 CUstream stream;
197 pthread_t host_thread;
198 bool multithreaded;
199
200 CUdeviceptr d;
201 void *h;
202 void *h_begin;
203 void *h_end;
204 void *h_next;
205 void *h_prev;
206 void *h_tail;
207
208 struct ptx_stream *next;
209 };
210
211 /* Thread-specific data for PTX. */
212
213 struct nvptx_thread
214 {
215 struct ptx_stream *current_stream;
216 struct ptx_device *ptx_dev;
217 };
218
219 struct map
220 {
221 int async;
222 size_t size;
223 char mappings[0];
224 };
225
226 static bool
map_init(struct ptx_stream * s)227 map_init (struct ptx_stream *s)
228 {
229 int size = getpagesize ();
230
231 assert (s);
232 assert (!s->d);
233 assert (!s->h);
234
235 CUDA_CALL (cuMemAllocHost, &s->h, size);
236 CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0);
237
238 assert (s->h);
239
240 s->h_begin = s->h;
241 s->h_end = s->h_begin + size;
242 s->h_next = s->h_prev = s->h_tail = s->h_begin;
243
244 assert (s->h_next);
245 assert (s->h_end);
246 return true;
247 }
248
249 static bool
map_fini(struct ptx_stream * s)250 map_fini (struct ptx_stream *s)
251 {
252 CUDA_CALL (cuMemFreeHost, s->h);
253 return true;
254 }
255
256 static void
map_pop(struct ptx_stream * s)257 map_pop (struct ptx_stream *s)
258 {
259 struct map *m;
260
261 assert (s != NULL);
262 assert (s->h_next);
263 assert (s->h_prev);
264 assert (s->h_tail);
265
266 m = s->h_tail;
267
268 s->h_tail += m->size;
269
270 if (s->h_tail >= s->h_end)
271 s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
272
273 if (s->h_next == s->h_tail)
274 s->h_prev = s->h_next;
275
276 assert (s->h_next >= s->h_begin);
277 assert (s->h_tail >= s->h_begin);
278 assert (s->h_prev >= s->h_begin);
279
280 assert (s->h_next <= s->h_end);
281 assert (s->h_tail <= s->h_end);
282 assert (s->h_prev <= s->h_end);
283 }
284
285 static void
map_push(struct ptx_stream * s,int async,size_t size,void ** h,void ** d)286 map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
287 {
288 int left;
289 int offset;
290 struct map *m;
291
292 assert (s != NULL);
293
294 left = s->h_end - s->h_next;
295 size += sizeof (struct map);
296
297 assert (s->h_prev);
298 assert (s->h_next);
299
300 if (size >= left)
301 {
302 m = s->h_prev;
303 m->size += left;
304 s->h_next = s->h_begin;
305
306 if (s->h_next + size > s->h_end)
307 GOMP_PLUGIN_fatal ("unable to push map");
308 }
309
310 assert (s->h_next);
311
312 m = s->h_next;
313 m->async = async;
314 m->size = size;
315
316 offset = (void *)&m->mappings[0] - s->h;
317
318 *d = (void *)(s->d + offset);
319 *h = (void *)(s->h + offset);
320
321 s->h_prev = s->h_next;
322 s->h_next += size;
323
324 assert (s->h_prev);
325 assert (s->h_next);
326
327 assert (s->h_next >= s->h_begin);
328 assert (s->h_tail >= s->h_begin);
329 assert (s->h_prev >= s->h_begin);
330 assert (s->h_next <= s->h_end);
331 assert (s->h_tail <= s->h_end);
332 assert (s->h_prev <= s->h_end);
333
334 return;
335 }
336
337 /* Target data function launch information. */
338
339 struct targ_fn_launch
340 {
341 const char *fn;
342 unsigned short dim[GOMP_DIM_MAX];
343 };
344
345 /* Target PTX object information. */
346
347 struct targ_ptx_obj
348 {
349 const char *code;
350 size_t size;
351 };
352
353 /* Target data image information. */
354
355 typedef struct nvptx_tdata
356 {
357 const struct targ_ptx_obj *ptx_objs;
358 unsigned ptx_num;
359
360 const char *const *var_names;
361 unsigned var_num;
362
363 const struct targ_fn_launch *fn_descs;
364 unsigned fn_num;
365 } nvptx_tdata_t;
366
367 /* Descriptor of a loaded function. */
368
369 struct targ_fn_descriptor
370 {
371 CUfunction fn;
372 const struct targ_fn_launch *launch;
373 int regs_per_thread;
374 int max_threads_per_block;
375 };
376
377 /* A loaded PTX image. */
378 struct ptx_image_data
379 {
380 const void *target_data;
381 CUmodule module;
382
383 struct targ_fn_descriptor *fns; /* Array of functions. */
384
385 struct ptx_image_data *next;
386 };
387
388 struct ptx_device
389 {
390 CUcontext ctx;
391 bool ctx_shared;
392 CUdevice dev;
393 struct ptx_stream *null_stream;
394 /* All non-null streams associated with this device (actually context),
395 either created implicitly or passed in from the user (via
396 acc_set_cuda_stream). */
397 struct ptx_stream *active_streams;
398 struct {
399 struct ptx_stream **arr;
400 int size;
401 } async_streams;
402 /* A lock for use when manipulating the above stream list and array. */
403 pthread_mutex_t stream_lock;
404 int ord;
405 bool overlap;
406 bool map;
407 bool concur;
408 bool mkern;
409 int mode;
410 int clock_khz;
411 int num_sms;
412 int regs_per_block;
413 int regs_per_sm;
414
415 struct ptx_image_data *images; /* Images loaded on device. */
416 pthread_mutex_t image_lock; /* Lock for above list. */
417
418 struct ptx_device *next;
419 };
420
421 enum ptx_event_type
422 {
423 PTX_EVT_MEM,
424 PTX_EVT_KNL,
425 PTX_EVT_SYNC,
426 PTX_EVT_ASYNC_CLEANUP
427 };
428
429 struct ptx_event
430 {
431 CUevent *evt;
432 int type;
433 void *addr;
434 int ord;
435 int val;
436
437 struct ptx_event *next;
438 };
439
440 static pthread_mutex_t ptx_event_lock;
441 static struct ptx_event *ptx_events;
442
443 static struct ptx_device **ptx_devices;
444
445 static inline struct nvptx_thread *
nvptx_thread(void)446 nvptx_thread (void)
447 {
448 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
449 }
450
451 static bool
init_streams_for_device(struct ptx_device * ptx_dev,int concurrency)452 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
453 {
454 int i;
455 struct ptx_stream *null_stream
456 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
457
458 null_stream->stream = NULL;
459 null_stream->host_thread = pthread_self ();
460 null_stream->multithreaded = true;
461 null_stream->d = (CUdeviceptr) NULL;
462 null_stream->h = NULL;
463 if (!map_init (null_stream))
464 return false;
465
466 ptx_dev->null_stream = null_stream;
467 ptx_dev->active_streams = NULL;
468 pthread_mutex_init (&ptx_dev->stream_lock, NULL);
469
470 if (concurrency < 1)
471 concurrency = 1;
472
473 /* This is just a guess -- make space for as many async streams as the
474 current device is capable of concurrently executing. This can grow
475 later as necessary. No streams are created yet. */
476 ptx_dev->async_streams.arr
477 = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
478 ptx_dev->async_streams.size = concurrency;
479
480 for (i = 0; i < concurrency; i++)
481 ptx_dev->async_streams.arr[i] = NULL;
482
483 return true;
484 }
485
486 static bool
fini_streams_for_device(struct ptx_device * ptx_dev)487 fini_streams_for_device (struct ptx_device *ptx_dev)
488 {
489 free (ptx_dev->async_streams.arr);
490
491 bool ret = true;
492 while (ptx_dev->active_streams != NULL)
493 {
494 struct ptx_stream *s = ptx_dev->active_streams;
495 ptx_dev->active_streams = ptx_dev->active_streams->next;
496
497 ret &= map_fini (s);
498
499 CUresult r = CUDA_CALL_NOCHECK (cuStreamDestroy, s->stream);
500 if (r != CUDA_SUCCESS)
501 {
502 GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
503 ret = false;
504 }
505 free (s);
506 }
507
508 ret &= map_fini (ptx_dev->null_stream);
509 free (ptx_dev->null_stream);
510 return ret;
511 }
512
513 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
514 thread THREAD (and also current device/context). If CREATE is true, create
515 the stream if it does not exist (or use EXISTING if it is non-NULL), and
516 associate the stream with the same thread argument. Returns stream to use
517 as result. */
518
519 static struct ptx_stream *
select_stream_for_async(int async,pthread_t thread,bool create,CUstream existing)520 select_stream_for_async (int async, pthread_t thread, bool create,
521 CUstream existing)
522 {
523 struct nvptx_thread *nvthd = nvptx_thread ();
524 /* Local copy of TLS variable. */
525 struct ptx_device *ptx_dev = nvthd->ptx_dev;
526 struct ptx_stream *stream = NULL;
527 int orig_async = async;
528
529 /* The special value acc_async_noval (-1) maps (for now) to an
530 implicitly-created stream, which is then handled the same as any other
531 numbered async stream. Other options are available, e.g. using the null
532 stream for anonymous async operations, or choosing an idle stream from an
533 active set. But, stick with this for now. */
534 if (async > acc_async_sync)
535 async++;
536
537 if (create)
538 pthread_mutex_lock (&ptx_dev->stream_lock);
539
540 /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
541 null stream, and in fact better performance may be obtainable if it doesn't
542 (because the null stream enforces overly-strict synchronisation with
543 respect to other streams for legacy reasons, and that's probably not
544 needed with OpenACC). Maybe investigate later. */
545 if (async == acc_async_sync)
546 stream = ptx_dev->null_stream;
547 else if (async >= 0 && async < ptx_dev->async_streams.size
548 && ptx_dev->async_streams.arr[async] && !(create && existing))
549 stream = ptx_dev->async_streams.arr[async];
550 else if (async >= 0 && create)
551 {
552 if (async >= ptx_dev->async_streams.size)
553 {
554 int i, newsize = ptx_dev->async_streams.size * 2;
555
556 if (async >= newsize)
557 newsize = async + 1;
558
559 ptx_dev->async_streams.arr
560 = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
561 newsize * sizeof (struct ptx_stream *));
562
563 for (i = ptx_dev->async_streams.size; i < newsize; i++)
564 ptx_dev->async_streams.arr[i] = NULL;
565
566 ptx_dev->async_streams.size = newsize;
567 }
568
569 /* Create a new stream on-demand if there isn't one already, or if we're
570 setting a particular async value to an existing (externally-provided)
571 stream. */
572 if (!ptx_dev->async_streams.arr[async] || existing)
573 {
574 CUresult r;
575 struct ptx_stream *s
576 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
577
578 if (existing)
579 s->stream = existing;
580 else
581 {
582 r = CUDA_CALL_NOCHECK (cuStreamCreate, &s->stream,
583 CU_STREAM_DEFAULT);
584 if (r != CUDA_SUCCESS)
585 {
586 pthread_mutex_unlock (&ptx_dev->stream_lock);
587 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s",
588 cuda_error (r));
589 }
590 }
591
592 /* If CREATE is true, we're going to be queueing some work on this
593 stream. Associate it with the current host thread. */
594 s->host_thread = thread;
595 s->multithreaded = false;
596
597 s->d = (CUdeviceptr) NULL;
598 s->h = NULL;
599 if (!map_init (s))
600 {
601 pthread_mutex_unlock (&ptx_dev->stream_lock);
602 GOMP_PLUGIN_fatal ("map_init fail");
603 }
604
605 s->next = ptx_dev->active_streams;
606 ptx_dev->active_streams = s;
607 ptx_dev->async_streams.arr[async] = s;
608 }
609
610 stream = ptx_dev->async_streams.arr[async];
611 }
612 else if (async < 0)
613 {
614 if (create)
615 pthread_mutex_unlock (&ptx_dev->stream_lock);
616 GOMP_PLUGIN_fatal ("bad async %d", async);
617 }
618
619 if (create)
620 {
621 assert (stream != NULL);
622
623 /* If we're trying to use the same stream from different threads
624 simultaneously, set stream->multithreaded to true. This affects the
625 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
626 only wait for asynchronous launches from the same host thread they are
627 invoked on. If multiple threads use the same async value, we make note
628 of that here and fall back to testing/waiting for all threads in those
629 functions. */
630 if (thread != stream->host_thread)
631 stream->multithreaded = true;
632
633 pthread_mutex_unlock (&ptx_dev->stream_lock);
634 }
635 else if (stream && !stream->multithreaded
636 && !pthread_equal (stream->host_thread, thread))
637 GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
638
639 return stream;
640 }
641
642 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
643 should be locked on entry and remains locked on exit. */
644
645 static bool
nvptx_init(void)646 nvptx_init (void)
647 {
648 int ndevs;
649
650 if (instantiated_devices != 0)
651 return true;
652
653 ptx_events = NULL;
654 pthread_mutex_init (&ptx_event_lock, NULL);
655
656 if (!init_cuda_lib ())
657 return false;
658
659 CUDA_CALL (cuInit, 0);
660
661 CUDA_CALL (cuDeviceGetCount, &ndevs);
662 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
663 * ndevs);
664 return true;
665 }
666
667 /* Select the N'th PTX device for the current host thread. The device must
668 have been previously opened before calling this function. */
669
670 static bool
nvptx_attach_host_thread_to_device(int n)671 nvptx_attach_host_thread_to_device (int n)
672 {
673 CUdevice dev;
674 CUresult r;
675 struct ptx_device *ptx_dev;
676 CUcontext thd_ctx;
677
678 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
679 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
680 {
681 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
682 return false;
683 }
684
685 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
686 return true;
687 else
688 {
689 CUcontext old_ctx;
690
691 ptx_dev = ptx_devices[n];
692 if (!ptx_dev)
693 {
694 GOMP_PLUGIN_error ("device %d not found", n);
695 return false;
696 }
697
698 CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
699
700 /* We don't necessarily have a current context (e.g. if it has been
701 destroyed. Pop it if we do though. */
702 if (thd_ctx != NULL)
703 CUDA_CALL (cuCtxPopCurrent, &old_ctx);
704
705 CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
706 }
707 return true;
708 }
709
710 static struct ptx_device *
nvptx_open_device(int n)711 nvptx_open_device (int n)
712 {
713 struct ptx_device *ptx_dev;
714 CUdevice dev, ctx_dev;
715 CUresult r;
716 int async_engines, pi;
717
718 CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
719
720 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
721
722 ptx_dev->ord = n;
723 ptx_dev->dev = dev;
724 ptx_dev->ctx_shared = false;
725
726 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
727 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
728 {
729 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
730 return NULL;
731 }
732
733 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
734 {
735 /* The current host thread has an active context for a different device.
736 Detach it. */
737 CUcontext old_ctx;
738 CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
739 }
740
741 CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
742
743 if (!ptx_dev->ctx)
744 CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
745 else
746 ptx_dev->ctx_shared = true;
747
748 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
749 &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
750 ptx_dev->overlap = pi;
751
752 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
753 &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
754 ptx_dev->map = pi;
755
756 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
757 &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
758 ptx_dev->concur = pi;
759
760 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
761 &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
762 ptx_dev->mode = pi;
763
764 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
765 &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
766 ptx_dev->mkern = pi;
767
768 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
769 &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
770 ptx_dev->clock_khz = pi;
771
772 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
773 &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
774 ptx_dev->num_sms = pi;
775
776 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
777 &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
778 ptx_dev->regs_per_block = pi;
779
780 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 is defined only
781 in CUDA 6.0 and newer. */
782 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi, 82, dev);
783 /* Fallback: use limit of registers per block, which is usually equal. */
784 if (r == CUDA_ERROR_INVALID_VALUE)
785 pi = ptx_dev->regs_per_block;
786 else if (r != CUDA_SUCCESS)
787 {
788 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
789 return NULL;
790 }
791 ptx_dev->regs_per_sm = pi;
792
793 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
794 &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
795 if (pi != 32)
796 {
797 GOMP_PLUGIN_error ("Only warp size 32 is supported");
798 return NULL;
799 }
800
801 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
802 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
803 if (r != CUDA_SUCCESS)
804 async_engines = 1;
805
806 ptx_dev->images = NULL;
807 pthread_mutex_init (&ptx_dev->image_lock, NULL);
808
809 if (!init_streams_for_device (ptx_dev, async_engines))
810 return NULL;
811
812 return ptx_dev;
813 }
814
815 static bool
nvptx_close_device(struct ptx_device * ptx_dev)816 nvptx_close_device (struct ptx_device *ptx_dev)
817 {
818 if (!ptx_dev)
819 return true;
820
821 if (!fini_streams_for_device (ptx_dev))
822 return false;
823
824 pthread_mutex_destroy (&ptx_dev->image_lock);
825
826 if (!ptx_dev->ctx_shared)
827 CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
828
829 free (ptx_dev);
830 return true;
831 }
832
833 static int
nvptx_get_num_devices(void)834 nvptx_get_num_devices (void)
835 {
836 int n;
837
838 /* PR libgomp/65099: Currently, we only support offloading in 64-bit
839 configurations. */
840 if (sizeof (void *) != 8)
841 {
842 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading;"
843 " only 64-bit configurations are supported\n");
844 return 0;
845 }
846
847 /* This function will be called before the plugin has been initialized in
848 order to enumerate available devices, but CUDA API routines can't be used
849 until cuInit has been called. Just call it now (but don't yet do any
850 further initialization). */
851 if (instantiated_devices == 0)
852 {
853 if (!init_cuda_lib ())
854 return 0;
855 CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
856 /* This is not an error: e.g. we may have CUDA libraries installed but
857 no devices available. */
858 if (r != CUDA_SUCCESS)
859 {
860 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
861 cuda_error (r));
862 return 0;
863 }
864 }
865
866 CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
867 return n;
868 }
869
870 static void
notify_var(const char * var_name,const char * env_var)871 notify_var (const char *var_name, const char *env_var)
872 {
873 if (env_var == NULL)
874 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
875 else
876 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
877 }
878
879 static bool
link_ptx(CUmodule * module,const struct targ_ptx_obj * ptx_objs,unsigned num_objs)880 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
881 unsigned num_objs)
882 {
883 CUjit_option opts[6];
884 void *optvals[6];
885 float elapsed = 0.0;
886 char elog[1024];
887 char ilog[16384];
888 CUlinkState linkstate;
889 CUresult r;
890 void *linkout;
891 size_t linkoutsize __attribute__ ((unused));
892
893 opts[0] = CU_JIT_WALL_TIME;
894 optvals[0] = &elapsed;
895
896 opts[1] = CU_JIT_INFO_LOG_BUFFER;
897 optvals[1] = &ilog[0];
898
899 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
900 optvals[2] = (void *) sizeof ilog;
901
902 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
903 optvals[3] = &elog[0];
904
905 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
906 optvals[4] = (void *) sizeof elog;
907
908 opts[5] = CU_JIT_LOG_VERBOSE;
909 optvals[5] = (void *) 1;
910
911 CUDA_CALL (cuLinkCreate, 6, opts, optvals, &linkstate);
912
913 for (; num_objs--; ptx_objs++)
914 {
915 /* cuLinkAddData's 'data' argument erroneously omits the const
916 qualifier. */
917 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
918 r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
919 (char *) ptx_objs->code, ptx_objs->size,
920 0, 0, 0, 0);
921 if (r != CUDA_SUCCESS)
922 {
923 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
924 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
925 cuda_error (r));
926 return false;
927 }
928 }
929
930 GOMP_PLUGIN_debug (0, "Linking\n");
931 r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
932
933 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
934 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
935
936 if (r != CUDA_SUCCESS)
937 {
938 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
939 return false;
940 }
941
942 CUDA_CALL (cuModuleLoadData, module, linkout);
943 CUDA_CALL (cuLinkDestroy, linkstate);
944 return true;
945 }
946
947 static void
event_gc(bool memmap_lockable)948 event_gc (bool memmap_lockable)
949 {
950 struct ptx_event *ptx_event = ptx_events;
951 struct ptx_event *async_cleanups = NULL;
952 struct nvptx_thread *nvthd = nvptx_thread ();
953
954 pthread_mutex_lock (&ptx_event_lock);
955
956 while (ptx_event != NULL)
957 {
958 CUresult r;
959 struct ptx_event *e = ptx_event;
960
961 ptx_event = ptx_event->next;
962
963 if (e->ord != nvthd->ptx_dev->ord)
964 continue;
965
966 r = CUDA_CALL_NOCHECK (cuEventQuery, *e->evt);
967 if (r == CUDA_SUCCESS)
968 {
969 bool append_async = false;
970 CUevent *te;
971
972 te = e->evt;
973
974 switch (e->type)
975 {
976 case PTX_EVT_MEM:
977 case PTX_EVT_SYNC:
978 break;
979
980 case PTX_EVT_KNL:
981 map_pop (e->addr);
982 break;
983
984 case PTX_EVT_ASYNC_CLEANUP:
985 {
986 /* The function gomp_plugin_async_unmap_vars needs to claim the
987 memory-map splay tree lock for the current device, so we
988 can't call it when one of our callers has already claimed
989 the lock. In that case, just delay the GC for this event
990 until later. */
991 if (!memmap_lockable)
992 continue;
993
994 append_async = true;
995 }
996 break;
997 }
998
999 CUDA_CALL_NOCHECK (cuEventDestroy, *te);
1000 free ((void *)te);
1001
1002 /* Unlink 'e' from ptx_events list. */
1003 if (ptx_events == e)
1004 ptx_events = ptx_events->next;
1005 else
1006 {
1007 struct ptx_event *e_ = ptx_events;
1008 while (e_->next != e)
1009 e_ = e_->next;
1010 e_->next = e_->next->next;
1011 }
1012
1013 if (append_async)
1014 {
1015 e->next = async_cleanups;
1016 async_cleanups = e;
1017 }
1018 else
1019 free (e);
1020 }
1021 }
1022
1023 pthread_mutex_unlock (&ptx_event_lock);
1024
1025 /* We have to do these here, after ptx_event_lock is released. */
1026 while (async_cleanups)
1027 {
1028 struct ptx_event *e = async_cleanups;
1029 async_cleanups = async_cleanups->next;
1030
1031 GOMP_PLUGIN_async_unmap_vars (e->addr, e->val);
1032 free (e);
1033 }
1034 }
1035
1036 static void
event_add(enum ptx_event_type type,CUevent * e,void * h,int val)1037 event_add (enum ptx_event_type type, CUevent *e, void *h, int val)
1038 {
1039 struct ptx_event *ptx_event;
1040 struct nvptx_thread *nvthd = nvptx_thread ();
1041
1042 assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
1043 || type == PTX_EVT_ASYNC_CLEANUP);
1044
1045 ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
1046 ptx_event->type = type;
1047 ptx_event->evt = e;
1048 ptx_event->addr = h;
1049 ptx_event->ord = nvthd->ptx_dev->ord;
1050 ptx_event->val = val;
1051
1052 pthread_mutex_lock (&ptx_event_lock);
1053
1054 ptx_event->next = ptx_events;
1055 ptx_events = ptx_event;
1056
1057 pthread_mutex_unlock (&ptx_event_lock);
1058 }
1059
1060 static void
nvptx_exec(void (* fn),size_t mapnum,void ** hostaddrs,void ** devaddrs,int async,unsigned * dims,void * targ_mem_desc)1061 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
1062 int async, unsigned *dims, void *targ_mem_desc)
1063 {
1064 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
1065 CUfunction function;
1066 CUresult r;
1067 int i;
1068 struct ptx_stream *dev_str;
1069 void *kargs[1];
1070 void *hp, *dp;
1071 struct nvptx_thread *nvthd = nvptx_thread ();
1072 const char *maybe_abort_msg = "(perhaps abort was called)";
1073
1074 function = targ_fn->fn;
1075
1076 dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
1077 assert (dev_str == nvthd->current_stream);
1078
1079 /* Initialize the launch dimensions. Typically this is constant,
1080 provided by the device compiler, but we must permit runtime
1081 values. */
1082 int seen_zero = 0;
1083 for (i = 0; i != GOMP_DIM_MAX; i++)
1084 {
1085 if (targ_fn->launch->dim[i])
1086 dims[i] = targ_fn->launch->dim[i];
1087 if (!dims[i])
1088 seen_zero = 1;
1089 }
1090
1091 if (seen_zero)
1092 {
1093 /* See if the user provided GOMP_OPENACC_DIM environment
1094 variable to specify runtime defaults. */
1095 static int default_dims[GOMP_DIM_MAX];
1096
1097 pthread_mutex_lock (&ptx_dev_lock);
1098 if (!default_dims[0])
1099 {
1100 const char *var_name = "GOMP_OPENACC_DIM";
1101 /* We only read the environment variable once. You can't
1102 change it in the middle of execution. The syntax is
1103 the same as for the -fopenacc-dim compilation option. */
1104 const char *env_var = getenv (var_name);
1105 notify_var (var_name, env_var);
1106 if (env_var)
1107 {
1108 const char *pos = env_var;
1109
1110 for (i = 0; *pos && i != GOMP_DIM_MAX; i++)
1111 {
1112 if (i && *pos++ != ':')
1113 break;
1114 if (*pos != ':')
1115 {
1116 const char *eptr;
1117
1118 errno = 0;
1119 long val = strtol (pos, (char **)&eptr, 10);
1120 if (errno || val < 0 || (unsigned)val != val)
1121 break;
1122 default_dims[i] = (int)val;
1123 pos = eptr;
1124 }
1125 }
1126 }
1127
1128 int warp_size, block_size, dev_size, cpu_size;
1129 CUdevice dev = nvptx_thread()->ptx_dev->dev;
1130 /* 32 is the default for known hardware. */
1131 int gang = 0, worker = 32, vector = 32;
1132 CUdevice_attribute cu_tpb, cu_ws, cu_mpc, cu_tpm;
1133
1134 cu_tpb = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK;
1135 cu_ws = CU_DEVICE_ATTRIBUTE_WARP_SIZE;
1136 cu_mpc = CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT;
1137 cu_tpm = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR;
1138
1139 if (CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &block_size, cu_tpb,
1140 dev) == CUDA_SUCCESS
1141 && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &warp_size, cu_ws,
1142 dev) == CUDA_SUCCESS
1143 && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &dev_size, cu_mpc,
1144 dev) == CUDA_SUCCESS
1145 && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &cpu_size, cu_tpm,
1146 dev) == CUDA_SUCCESS)
1147 {
1148 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
1149 " dev_size=%d, cpu_size=%d\n",
1150 warp_size, block_size, dev_size, cpu_size);
1151 gang = (cpu_size / block_size) * dev_size;
1152 worker = block_size / warp_size;
1153 vector = warp_size;
1154 }
1155
1156 /* There is no upper bound on the gang size. The best size
1157 matches the hardware configuration. Logical gangs are
1158 scheduled onto physical hardware. To maximize usage, we
1159 should guess a large number. */
1160 if (default_dims[GOMP_DIM_GANG] < 1)
1161 default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
1162 /* The worker size must not exceed the hardware. */
1163 if (default_dims[GOMP_DIM_WORKER] < 1
1164 || (default_dims[GOMP_DIM_WORKER] > worker && gang))
1165 default_dims[GOMP_DIM_WORKER] = worker;
1166 /* The vector size must exactly match the hardware. */
1167 if (default_dims[GOMP_DIM_VECTOR] < 1
1168 || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
1169 default_dims[GOMP_DIM_VECTOR] = vector;
1170
1171 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
1172 default_dims[GOMP_DIM_GANG],
1173 default_dims[GOMP_DIM_WORKER],
1174 default_dims[GOMP_DIM_VECTOR]);
1175 }
1176 pthread_mutex_unlock (&ptx_dev_lock);
1177
1178 for (i = 0; i != GOMP_DIM_MAX; i++)
1179 if (!dims[i])
1180 dims[i] = default_dims[i];
1181 }
1182
1183 /* This reserves a chunk of a pre-allocated page of memory mapped on both
1184 the host and the device. HP is a host pointer to the new chunk, and DP is
1185 the corresponding device pointer. */
1186 map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
1187
1188 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
1189
1190 /* Copy the array of arguments to the mapped page. */
1191 for (i = 0; i < mapnum; i++)
1192 ((void **) hp)[i] = devaddrs[i];
1193
1194 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1195 fact have the same value on a unified-memory system). */
1196 CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp,
1197 mapnum * sizeof (void *));
1198 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
1199 " gangs=%u, workers=%u, vectors=%u\n",
1200 __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
1201 dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
1202
1203 // OpenACC CUDA
1204 //
1205 // num_gangs nctaid.x
1206 // num_workers ntid.y
1207 // vector length ntid.x
1208
1209 kargs[0] = &dp;
1210 CUDA_CALL_ASSERT (cuLaunchKernel, function,
1211 dims[GOMP_DIM_GANG], 1, 1,
1212 dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
1213 0, dev_str->stream, kargs, 0);
1214
1215 #ifndef DISABLE_ASYNC
1216 if (async < acc_async_noval)
1217 {
1218 r = CUDA_CALL_NOCHECK (cuStreamSynchronize, dev_str->stream);
1219 if (r == CUDA_ERROR_LAUNCH_FAILED)
1220 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1221 maybe_abort_msg);
1222 else if (r != CUDA_SUCCESS)
1223 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1224 }
1225 else
1226 {
1227 CUevent *e;
1228
1229 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1230
1231 r = CUDA_CALL_NOCHECK (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1232 if (r == CUDA_ERROR_LAUNCH_FAILED)
1233 GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
1234 maybe_abort_msg);
1235 else if (r != CUDA_SUCCESS)
1236 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1237
1238 event_gc (true);
1239
1240 CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream);
1241
1242 event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
1243 }
1244 #else
1245 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
1246 if (r == CUDA_ERROR_LAUNCH_FAILED)
1247 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1248 maybe_abort_msg);
1249 else if (r != CUDA_SUCCESS)
1250 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1251 #endif
1252
1253 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
1254 targ_fn->launch->fn);
1255
1256 #ifndef DISABLE_ASYNC
1257 if (async < acc_async_noval)
1258 #endif
1259 map_pop (dev_str);
1260 }
1261
1262 void * openacc_get_current_cuda_context (void);
1263
1264 static void *
nvptx_alloc(size_t s)1265 nvptx_alloc (size_t s)
1266 {
1267 CUdeviceptr d;
1268
1269 CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
1270 return (void *) d;
1271 }
1272
1273 static bool
nvptx_free(void * p)1274 nvptx_free (void *p)
1275 {
1276 CUdeviceptr pb;
1277 size_t ps;
1278
1279 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) p);
1280 if ((CUdeviceptr) p != pb)
1281 {
1282 GOMP_PLUGIN_error ("invalid device address");
1283 return false;
1284 }
1285
1286 CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1287 return true;
1288 }
1289
1290
1291 static bool
nvptx_host2dev(void * d,const void * h,size_t s)1292 nvptx_host2dev (void *d, const void *h, size_t s)
1293 {
1294 CUdeviceptr pb;
1295 size_t ps;
1296 struct nvptx_thread *nvthd = nvptx_thread ();
1297
1298 if (!s)
1299 return true;
1300 if (!d)
1301 {
1302 GOMP_PLUGIN_error ("invalid device address");
1303 return false;
1304 }
1305
1306 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1307
1308 if (!pb)
1309 {
1310 GOMP_PLUGIN_error ("invalid device address");
1311 return false;
1312 }
1313 if (!h)
1314 {
1315 GOMP_PLUGIN_error ("invalid host address");
1316 return false;
1317 }
1318 if (d == h)
1319 {
1320 GOMP_PLUGIN_error ("invalid host or device address");
1321 return false;
1322 }
1323 if ((void *)(d + s) > (void *)(pb + ps))
1324 {
1325 GOMP_PLUGIN_error ("invalid size");
1326 return false;
1327 }
1328
1329 #ifndef DISABLE_ASYNC
1330 if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
1331 {
1332 CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1333 CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1334 event_gc (false);
1335 CUDA_CALL (cuMemcpyHtoDAsync,
1336 (CUdeviceptr) d, h, s, nvthd->current_stream->stream);
1337 CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1338 event_add (PTX_EVT_MEM, e, (void *)h, 0);
1339 }
1340 else
1341 #endif
1342 CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) d, h, s);
1343
1344 return true;
1345 }
1346
1347 static bool
nvptx_dev2host(void * h,const void * d,size_t s)1348 nvptx_dev2host (void *h, const void *d, size_t s)
1349 {
1350 CUdeviceptr pb;
1351 size_t ps;
1352 struct nvptx_thread *nvthd = nvptx_thread ();
1353
1354 if (!s)
1355 return true;
1356 if (!d)
1357 {
1358 GOMP_PLUGIN_error ("invalid device address");
1359 return false;
1360 }
1361
1362 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1363
1364 if (!pb)
1365 {
1366 GOMP_PLUGIN_error ("invalid device address");
1367 return false;
1368 }
1369 if (!h)
1370 {
1371 GOMP_PLUGIN_error ("invalid host address");
1372 return false;
1373 }
1374 if (d == h)
1375 {
1376 GOMP_PLUGIN_error ("invalid host or device address");
1377 return false;
1378 }
1379 if ((void *)(d + s) > (void *)(pb + ps))
1380 {
1381 GOMP_PLUGIN_error ("invalid size");
1382 return false;
1383 }
1384
1385 #ifndef DISABLE_ASYNC
1386 if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
1387 {
1388 CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1389 CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1390 event_gc (false);
1391 CUDA_CALL (cuMemcpyDtoHAsync,
1392 h, (CUdeviceptr) d, s, nvthd->current_stream->stream);
1393 CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1394 event_add (PTX_EVT_MEM, e, (void *)h, 0);
1395 }
1396 else
1397 #endif
1398 CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s);
1399
1400 return true;
1401 }
1402
1403 static void
nvptx_set_async(int async)1404 nvptx_set_async (int async)
1405 {
1406 struct nvptx_thread *nvthd = nvptx_thread ();
1407 nvthd->current_stream
1408 = select_stream_for_async (async, pthread_self (), true, NULL);
1409 }
1410
1411 static int
nvptx_async_test(int async)1412 nvptx_async_test (int async)
1413 {
1414 CUresult r;
1415 struct ptx_stream *s;
1416
1417 s = select_stream_for_async (async, pthread_self (), false, NULL);
1418
1419 if (!s)
1420 GOMP_PLUGIN_fatal ("unknown async %d", async);
1421
1422 r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
1423 if (r == CUDA_SUCCESS)
1424 {
1425 /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1426 whether all work has completed on this stream, and if so omits the call
1427 to the wait hook. If that happens, event_gc might not get called
1428 (which prevents variables from getting unmapped and their associated
1429 device storage freed), so call it here. */
1430 event_gc (true);
1431 return 1;
1432 }
1433 else if (r == CUDA_ERROR_NOT_READY)
1434 return 0;
1435
1436 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1437
1438 return 0;
1439 }
1440
1441 static int
nvptx_async_test_all(void)1442 nvptx_async_test_all (void)
1443 {
1444 struct ptx_stream *s;
1445 pthread_t self = pthread_self ();
1446 struct nvptx_thread *nvthd = nvptx_thread ();
1447
1448 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1449
1450 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1451 {
1452 if ((s->multithreaded || pthread_equal (s->host_thread, self))
1453 && CUDA_CALL_NOCHECK (cuStreamQuery,
1454 s->stream) == CUDA_ERROR_NOT_READY)
1455 {
1456 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1457 return 0;
1458 }
1459 }
1460
1461 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1462
1463 event_gc (true);
1464
1465 return 1;
1466 }
1467
1468 static void
nvptx_wait(int async)1469 nvptx_wait (int async)
1470 {
1471 struct ptx_stream *s;
1472
1473 s = select_stream_for_async (async, pthread_self (), false, NULL);
1474 if (!s)
1475 GOMP_PLUGIN_fatal ("unknown async %d", async);
1476
1477 CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1478
1479 event_gc (true);
1480 }
1481
1482 static void
nvptx_wait_async(int async1,int async2)1483 nvptx_wait_async (int async1, int async2)
1484 {
1485 CUevent *e;
1486 struct ptx_stream *s1, *s2;
1487 pthread_t self = pthread_self ();
1488
1489 /* The stream that is waiting (rather than being waited for) doesn't
1490 necessarily have to exist already. */
1491 s2 = select_stream_for_async (async2, self, true, NULL);
1492
1493 s1 = select_stream_for_async (async1, self, false, NULL);
1494 if (!s1)
1495 GOMP_PLUGIN_fatal ("invalid async 1\n");
1496
1497 if (s1 == s2)
1498 GOMP_PLUGIN_fatal ("identical parameters");
1499
1500 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1501
1502 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1503
1504 event_gc (true);
1505
1506 CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream);
1507
1508 event_add (PTX_EVT_SYNC, e, NULL, 0);
1509
1510 CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0);
1511 }
1512
1513 static void
nvptx_wait_all(void)1514 nvptx_wait_all (void)
1515 {
1516 CUresult r;
1517 struct ptx_stream *s;
1518 pthread_t self = pthread_self ();
1519 struct nvptx_thread *nvthd = nvptx_thread ();
1520
1521 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1522
1523 /* Wait for active streams initiated by this thread (or by multiple threads)
1524 to complete. */
1525 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1526 {
1527 if (s->multithreaded || pthread_equal (s->host_thread, self))
1528 {
1529 r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
1530 if (r == CUDA_SUCCESS)
1531 continue;
1532 else if (r != CUDA_ERROR_NOT_READY)
1533 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1534
1535 CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1536 }
1537 }
1538
1539 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1540
1541 event_gc (true);
1542 }
1543
1544 static void
nvptx_wait_all_async(int async)1545 nvptx_wait_all_async (int async)
1546 {
1547 struct ptx_stream *waiting_stream, *other_stream;
1548 CUevent *e;
1549 struct nvptx_thread *nvthd = nvptx_thread ();
1550 pthread_t self = pthread_self ();
1551
1552 /* The stream doing the waiting. This could be the first mention of the
1553 stream, so create it if necessary. */
1554 waiting_stream
1555 = select_stream_for_async (async, pthread_self (), true, NULL);
1556
1557 /* Launches on the null stream already block on other streams in the
1558 context. */
1559 if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1560 return;
1561
1562 event_gc (true);
1563
1564 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1565
1566 for (other_stream = nvthd->ptx_dev->active_streams;
1567 other_stream != NULL;
1568 other_stream = other_stream->next)
1569 {
1570 if (!other_stream->multithreaded
1571 && !pthread_equal (other_stream->host_thread, self))
1572 continue;
1573
1574 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1575
1576 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1577
1578 /* Record an event on the waited-for stream. */
1579 CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream);
1580
1581 event_add (PTX_EVT_SYNC, e, NULL, 0);
1582
1583 CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0);
1584 }
1585
1586 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1587 }
1588
1589 static void *
nvptx_get_current_cuda_device(void)1590 nvptx_get_current_cuda_device (void)
1591 {
1592 struct nvptx_thread *nvthd = nvptx_thread ();
1593
1594 if (!nvthd || !nvthd->ptx_dev)
1595 return NULL;
1596
1597 return &nvthd->ptx_dev->dev;
1598 }
1599
1600 static void *
nvptx_get_current_cuda_context(void)1601 nvptx_get_current_cuda_context (void)
1602 {
1603 struct nvptx_thread *nvthd = nvptx_thread ();
1604
1605 if (!nvthd || !nvthd->ptx_dev)
1606 return NULL;
1607
1608 return nvthd->ptx_dev->ctx;
1609 }
1610
1611 static void *
nvptx_get_cuda_stream(int async)1612 nvptx_get_cuda_stream (int async)
1613 {
1614 struct ptx_stream *s;
1615 struct nvptx_thread *nvthd = nvptx_thread ();
1616
1617 if (!nvthd || !nvthd->ptx_dev)
1618 return NULL;
1619
1620 s = select_stream_for_async (async, pthread_self (), false, NULL);
1621
1622 return s ? s->stream : NULL;
1623 }
1624
1625 static int
nvptx_set_cuda_stream(int async,void * stream)1626 nvptx_set_cuda_stream (int async, void *stream)
1627 {
1628 struct ptx_stream *oldstream;
1629 pthread_t self = pthread_self ();
1630 struct nvptx_thread *nvthd = nvptx_thread ();
1631
1632 if (async < 0)
1633 GOMP_PLUGIN_fatal ("bad async %d", async);
1634
1635 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1636
1637 /* We have a list of active streams and an array mapping async values to
1638 entries of that list. We need to take "ownership" of the passed-in stream,
1639 and add it to our list, removing the previous entry also (if there was one)
1640 in order to prevent resource leaks. Note the potential for surprise
1641 here: maybe we should keep track of passed-in streams and leave it up to
1642 the user to tidy those up, but that doesn't work for stream handles
1643 returned from acc_get_cuda_stream above... */
1644
1645 oldstream = select_stream_for_async (async, self, false, NULL);
1646
1647 if (oldstream)
1648 {
1649 if (nvthd->ptx_dev->active_streams == oldstream)
1650 nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1651 else
1652 {
1653 struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1654 while (s->next != oldstream)
1655 s = s->next;
1656 s->next = s->next->next;
1657 }
1658
1659 CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream);
1660
1661 if (!map_fini (oldstream))
1662 GOMP_PLUGIN_fatal ("error when freeing host memory");
1663
1664 free (oldstream);
1665 }
1666
1667 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1668
1669 (void) select_stream_for_async (async, self, true, (CUstream) stream);
1670
1671 return 1;
1672 }
1673
1674 /* Plugin entry points. */
1675
1676 const char *
GOMP_OFFLOAD_get_name(void)1677 GOMP_OFFLOAD_get_name (void)
1678 {
1679 return "nvptx";
1680 }
1681
1682 unsigned int
GOMP_OFFLOAD_get_caps(void)1683 GOMP_OFFLOAD_get_caps (void)
1684 {
1685 return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1686 }
1687
1688 int
GOMP_OFFLOAD_get_type(void)1689 GOMP_OFFLOAD_get_type (void)
1690 {
1691 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1692 }
1693
1694 int
GOMP_OFFLOAD_get_num_devices(void)1695 GOMP_OFFLOAD_get_num_devices (void)
1696 {
1697 return nvptx_get_num_devices ();
1698 }
1699
1700 bool
GOMP_OFFLOAD_init_device(int n)1701 GOMP_OFFLOAD_init_device (int n)
1702 {
1703 struct ptx_device *dev;
1704
1705 pthread_mutex_lock (&ptx_dev_lock);
1706
1707 if (!nvptx_init () || ptx_devices[n] != NULL)
1708 {
1709 pthread_mutex_unlock (&ptx_dev_lock);
1710 return false;
1711 }
1712
1713 dev = nvptx_open_device (n);
1714 if (dev)
1715 {
1716 ptx_devices[n] = dev;
1717 instantiated_devices++;
1718 }
1719
1720 pthread_mutex_unlock (&ptx_dev_lock);
1721
1722 return dev != NULL;
1723 }
1724
1725 bool
GOMP_OFFLOAD_fini_device(int n)1726 GOMP_OFFLOAD_fini_device (int n)
1727 {
1728 pthread_mutex_lock (&ptx_dev_lock);
1729
1730 if (ptx_devices[n] != NULL)
1731 {
1732 if (!nvptx_attach_host_thread_to_device (n)
1733 || !nvptx_close_device (ptx_devices[n]))
1734 {
1735 pthread_mutex_unlock (&ptx_dev_lock);
1736 return false;
1737 }
1738 ptx_devices[n] = NULL;
1739 instantiated_devices--;
1740 }
1741
1742 pthread_mutex_unlock (&ptx_dev_lock);
1743 return true;
1744 }
1745
1746 /* Return the libgomp version number we're compatible with. There is
1747 no requirement for cross-version compatibility. */
1748
1749 unsigned
GOMP_OFFLOAD_version(void)1750 GOMP_OFFLOAD_version (void)
1751 {
1752 return GOMP_VERSION;
1753 }
1754
1755 /* Initialize __nvptx_clocktick, if present in MODULE. */
1756
1757 static void
nvptx_set_clocktick(CUmodule module,struct ptx_device * dev)1758 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1759 {
1760 CUdeviceptr dptr;
1761 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1762 module, "__nvptx_clocktick");
1763 if (r == CUDA_ERROR_NOT_FOUND)
1764 return;
1765 if (r != CUDA_SUCCESS)
1766 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1767 double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1768 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1769 sizeof (__nvptx_clocktick));
1770 if (r != CUDA_SUCCESS)
1771 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1772 }
1773
1774 /* Load the (partial) program described by TARGET_DATA to device
1775 number ORD. Allocate and return TARGET_TABLE. */
1776
1777 int
GOMP_OFFLOAD_load_image(int ord,unsigned version,const void * target_data,struct addr_pair ** target_table)1778 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1779 struct addr_pair **target_table)
1780 {
1781 CUmodule module;
1782 const char *const *var_names;
1783 const struct targ_fn_launch *fn_descs;
1784 unsigned int fn_entries, var_entries, i, j;
1785 struct targ_fn_descriptor *targ_fns;
1786 struct addr_pair *targ_tbl;
1787 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1788 struct ptx_image_data *new_image;
1789 struct ptx_device *dev;
1790
1791 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1792 {
1793 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1794 " (expected %u, received %u)",
1795 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1796 return -1;
1797 }
1798
1799 if (!nvptx_attach_host_thread_to_device (ord)
1800 || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1801 return -1;
1802
1803 dev = ptx_devices[ord];
1804
1805 /* The mkoffload utility emits a struct of pointers/integers at the
1806 start of each offload image. The array of kernel names and the
1807 functions addresses form a one-to-one correspondence. */
1808
1809 var_entries = img_header->var_num;
1810 var_names = img_header->var_names;
1811 fn_entries = img_header->fn_num;
1812 fn_descs = img_header->fn_descs;
1813
1814 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1815 * (fn_entries + var_entries));
1816 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1817 * fn_entries);
1818
1819 *target_table = targ_tbl;
1820
1821 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1822 new_image->target_data = target_data;
1823 new_image->module = module;
1824 new_image->fns = targ_fns;
1825
1826 pthread_mutex_lock (&dev->image_lock);
1827 new_image->next = dev->images;
1828 dev->images = new_image;
1829 pthread_mutex_unlock (&dev->image_lock);
1830
1831 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1832 {
1833 CUfunction function;
1834 int nregs, mthrs;
1835
1836 CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1837 fn_descs[i].fn);
1838 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1839 CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1840 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1841 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1842
1843 targ_fns->fn = function;
1844 targ_fns->launch = &fn_descs[i];
1845 targ_fns->regs_per_thread = nregs;
1846 targ_fns->max_threads_per_block = mthrs;
1847
1848 targ_tbl->start = (uintptr_t) targ_fns;
1849 targ_tbl->end = targ_tbl->start + 1;
1850 }
1851
1852 for (j = 0; j < var_entries; j++, targ_tbl++)
1853 {
1854 CUdeviceptr var;
1855 size_t bytes;
1856
1857 CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1858 &var, &bytes, module, var_names[j]);
1859
1860 targ_tbl->start = (uintptr_t) var;
1861 targ_tbl->end = targ_tbl->start + bytes;
1862 }
1863
1864 nvptx_set_clocktick (module, dev);
1865
1866 return fn_entries + var_entries;
1867 }
1868
1869 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1870 function descriptors allocated by G_O_load_image. */
1871
1872 bool
GOMP_OFFLOAD_unload_image(int ord,unsigned version,const void * target_data)1873 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1874 {
1875 struct ptx_image_data *image, **prev_p;
1876 struct ptx_device *dev = ptx_devices[ord];
1877
1878 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1879 {
1880 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1881 " (expected %u, received %u)",
1882 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1883 return false;
1884 }
1885
1886 bool ret = true;
1887 pthread_mutex_lock (&dev->image_lock);
1888 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1889 if (image->target_data == target_data)
1890 {
1891 *prev_p = image->next;
1892 if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
1893 ret = false;
1894 free (image->fns);
1895 free (image);
1896 break;
1897 }
1898 pthread_mutex_unlock (&dev->image_lock);
1899 return ret;
1900 }
1901
1902 void *
GOMP_OFFLOAD_alloc(int ord,size_t size)1903 GOMP_OFFLOAD_alloc (int ord, size_t size)
1904 {
1905 if (!nvptx_attach_host_thread_to_device (ord))
1906 return NULL;
1907 return nvptx_alloc (size);
1908 }
1909
1910 bool
GOMP_OFFLOAD_free(int ord,void * ptr)1911 GOMP_OFFLOAD_free (int ord, void *ptr)
1912 {
1913 return (nvptx_attach_host_thread_to_device (ord)
1914 && nvptx_free (ptr));
1915 }
1916
1917 bool
GOMP_OFFLOAD_dev2host(int ord,void * dst,const void * src,size_t n)1918 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1919 {
1920 return (nvptx_attach_host_thread_to_device (ord)
1921 && nvptx_dev2host (dst, src, n));
1922 }
1923
1924 bool
GOMP_OFFLOAD_host2dev(int ord,void * dst,const void * src,size_t n)1925 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1926 {
1927 return (nvptx_attach_host_thread_to_device (ord)
1928 && nvptx_host2dev (dst, src, n));
1929 }
1930
1931 bool
GOMP_OFFLOAD_dev2dev(int ord,void * dst,const void * src,size_t n)1932 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1933 {
1934 struct ptx_device *ptx_dev = ptx_devices[ord];
1935 CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
1936 ptx_dev->null_stream->stream);
1937 return true;
1938 }
1939
1940 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
1941
1942 void
GOMP_OFFLOAD_openacc_exec(void (* fn)(void *),size_t mapnum,void ** hostaddrs,void ** devaddrs,int async,unsigned * dims,void * targ_mem_desc)1943 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
1944 void **hostaddrs, void **devaddrs,
1945 int async, unsigned *dims, void *targ_mem_desc)
1946 {
1947 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, async, dims, targ_mem_desc);
1948 }
1949
1950 void
GOMP_OFFLOAD_openacc_register_async_cleanup(void * targ_mem_desc,int async)1951 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc, int async)
1952 {
1953 struct nvptx_thread *nvthd = nvptx_thread ();
1954 CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1955
1956 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1957 CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream);
1958 event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc, async);
1959 }
1960
1961 int
GOMP_OFFLOAD_openacc_async_test(int async)1962 GOMP_OFFLOAD_openacc_async_test (int async)
1963 {
1964 return nvptx_async_test (async);
1965 }
1966
1967 int
GOMP_OFFLOAD_openacc_async_test_all(void)1968 GOMP_OFFLOAD_openacc_async_test_all (void)
1969 {
1970 return nvptx_async_test_all ();
1971 }
1972
1973 void
GOMP_OFFLOAD_openacc_async_wait(int async)1974 GOMP_OFFLOAD_openacc_async_wait (int async)
1975 {
1976 nvptx_wait (async);
1977 }
1978
1979 void
GOMP_OFFLOAD_openacc_async_wait_async(int async1,int async2)1980 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
1981 {
1982 nvptx_wait_async (async1, async2);
1983 }
1984
1985 void
GOMP_OFFLOAD_openacc_async_wait_all(void)1986 GOMP_OFFLOAD_openacc_async_wait_all (void)
1987 {
1988 nvptx_wait_all ();
1989 }
1990
1991 void
GOMP_OFFLOAD_openacc_async_wait_all_async(int async)1992 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
1993 {
1994 nvptx_wait_all_async (async);
1995 }
1996
1997 void
GOMP_OFFLOAD_openacc_async_set_async(int async)1998 GOMP_OFFLOAD_openacc_async_set_async (int async)
1999 {
2000 nvptx_set_async (async);
2001 }
2002
2003 void *
GOMP_OFFLOAD_openacc_create_thread_data(int ord)2004 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
2005 {
2006 struct ptx_device *ptx_dev;
2007 struct nvptx_thread *nvthd
2008 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
2009 CUcontext thd_ctx;
2010
2011 ptx_dev = ptx_devices[ord];
2012
2013 assert (ptx_dev);
2014
2015 CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
2016
2017 assert (ptx_dev->ctx);
2018
2019 if (!thd_ctx)
2020 CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
2021
2022 nvthd->current_stream = ptx_dev->null_stream;
2023 nvthd->ptx_dev = ptx_dev;
2024
2025 return (void *) nvthd;
2026 }
2027
2028 void
GOMP_OFFLOAD_openacc_destroy_thread_data(void * data)2029 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
2030 {
2031 free (data);
2032 }
2033
2034 void *
GOMP_OFFLOAD_openacc_cuda_get_current_device(void)2035 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
2036 {
2037 return nvptx_get_current_cuda_device ();
2038 }
2039
2040 void *
GOMP_OFFLOAD_openacc_cuda_get_current_context(void)2041 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
2042 {
2043 return nvptx_get_current_cuda_context ();
2044 }
2045
2046 /* NOTE: This returns a CUstream, not a ptx_stream pointer. */
2047
2048 void *
GOMP_OFFLOAD_openacc_cuda_get_stream(int async)2049 GOMP_OFFLOAD_openacc_cuda_get_stream (int async)
2050 {
2051 return nvptx_get_cuda_stream (async);
2052 }
2053
2054 /* NOTE: This takes a CUstream, not a ptx_stream pointer. */
2055
2056 int
GOMP_OFFLOAD_openacc_cuda_set_stream(int async,void * stream)2057 GOMP_OFFLOAD_openacc_cuda_set_stream (int async, void *stream)
2058 {
2059 return nvptx_set_cuda_stream (async, stream);
2060 }
2061
2062 /* Adjust launch dimensions: pick good values for number of blocks and warps
2063 and ensure that number of warps does not exceed CUDA limits as well as GCC's
2064 own limits. */
2065
2066 static void
nvptx_adjust_launch_bounds(struct targ_fn_descriptor * fn,struct ptx_device * ptx_dev,int * teams_p,int * threads_p)2067 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
2068 struct ptx_device *ptx_dev,
2069 int *teams_p, int *threads_p)
2070 {
2071 int max_warps_block = fn->max_threads_per_block / 32;
2072 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
2073 and libgcc, which matches documented limit of all GPUs as of 2015. */
2074 if (max_warps_block > 32)
2075 max_warps_block = 32;
2076 if (*threads_p <= 0)
2077 *threads_p = 8;
2078 if (*threads_p > max_warps_block)
2079 *threads_p = max_warps_block;
2080
2081 int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
2082 /* This is an estimate of how many blocks the device can host simultaneously.
2083 Actual limit, which may be lower, can be queried with "occupancy control"
2084 driver interface (since CUDA 6.0). */
2085 int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
2086 if (*teams_p <= 0 || *teams_p > max_blocks)
2087 *teams_p = max_blocks;
2088 }
2089
2090 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
2091 target regions. */
2092
2093 static size_t
nvptx_stacks_size()2094 nvptx_stacks_size ()
2095 {
2096 return 128 * 1024;
2097 }
2098
2099 /* Return contiguous storage for NUM stacks, each SIZE bytes. */
2100
2101 static void *
nvptx_stacks_alloc(size_t size,int num)2102 nvptx_stacks_alloc (size_t size, int num)
2103 {
2104 CUdeviceptr stacks;
2105 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num);
2106 if (r != CUDA_SUCCESS)
2107 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
2108 return (void *) stacks;
2109 }
2110
2111 /* Release storage previously allocated by nvptx_stacks_alloc. */
2112
2113 static void
nvptx_stacks_free(void * p,int num)2114 nvptx_stacks_free (void *p, int num)
2115 {
2116 CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p);
2117 if (r != CUDA_SUCCESS)
2118 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
2119 }
2120
2121 void
GOMP_OFFLOAD_run(int ord,void * tgt_fn,void * tgt_vars,void ** args)2122 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
2123 {
2124 CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
2125 CUresult r;
2126 struct ptx_device *ptx_dev = ptx_devices[ord];
2127 const char *maybe_abort_msg = "(perhaps abort was called)";
2128 int teams = 0, threads = 0;
2129
2130 if (!args)
2131 GOMP_PLUGIN_fatal ("No target arguments provided");
2132 while (*args)
2133 {
2134 intptr_t id = (intptr_t) *args++, val;
2135 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
2136 val = (intptr_t) *args++;
2137 else
2138 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
2139 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
2140 continue;
2141 val = val > INT_MAX ? INT_MAX : val;
2142 id &= GOMP_TARGET_ARG_ID_MASK;
2143 if (id == GOMP_TARGET_ARG_NUM_TEAMS)
2144 teams = val;
2145 else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
2146 threads = val;
2147 }
2148 nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
2149
2150 size_t stack_size = nvptx_stacks_size ();
2151 void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
2152 void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
2153 size_t fn_args_size = sizeof fn_args;
2154 void *config[] = {
2155 CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
2156 CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
2157 CU_LAUNCH_PARAM_END
2158 };
2159 r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
2160 32, threads, 1, 0, ptx_dev->null_stream->stream,
2161 NULL, config);
2162 if (r != CUDA_SUCCESS)
2163 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
2164
2165 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2166 if (r == CUDA_ERROR_LAUNCH_FAILED)
2167 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2168 maybe_abort_msg);
2169 else if (r != CUDA_SUCCESS)
2170 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
2171 nvptx_stacks_free (stacks, teams * threads);
2172 }
2173
2174 void
GOMP_OFFLOAD_async_run(int ord,void * tgt_fn,void * tgt_vars,void ** args,void * async_data)2175 GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void **args,
2176 void *async_data)
2177 {
2178 GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");
2179 }
2180