1 /* { dg-do run { target openacc_nvidia_accel_selected } } */
2 /* { dg-additional-options "-lcuda" } */
3 /* { dg-require-effective-target openacc_cuda } */
4 
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include <unistd.h>
8 #include <openacc.h>
9 #include <cuda.h>
10 
11 int
main(int argc,char ** argv)12 main (int argc, char **argv)
13 {
14   CUdevice dev;
15   CUfunction delay2;
16   CUmodule module;
17   CUresult r;
18   int N;
19   int i;
20   CUstream *streams;
21   unsigned long **a, **d_a, *tid, ticks;
22   int nbytes;
23   void *kargs[3];
24   int clkrate;
25   int devnum, nprocs;
26 
27   acc_init (acc_device_nvidia);
28 
29   devnum = acc_get_device_num (acc_device_nvidia);
30 
31   r = cuDeviceGet (&dev, devnum);
32   if (r != CUDA_SUCCESS)
33     {
34       fprintf (stderr, "cuDeviceGet failed: %d\n", r);
35       abort ();
36     }
37 
38   r =
39     cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
40 			  dev);
41   if (r != CUDA_SUCCESS)
42     {
43       fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
44       abort ();
45     }
46 
47   r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
48   if (r != CUDA_SUCCESS)
49     {
50       fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
51       abort ();
52     }
53 
54   r = cuModuleLoad (&module, "subr.ptx");
55   if (r != CUDA_SUCCESS)
56     {
57       fprintf (stderr, "cuModuleLoad failed: %d\n", r);
58       abort ();
59     }
60 
61   r = cuModuleGetFunction (&delay2, module, "delay2");
62   if (r != CUDA_SUCCESS)
63     {
64       fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
65       abort ();
66     }
67 
68   nbytes = sizeof (int);
69 
70   ticks = (unsigned long) (200.0 * clkrate);
71 
72   N = nprocs;
73 
74   streams = (CUstream *) malloc (N * sizeof (void *));
75 
76   a = (unsigned long **) malloc (N * sizeof (unsigned long *));
77   d_a = (unsigned long **) malloc (N * sizeof (unsigned long *));
78   tid = (unsigned long *) malloc (N * sizeof (unsigned long));
79 
80   for (i = 0; i < N; i++)
81     {
82       a[i] = (unsigned long *) malloc (sizeof (unsigned long));
83       *a[i] = N;
84       d_a[i] = (unsigned long *) acc_malloc (nbytes);
85       tid[i] = i;
86 
87       acc_map_data (a[i], d_a[i], nbytes);
88 
89       streams[i] = (CUstream) acc_get_cuda_stream (i);
90       if (streams[i] != NULL)
91         abort ();
92 
93       r = cuStreamCreate (&streams[i], CU_STREAM_DEFAULT);
94       if (r != CUDA_SUCCESS)
95         {
96           fprintf (stderr, "cuStreamCreate failed: %d\n", r);
97           abort ();
98         }
99 
100        if (!acc_set_cuda_stream (i, streams[i]))
101         abort ();
102     }
103 
104   for (i = 0; i < N; i++)
105     {
106       kargs[0] = (void *) &d_a[i];
107       kargs[1] = (void *) &ticks;
108       kargs[2] = (void *) &tid[i];
109 
110       r = cuLaunchKernel (delay2, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
111       if (r != CUDA_SUCCESS)
112 	{
113 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
114 	  abort ();
115 	}
116 
117       ticks = (unsigned long) (50.0 * clkrate);
118     }
119 
120   acc_wait_all_async (0);
121 
122   for (i = 0; i < N; i++)
123     {
124       acc_memcpy_from_device (a[i], d_a[i], nbytes);
125       if (*a[i] != i)
126 	abort ();
127 
128       acc_unmap_data (a[i]);
129 
130       acc_free (d_a[i]);
131     }
132 
133   free (streams);
134 
135   for (i = 0; i < N; i++)
136     {
137       free (a[i]);
138     }
139 
140   free (a);
141   free (d_a);
142   free (tid);
143 
144   acc_shutdown (acc_device_nvidia);
145 
146   exit (0);
147 }
148 
149 /* { dg-output "" } */
150