1 /* { dg-do run { target openacc_nvidia_accel_selected } } */
2 /* { dg-additional-options "-lcuda" } */
3 
4 #include <stdio.h>
5 #include <stdlib.h>
6 #include <unistd.h>
7 #include <openacc.h>
8 #include <cuda.h>
9 
10 int
main(int argc,char ** argv)11 main (int argc, char **argv)
12 {
13   CUdevice dev;
14   CUfunction delay2;
15   CUmodule module;
16   CUresult r;
17   int N;
18   int i;
19   CUstream *streams;
20   unsigned long **a, **d_a, *tid, ticks;
21   int nbytes;
22   void *kargs[3];
23   int clkrate;
24   int devnum, nprocs;
25 
26   acc_init (acc_device_nvidia);
27 
28   devnum = acc_get_device_num (acc_device_nvidia);
29 
30   r = cuDeviceGet (&dev, devnum);
31   if (r != CUDA_SUCCESS)
32     {
33       fprintf (stderr, "cuDeviceGet failed: %d\n", r);
34       abort ();
35     }
36 
37   r =
38     cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
39 			  dev);
40   if (r != CUDA_SUCCESS)
41     {
42       fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
43       abort ();
44     }
45 
46   r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
47   if (r != CUDA_SUCCESS)
48     {
49       fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
50       abort ();
51     }
52 
53   r = cuModuleLoad (&module, "subr.ptx");
54   if (r != CUDA_SUCCESS)
55     {
56       fprintf (stderr, "cuModuleLoad failed: %d\n", r);
57       abort ();
58     }
59 
60   r = cuModuleGetFunction (&delay2, module, "delay2");
61   if (r != CUDA_SUCCESS)
62     {
63       fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
64       abort ();
65     }
66 
67   nbytes = sizeof (int);
68 
69   ticks = (unsigned long) (200.0 * clkrate);
70 
71   N = nprocs;
72 
73   streams = (CUstream *) malloc (N * sizeof (void *));
74 
75   a = (unsigned long **) malloc (N * sizeof (unsigned long *));
76   d_a = (unsigned long **) malloc (N * sizeof (unsigned long *));
77   tid = (unsigned long *) malloc (N * sizeof (unsigned long));
78 
79   for (i = 0; i < N; i++)
80     {
81       a[i] = (unsigned long *) malloc (sizeof (unsigned long));
82       *a[i] = N;
83       d_a[i] = (unsigned long *) acc_malloc (nbytes);
84       tid[i] = i;
85 
86       acc_map_data (a[i], d_a[i], nbytes);
87 
88       streams[i] = (CUstream) acc_get_cuda_stream (i);
89       if (streams[i] != NULL)
90         abort ();
91 
92       r = cuStreamCreate (&streams[i], CU_STREAM_DEFAULT);
93       if (r != CUDA_SUCCESS)
94         {
95           fprintf (stderr, "cuStreamCreate failed: %d\n", r);
96           abort ();
97         }
98 
99        if (!acc_set_cuda_stream (i, streams[i]))
100         abort ();
101     }
102 
103   for (i = 0; i < N; i++)
104     {
105       kargs[0] = (void *) &d_a[i];
106       kargs[1] = (void *) &ticks;
107       kargs[2] = (void *) &tid[i];
108 
109       r = cuLaunchKernel (delay2, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
110       if (r != CUDA_SUCCESS)
111 	{
112 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
113 	  abort ();
114 	}
115 
116       ticks = (unsigned long) (50.0 * clkrate);
117     }
118 
119   acc_wait_all_async (0);
120 
121   for (i = 0; i < N; i++)
122     {
123       acc_copyout (a[i], nbytes);
124       if (*a[i] != i)
125 	abort ();
126     }
127 
128   free (streams);
129 
130   for (i = 0; i < N; i++)
131     {
132       free (a[i]);
133     }
134 
135   free (a);
136   free (d_a);
137   free (tid);
138 
139   acc_shutdown (acc_device_nvidia);
140 
141   exit (0);
142 }
143 
144 /* { dg-output "" } */
145