1 /* { dg-do run { target openacc_nvidia_accel_selected } } */
2 /* { dg-additional-options "-lcuda" } */
3 /* { dg-require-effective-target openacc_cuda } */
4
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include <unistd.h>
8 #include <openacc.h>
9 #include <cuda.h>
10
11 int
main(int argc,char ** argv)12 main (int argc, char **argv)
13 {
14 CUdevice dev;
15 CUfunction delay2;
16 CUmodule module;
17 CUresult r;
18 int N;
19 int i;
20 CUstream *streams;
21 unsigned long **a, **d_a, *tid, ticks;
22 int nbytes;
23 void *kargs[3];
24 int clkrate;
25 int devnum, nprocs;
26
27 acc_init (acc_device_nvidia);
28
29 devnum = acc_get_device_num (acc_device_nvidia);
30
31 r = cuDeviceGet (&dev, devnum);
32 if (r != CUDA_SUCCESS)
33 {
34 fprintf (stderr, "cuDeviceGet failed: %d\n", r);
35 abort ();
36 }
37
38 r =
39 cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
40 dev);
41 if (r != CUDA_SUCCESS)
42 {
43 fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
44 abort ();
45 }
46
47 r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
48 if (r != CUDA_SUCCESS)
49 {
50 fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
51 abort ();
52 }
53
54 r = cuModuleLoad (&module, "subr.ptx");
55 if (r != CUDA_SUCCESS)
56 {
57 fprintf (stderr, "cuModuleLoad failed: %d\n", r);
58 abort ();
59 }
60
61 r = cuModuleGetFunction (&delay2, module, "delay2");
62 if (r != CUDA_SUCCESS)
63 {
64 fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
65 abort ();
66 }
67
68 nbytes = sizeof (int);
69
70 ticks = (unsigned long) (200.0 * clkrate);
71
72 N = nprocs;
73
74 streams = (CUstream *) malloc (N * sizeof (void *));
75
76 a = (unsigned long **) malloc (N * sizeof (unsigned long *));
77 d_a = (unsigned long **) malloc (N * sizeof (unsigned long *));
78 tid = (unsigned long *) malloc (N * sizeof (unsigned long));
79
80 for (i = 0; i < N; i++)
81 {
82 a[i] = (unsigned long *) malloc (sizeof (unsigned long));
83 *a[i] = N;
84 d_a[i] = (unsigned long *) acc_malloc (nbytes);
85 tid[i] = i;
86
87 acc_map_data (a[i], d_a[i], nbytes);
88
89 streams[i] = (CUstream) acc_get_cuda_stream (i);
90 if (streams[i] != NULL)
91 abort ();
92
93 r = cuStreamCreate (&streams[i], CU_STREAM_DEFAULT);
94 if (r != CUDA_SUCCESS)
95 {
96 fprintf (stderr, "cuStreamCreate failed: %d\n", r);
97 abort ();
98 }
99
100 if (!acc_set_cuda_stream (i, streams[i]))
101 abort ();
102 }
103
104 for (i = 0; i < N; i++)
105 {
106 kargs[0] = (void *) &d_a[i];
107 kargs[1] = (void *) &ticks;
108 kargs[2] = (void *) &tid[i];
109
110 r = cuLaunchKernel (delay2, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
111 if (r != CUDA_SUCCESS)
112 {
113 fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
114 abort ();
115 }
116
117 ticks = (unsigned long) (50.0 * clkrate);
118 }
119
120 acc_wait_all_async (0);
121
122 for (i = 0; i < N; i++)
123 {
124 acc_memcpy_from_device (a[i], d_a[i], nbytes);
125 if (*a[i] != i)
126 abort ();
127
128 acc_unmap_data (a[i]);
129
130 acc_free (d_a[i]);
131 }
132
133 free (streams);
134
135 for (i = 0; i < N; i++)
136 {
137 free (a[i]);
138 }
139
140 free (a);
141 free (d_a);
142 free (tid);
143
144 acc_shutdown (acc_device_nvidia);
145
146 exit (0);
147 }
148
149 /* { dg-output "" } */
150