1 /* { dg-do run { target openacc_nvidia_accel_selected } } */
2 /* { dg-additional-options "-lcuda -lcublas -lcudart" } */
3 /* { dg-require-effective-target openacc_cublas } */
4 /* { dg-require-effective-target openacc_cudart } */
5 
6 #include <stdio.h>
7 #include <stdlib.h>
8 #include <cuda.h>
9 #include <cuda_runtime_api.h>
10 #include <cublas_v2.h>
11 #include <openacc.h>
12 
13 void
saxpy(int n,float a,float * x,float * y)14 saxpy (int n, float a, float *x, float *y)
15 {
16     int i;
17 
18     for (i = 0; i < n; i++)
19     {
20         y[i] = a * x[i] + y[i];
21     }
22 }
23 
24 void
context_check(CUcontext ctx1)25 context_check (CUcontext ctx1)
26 {
27     CUcontext ctx2, ctx3;
28     CUresult r;
29 
30     r = cuCtxGetCurrent (&ctx2);
31     if (r != CUDA_SUCCESS)
32     {
33         fprintf (stderr, "cuCtxGetCurrent failed: %d\n", r);
34         exit (EXIT_FAILURE);
35     }
36 
37     if (ctx1 != ctx2)
38     {
39         fprintf (stderr, "new context established\n");
40         exit (EXIT_FAILURE);
41     }
42 
43     ctx3 = (CUcontext) acc_get_current_cuda_context ();
44 
45     if (ctx1 != ctx3)
46     {
47         fprintf (stderr, "acc_get_current_cuda_context returned wrong value\n");
48         exit (EXIT_FAILURE);
49     }
50 
51     return;
52 }
53 
54 int
main(int argc,char ** argv)55 main (int argc, char **argv)
56 {
57     cublasStatus_t s;
58     cublasHandle_t h;
59     CUcontext pctx;
60     CUresult r;
61     int i;
62     const int N = 256;
63     float *h_X, *h_Y1, *h_Y2;
64     float *d_X,*d_Y;
65     float alpha = 2.0f;
66     float error_norm;
67     float ref_norm;
68 
69     /* Test 4 - OpenACC creates, cuBLAS shares.  */
70 
71     acc_set_device_num (0, acc_device_nvidia);
72 
73     r = cuCtxGetCurrent (&pctx);
74     if (r != CUDA_SUCCESS)
75     {
76         fprintf (stderr, "cuCtxGetCurrent failed: %d\n", r);
77         exit (EXIT_FAILURE);
78     }
79 
80     h_X = (float *) malloc (N * sizeof (float));
81     if (h_X == 0)
82     {
83         fprintf (stderr, "malloc failed: for h_X\n");
84         exit (EXIT_FAILURE);
85     }
86 
87     h_Y1 = (float *) malloc (N * sizeof (float));
88     if (h_Y1 == 0)
89     {
90         fprintf (stderr, "malloc failed: for h_Y1\n");
91         exit (EXIT_FAILURE);
92     }
93 
94     h_Y2 = (float *) malloc (N * sizeof (float));
95     if (h_Y2 == 0)
96     {
97         fprintf (stderr, "malloc failed: for h_Y2\n");
98         exit (EXIT_FAILURE);
99     }
100 
101     for (i = 0; i < N; i++)
102     {
103         h_X[i] = rand () / (float) RAND_MAX;
104         h_Y2[i] = h_Y1[i] = rand () / (float) RAND_MAX;
105     }
106 
107 #pragma acc parallel copyin (h_X[0:N]), copy (h_Y2[0:N]) copy (alpha)
108     {
109         int i;
110 
111         for (i = 0; i < N; i++)
112         {
113             h_Y2[i] = alpha * h_X[i] + h_Y2[i];
114         }
115     }
116 
117     r = cuCtxGetCurrent (&pctx);
118     if (r != CUDA_SUCCESS)
119     {
120         fprintf (stderr, "cuCtxGetCurrent failed: %d\n", r);
121         exit (EXIT_FAILURE);
122     }
123 
124     d_X = (float *) acc_copyin (&h_X[0], N * sizeof (float));
125     if (d_X == NULL)
126     {
127         fprintf (stderr, "copyin error h_Y1\n");
128         exit (EXIT_FAILURE);
129     }
130 
131     d_Y = (float *) acc_copyin (&h_Y1[0], N * sizeof (float));
132     if (d_Y == NULL)
133     {
134         fprintf (stderr, "copyin error h_Y1\n");
135         exit (EXIT_FAILURE);
136     }
137 
138     s = cublasCreate (&h);
139     if (s != CUBLAS_STATUS_SUCCESS)
140     {
141         fprintf (stderr, "cublasCreate failed: %d\n", s);
142         exit (EXIT_FAILURE);
143     }
144 
145     context_check (pctx);
146 
147     s = cublasSaxpy (h, N, &alpha, d_X, 1, d_Y, 1);
148     if (s != CUBLAS_STATUS_SUCCESS)
149     {
150         fprintf (stderr, "cublasSaxpy failed: %d\n", s);
151         exit (EXIT_FAILURE);
152     }
153 
154     context_check (pctx);
155 
156     acc_memcpy_from_device (&h_Y1[0], d_Y, N * sizeof (float));
157 
158     context_check (pctx);
159 
160     error_norm = 0;
161     ref_norm = 0;
162 
163     for (i = 0; i < N; ++i)
164     {
165         float diff;
166 
167         diff = h_Y1[i] - h_Y2[i];
168         error_norm += diff * diff;
169         ref_norm += h_Y2[i] * h_Y2[i];
170     }
171 
172     error_norm = (float) sqrt ((double) error_norm);
173     ref_norm = (float) sqrt ((double) ref_norm);
174 
175     if ((fabs (ref_norm) < 1e-7) || ((error_norm / ref_norm) >= 1e-6f))
176     {
177         fprintf (stderr, "math error\n");
178         exit (EXIT_FAILURE);
179     }
180 
181     acc_delete (&h_X[0], N * sizeof (float));
182     acc_delete (&h_Y1[0], N * sizeof (float));
183 
184     free (h_X);
185     free (h_Y1);
186     free (h_Y2);
187 
188     context_check (pctx);
189 
190     s = cublasDestroy (h);
191     if (s != CUBLAS_STATUS_SUCCESS)
192     {
193         fprintf (stderr, "cublasDestroy failed: %d\n", s);
194         exit (EXIT_FAILURE);
195     }
196 
197     context_check (pctx);
198 
199     acc_shutdown (acc_device_nvidia);
200 
201     r = cuCtxGetCurrent (&pctx);
202     if (r != CUDA_SUCCESS)
203     {
204         fprintf (stderr, "cuCtxGetCurrent failed: %d\n", r);
205         exit (EXIT_FAILURE);
206     }
207 
208     if (pctx)
209     {
210         fprintf (stderr, "Unexpected context\n");
211         exit (EXIT_FAILURE);
212     }
213 
214     return EXIT_SUCCESS;
215 }
216