1 /* { dg-do run { target openacc_nvidia_accel_selected } } */
2 /* { dg-additional-options "-lcuda -lcublas -lcudart" } */
3 
4 #include <stdio.h>
5 #include <stdlib.h>
6 #include <cuda.h>
7 #include <cuda_runtime_api.h>
8 #include <cublas_v2.h>
9 #include <openacc.h>
10 
11 void
saxpy(int n,float a,float * x,float * y)12 saxpy (int n, float a, float *x, float *y)
13 {
14     int i;
15 
16     for (i = 0; i < n; i++)
17     {
18         y[i] = a * x[i] + y[i];
19     }
20 }
21 
22 void
context_check(CUcontext ctx1)23 context_check (CUcontext ctx1)
24 {
25     CUcontext ctx2, ctx3;
26     CUresult r;
27 
28     r = cuCtxGetCurrent (&ctx2);
29     if (r != CUDA_SUCCESS)
30     {
31         fprintf (stderr, "cuCtxGetCurrent failed: %d\n", r);
32         exit (EXIT_FAILURE);
33     }
34 
35     if (ctx1 != ctx2)
36     {
37         fprintf (stderr, "new context established\n");
38         exit (EXIT_FAILURE);
39     }
40 
41     ctx3 = (CUcontext) acc_get_current_cuda_context ();
42 
43     if (ctx1 != ctx3)
44     {
45         fprintf (stderr, "acc_get_current_cuda_context returned wrong value\n");
46         exit (EXIT_FAILURE);
47     }
48 
49     return;
50 }
51 
52 int
main(int argc,char ** argv)53 main (int argc, char **argv)
54 {
55     cublasStatus_t s;
56     cublasHandle_t h;
57     CUcontext pctx;
58     CUresult r;
59     int i;
60     const int N = 256;
61     float *h_X, *h_Y1, *h_Y2;
62     float *d_X,*d_Y;
63     float alpha = 2.0f;
64     float error_norm;
65     float ref_norm;
66 
67     /* Test 4 - OpenACC creates, cuBLAS shares.  */
68 
69     acc_set_device_num (0, acc_device_nvidia);
70 
71     r = cuCtxGetCurrent (&pctx);
72     if (r != CUDA_SUCCESS)
73     {
74         fprintf (stderr, "cuCtxGetCurrent failed: %d\n", r);
75         exit (EXIT_FAILURE);
76     }
77 
78     h_X = (float *) malloc (N * sizeof (float));
79     if (h_X == 0)
80     {
81         fprintf (stderr, "malloc failed: for h_X\n");
82         exit (EXIT_FAILURE);
83     }
84 
85     h_Y1 = (float *) malloc (N * sizeof (float));
86     if (h_Y1 == 0)
87     {
88         fprintf (stderr, "malloc failed: for h_Y1\n");
89         exit (EXIT_FAILURE);
90     }
91 
92     h_Y2 = (float *) malloc (N * sizeof (float));
93     if (h_Y2 == 0)
94     {
95         fprintf (stderr, "malloc failed: for h_Y2\n");
96         exit (EXIT_FAILURE);
97     }
98 
99     for (i = 0; i < N; i++)
100     {
101         h_X[i] = rand () / (float) RAND_MAX;
102         h_Y2[i] = h_Y1[i] = rand () / (float) RAND_MAX;
103     }
104 
105 #pragma acc parallel copyin (h_X[0:N]), copy (h_Y2[0:N]) copy (alpha)
106     {
107         int i;
108 
109         for (i = 0; i < N; i++)
110         {
111             h_Y2[i] = alpha * h_X[i] + h_Y2[i];
112         }
113     }
114 
115     r = cuCtxGetCurrent (&pctx);
116     if (r != CUDA_SUCCESS)
117     {
118         fprintf (stderr, "cuCtxGetCurrent failed: %d\n", r);
119         exit (EXIT_FAILURE);
120     }
121 
122     d_X = (float *) acc_copyin (&h_X[0], N * sizeof (float));
123     if (d_X == NULL)
124     {
125         fprintf (stderr, "copyin error h_Y1\n");
126         exit (EXIT_FAILURE);
127     }
128 
129     d_Y = (float *) acc_copyin (&h_Y1[0], N * sizeof (float));
130     if (d_Y == NULL)
131     {
132         fprintf (stderr, "copyin error h_Y1\n");
133         exit (EXIT_FAILURE);
134     }
135 
136     s = cublasCreate (&h);
137     if (s != CUBLAS_STATUS_SUCCESS)
138     {
139         fprintf (stderr, "cublasCreate failed: %d\n", s);
140         exit (EXIT_FAILURE);
141     }
142 
143     context_check (pctx);
144 
145     s = cublasSaxpy (h, N, &alpha, d_X, 1, d_Y, 1);
146     if (s != CUBLAS_STATUS_SUCCESS)
147     {
148         fprintf (stderr, "cublasSaxpy failed: %d\n", s);
149         exit (EXIT_FAILURE);
150     }
151 
152     context_check (pctx);
153 
154     acc_memcpy_from_device (&h_Y1[0], d_Y, N * sizeof (float));
155 
156     context_check (pctx);
157 
158     error_norm = 0;
159     ref_norm = 0;
160 
161     for (i = 0; i < N; ++i)
162     {
163         float diff;
164 
165         diff = h_Y1[i] - h_Y2[i];
166         error_norm += diff * diff;
167         ref_norm += h_Y2[i] * h_Y2[i];
168     }
169 
170     error_norm = (float) sqrt ((double) error_norm);
171     ref_norm = (float) sqrt ((double) ref_norm);
172 
173     if ((fabs (ref_norm) < 1e-7) || ((error_norm / ref_norm) >= 1e-6f))
174     {
175         fprintf (stderr, "math error\n");
176         exit (EXIT_FAILURE);
177     }
178 
179     acc_delete (&h_X[0], N * sizeof (float));
180     acc_delete (&h_Y1[0], N * sizeof (float));
181 
182     free (h_X);
183     free (h_Y1);
184     free (h_Y2);
185 
186     context_check (pctx);
187 
188     s = cublasDestroy (h);
189     if (s != CUBLAS_STATUS_SUCCESS)
190     {
191         fprintf (stderr, "cublasDestroy failed: %d\n", s);
192         exit (EXIT_FAILURE);
193     }
194 
195     context_check (pctx);
196 
197     acc_shutdown (acc_device_nvidia);
198 
199     r = cuCtxGetCurrent (&pctx);
200     if (r != CUDA_SUCCESS)
201     {
202         fprintf (stderr, "cuCtxGetCurrent failed: %d\n", r);
203         exit (EXIT_FAILURE);
204     }
205 
206     if (pctx)
207     {
208         fprintf (stderr, "Unexpected context\n");
209         exit (EXIT_FAILURE);
210     }
211 
212     return EXIT_SUCCESS;
213 }
214