1 /* Test asynchronous, unstructed data regions, runtime library variant.  */
2 /* See also data-2.c.  */
3 
4 #include <stdlib.h>
5 #undef NDEBUG
6 #include <assert.h>
7 #include <openacc.h>
8 
9 int
main(int argc,char ** argv)10 main (int argc, char **argv)
11 {
12   int N = 12345;
13   float *a, *b, *c, *d, *e;
14   void *d_a, *d_b, *d_c, *d_d;
15   int i;
16   int nbytes;
17 
18   nbytes = N * sizeof (float);
19 
20   a = (float *) malloc (nbytes);
21   b = (float *) malloc (nbytes);
22   c = (float *) malloc (nbytes);
23   d = (float *) malloc (nbytes);
24   e = (float *) malloc (nbytes);
25 
26   for (i = 0; i < N; i++)
27     {
28       a[i] = 3.0;
29       b[i] = 0.0;
30     }
31 
32   acc_copyin_async (a, nbytes, acc_async_noval);
33   acc_copyin_async (b, nbytes, acc_async_noval);
34   acc_copyin_async (&N, sizeof (int), acc_async_noval);
35 
36 #pragma acc parallel present (a[0:N], b[0:N], N) async
37 #pragma acc loop
38   for (i = 0; i < N; i++)
39     b[i] = a[i];
40 
41   d_a = acc_deviceptr (a);
42   acc_memcpy_from_device_async (a, d_a, nbytes, acc_async_noval);
43   d_b = acc_deviceptr (b);
44   acc_memcpy_from_device_async (b, d_b, nbytes, acc_async_noval);
45 
46   acc_wait (acc_async_noval);
47 
48   for (i = 0; i < N; i++)
49     {
50       assert (a[i] == 3.0);
51       assert (b[i] == 3.0);
52     }
53 
54   for (i = 0; i < N; i++)
55     {
56       a[i] = 2.0;
57       b[i] = 0.0;
58     }
59 
60   acc_update_device_async (a, nbytes, 1);
61   acc_update_device_async (b, nbytes, 1);
62 
63 #pragma acc parallel present (a[0:N], b[0:N], N) async (1)
64 #pragma acc loop
65   for (i = 0; i < N; i++)
66     b[i] = a[i];
67 
68   acc_memcpy_from_device_async (a, d_a, nbytes, 1);
69   acc_memcpy_from_device_async (b, d_b, nbytes, 1);
70 
71   acc_wait (1);
72   /* Test unseen async-argument.  */
73   acc_wait (10);
74 
75   for (i = 0; i < N; i++)
76     {
77       assert (a[i] == 2.0);
78       assert (b[i] == 2.0);
79     }
80 
81   for (i = 0; i < N; i++)
82     {
83       a[i] = 3.0;
84       b[i] = 0.0;
85       c[i] = 0.0;
86       d[i] = 0.0;
87     }
88 
89   acc_update_device_async (a, nbytes, 0);
90   acc_update_device_async (b, nbytes, 1);
91   acc_copyin_async (c, nbytes, 2);
92   acc_copyin_async (d, nbytes, 3);
93 
94 #pragma acc parallel present (a[0:N], b[0:N], N) wait (0) async (1)
95 #pragma acc loop
96   for (i = 0; i < N; i++)
97     b[i] = (a[i] * a[i] * a[i]) / a[i];
98 
99 #pragma acc parallel present (a[0:N], c[0:N], N) wait (0) async (2)
100 #pragma acc loop
101   for (i = 0; i < N; i++)
102     c[i] = (a[i] + a[i] + a[i] + a[i]) / a[i];
103 
104 #pragma acc parallel present (a[0:N], d[0:N], N) wait (0) async (3)
105 #pragma acc loop
106   for (i = 0; i < N; i++)
107     d[i] = ((a[i] * a[i] + a[i]) / a[i]) - a[i];
108 
109   acc_memcpy_from_device_async (a, d_a, nbytes, 0);
110   acc_memcpy_from_device_async (b, d_b, nbytes, 1);
111   d_c = acc_deviceptr (c);
112   acc_memcpy_from_device_async (c, d_c, nbytes, 2);
113   d_d = acc_deviceptr (d);
114   acc_memcpy_from_device_async (d, d_d, nbytes, 3);
115 
116   acc_wait_all_async (0);
117   acc_wait (0);
118 
119   for (i = 0; i < N; i++)
120     {
121       assert (a[i] == 3.0);
122       assert (b[i] == 9.0);
123       assert (c[i] == 4.0);
124       assert (d[i] == 1.0);
125     }
126 
127   for (i = 0; i < N; i++)
128     {
129       a[i] = 2.0;
130       b[i] = 0.0;
131       c[i] = 0.0;
132       d[i] = 0.0;
133       e[i] = 0.0;
134     }
135 
136   acc_update_device_async (a, nbytes, 10);
137   acc_update_device_async (b, nbytes, 11);
138   acc_update_device_async (c, nbytes, 12);
139   acc_update_device_async (d, nbytes, 13);
140   acc_copyin_async (e, nbytes, 14);
141 
142 #pragma acc parallel present (a[0:N], b[0:N], N) wait (10) async (11)
143   for (int ii = 0; ii < N; ii++)
144     b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
145 
146 #pragma acc parallel present (a[0:N], c[0:N], N) wait (10) async (12)
147   for (int ii = 0; ii < N; ii++)
148     c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
149 
150 #pragma acc parallel present (a[0:N], d[0:N], N) wait (10) async (13)
151   for (int ii = 0; ii < N; ii++)
152     d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
153 
154 #pragma acc parallel present (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N], N)  wait (11) wait (12) wait (13) async (14)
155   for (int ii = 0; ii < N; ii++)
156     e[ii] = a[ii] + b[ii] + c[ii] + d[ii];
157 
158   acc_copyout_async (a, nbytes, 10);
159   acc_copyout_async (b, nbytes, 11);
160   acc_copyout_async (c, nbytes, 12);
161   acc_copyout_async (d, nbytes, 13);
162   acc_copyout_async (e, nbytes, 14);
163   acc_delete_async (&N, sizeof (int), 15);
164   acc_wait_all ();
165 
166   for (i = 0; i < N; i++)
167     {
168       assert (a[i] == 2.0);
169       assert (b[i] == 4.0);
170       assert (c[i] == 4.0);
171       assert (d[i] == 1.0);
172       assert (e[i] == 11.0);
173     }
174 
175   return 0;
176 }
177