1 /* This code uses nvptx inline assembly guarded with acc_on_device, which is
2    not optimized away at -O0, and then confuses the target assembler.
3    { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
4 
5 /* { dg-additional-options "-fopenacc-dim=32" } */
6 
7 #include <stdio.h>
8 #include <openacc.h>
9 
check(const int * ary,int size,int gp,int wp,int vp)10 static int check (const int *ary, int size, int gp, int wp, int vp)
11 {
12   int exit = 0;
13   int ix;
14   int gangs[32], workers[32], vectors[32];
15 
16   for (ix = 0; ix < 32; ix++)
17     gangs[ix] = workers[ix] = vectors[ix] = 0;
18 
19   for (ix = 0; ix < size; ix++)
20     {
21       vectors[ary[ix] & 0xff]++;
22       workers[(ary[ix] >> 8) & 0xff]++;
23       gangs[(ary[ix] >> 16) & 0xff]++;
24     }
25 
26   for (ix = 0; ix < 32; ix++)
27     {
28       if (gp)
29 	{
30 	  int expect = gangs[0];
31 	  if (gangs[ix] != expect)
32 	    {
33 	      exit = 1;
34 	      printf ("gang %d not used %d times\n", ix, expect);
35 	    }
36 	}
37       else if (ix && gangs[ix])
38 	{
39 	  exit = 1;
40 	  printf ("gang %d unexpectedly used\n", ix);
41 	}
42 
43       if (wp)
44 	{
45 	  int expect = workers[0];
46 	  if (workers[ix] != expect)
47 	    {
48 	      exit = 1;
49 	      printf ("worker %d not used %d times\n", ix, expect);
50 	    }
51 	}
52       else if (ix && workers[ix])
53 	{
54 	  exit = 1;
55 	  printf ("worker %d unexpectedly used\n", ix);
56 	}
57 
58       if (vp)
59 	{
60 	  int expect = vectors[0];
61 	  if (vectors[ix] != expect)
62 	    {
63 	      exit = 1;
64 	      printf ("vector %d not used %d times\n", ix, expect);
65 	    }
66 	}
67       else if (ix && vectors[ix])
68 	{
69 	  exit = 1;
70 	  printf ("vector %d unexpectedly used\n", ix);
71 	}
72 
73     }
74   return exit;
75 }
76 
77 #pragma acc routine seq
place()78 static int __attribute__((noinline)) place ()
79 {
80   int r = 0;
81 
82   if (acc_on_device (acc_device_nvidia))
83     {
84       int g = 0, w = 0, v = 0;
85 
86       __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
87       __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
88       __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
89       r = (g << 16) | (w << 8) | v;
90     }
91   return r;
92 }
93 
clear(int * ary,int size)94 static void clear (int *ary, int size)
95 {
96   int ix;
97 
98   for (ix = 0; ix < size; ix++)
99     ary[ix] = -1;
100 }
101 
gang_vector_1(int * ary,int size)102 int gang_vector_1 (int *ary, int size)
103 {
104   clear (ary, size);
105 #pragma acc parallel vector_length(32) num_gangs (32) copy (ary[0:size]) firstprivate (size)
106   {
107 #pragma acc loop tile(128) gang vector
108     for (int jx = 0; jx < size; jx++)
109       ary[jx] = place ();
110   }
111 
112   return check (ary, size, 1, 0, 1);
113 }
114 
gang_vector_2a(int * ary,int size)115 int gang_vector_2a (int *ary, int size)
116 {
117   if (size % 256)
118     return 1;
119 
120   clear (ary, size);
121 #pragma acc parallel vector_length(32) num_gangs (32) copy (ary[0:size]) firstprivate (size)
122   {
123 #pragma acc loop tile(64, 64) gang vector
124     for (int jx = 0; jx < size / 256; jx++)
125       for (int ix = 0; ix < 256; ix++)
126 	ary[jx * 256 + ix] = place ();
127   }
128 
129   return check (ary, size, 1, 0, 1);
130 }
131 
gang_vector_2b(int * ary,int size)132 int gang_vector_2b (int *ary, int size)
133 {
134   if (size % 256)
135     return 1;
136 
137   clear (ary, size);
138 #pragma acc parallel vector_length(32) num_gangs (32) copy (ary[0:size]) firstprivate (size)
139   {
140 #pragma acc loop tile(64, 64) gang vector
141     for (int jx = 0; jx < size; jx += 256)
142       for (int ix = 0; ix < 256; ix++)
143 	ary[jx + ix] = place ();
144   }
145 
146   return check (ary, size, 1, 0, 1);
147 }
148 
worker_vector_2a(int * ary,int size)149 int worker_vector_2a (int *ary, int size)
150 {
151   if (size % 256)
152     return 1;
153 
154   clear (ary, size);
155 #pragma acc parallel vector_length(32) num_workers (32) copy (ary[0:size]) firstprivate (size)
156   {
157 #pragma acc loop tile(64, 64) worker vector
158     for (int jx = 0; jx < size / 256; jx++)
159       for (int ix = 0; ix < 256; ix++)
160 	ary[jx * 256 + ix] = place ();
161   }
162 
163   return check (ary, size, 0, 1, 1);
164 }
165 
worker_vector_2b(int * ary,int size)166 int worker_vector_2b (int *ary, int size)
167 {
168   if (size % 256)
169     return 1;
170 
171   clear (ary, size);
172 #pragma acc parallel vector_length(32) num_workers (32) copy (ary[0:size]) firstprivate (size)
173   {
174 #pragma acc loop tile(64, 64) worker vector
175     for (int jx = 0; jx < size; jx += 256)
176       for (int ix = 0; ix < 256; ix++)
177 	ary[jx + ix] = place ();
178   }
179 
180   return check (ary, size, 0, 1, 1);
181 }
182 
gang_worker_vector_2a(int * ary,int size)183 int gang_worker_vector_2a (int *ary, int size)
184 {
185   if (size % 256)
186     return 1;
187   clear (ary, size);
188 #pragma acc parallel vector_length(32) num_workers (32) num_gangs(32) copy (ary[0:size]) firstprivate (size)
189   {
190 #pragma acc loop tile(32, 32)
191     for (int jx = 0; jx < size / 256; jx++)
192       for (int ix = 0; ix < 256; ix++)
193 	ary[jx * 256 + ix] = place ();
194   }
195 
196   return check (ary, size, 1, 1, 1);
197 }
198 
gang_worker_vector_2b(int * ary,int size)199 int gang_worker_vector_2b (int *ary, int size)
200 {
201   if (size % 256)
202     return 1;
203   clear (ary, size);
204 #pragma acc parallel vector_length(32) num_workers (32) num_gangs(32) copy (ary[0:size]) firstprivate (size)
205   {
206 #pragma acc loop tile(32, 32)
207     for (int jx = 0; jx < size; jx += 256)
208       for (int ix = 0; ix < 256; ix++)
209 	ary[jx + ix] = place ();
210   }
211 
212   return check (ary, size, 1, 1, 1);
213 }
214 
gang_worker_vector_star_2a(int * ary,int size)215 int gang_worker_vector_star_2a (int *ary, int size)
216 {
217   if (size % 256)
218     return 1;
219 
220   clear (ary, size);
221 #pragma acc parallel vector_length(32) num_workers (32) num_gangs(32) copy (ary[0:size]) firstprivate (size)
222   {
223 #pragma acc loop tile(*, *)
224     for (int jx = 0; jx < size / 256; jx++)
225       for (int ix = 0; ix < 256; ix++)
226 	ary[jx * 256 + ix] = place ();
227   }
228 
229   return check (ary, size, 1, 1, 1);
230 }
231 
gang_worker_vector_star_2b(int * ary,int size)232 int gang_worker_vector_star_2b (int *ary, int size)
233 {
234   if (size % 256)
235     return 1;
236 
237   clear (ary, size);
238 #pragma acc parallel vector_length(32) num_workers (32) num_gangs(32) copy (ary[0:size]) firstprivate (size)
239   {
240 #pragma acc loop tile(*, *)
241     for (int jx = 0; jx < size; jx +=256)
242       for (int ix = 0; ix < 256; ix++)
243 	ary[jx + ix] = place ();
244   }
245 
246   return check (ary, size, 1, 1, 1);
247 }
248 
249 #define N (32*32*32*8)
main()250 int main ()
251 {
252   int ondev = 0;
253 
254 #pragma acc parallel copy(ondev)
255   {
256     ondev = acc_on_device (acc_device_not_host);
257   }
258   if (!ondev)
259     return 0;
260 
261   int ary[N];
262   if (gang_vector_1 (ary, N))
263     return 1;
264   if (gang_vector_2a (ary, N))
265     return 1;
266   if (worker_vector_2a (ary, N))
267     return 1;
268   if (gang_worker_vector_2a (ary, N))
269     return 1;
270   if (gang_worker_vector_star_2a (ary, N))
271     return 1;
272   if (gang_vector_2b (ary, N))
273     return 1;
274   if (worker_vector_2b (ary, N))
275     return 1;
276   if (gang_worker_vector_2b (ary, N))
277     return 1;
278   if (gang_worker_vector_star_2b (ary, N))
279     return 1;
280   return 0;
281 }
282