1 /* OpenACC parallelism dimensions clauses: num_gangs, num_workers,
2    vector_length.  */
3 
4 /* See also '../libgomp.oacc-fortran/parallel-dims.f90'.  */
5 
6 #include <limits.h>
7 #include <openacc.h>
8 #include <gomp-constants.h>
9 
10 /* TODO: "(int) acc_device_*" casts because of the C++ acc_on_device wrapper
11    not behaving as expected for -O0.  */
12 #pragma acc routine seq
acc_gang()13 static unsigned int __attribute__ ((optimize ("O2"))) acc_gang ()
14 {
15   if (acc_on_device ((int) acc_device_host))
16     return 0;
17   else if (acc_on_device ((int) acc_device_nvidia)
18 	   || acc_on_device ((int) acc_device_radeon))
19     return __builtin_goacc_parlevel_id (GOMP_DIM_GANG);
20   else
21     __builtin_abort ();
22 }
23 
24 #pragma acc routine seq
acc_worker()25 static unsigned int __attribute__ ((optimize ("O2"))) acc_worker ()
26 {
27   if (acc_on_device ((int) acc_device_host))
28     return 0;
29   else if (acc_on_device ((int) acc_device_nvidia)
30 	   || acc_on_device ((int) acc_device_radeon))
31     return __builtin_goacc_parlevel_id (GOMP_DIM_WORKER);
32   else
33     __builtin_abort ();
34 }
35 
36 #pragma acc routine seq
acc_vector()37 static unsigned int __attribute__ ((optimize ("O2"))) acc_vector ()
38 {
39   if (acc_on_device ((int) acc_device_host))
40     return 0;
41   else if (acc_on_device ((int) acc_device_nvidia)
42 	   || acc_on_device ((int) acc_device_radeon))
43     return __builtin_goacc_parlevel_id (GOMP_DIM_VECTOR);
44   else
45     __builtin_abort ();
46 }
47 
48 
main()49 int main ()
50 {
51   acc_init (acc_device_default);
52 
53   /* OpenACC parallel construct.  */
54 
55   /* Non-positive value.  */
56 
57   /* GR, WS, VS.  */
58   {
59 #define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" { target c } } */
60     int gangs_actual = GANGS;
61     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
62     gangs_min = workers_min = vectors_min = INT_MAX;
63     gangs_max = workers_max = vectors_max = INT_MIN;
64 #pragma acc parallel copy (gangs_actual) \
65   reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \
66   num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive" "" { target c++ } } */
67     {
68       /* We're actually executing with num_gangs (1).  */
69       gangs_actual = 1;
70       for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
71 	{
72 	  /* <https://gcc.gnu.org/PR80547>.  */
73 #if 0
74 	  gangs_min = gangs_max = acc_gang ();
75 	  workers_min = workers_max = acc_worker ();
76 	  vectors_min = vectors_max = acc_vector ();
77 #else
78 	  int gangs = acc_gang ();
79 	  gangs_min = (gangs_min < gangs) ? gangs_min : gangs;
80 	  gangs_max = (gangs_max > gangs) ? gangs_max : gangs;
81 	  int workers = acc_worker ();
82 	  workers_min = (workers_min < workers) ? workers_min : workers;
83 	  workers_max = (workers_max > workers) ? workers_max : workers;
84 	  int vectors = acc_vector ();
85 	  vectors_min = (vectors_min < vectors) ? vectors_min : vectors;
86 	  vectors_max = (vectors_max > vectors) ? vectors_max : vectors;
87 #endif
88 	}
89     }
90     if (gangs_actual != 1)
91       __builtin_abort ();
92     if (gangs_min != 0 || gangs_max != gangs_actual - 1
93 	|| workers_min != 0 || workers_max != 0
94 	|| vectors_min != 0 || vectors_max != 0)
95       __builtin_abort ();
96 #undef GANGS
97   }
98 
99   /* GP, WS, VS.  */
100   {
101 #define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" { target c } } */
102     int gangs_actual = GANGS;
103     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
104     gangs_min = workers_min = vectors_min = INT_MAX;
105     gangs_max = workers_max = vectors_max = INT_MIN;
106 #pragma acc parallel copy (gangs_actual) \
107   num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive" "" { target c++ } } */
108     {
109       /* We're actually executing with num_gangs (1).  */
110       gangs_actual = 1;
111 #pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
112       for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
113 	{
114 	  gangs_min = gangs_max = acc_gang ();
115 	  workers_min = workers_max = acc_worker ();
116 	  vectors_min = vectors_max = acc_vector ();
117 	}
118     }
119     if (gangs_actual != 1)
120       __builtin_abort ();
121     if (gangs_min != 0 || gangs_max != gangs_actual - 1
122 	|| workers_min != 0 || workers_max != 0
123 	|| vectors_min != 0 || vectors_max != 0)
124       __builtin_abort ();
125 #undef GANGS
126   }
127 
128   /* GR, WP, VS.  */
129   {
130 #define WORKERS 0 /* { dg-warning "'num_workers' value must be positive" "" { target c } } */
131     int workers_actual = WORKERS;
132     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
133     gangs_min = workers_min = vectors_min = INT_MAX;
134     gangs_max = workers_max = vectors_max = INT_MIN;
135 #pragma acc parallel copy (workers_actual) \
136   num_workers (WORKERS) /* { dg-warning "'num_workers' value must be positive" "" { target c++ } } */
137     {
138       /* We're actually executing with num_workers (1).  */
139       workers_actual = 1;
140 #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
141       for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
142 	{
143 	  gangs_min = gangs_max = acc_gang ();
144 	  workers_min = workers_max = acc_worker ();
145 	  vectors_min = vectors_max = acc_vector ();
146 	}
147     }
148     if (workers_actual != 1)
149       __builtin_abort ();
150     if (gangs_min != 0 || gangs_max != 0
151 	|| workers_min != 0 || workers_max != workers_actual - 1
152 	|| vectors_min != 0 || vectors_max != 0)
153       __builtin_abort ();
154 #undef WORKERS
155   }
156 
157   /* GR, WS, VP.  */
158   {
159 #define VECTORS 0 /* { dg-warning "'vector_length' value must be positive" "" { target c } } */
160     int vectors_actual = VECTORS;
161     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
162     gangs_min = workers_min = vectors_min = INT_MAX;
163     gangs_max = workers_max = vectors_max = INT_MIN;
164 #pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 1" "" { target openacc_nvidia_accel_selected } } */ \
165   vector_length (VECTORS) /* { dg-warning "'vector_length' value must be positive" "" { target c++ } } */
166     {
167       /* We're actually executing with vector_length (1), just the GCC nvptx
168 	 back end enforces vector_length (32).  */
169       if (acc_on_device (acc_device_nvidia))
170 	vectors_actual = 32;
171       else
172 	vectors_actual = 1;
173 #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
174       for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
175 	{
176 	  gangs_min = gangs_max = acc_gang ();
177 	  workers_min = workers_max = acc_worker ();
178 	  vectors_min = vectors_max = acc_vector ();
179 	}
180     }
181     if (acc_get_device_type () == acc_device_nvidia)
182       {
183 	if (vectors_actual != 32)
184 	  __builtin_abort ();
185       }
186     else
187       if (vectors_actual != 1)
188 	__builtin_abort ();
189     if (gangs_min != 0 || gangs_max != 0
190 	|| workers_min != 0 || workers_max != 0
191 	|| vectors_min != 0 || vectors_max != vectors_actual - 1)
192       __builtin_abort ();
193 #undef VECTORS
194   }
195 
196 
197   /* High value.  */
198 
199   /* GR, WS, VS.  */
200   {
201     /* There is no actual limit for the number of gangs, so we try with a
202        rather high value.  */
203     int gangs = 12345;
204     int gangs_actual = gangs;
205     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
206     gangs_min = workers_min = vectors_min = INT_MAX;
207     gangs_max = workers_max = vectors_max = INT_MIN;
208 #pragma acc parallel copy (gangs_actual) \
209   reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \
210   num_gangs (gangs)
211     {
212       if (acc_on_device (acc_device_host))
213 	{
214 	  /* We're actually executing with num_gangs (1).  */
215 	  gangs_actual = 1;
216 	}
217       /* As we're executing GR not GP, don't multiply with a "gangs_actual"
218 	 factor.  */
219       for (int i = 100 /* * gangs_actual */; i > -100 /* * gangs_actual */; --i)
220 	{
221 	  gangs_min = gangs_max = acc_gang ();
222 	  workers_min = workers_max = acc_worker ();
223 	  vectors_min = vectors_max = acc_vector ();
224 	}
225     }
226     if (gangs_actual < 1)
227       __builtin_abort ();
228     if (gangs_min != 0 || gangs_max != gangs_actual - 1
229 	|| workers_min != 0 || workers_max != 0
230 	|| vectors_min != 0 || vectors_max != 0)
231       __builtin_abort ();
232   }
233 
234   /* GP, WS, VS.  */
235   {
236     /* There is no actual limit for the number of gangs, so we try with a
237        rather high value.  */
238     int gangs = 12345;
239     int gangs_actual = gangs;
240     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
241     gangs_min = workers_min = vectors_min = INT_MAX;
242     gangs_max = workers_max = vectors_max = INT_MIN;
243 #pragma acc parallel copy (gangs_actual) \
244   num_gangs (gangs)
245     {
246       if (acc_on_device (acc_device_host))
247 	{
248 	  /* We're actually executing with num_gangs (1).  */
249 	  gangs_actual = 1;
250 	}
251 #pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
252       for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
253 	{
254 	  gangs_min = gangs_max = acc_gang ();
255 	  workers_min = workers_max = acc_worker ();
256 	  vectors_min = vectors_max = acc_vector ();
257 	}
258     }
259     if (gangs_actual < 1)
260       __builtin_abort ();
261     if (gangs_min != 0 || gangs_max != gangs_actual - 1
262 	|| workers_min != 0 || workers_max != 0
263 	|| vectors_min != 0 || vectors_max != 0)
264       __builtin_abort ();
265   }
266 
267   /* GR, WP, VS.  */
268   {
269     /* We try with an outrageously large value. */
270 #define WORKERS 2 << 20
271     int workers_actual = WORKERS;
272     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
273     gangs_min = workers_min = vectors_min = INT_MAX;
274     gangs_max = workers_max = vectors_max = INT_MIN;
275 #pragma acc parallel copy (workers_actual) /* { dg-warning "using num_workers \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_selected } } */ \
276   num_workers (WORKERS)
277     {
278       if (acc_on_device (acc_device_host))
279 	{
280 	  /* We're actually executing with num_workers (1).  */
281 	  workers_actual = 1;
282 	}
283       else if (acc_on_device (acc_device_nvidia))
284 	{
285 	  /* The GCC nvptx back end enforces num_workers (32).  */
286 	  workers_actual = 32;
287 	}
288       else if (acc_on_device (acc_device_radeon))
289 	{
290 	  /* The GCC GCN back end is limited to num_workers (16).
291 	     Temporarily set this to 1 until multiple workers are permitted. */
292 	  workers_actual = 1; // 16;
293 	}
294       else
295 	__builtin_abort ();
296 #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
297       for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
298 	{
299 	  gangs_min = gangs_max = acc_gang ();
300 	  workers_min = workers_max = acc_worker ();
301 	  vectors_min = vectors_max = acc_vector ();
302 	}
303     }
304     if (workers_actual < 1)
305       __builtin_abort ();
306     if (gangs_min != 0 || gangs_max != 0
307 	|| workers_min != 0 || workers_max != workers_actual - 1
308 	|| vectors_min != 0 || vectors_max != 0)
309       __builtin_abort ();
310 #undef WORKERS
311   }
312 
313   /* GR, WP, VS.  */
314   {
315     /* We try with an outrageously large value. */
316     int workers = 2 << 20;
317     /* For nvptx offloading, this one will not result in "using num_workers
318        (32), ignoring runtime setting", and will in fact try to launch with
319        "num_workers (workers)", which will run into "libgomp: cuLaunchKernel
320        error: invalid argument".  So, limit ourselves here.  */
321     if (acc_get_device_type () == acc_device_nvidia)
322       workers = 32;
323     int workers_actual = workers;
324     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
325     gangs_min = workers_min = vectors_min = INT_MAX;
326     gangs_max = workers_max = vectors_max = INT_MIN;
327 #pragma acc parallel copy (workers_actual) \
328   num_workers (workers)
329     {
330       if (acc_on_device (acc_device_host))
331 	{
332 	  /* We're actually executing with num_workers (1).  */
333 	  workers_actual = 1;
334 	}
335       else if (acc_on_device (acc_device_nvidia))
336 	{
337 	  /* We're actually executing with num_workers (32).  */
338 	  /* workers_actual = 32; */
339 	}
340       else if (acc_on_device (acc_device_radeon))
341 	{
342 	  /* The GCC GCN back end is limited to num_workers (16).  */
343 	  workers_actual = 16;
344 	}
345       else
346 	__builtin_abort ();
347 #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
348       for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
349 	{
350 	  gangs_min = gangs_max = acc_gang ();
351 	  workers_min = workers_max = acc_worker ();
352 	  vectors_min = vectors_max = acc_vector ();
353 	}
354     }
355     if (workers_actual < 1)
356       __builtin_abort ();
357     if (gangs_min != 0 || gangs_max != 0
358 	|| workers_min != 0 || workers_max != workers_actual - 1
359 	|| vectors_min != 0 || vectors_max != 0)
360       __builtin_abort ();
361   }
362 
363   /* GR, WS, VP.  */
364   {
365     /* We try with an outrageously large value. */
366 #define VECTORS 2 << 20
367     int vectors_actual = VECTORS;
368     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
369     gangs_min = workers_min = vectors_min = INT_MAX;
370     gangs_max = workers_max = vectors_max = INT_MIN;
371 #pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(1024\\), ignoring 2097152" "" { target openacc_nvidia_accel_selected } } */ \
372   vector_length (VECTORS)
373     {
374       if (acc_on_device (acc_device_host))
375 	{
376 	  /* We're actually executing with vector_length (1).  */
377 	  vectors_actual = 1;
378 	}
379       else if (acc_on_device (acc_device_nvidia))
380 	{
381 	  /* The GCC nvptx back end enforces vector_length (32).  */
382 	  vectors_actual = 1024;
383 	}
384       else if (acc_on_device (acc_device_radeon))
385 	{
386 	  /* The GCC GCN back end enforces vector_length (1): autovectorize. */
387 	  vectors_actual = 1;
388 	}
389       else
390 	__builtin_abort ();
391 #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
392       for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
393 	{
394 	  gangs_min = gangs_max = acc_gang ();
395 	  workers_min = workers_max = acc_worker ();
396 	  vectors_min = vectors_max = acc_vector ();
397 	}
398     }
399     if (vectors_actual < 1)
400       __builtin_abort ();
401     if (gangs_min != 0 || gangs_max != 0
402 	|| workers_min != 0 || workers_max != 0
403 	|| vectors_min != 0 || vectors_max != vectors_actual - 1)
404       __builtin_abort ();
405 #undef VECTORS
406   }
407 
408   /* GR, WS, VP.  */
409   {
410     /* We try with an outrageously large value. */
411     int vectors = 2 << 20;
412     int vectors_actual = vectors;
413     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
414     gangs_min = workers_min = vectors_min = INT_MAX;
415     gangs_max = workers_max = vectors_max = INT_MIN;
416 #pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring runtime setting" "" { target openacc_nvidia_accel_selected } } */ \
417   vector_length (vectors)
418     {
419       if (acc_on_device (acc_device_host))
420 	{
421 	  /* We're actually executing with vector_length (1).  */
422 	  vectors_actual = 1;
423 	}
424       else if (acc_on_device (acc_device_nvidia))
425 	{
426 	  /* The GCC nvptx back end enforces vector_length (32).  */
427 	  vectors_actual = 32;
428 	}
429       else if (acc_on_device (acc_device_radeon))
430 	{
431 	  /* Because of the way vectors are implemented for GCN, a vector loop
432 	     containing a seq routine call will not vectorize calls to that
433 	     routine.  Hence, we'll only get one "vector".  */
434 	  vectors_actual = 1;
435 	}
436       else
437 	__builtin_abort ();
438 #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
439       for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
440 	{
441 	  gangs_min = gangs_max = acc_gang ();
442 	  workers_min = workers_max = acc_worker ();
443 	  vectors_min = vectors_max = acc_vector ();
444 	}
445     }
446     if (vectors_actual < 1)
447       __builtin_abort ();
448     if (gangs_min != 0 || gangs_max != 0
449 	|| workers_min != 0 || workers_max != 0
450 	|| vectors_min != 0 || vectors_max != vectors_actual - 1)
451       __builtin_abort ();
452   }
453 
454 
455   /* Composition of GP, WP, VP.  */
456   {
457     int gangs = 12345;
458     /* With nvptx offloading, multi-level reductions apparently are very slow
459        in the following case.  So, limit ourselves here.  */
460     if (acc_get_device_type () == acc_device_nvidia)
461       gangs = 3;
462     /* Similar appears to be true for GCN.  */
463     if (acc_get_device_type () == acc_device_radeon)
464       gangs = 3;
465     int gangs_actual = gangs;
466 #define WORKERS 3
467     int workers_actual = WORKERS;
468 #define VECTORS 11
469     int vectors_actual = VECTORS;
470     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
471     gangs_min = workers_min = vectors_min = INT_MAX;
472     gangs_max = workers_max = vectors_max = INT_MIN;
473 #pragma acc parallel copy (gangs_actual, workers_actual, vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 11" "" { target openacc_nvidia_accel_selected } } */ \
474   num_gangs (gangs) \
475   num_workers (WORKERS) \
476   vector_length (VECTORS)
477     {
478       if (acc_on_device (acc_device_host))
479 	{
480 	  /* We're actually executing with num_gangs (1), num_workers (1),
481 	     vector_length (1).  */
482 	  gangs_actual = 1;
483 	  workers_actual = 1;
484 	  vectors_actual = 1;
485 	}
486       else if (acc_on_device (acc_device_nvidia))
487 	{
488 	  /* The GCC nvptx back end enforces vector_length (32).  */
489 	  vectors_actual = 32;
490 	}
491       else if (acc_on_device (acc_device_radeon))
492 	{
493 	  /* Temporary setting, until multiple workers are permitted.  */
494 	  workers_actual = 1;
495 	  /* See above comments about GCN vectors_actual.  */
496 	  vectors_actual = 1;
497 	}
498       else
499 	__builtin_abort ();
500 #pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
501       for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
502 #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
503 	for (int j = 100 * workers_actual; j > -100 * workers_actual; --j)
504 #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
505 	  for (int k = 100 * vectors_actual; k > -100 * vectors_actual; --k)
506 	    {
507 	      gangs_min = gangs_max = acc_gang ();
508 	      workers_min = workers_max = acc_worker ();
509 	      vectors_min = vectors_max = acc_vector ();
510 	    }
511     }
512     if (gangs_min != 0 || gangs_max != gangs_actual - 1
513 	|| workers_min != 0 || workers_max != workers_actual - 1
514 	|| vectors_min != 0 || vectors_max != vectors_actual - 1)
515       __builtin_abort ();
516 #undef VECTORS
517 #undef WORKERS
518   }
519 
520 
521   /* OpenACC kernels construct.  */
522 
523   /* We can't test parallelized OpenACC kernels constructs in this way: use of
524      the acc_gang, acc_worker, acc_vector functions will make the construct
525      unparallelizable.  */
526 
527 
528   /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
529      kernels.  */
530   {
531     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
532     gangs_min = workers_min = vectors_min = INT_MAX;
533     gangs_max = workers_max = vectors_max = INT_MIN;
534 #pragma acc kernels
535     {
536       /* This is to make the OpenACC kernels construct unparallelizable.  */
537       asm volatile ("" : : : "memory");
538 
539 #pragma acc loop reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
540       for (int i = 100; i > -100; --i)
541 	{
542 	  gangs_min = gangs_max = acc_gang ();
543 	  workers_min = workers_max = acc_worker ();
544 	  vectors_min = vectors_max = acc_vector ();
545 	}
546     }
547     if (gangs_min != 0 || gangs_max != 1 - 1
548 	|| workers_min != 0 || workers_max != 1 - 1
549 	|| vectors_min != 0 || vectors_max != 1 - 1)
550       __builtin_abort ();
551   }
552 
553 
554   /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
555      kernels even when there are explicit num_gangs, num_workers, or
556      vector_length clauses.  */
557   {
558     int gangs = 5;
559 #define WORKERS 5
560 #define VECTORS 13
561     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
562     gangs_min = workers_min = vectors_min = INT_MAX;
563     gangs_max = workers_max = vectors_max = INT_MIN;
564 #pragma acc kernels \
565   num_gangs (gangs) \
566   num_workers (WORKERS) \
567   vector_length (VECTORS)
568     {
569       /* This is to make the OpenACC kernels construct unparallelizable.  */
570       asm volatile ("" : : : "memory");
571 
572 #pragma acc loop reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
573       for (int i = 100; i > -100; --i)
574 	{
575 	  gangs_min = gangs_max = acc_gang ();
576 	  workers_min = workers_max = acc_worker ();
577 	  vectors_min = vectors_max = acc_vector ();
578 	}
579     }
580     if (gangs_min != 0 || gangs_max != 1 - 1
581 	|| workers_min != 0 || workers_max != 1 - 1
582 	|| vectors_min != 0 || vectors_max != 1 - 1)
583       __builtin_abort ();
584 #undef VECTORS
585 #undef WORKERS
586   }
587 
588 
589   /* OpenACC serial construct.  */
590 
591   /* GR, WS, VS.  */
592   {
593     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
594     gangs_min = workers_min = vectors_min = INT_MAX;
595     gangs_max = workers_max = vectors_max = INT_MIN;
596 #pragma acc serial /* { dg-warning "using vector_length \\(32\\), ignoring 1" "" { target openacc_nvidia_accel_selected } } */ \
597   reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
598     {
599       for (int i = 100; i > -100; i--)
600 	{
601 	  gangs_min = gangs_max = acc_gang ();
602 	  workers_min = workers_max = acc_worker ();
603 	  vectors_min = vectors_max = acc_vector ();
604 	}
605     }
606     if (gangs_min != 0 || gangs_max != 1 - 1
607 	|| workers_min != 0 || workers_max != 1 - 1
608 	|| vectors_min != 0 || vectors_max != 1 - 1)
609       __builtin_abort ();
610   }
611 
612   /* Composition of GP, WP, VP.  */
613   {
614     int vectors_actual = 1;  /* Implicit 'vector_length (1)' clause.  */
615     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
616     gangs_min = workers_min = vectors_min = INT_MAX;
617     gangs_max = workers_max = vectors_max = INT_MIN;
618 #pragma acc serial copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 1" "" { target openacc_nvidia_accel_selected } } */ \
619   copy (gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max)
620     {
621       if (acc_on_device (acc_device_nvidia))
622 	{
623 	  /* The GCC nvptx back end enforces vector_length (32).  */
624 	  /* It's unclear if that's actually permissible here;
625 	     <https://github.com/OpenACC/openacc-spec/issues/238> "OpenACC
626 	     'serial' construct might not actually be serial".  */
627 	  vectors_actual = 32;
628 	}
629 #pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
630       for (int i = 100; i > -100; i--)
631 #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
632 	for (int j = 100; j > -100; j--)
633 #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
634 	  for (int k = 100 * vectors_actual; k > -100 * vectors_actual; k--)
635 	    {
636 	      gangs_min = gangs_max = acc_gang ();
637 	      workers_min = workers_max = acc_worker ();
638 	      vectors_min = vectors_max = acc_vector ();
639 	    }
640     }
641     if (acc_get_device_type () == acc_device_nvidia)
642       {
643 	if (vectors_actual != 32)
644 	  __builtin_abort ();
645       }
646     else
647       if (vectors_actual != 1)
648 	__builtin_abort ();
649     if (gangs_min != 0 || gangs_max != 1 - 1
650 	|| workers_min != 0 || workers_max != 1 - 1
651 	|| vectors_min != 0 || vectors_max != vectors_actual - 1)
652       __builtin_abort ();
653   }
654 
655 
656   return 0;
657 }
658