1 /* This code uses nvptx inline assembly guarded with acc_on_device, which is
2    not optimized away at -O0, and then confuses the target assembler.
3    { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
4 /* { dg-additional-options "-fopenacc-dim=16:16" } */
5 
6 #include <openacc.h>
7 #include <alloca.h>
8 #include <string.h>
9 #include <stdio.h>
10 
11 #pragma acc routine
coord()12 static int __attribute__ ((noinline)) coord ()
13 {
14   int res = 0;
15 
16   if (acc_on_device (acc_device_nvidia))
17     {
18       int g = 0, w = 0, v = 0;
19 
20       __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
21       __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
22       __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
23       res = (1 << 24) | (g << 16) | (w << 8) | v;
24     }
25   return res;
26 }
27 
28 
check(const int * ary,int size,int gp,int wp,int vp)29 int check (const int *ary, int size, int gp, int wp, int vp)
30 {
31   int exit = 0;
32   int ix;
33   int *gangs = (int *)alloca (gp * sizeof (int));
34   int *workers = (int *)alloca (wp * sizeof (int));
35   int *vectors = (int *)alloca (vp * sizeof (int));
36   int offloaded = 0;
37 
38   memset (gangs, 0, gp * sizeof (int));
39   memset (workers, 0, wp * sizeof (int));
40   memset (vectors, 0, vp * sizeof (int));
41 
42   for (ix = 0; ix < size; ix++)
43     {
44       int g = (ary[ix] >> 16) & 0xff;
45       int w = (ary[ix] >> 8) & 0xff;
46       int v = (ary[ix] >> 0) & 0xff;
47 
48       if (g >= gp || w >= wp || v >= vp)
49 	{
50 	  printf ("unexpected cpu %#x used\n", ary[ix]);
51 	  exit = 1;
52 	}
53       else
54 	{
55 	  vectors[v]++;
56 	  workers[w]++;
57 	  gangs[g]++;
58 	}
59       offloaded += ary[ix] >> 24;
60     }
61 
62   if (!offloaded)
63     return 0;
64 
65   if (offloaded != size)
66     {
67       printf ("offloaded %d times,  expected %d\n", offloaded, size);
68       return 1;
69     }
70 
71   for (ix = 0; ix < gp; ix++)
72     if (gangs[ix] != gangs[0])
73       {
74 	printf ("gang %d not used %d times\n", ix, gangs[0]);
75 	exit = 1;
76       }
77 
78   for (ix = 0; ix < wp; ix++)
79     if (workers[ix] != workers[0])
80       {
81 	printf ("worker %d not used %d times\n", ix, workers[0]);
82 	exit = 1;
83       }
84 
85   for (ix = 0; ix < vp; ix++)
86     if (vectors[ix] != vectors[0])
87       {
88 	printf ("vector %d not used %d times\n", ix, vectors[0]);
89 	exit = 1;
90       }
91 
92   return exit;
93 }
94 
95 #define N (32 *32*32)
96 
test_1(int gp,int wp,int vp)97 int test_1 (int gp, int wp, int vp)
98 {
99   int ary[N];
100   int exit = 0;
101 
102 #pragma acc parallel copyout (ary)
103   {
104 #pragma acc loop gang (static:1)
105     for (int ix = 0; ix < N; ix++)
106       ary[ix] = coord ();
107   }
108 
109   exit |= check (ary, N, gp, 1, 1);
110 
111 #pragma  acc parallel copyout (ary)
112   {
113 #pragma acc loop worker
114     for (int ix = 0; ix < N; ix++)
115       ary[ix] = coord ();
116   }
117 
118   exit |= check (ary, N, 1, wp, 1);
119 
120 #pragma  acc parallel copyout (ary)
121   {
122 #pragma acc loop vector
123     for (int ix = 0; ix < N; ix++)
124       ary[ix] = coord ();
125   }
126 
127   exit |= check (ary, N, 1, 1, vp);
128 
129   return exit;
130 }
131 
main()132 int main ()
133 {
134   return test_1 (16, 16, 32);
135 }
136