1 /* This code uses nvptx inline assembly guarded with acc_on_device, which is
2 not optimized away at -O0, and then confuses the target assembler.
3 { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
4 /* { dg-additional-options "-fopenacc-dim=16:16" } */
5
6 #include <openacc.h>
7 #include <alloca.h>
8 #include <string.h>
9 #include <stdio.h>
10
11 #pragma acc routine
coord()12 static int __attribute__ ((noinline)) coord ()
13 {
14 int res = 0;
15
16 if (acc_on_device (acc_device_nvidia))
17 {
18 int g = 0, w = 0, v = 0;
19
20 __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
21 __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
22 __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
23 res = (1 << 24) | (g << 16) | (w << 8) | v;
24 }
25 return res;
26 }
27
28
check(const int * ary,int size,int gp,int wp,int vp)29 int check (const int *ary, int size, int gp, int wp, int vp)
30 {
31 int exit = 0;
32 int ix;
33 int *gangs = (int *)alloca (gp * sizeof (int));
34 int *workers = (int *)alloca (wp * sizeof (int));
35 int *vectors = (int *)alloca (vp * sizeof (int));
36 int offloaded = 0;
37
38 memset (gangs, 0, gp * sizeof (int));
39 memset (workers, 0, wp * sizeof (int));
40 memset (vectors, 0, vp * sizeof (int));
41
42 for (ix = 0; ix < size; ix++)
43 {
44 int g = (ary[ix] >> 16) & 0xff;
45 int w = (ary[ix] >> 8) & 0xff;
46 int v = (ary[ix] >> 0) & 0xff;
47
48 if (g >= gp || w >= wp || v >= vp)
49 {
50 printf ("unexpected cpu %#x used\n", ary[ix]);
51 exit = 1;
52 }
53 else
54 {
55 vectors[v]++;
56 workers[w]++;
57 gangs[g]++;
58 }
59 offloaded += ary[ix] >> 24;
60 }
61
62 if (!offloaded)
63 return 0;
64
65 if (offloaded != size)
66 {
67 printf ("offloaded %d times, expected %d\n", offloaded, size);
68 return 1;
69 }
70
71 for (ix = 0; ix < gp; ix++)
72 if (gangs[ix] != gangs[0])
73 {
74 printf ("gang %d not used %d times\n", ix, gangs[0]);
75 exit = 1;
76 }
77
78 for (ix = 0; ix < wp; ix++)
79 if (workers[ix] != workers[0])
80 {
81 printf ("worker %d not used %d times\n", ix, workers[0]);
82 exit = 1;
83 }
84
85 for (ix = 0; ix < vp; ix++)
86 if (vectors[ix] != vectors[0])
87 {
88 printf ("vector %d not used %d times\n", ix, vectors[0]);
89 exit = 1;
90 }
91
92 return exit;
93 }
94
95 #define N (32 *32*32)
96
test_1(int gp,int wp,int vp)97 int test_1 (int gp, int wp, int vp)
98 {
99 int ary[N];
100 int exit = 0;
101
102 #pragma acc parallel copyout (ary)
103 {
104 #pragma acc loop gang (static:1)
105 for (int ix = 0; ix < N; ix++)
106 ary[ix] = coord ();
107 }
108
109 exit |= check (ary, N, gp, 1, 1);
110
111 #pragma acc parallel copyout (ary)
112 {
113 #pragma acc loop worker
114 for (int ix = 0; ix < N; ix++)
115 ary[ix] = coord ();
116 }
117
118 exit |= check (ary, N, 1, wp, 1);
119
120 #pragma acc parallel copyout (ary)
121 {
122 #pragma acc loop vector
123 for (int ix = 0; ix < N; ix++)
124 ary[ix] = coord ();
125 }
126
127 exit |= check (ary, N, 1, 1, vp);
128
129 return exit;
130 }
131
main()132 int main ()
133 {
134 return test_1 (16, 16, 32);
135 }
136