1; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefix=SI --check-prefix=ALL %s
2; RUN: opt -S -mcpu=tonga -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefix=CI --check-prefix=ALL %s
3
4; SI-NOT: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
5; CI: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
6
7define amdgpu_kernel void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
8entry:
9  %stack = alloca [5 x i32], align 4
10  %0 = load i32, i32 addrspace(1)* %in, align 4
11  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
12  store i32 4, i32* %arrayidx1, align 4
13  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
14  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
15  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
16  store i32 5, i32* %arrayidx3, align 4
17  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
18  %2 = load i32, i32* %arrayidx10, align 4
19  store i32 %2, i32 addrspace(1)* %out, align 4
20  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
21  %3 = load i32, i32* %arrayidx12
22  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
23  store i32 %3, i32 addrspace(1)* %arrayidx13
24  ret void
25}
26
27; ALL: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4
28
29define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 {
30entry:
31  %stack = alloca [5 x i32], align 4
32  %0 = load i32, i32 addrspace(1)* %in, align 4
33  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
34  store i32 4, i32* %arrayidx1, align 4
35  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
36  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
37  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
38  store i32 5, i32* %arrayidx3, align 4
39  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
40  %2 = load i32, i32* %arrayidx10, align 4
41  store i32 %2, i32 addrspace(1)* %out, align 4
42  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
43  %3 = load i32, i32* %arrayidx12
44  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
45  store i32 %3, i32 addrspace(1)* %arrayidx13
46  ret void
47}
48
49; ALL: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4
50
51define amdgpu_kernel void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 {
52entry:
53  %stack = alloca [5 x i32], align 4
54  %0 = load i32, i32 addrspace(1)* %in, align 4
55  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
56  store i32 4, i32* %arrayidx1, align 4
57  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
58  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
59  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
60  store i32 5, i32* %arrayidx3, align 4
61  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
62  %2 = load i32, i32* %arrayidx10, align 4
63  store i32 %2, i32 addrspace(1)* %out, align 4
64  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
65  %3 = load i32, i32* %arrayidx12
66  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
67  store i32 %3, i32 addrspace(1)* %arrayidx13
68  ret void
69}
70
71; ALL-LABEL: @occupancy_0(
72; CI-NOT: alloca [5 x i32]
73; SI: alloca [5 x i32]
74define amdgpu_kernel void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 {
75entry:
76  %stack = alloca [5 x i32], align 4
77  %0 = load i32, i32 addrspace(1)* %in, align 4
78  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
79  store i32 4, i32* %arrayidx1, align 4
80  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
81  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
82  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
83  store i32 5, i32* %arrayidx3, align 4
84  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
85  %2 = load i32, i32* %arrayidx10, align 4
86  store i32 %2, i32 addrspace(1)* %out, align 4
87  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
88  %3 = load i32, i32* %arrayidx12
89  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
90  store i32 %3, i32 addrspace(1)* %arrayidx13
91  ret void
92}
93
94; ALL-LABEL: @occupancy_max(
95; CI-NOT: alloca [5 x i32]
96; SI: alloca [5 x i32]
97define amdgpu_kernel void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 {
98entry:
99  %stack = alloca [5 x i32], align 4
100  %0 = load i32, i32 addrspace(1)* %in, align 4
101  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
102  store i32 4, i32* %arrayidx1, align 4
103  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
104  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
105  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
106  store i32 5, i32* %arrayidx3, align 4
107  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
108  %2 = load i32, i32* %arrayidx10, align 4
109  store i32 %2, i32 addrspace(1)* %out, align 4
110  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
111  %3 = load i32, i32* %arrayidx12
112  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
113  store i32 %3, i32 addrspace(1)* %arrayidx13
114  ret void
115}
116
117; SI-LABEL: @occupancy_6(
118; CI-LABEL: @occupancy_6(
119; SI: alloca
120; CI-NOT: alloca
121define amdgpu_kernel void @occupancy_6(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
122entry:
123  %stack = alloca [42 x i8], align 4
124  %tmp = load i8, i8 addrspace(1)* %in, align 1
125  %tmp4 = sext i8 %tmp to i64
126  %arrayidx1 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 %tmp4
127  store i8 4, i8* %arrayidx1, align 1
128  %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
129  %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
130  %tmp5 = sext i8 %tmp1 to i64
131  %arrayidx3 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 %tmp5
132  store i8 5, i8* %arrayidx3, align 1
133  %arrayidx10 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 0
134  %tmp2 = load i8, i8* %arrayidx10, align 1
135  store i8 %tmp2, i8 addrspace(1)* %out, align 1
136  %arrayidx12 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 1
137  %tmp3 = load i8, i8* %arrayidx12, align 1
138  %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
139  store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
140  ret void
141}
142
143; ALL-LABEL: @occupancy_6_over(
144; ALL: alloca [43 x i8]
145define amdgpu_kernel void @occupancy_6_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
146entry:
147  %stack = alloca [43 x i8], align 4
148  %tmp = load i8, i8 addrspace(1)* %in, align 1
149  %tmp4 = sext i8 %tmp to i64
150  %arrayidx1 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 %tmp4
151  store i8 4, i8* %arrayidx1, align 1
152  %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
153  %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
154  %tmp5 = sext i8 %tmp1 to i64
155  %arrayidx3 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 %tmp5
156  store i8 5, i8* %arrayidx3, align 1
157  %arrayidx10 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 0
158  %tmp2 = load i8, i8* %arrayidx10, align 1
159  store i8 %tmp2, i8 addrspace(1)* %out, align 1
160  %arrayidx12 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 1
161  %tmp3 = load i8, i8* %arrayidx12, align 1
162  %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
163  store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
164  ret void
165}
166
167; SI-LABEL: @occupancy_8(
168; CI-LABEL: @occupancy_8(
169; SI: alloca
170; CI-NOT: alloca
171define amdgpu_kernel void @occupancy_8(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
172entry:
173  %stack = alloca [32 x i8], align 4
174  %tmp = load i8, i8 addrspace(1)* %in, align 1
175  %tmp4 = sext i8 %tmp to i64
176  %arrayidx1 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 %tmp4
177  store i8 4, i8* %arrayidx1, align 1
178  %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
179  %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
180  %tmp5 = sext i8 %tmp1 to i64
181  %arrayidx3 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 %tmp5
182  store i8 5, i8* %arrayidx3, align 1
183  %arrayidx10 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 0
184  %tmp2 = load i8, i8* %arrayidx10, align 1
185  store i8 %tmp2, i8 addrspace(1)* %out, align 1
186  %arrayidx12 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 1
187  %tmp3 = load i8, i8* %arrayidx12, align 1
188  %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
189  store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
190  ret void
191}
192
193; ALL-LABEL: @occupancy_8_over(
194; ALL: alloca [33 x i8]
195define amdgpu_kernel void @occupancy_8_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
196entry:
197  %stack = alloca [33 x i8], align 4
198  %tmp = load i8, i8 addrspace(1)* %in, align 1
199  %tmp4 = sext i8 %tmp to i64
200  %arrayidx1 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 %tmp4
201  store i8 4, i8* %arrayidx1, align 1
202  %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
203  %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
204  %tmp5 = sext i8 %tmp1 to i64
205  %arrayidx3 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 %tmp5
206  store i8 5, i8* %arrayidx3, align 1
207  %arrayidx10 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 0
208  %tmp2 = load i8, i8* %arrayidx10, align 1
209  store i8 %tmp2, i8 addrspace(1)* %out, align 1
210  %arrayidx12 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 1
211  %tmp3 = load i8, i8* %arrayidx12, align 1
212  %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
213  store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
214  ret void
215}
216
217; SI-LABEL: @occupancy_9(
218; CI-LABEL: @occupancy_9(
219; SI: alloca
220; CI-NOT: alloca
221define amdgpu_kernel void @occupancy_9(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
222entry:
223  %stack = alloca [28 x i8], align 4
224  %tmp = load i8, i8 addrspace(1)* %in, align 1
225  %tmp4 = sext i8 %tmp to i64
226  %arrayidx1 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 %tmp4
227  store i8 4, i8* %arrayidx1, align 1
228  %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
229  %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
230  %tmp5 = sext i8 %tmp1 to i64
231  %arrayidx3 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 %tmp5
232  store i8 5, i8* %arrayidx3, align 1
233  %arrayidx10 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 0
234  %tmp2 = load i8, i8* %arrayidx10, align 1
235  store i8 %tmp2, i8 addrspace(1)* %out, align 1
236  %arrayidx12 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 1
237  %tmp3 = load i8, i8* %arrayidx12, align 1
238  %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
239  store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
240  ret void
241}
242
243; ALL-LABEL: @occupancy_9_over(
244; ALL: alloca [29 x i8]
245define amdgpu_kernel void @occupancy_9_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
246entry:
247  %stack = alloca [29 x i8], align 4
248  %tmp = load i8, i8 addrspace(1)* %in, align 1
249  %tmp4 = sext i8 %tmp to i64
250  %arrayidx1 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 %tmp4
251  store i8 4, i8* %arrayidx1, align 1
252  %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
253  %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
254  %tmp5 = sext i8 %tmp1 to i64
255  %arrayidx3 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 %tmp5
256  store i8 5, i8* %arrayidx3, align 1
257  %arrayidx10 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 0
258  %tmp2 = load i8, i8* %arrayidx10, align 1
259  store i8 %tmp2, i8 addrspace(1)* %out, align 1
260  %arrayidx12 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 1
261  %tmp3 = load i8, i8* %arrayidx12, align 1
262  %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
263  store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
264  ret void
265}
266
267attributes #0 = { nounwind "amdgpu-max-work-group-size"="63" }
268attributes #1 = { nounwind "amdgpu-waves-per-eu"="1,3" "amdgpu-flat-work-group-size"="256,256" }
269attributes #2 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1600,1600" }
270attributes #3 = { nounwind "amdgpu-waves-per-eu"="1,10" }
271attributes #4 = { nounwind "amdgpu-waves-per-eu"="1,10" }
272attributes #5 = { nounwind "amdgpu-waves-per-eu"="1,6" "amdgpu-flat-work-group-size"="64,64" }
273attributes #6 = { nounwind "amdgpu-waves-per-eu"="1,8" "amdgpu-flat-work-group-size"="64,64" }
274attributes #7 = { nounwind "amdgpu-waves-per-eu"="1,9" "amdgpu-flat-work-group-size"="64,64" }
275