1; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX908 %s
2; RUN: not llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefixes=GCN,GFX900 %s
3
4; GCN-LABEL: {{^}}max_10_vgprs:
5; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
6; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
7; GFX908-NOT: SCRATCH_RSRC
8; GFX908-DAG: v_accvgpr_write_b32 a0, v{{[0-9]}} ; Reload Reuse
9; GFX908-DAG: v_accvgpr_write_b32 a1, v{{[0-9]}} ; Reload Reuse
10; GFX900:     buffer_store_dword v{{[0-9]}},
11; GFX900:     buffer_store_dword v{{[0-9]}},
12; GFX900:     buffer_load_dword v{{[0-9]}},
13; GFX900:     buffer_load_dword v{{[0-9]}},
14; GFX908-NOT: buffer_
15; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a0 ; Reload Reuse
16; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a1 ; Reload Reuse
17
18; GCN:    NumVgprs: 10
19; GFX900: ScratchSize: 12
20; GFX908: ScratchSize: 0
21; GCN:    VGPRBlocks: 2
22; GCN:    NumVGPRsForWavesPerEU: 10
23define amdgpu_kernel void @max_10_vgprs(i32 addrspace(1)* %p) #0 {
24  %tid = load volatile i32, i32 addrspace(1)* undef
25  %p1 = getelementptr inbounds i32, i32 addrspace(1)* %p, i32 %tid
26  %p2 = getelementptr inbounds i32, i32 addrspace(1)* %p1, i32 4
27  %p3 = getelementptr inbounds i32, i32 addrspace(1)* %p2, i32 8
28  %p4 = getelementptr inbounds i32, i32 addrspace(1)* %p3, i32 12
29  %p5 = getelementptr inbounds i32, i32 addrspace(1)* %p4, i32 16
30  %p6 = getelementptr inbounds i32, i32 addrspace(1)* %p5, i32 20
31  %p7 = getelementptr inbounds i32, i32 addrspace(1)* %p6, i32 24
32  %p8 = getelementptr inbounds i32, i32 addrspace(1)* %p7, i32 28
33  %p9 = getelementptr inbounds i32, i32 addrspace(1)* %p8, i32 32
34  %p10 = getelementptr inbounds i32, i32 addrspace(1)* %p9, i32 36
35  %v1 = load volatile i32, i32 addrspace(1)* %p1
36  %v2 = load volatile i32, i32 addrspace(1)* %p2
37  %v3 = load volatile i32, i32 addrspace(1)* %p3
38  %v4 = load volatile i32, i32 addrspace(1)* %p4
39  %v5 = load volatile i32, i32 addrspace(1)* %p5
40  %v6 = load volatile i32, i32 addrspace(1)* %p6
41  %v7 = load volatile i32, i32 addrspace(1)* %p7
42  %v8 = load volatile i32, i32 addrspace(1)* %p8
43  %v9 = load volatile i32, i32 addrspace(1)* %p9
44  %v10 = load volatile i32, i32 addrspace(1)* %p10
45  call void asm sideeffect "", "v,v,v,v,v,v,v,v,v,v"(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, i32 %v6, i32 %v7, i32 %v8, i32 %v9, i32 %v10)
46  store volatile i32 %v1, i32 addrspace(1)* undef
47  store volatile i32 %v2, i32 addrspace(1)* undef
48  store volatile i32 %v3, i32 addrspace(1)* undef
49  store volatile i32 %v4, i32 addrspace(1)* undef
50  store volatile i32 %v5, i32 addrspace(1)* undef
51  store volatile i32 %v6, i32 addrspace(1)* undef
52  store volatile i32 %v7, i32 addrspace(1)* undef
53  store volatile i32 %v8, i32 addrspace(1)* undef
54  store volatile i32 %v9, i32 addrspace(1)* undef
55  store volatile i32 %v10, i32 addrspace(1)* undef
56  ret void
57}
58
59; GCN-LABEL: {{^}}max_10_vgprs_used_9a:
60; GFX908-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
61; GFX908-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
62; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}} ; Reload Reuse
63; GFX908:     buffer_store_dword v{{[0-9]}},
64; GFX908-NOT: buffer_
65; GFX908:     v_accvgpr_read_b32 v{{[0-9]}}, a9 ; Reload Reuse
66; GFX908:     buffer_load_dword v{{[0-9]}},
67; GFX908-NOT: buffer_
68
69; GFX900:     couldn't allocate input reg for constraint 'a'
70
71; GFX908: NumVgprs: 10
72; GFX908: ScratchSize: 8
73; GFX908: VGPRBlocks: 2
74; GFX908: NumVGPRsForWavesPerEU: 10
75define amdgpu_kernel void @max_10_vgprs_used_9a(i32 addrspace(1)* %p) #0 {
76  %tid = load volatile i32, i32 addrspace(1)* undef
77  call void asm sideeffect "", "a,a,a,a,a,a,a,a,a"(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9)
78  %p1 = getelementptr inbounds i32, i32 addrspace(1)* %p, i32 %tid
79  %p2 = getelementptr inbounds i32, i32 addrspace(1)* %p1, i32 4
80  %p3 = getelementptr inbounds i32, i32 addrspace(1)* %p2, i32 8
81  %p4 = getelementptr inbounds i32, i32 addrspace(1)* %p3, i32 12
82  %p5 = getelementptr inbounds i32, i32 addrspace(1)* %p4, i32 16
83  %p6 = getelementptr inbounds i32, i32 addrspace(1)* %p5, i32 20
84  %p7 = getelementptr inbounds i32, i32 addrspace(1)* %p6, i32 24
85  %p8 = getelementptr inbounds i32, i32 addrspace(1)* %p7, i32 28
86  %p9 = getelementptr inbounds i32, i32 addrspace(1)* %p8, i32 32
87  %p10 = getelementptr inbounds i32, i32 addrspace(1)* %p9, i32 36
88  %v1 = load volatile i32, i32 addrspace(1)* %p1
89  %v2 = load volatile i32, i32 addrspace(1)* %p2
90  %v3 = load volatile i32, i32 addrspace(1)* %p3
91  %v4 = load volatile i32, i32 addrspace(1)* %p4
92  %v5 = load volatile i32, i32 addrspace(1)* %p5
93  %v6 = load volatile i32, i32 addrspace(1)* %p6
94  %v7 = load volatile i32, i32 addrspace(1)* %p7
95  %v8 = load volatile i32, i32 addrspace(1)* %p8
96  %v9 = load volatile i32, i32 addrspace(1)* %p9
97  %v10 = load volatile i32, i32 addrspace(1)* %p10
98  call void asm sideeffect "", "v,v,v,v,v,v,v,v,v,v"(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, i32 %v6, i32 %v7, i32 %v8, i32 %v9, i32 %v10)
99  store volatile i32 %v1, i32 addrspace(1)* undef
100  store volatile i32 %v2, i32 addrspace(1)* undef
101  store volatile i32 %v3, i32 addrspace(1)* undef
102  store volatile i32 %v4, i32 addrspace(1)* undef
103  store volatile i32 %v5, i32 addrspace(1)* undef
104  store volatile i32 %v6, i32 addrspace(1)* undef
105  store volatile i32 %v7, i32 addrspace(1)* undef
106  store volatile i32 %v8, i32 addrspace(1)* undef
107  store volatile i32 %v9, i32 addrspace(1)* undef
108  store volatile i32 %v10, i32 addrspace(1)* undef
109  ret void
110}
111
112; GCN-LABEL: {{^}}max_10_vgprs_used_1a_partial_spill:
113; GCN-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
114; GCN-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
115; GFX908-DAG: v_accvgpr_write_b32 a0, 1
116; GFX908-DAG: v_accvgpr_write_b32 a1, v{{[0-9]}} ; Reload Reuse
117; GFX908-DAG: v_accvgpr_write_b32 a2, v{{[0-9]}} ; Reload Reuse
118; GFX908-DAG: v_accvgpr_write_b32 a3, v{{[0-9]}} ; Reload Reuse
119; GFX908-DAG: v_accvgpr_write_b32 a4, v{{[0-9]}} ; Reload Reuse
120; GFX908-DAG: v_accvgpr_write_b32 a5, v{{[0-9]}} ; Reload Reuse
121; GFX908-DAG: v_accvgpr_write_b32 a6, v{{[0-9]}} ; Reload Reuse
122; GFX908-DAG: v_accvgpr_write_b32 a7, v{{[0-9]}} ; Reload Reuse
123; GFX908-DAG: v_accvgpr_write_b32 a8, v{{[0-9]}} ; Reload Reuse
124; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}} ; Reload Reuse
125; GFX900:     buffer_store_dword v{{[0-9]}},
126; GCN-DAG:    buffer_store_dword v{{[0-9]}},
127; GFX900:     buffer_load_dword v{{[0-9]}},
128; GCN-DAG:    buffer_load_dword v{{[0-9]}},
129; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a1 ; Reload Reuse
130; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a2 ; Reload Reuse
131; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a3 ; Reload Reuse
132; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a4 ; Reload Reuse
133; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a5 ; Reload Reuse
134; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a6 ; Reload Reuse
135; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a7 ; Reload Reuse
136; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a8 ; Reload Reuse
137; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a9 ; Reload Reuse
138
139; GCN:    NumVgprs: 10
140; GFX900: ScratchSize: 44
141; GFX908: ScratchSize: 12
142; GCN:    VGPRBlocks: 2
143; GCN:    NumVGPRsForWavesPerEU: 10
144define amdgpu_kernel void @max_10_vgprs_used_1a_partial_spill(i64 addrspace(1)* %p) #0 {
145  %tid = load volatile i32, i32 addrspace(1)* undef
146  call void asm sideeffect "", "a"(i32 1)
147  %p1 = getelementptr inbounds i64, i64 addrspace(1)* %p, i32 %tid
148  %p2 = getelementptr inbounds i64, i64 addrspace(1)* %p1, i32 8
149  %p3 = getelementptr inbounds i64, i64 addrspace(1)* %p2, i32 16
150  %p4 = getelementptr inbounds i64, i64 addrspace(1)* %p3, i32 24
151  %p5 = getelementptr inbounds i64, i64 addrspace(1)* %p4, i32 32
152  %v1 = load volatile i64, i64 addrspace(1)* %p1
153  %v2 = load volatile i64, i64 addrspace(1)* %p2
154  %v3 = load volatile i64, i64 addrspace(1)* %p3
155  %v4 = load volatile i64, i64 addrspace(1)* %p4
156  %v5 = load volatile i64, i64 addrspace(1)* %p5
157  call void asm sideeffect "", "v,v,v,v,v"(i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5)
158  store volatile i64 %v1, i64 addrspace(1)* %p2
159  store volatile i64 %v2, i64 addrspace(1)* %p3
160  store volatile i64 %v3, i64 addrspace(1)* %p4
161  store volatile i64 %v4, i64 addrspace(1)* %p5
162  store volatile i64 %v5, i64 addrspace(1)* %p1
163  ret void
164}
165
166; GCN-LABEL: {{^}}max_10_vgprs_spill_v32:
167; GCN-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
168; GCN-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
169; GFX908-DAG: v_accvgpr_write_b32 a0, v{{[0-9]}} ; Reload Reuse
170; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}} ; Reload Reuse
171; GCN-NOT:    a10
172; GCN:        buffer_store_dword v{{[0-9]}},
173
174; GFX908: NumVgprs: 10
175; GFX900: ScratchSize: 100
176; GFX908: ScratchSize: 68
177; GFX908: VGPRBlocks: 2
178; GFX908: NumVGPRsForWavesPerEU: 10
179define amdgpu_kernel void @max_10_vgprs_spill_v32(<32 x float> addrspace(1)* %p) #0 {
180  %tid = call i32 @llvm.amdgcn.workitem.id.x()
181  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid
182  %v = load volatile <32 x float>, <32 x float> addrspace(1)* %gep
183  store volatile <32 x float> %v, <32 x float> addrspace(1)* undef
184  ret void
185}
186
187; GCN-LABEL: {{^}}max_256_vgprs_spill_9x32:
188; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
189; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
190; GFX908-NOT: SCRATCH_RSRC
191; GFX908-DAG: v_accvgpr_write_b32 a0, v
192; GFX900:     buffer_store_dword v
193; GFX900:     buffer_load_dword v
194; GFX908-NOT: buffer_
195; GFX908-DAG: v_accvgpr_read_b32
196
197; GCN:    NumVgprs: 256
198; GFX900: ScratchSize: 148
199; GFX908: ScratchSize: 0
200; GCN:    VGPRBlocks: 63
201; GCN:    NumVGPRsForWavesPerEU: 256
202define amdgpu_kernel void @max_256_vgprs_spill_9x32(<32 x float> addrspace(1)* %p) #1 {
203  %tid = call i32 @llvm.amdgcn.workitem.id.x()
204  %p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid
205  %p2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p1, i32 %tid
206  %p3 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p2, i32 %tid
207  %p4 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p3, i32 %tid
208  %p5 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p4, i32 %tid
209  %p6 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p5, i32 %tid
210  %p7 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p6, i32 %tid
211  %p8 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p7, i32 %tid
212  %p9 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p8, i32 %tid
213  %v1 = load volatile <32 x float>, <32 x float> addrspace(1)* %p1
214  %v2 = load volatile <32 x float>, <32 x float> addrspace(1)* %p2
215  %v3 = load volatile <32 x float>, <32 x float> addrspace(1)* %p3
216  %v4 = load volatile <32 x float>, <32 x float> addrspace(1)* %p4
217  %v5 = load volatile <32 x float>, <32 x float> addrspace(1)* %p5
218  %v6 = load volatile <32 x float>, <32 x float> addrspace(1)* %p6
219  %v7 = load volatile <32 x float>, <32 x float> addrspace(1)* %p7
220  %v8 = load volatile <32 x float>, <32 x float> addrspace(1)* %p8
221  %v9 = load volatile <32 x float>, <32 x float> addrspace(1)* %p9
222  store volatile <32 x float> %v1, <32 x float> addrspace(1)* undef
223  store volatile <32 x float> %v2, <32 x float> addrspace(1)* undef
224  store volatile <32 x float> %v3, <32 x float> addrspace(1)* undef
225  store volatile <32 x float> %v4, <32 x float> addrspace(1)* undef
226  store volatile <32 x float> %v5, <32 x float> addrspace(1)* undef
227  store volatile <32 x float> %v6, <32 x float> addrspace(1)* undef
228  store volatile <32 x float> %v7, <32 x float> addrspace(1)* undef
229  store volatile <32 x float> %v8, <32 x float> addrspace(1)* undef
230  store volatile <32 x float> %v9, <32 x float> addrspace(1)* undef
231  ret void
232}
233
234; FIXME: adding an AReg_1024 register class for v32f32 and v32i32
235;        produces unnecessary copies and we still have some amount
236;        of conventional spilling.
237
238; GCN-LABEL: {{^}}max_256_vgprs_spill_9x32_2bb:
239; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
240; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
241; GFX908-FIXME-NOT: SCRATCH_RSRC
242; GFX908-DAG: v_accvgpr_write_b32 a0, v
243; GFX900:     buffer_store_dword v
244; GFX900:     buffer_load_dword v
245; GFX908-FIXME-NOT: buffer_
246; GFX908-DAG: v_accvgpr_read_b32
247
248; GCN:    NumVgprs: 256
249; GFX900: ScratchSize: 1796
250; GFX908-FIXME: ScratchSize: 0
251; GCN:    VGPRBlocks: 63
252; GCN:    NumVGPRsForWavesPerEU: 256
253define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(<32 x float> addrspace(1)* %p) #1 {
254  %tid = call i32 @llvm.amdgcn.workitem.id.x()
255  %p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid
256  %p2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p1, i32 %tid
257  %p3 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p2, i32 %tid
258  %p4 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p3, i32 %tid
259  %p5 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p4, i32 %tid
260  %p6 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p5, i32 %tid
261  %p7 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p6, i32 %tid
262  %p8 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p7, i32 %tid
263  %p9 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p8, i32 %tid
264  %v1 = load volatile <32 x float>, <32 x float> addrspace(1)* %p1
265  %v2 = load volatile <32 x float>, <32 x float> addrspace(1)* %p2
266  %v3 = load volatile <32 x float>, <32 x float> addrspace(1)* %p3
267  %v4 = load volatile <32 x float>, <32 x float> addrspace(1)* %p4
268  %v5 = load volatile <32 x float>, <32 x float> addrspace(1)* %p5
269  %v6 = load volatile <32 x float>, <32 x float> addrspace(1)* %p6
270  %v7 = load volatile <32 x float>, <32 x float> addrspace(1)* %p7
271  %v8 = load volatile <32 x float>, <32 x float> addrspace(1)* %p8
272  %v9 = load volatile <32 x float>, <32 x float> addrspace(1)* %p9
273  br label %st
274
275st:
276  store volatile <32 x float> %v1, <32 x float> addrspace(1)* undef
277  store volatile <32 x float> %v2, <32 x float> addrspace(1)* undef
278  store volatile <32 x float> %v3, <32 x float> addrspace(1)* undef
279  store volatile <32 x float> %v4, <32 x float> addrspace(1)* undef
280  store volatile <32 x float> %v5, <32 x float> addrspace(1)* undef
281  store volatile <32 x float> %v6, <32 x float> addrspace(1)* undef
282  store volatile <32 x float> %v7, <32 x float> addrspace(1)* undef
283  store volatile <32 x float> %v8, <32 x float> addrspace(1)* undef
284  store volatile <32 x float> %v9, <32 x float> addrspace(1)* undef
285  ret void
286}
287
288declare i32 @llvm.amdgcn.workitem.id.x()
289
290attributes #0 = { nounwind "amdgpu-num-vgpr"="10" }
291attributes #1 = { "amdgpu-flat-work-group-size"="1,256" }
292