1; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs -asm-verbose=0 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10-ASM %s
2; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s -filetype=obj | llvm-objdump -d --arch-name=amdgcn --mcpu=gfx1030 - | FileCheck --check-prefixes=GCN,GFX10,GFX10-DIS %s
3; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s
4
5; GFX8-NOT: s_inst_prefetch
6; GFX8-NOT: .palign 6
7
8; GCN-LABEL: test_loop_64
9; GFX10:          s_movk_i32 s{{[0-9]+}}, 0x400
10; GFX10-DIS-NEXT: {{^$}}
11; GFX10-ASM-NEXT: [[L1:BB[0-9_]+]]:
12; GFX10-DIS-NEXT: <[[L1:BB[0-9_]+]]>:
13; GFX10:          s_sleep 0
14; GFX10:          s_cbranch_scc0 [[L1]]
15; GFX10-NEXT:     s_endpgm
16define amdgpu_kernel void @test_loop_64(i32 addrspace(1)* nocapture %arg) {
17bb:
18  br label %bb2
19
20bb1:                                              ; preds = %bb2
21  ret void
22
23bb2:                                              ; preds = %bb2, %bb
24  %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb2 ]
25  %tmp2 = add nuw nsw i32 %tmp1, 1
26  %tmp3 = icmp eq i32 %tmp2, 1024
27  tail call void @llvm.amdgcn.s.sleep(i32 0)
28  br i1 %tmp3, label %bb1, label %bb2
29}
30
31; GCN-LABEL: test_loop_128
32; GFX10:          s_movk_i32 s{{[0-9]+}}, 0x400
33; GFX10-ASM-NEXT: .p2align 6
34; GFX10-DIS-NEXT: s_nop 0
35; GFX10-NOT:      s_inst_prefetch
36; GFX10-ASM:      [[L1:BB[0-9_]+]]:
37; GFX10-DIS:      <[[L1:BB[0-9_]+]]>:
38; GFX10:          s_sleep 0
39; GFX10:          s_cbranch_scc0 [[L1]]
40; GFX10-NEXT:     s_endpgm
41define amdgpu_kernel void @test_loop_128(i32 addrspace(1)* nocapture %arg) {
42bb:
43  br label %bb2
44
45bb1:                                              ; preds = %bb2
46  ret void
47
48bb2:                                              ; preds = %bb2, %bb
49  %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb2 ]
50  %tmp2 = add nuw nsw i32 %tmp1, 1
51  %tmp3 = icmp eq i32 %tmp2, 1024
52  tail call void @llvm.amdgcn.s.sleep(i32 0)
53  tail call void @llvm.amdgcn.s.sleep(i32 0)
54  tail call void @llvm.amdgcn.s.sleep(i32 0)
55  tail call void @llvm.amdgcn.s.sleep(i32 0)
56  tail call void @llvm.amdgcn.s.sleep(i32 0)
57  tail call void @llvm.amdgcn.s.sleep(i32 0)
58  tail call void @llvm.amdgcn.s.sleep(i32 0)
59  tail call void @llvm.amdgcn.s.sleep(i32 0)
60  tail call void @llvm.amdgcn.s.sleep(i32 0)
61  tail call void @llvm.amdgcn.s.sleep(i32 0)
62  tail call void @llvm.amdgcn.s.sleep(i32 0)
63  tail call void @llvm.amdgcn.s.sleep(i32 0)
64  tail call void @llvm.amdgcn.s.sleep(i32 0)
65  tail call void @llvm.amdgcn.s.sleep(i32 0)
66  tail call void @llvm.amdgcn.s.sleep(i32 0)
67  tail call void @llvm.amdgcn.s.sleep(i32 0)
68  br i1 %tmp3, label %bb1, label %bb2
69}
70
71; GCN-LABEL: test_loop_192
72; GFX10:          s_movk_i32 s{{[0-9]+}}, 0x400
73; GFX10-NEXT:     s_inst_prefetch 0x1
74; GFX10-ASM-NEXT: .p2align 6
75; GFX10-DIS-NEXT: s_nop 0
76; GFX10-NOT:      s_inst_prefetch
77; GFX10-ASM:      [[L1:BB[0-9_]+]]:
78; GFX10-DIS:      <[[L1:BB[0-9_]+]]>:
79; GFX10:          s_sleep 0
80; GFX10:          s_cbranch_scc0 [[L1]]
81; GFX10-NEXT:     s_inst_prefetch 0x2
82; GFX10-NEXT:     s_endpgm
83define amdgpu_kernel void @test_loop_192(i32 addrspace(1)* nocapture %arg) {
84bb:
85  br label %bb2
86
87bb1:                                              ; preds = %bb2
88  ret void
89
90bb2:                                              ; preds = %bb2, %bb
91  %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb2 ]
92  %tmp2 = add nuw nsw i32 %tmp1, 1
93  %tmp3 = icmp eq i32 %tmp2, 1024
94  tail call void @llvm.amdgcn.s.sleep(i32 0)
95  tail call void @llvm.amdgcn.s.sleep(i32 0)
96  tail call void @llvm.amdgcn.s.sleep(i32 0)
97  tail call void @llvm.amdgcn.s.sleep(i32 0)
98  tail call void @llvm.amdgcn.s.sleep(i32 0)
99  tail call void @llvm.amdgcn.s.sleep(i32 0)
100  tail call void @llvm.amdgcn.s.sleep(i32 0)
101  tail call void @llvm.amdgcn.s.sleep(i32 0)
102  tail call void @llvm.amdgcn.s.sleep(i32 0)
103  tail call void @llvm.amdgcn.s.sleep(i32 0)
104  tail call void @llvm.amdgcn.s.sleep(i32 0)
105  tail call void @llvm.amdgcn.s.sleep(i32 0)
106  tail call void @llvm.amdgcn.s.sleep(i32 0)
107  tail call void @llvm.amdgcn.s.sleep(i32 0)
108  tail call void @llvm.amdgcn.s.sleep(i32 0)
109  tail call void @llvm.amdgcn.s.sleep(i32 0)
110  tail call void @llvm.amdgcn.s.sleep(i32 0)
111  tail call void @llvm.amdgcn.s.sleep(i32 0)
112  tail call void @llvm.amdgcn.s.sleep(i32 0)
113  tail call void @llvm.amdgcn.s.sleep(i32 0)
114  tail call void @llvm.amdgcn.s.sleep(i32 0)
115  tail call void @llvm.amdgcn.s.sleep(i32 0)
116  tail call void @llvm.amdgcn.s.sleep(i32 0)
117  tail call void @llvm.amdgcn.s.sleep(i32 0)
118  tail call void @llvm.amdgcn.s.sleep(i32 0)
119  tail call void @llvm.amdgcn.s.sleep(i32 0)
120  tail call void @llvm.amdgcn.s.sleep(i32 0)
121  tail call void @llvm.amdgcn.s.sleep(i32 0)
122  tail call void @llvm.amdgcn.s.sleep(i32 0)
123  tail call void @llvm.amdgcn.s.sleep(i32 0)
124  tail call void @llvm.amdgcn.s.sleep(i32 0)
125  tail call void @llvm.amdgcn.s.sleep(i32 0)
126  tail call void @llvm.amdgcn.s.sleep(i32 0)
127  tail call void @llvm.amdgcn.s.sleep(i32 0)
128  br i1 %tmp3, label %bb1, label %bb2
129}
130
131; GCN-LABEL: test_loop_256
132; GFX10:          s_movk_i32 s{{[0-9]+}}, 0x400
133; GFX10-DIS-NEXT: {{^$}}
134; GFX10-ASM-NEXT: [[L1:BB[0-9_]+]]:
135; GFX10-DIS-NEXT: <[[L1:BB[0-9_]+]]>:
136; GFX10:          s_sleep 0
137; GFX10:          s_cbranch_scc0 [[L1]]
138; GFX10-NEXT:     s_endpgm
139define amdgpu_kernel void @test_loop_256(i32 addrspace(1)* nocapture %arg) {
140bb:
141  br label %bb2
142
143bb1:                                              ; preds = %bb2
144  ret void
145
146bb2:                                              ; preds = %bb2, %bb
147  %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb2 ]
148  %tmp2 = add nuw nsw i32 %tmp1, 1
149  %tmp3 = icmp eq i32 %tmp2, 1024
150  tail call void @llvm.amdgcn.s.sleep(i32 0)
151  tail call void @llvm.amdgcn.s.sleep(i32 0)
152  tail call void @llvm.amdgcn.s.sleep(i32 0)
153  tail call void @llvm.amdgcn.s.sleep(i32 0)
154  tail call void @llvm.amdgcn.s.sleep(i32 0)
155  tail call void @llvm.amdgcn.s.sleep(i32 0)
156  tail call void @llvm.amdgcn.s.sleep(i32 0)
157  tail call void @llvm.amdgcn.s.sleep(i32 0)
158  tail call void @llvm.amdgcn.s.sleep(i32 0)
159  tail call void @llvm.amdgcn.s.sleep(i32 0)
160  tail call void @llvm.amdgcn.s.sleep(i32 0)
161  tail call void @llvm.amdgcn.s.sleep(i32 0)
162  tail call void @llvm.amdgcn.s.sleep(i32 0)
163  tail call void @llvm.amdgcn.s.sleep(i32 0)
164  tail call void @llvm.amdgcn.s.sleep(i32 0)
165  tail call void @llvm.amdgcn.s.sleep(i32 0)
166  tail call void @llvm.amdgcn.s.sleep(i32 0)
167  tail call void @llvm.amdgcn.s.sleep(i32 0)
168  tail call void @llvm.amdgcn.s.sleep(i32 0)
169  tail call void @llvm.amdgcn.s.sleep(i32 0)
170  tail call void @llvm.amdgcn.s.sleep(i32 0)
171  tail call void @llvm.amdgcn.s.sleep(i32 0)
172  tail call void @llvm.amdgcn.s.sleep(i32 0)
173  tail call void @llvm.amdgcn.s.sleep(i32 0)
174  tail call void @llvm.amdgcn.s.sleep(i32 0)
175  tail call void @llvm.amdgcn.s.sleep(i32 0)
176  tail call void @llvm.amdgcn.s.sleep(i32 0)
177  tail call void @llvm.amdgcn.s.sleep(i32 0)
178  tail call void @llvm.amdgcn.s.sleep(i32 0)
179  tail call void @llvm.amdgcn.s.sleep(i32 0)
180  tail call void @llvm.amdgcn.s.sleep(i32 0)
181  tail call void @llvm.amdgcn.s.sleep(i32 0)
182  tail call void @llvm.amdgcn.s.sleep(i32 0)
183  tail call void @llvm.amdgcn.s.sleep(i32 0)
184  tail call void @llvm.amdgcn.s.sleep(i32 0)
185  tail call void @llvm.amdgcn.s.sleep(i32 0)
186  tail call void @llvm.amdgcn.s.sleep(i32 0)
187  tail call void @llvm.amdgcn.s.sleep(i32 0)
188  tail call void @llvm.amdgcn.s.sleep(i32 0)
189  tail call void @llvm.amdgcn.s.sleep(i32 0)
190  tail call void @llvm.amdgcn.s.sleep(i32 0)
191  tail call void @llvm.amdgcn.s.sleep(i32 0)
192  tail call void @llvm.amdgcn.s.sleep(i32 0)
193  tail call void @llvm.amdgcn.s.sleep(i32 0)
194  tail call void @llvm.amdgcn.s.sleep(i32 0)
195  tail call void @llvm.amdgcn.s.sleep(i32 0)
196  tail call void @llvm.amdgcn.s.sleep(i32 0)
197  tail call void @llvm.amdgcn.s.sleep(i32 0)
198  tail call void @llvm.amdgcn.s.sleep(i32 0)
199  tail call void @llvm.amdgcn.s.sleep(i32 0)
200  br i1 %tmp3, label %bb1, label %bb2
201}
202
203; GCN-LABEL: test_loop_prefetch_inner_outer
204; GFX10:          s_inst_prefetch 0x1
205; GFX10-ASM-NEXT: .p2align 6
206; GFX10-DIS-NEXT: s_nop 0
207; GFX10-NOT:      s_inst_prefetch
208; GFX10-ASM:      [[L1:BB[0-9_]+]]:
209; GFX10-DIS:      <[[L1:BB[0-9_]+]]>:
210; GFX10-NOT:      s_inst_prefetch
211; GFX10-ASM:      .p2align 6
212; GFX10-DIS:      s_nop 0
213; GFX10-NOT:      s_inst_prefetch
214; GFX10-ASM:      [[L2:BB[0-9_]+]]:
215; GFX10-DIS:      <[[L2:BB[0-9_]+]]>:
216; GFX10-NOT:      s_inst_prefetch
217; GFX10:          s_sleep 0
218; GFX10:          s_cbranch_scc{{[01]}} [[L2]]
219; GFX10-NOT:      s_inst_prefetch
220; GFX10:          s_cbranch_scc{{[01]}} [[L1]]
221; GFX10-NEXT:     s_inst_prefetch 0x2
222; GFX10-NEXT:     s_endpgm
223define amdgpu_kernel void @test_loop_prefetch_inner_outer(i32 addrspace(1)* nocapture %arg) {
224bb:
225  br label %bb2
226
227bb1:
228  ret void
229
230bb2:
231  %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb4 ]
232  %tmp2 = add nuw nsw i32 %tmp1, 1
233  %tmp3 = icmp eq i32 %tmp2, 1024
234  br label %bb3
235
236bb3:
237  %tmp4 = phi i32 [ 0, %bb2 ], [ %tmp5, %bb3 ]
238  %tmp5 = add nuw nsw i32 %tmp4, 1
239  %tmp6 = icmp eq i32 %tmp5, 1024
240  tail call void @llvm.amdgcn.s.sleep(i32 0)
241  tail call void @llvm.amdgcn.s.sleep(i32 0)
242  tail call void @llvm.amdgcn.s.sleep(i32 0)
243  tail call void @llvm.amdgcn.s.sleep(i32 0)
244  tail call void @llvm.amdgcn.s.sleep(i32 0)
245  tail call void @llvm.amdgcn.s.sleep(i32 0)
246  tail call void @llvm.amdgcn.s.sleep(i32 0)
247  tail call void @llvm.amdgcn.s.sleep(i32 0)
248  tail call void @llvm.amdgcn.s.sleep(i32 0)
249  tail call void @llvm.amdgcn.s.sleep(i32 0)
250  tail call void @llvm.amdgcn.s.sleep(i32 0)
251  tail call void @llvm.amdgcn.s.sleep(i32 0)
252  tail call void @llvm.amdgcn.s.sleep(i32 0)
253  tail call void @llvm.amdgcn.s.sleep(i32 0)
254  tail call void @llvm.amdgcn.s.sleep(i32 0)
255  tail call void @llvm.amdgcn.s.sleep(i32 0)
256  tail call void @llvm.amdgcn.s.sleep(i32 0)
257  tail call void @llvm.amdgcn.s.sleep(i32 0)
258  tail call void @llvm.amdgcn.s.sleep(i32 0)
259  tail call void @llvm.amdgcn.s.sleep(i32 0)
260  tail call void @llvm.amdgcn.s.sleep(i32 0)
261  tail call void @llvm.amdgcn.s.sleep(i32 0)
262  tail call void @llvm.amdgcn.s.sleep(i32 0)
263  tail call void @llvm.amdgcn.s.sleep(i32 0)
264  tail call void @llvm.amdgcn.s.sleep(i32 0)
265  tail call void @llvm.amdgcn.s.sleep(i32 0)
266  tail call void @llvm.amdgcn.s.sleep(i32 0)
267  tail call void @llvm.amdgcn.s.sleep(i32 0)
268  tail call void @llvm.amdgcn.s.sleep(i32 0)
269  tail call void @llvm.amdgcn.s.sleep(i32 0)
270  tail call void @llvm.amdgcn.s.sleep(i32 0)
271  tail call void @llvm.amdgcn.s.sleep(i32 0)
272  tail call void @llvm.amdgcn.s.sleep(i32 0)
273  tail call void @llvm.amdgcn.s.sleep(i32 0)
274  br i1 %tmp6, label %bb4, label %bb3
275
276bb4:
277  br i1 %tmp3, label %bb1, label %bb2
278}
279
280; GCN-LABEL: test_loop_prefetch_inner_outer_noouter
281; GFX10-NOT:      .p2align 6
282; GFX10-NOT:      s_nop
283; GFX10-NOT:      s_inst_prefetch
284; GFX10-ASM:      [[L0:BB[0-9_]+]]:
285; GFX10-DIS:      <[[L0:BB[0-9_]+]]>:
286; GFX10:          s_inst_prefetch 0x1
287; GFX10-ASM-NEXT: .p2align 6
288; GFX10-DIS-NEXT: s_nop 0
289; GFX10-NOT:      s_inst_prefetch
290; GFX10-ASM:      [[L1:BB[0-9_]+]]:
291; GFX10-DIS:      <[[L1:BB[0-9_]+]]>:
292; GFX10-NOT:      s_inst_prefetch
293; GFX10-ASM:      .p2align 6
294; GFX10-DIS:      s_nop 0
295; GFX10-NOT:      s_inst_prefetch
296; GFX10-ASM:      [[L2:BB[0-9_]+]]:
297; GFX10-DIS:      <[[L2:BB[0-9_]+]]>:
298; GFX10-NOT:      s_inst_prefetch
299; GFX10:          s_sleep 0
300; GFX10:          s_cbranch_scc{{[01]}} [[L2]]
301; GFX10-NOT:      s_inst_prefetch
302; GFX10:          s_cbranch_scc{{[01]}} [[L1]]
303; GFX10-NEXT:     s_inst_prefetch 0x2
304; GFX10:          s_cbranch_scc{{[01]}} [[L0]]
305; GFX10-NEXT:     s_endpgm
306define amdgpu_kernel void @test_loop_prefetch_inner_outer_noouter(i32 addrspace(1)* nocapture %arg) {
307bb:
308  br label %bb2
309
310bb1:
311  ret void
312
313bb2:
314  %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb6 ]
315  %tmp2 = add nuw nsw i32 %tmp1, 1
316  %tmp3 = icmp eq i32 %tmp2, 1024
317  br label %bb3
318
319bb3:
320  %tmp4 = phi i32 [ 0, %bb2 ], [ %tmp5, %bb5 ]
321  %tmp5 = add nuw nsw i32 %tmp4, 1
322  %tmp6 = icmp eq i32 %tmp5, 1024
323  br label %bb4
324
325bb4:
326  %tmp7 = phi i32 [ 0, %bb3 ], [ %tmp8, %bb4 ]
327  %tmp8 = add nuw nsw i32 %tmp7, 1
328  %tmp9 = icmp eq i32 %tmp8, 1024
329  tail call void @llvm.amdgcn.s.sleep(i32 0)
330  tail call void @llvm.amdgcn.s.sleep(i32 0)
331  tail call void @llvm.amdgcn.s.sleep(i32 0)
332  tail call void @llvm.amdgcn.s.sleep(i32 0)
333  tail call void @llvm.amdgcn.s.sleep(i32 0)
334  tail call void @llvm.amdgcn.s.sleep(i32 0)
335  tail call void @llvm.amdgcn.s.sleep(i32 0)
336  tail call void @llvm.amdgcn.s.sleep(i32 0)
337  tail call void @llvm.amdgcn.s.sleep(i32 0)
338  tail call void @llvm.amdgcn.s.sleep(i32 0)
339  tail call void @llvm.amdgcn.s.sleep(i32 0)
340  tail call void @llvm.amdgcn.s.sleep(i32 0)
341  tail call void @llvm.amdgcn.s.sleep(i32 0)
342  tail call void @llvm.amdgcn.s.sleep(i32 0)
343  tail call void @llvm.amdgcn.s.sleep(i32 0)
344  tail call void @llvm.amdgcn.s.sleep(i32 0)
345  tail call void @llvm.amdgcn.s.sleep(i32 0)
346  tail call void @llvm.amdgcn.s.sleep(i32 0)
347  tail call void @llvm.amdgcn.s.sleep(i32 0)
348  tail call void @llvm.amdgcn.s.sleep(i32 0)
349  tail call void @llvm.amdgcn.s.sleep(i32 0)
350  tail call void @llvm.amdgcn.s.sleep(i32 0)
351  tail call void @llvm.amdgcn.s.sleep(i32 0)
352  tail call void @llvm.amdgcn.s.sleep(i32 0)
353  tail call void @llvm.amdgcn.s.sleep(i32 0)
354  tail call void @llvm.amdgcn.s.sleep(i32 0)
355  tail call void @llvm.amdgcn.s.sleep(i32 0)
356  tail call void @llvm.amdgcn.s.sleep(i32 0)
357  tail call void @llvm.amdgcn.s.sleep(i32 0)
358  tail call void @llvm.amdgcn.s.sleep(i32 0)
359  tail call void @llvm.amdgcn.s.sleep(i32 0)
360  tail call void @llvm.amdgcn.s.sleep(i32 0)
361  tail call void @llvm.amdgcn.s.sleep(i32 0)
362  tail call void @llvm.amdgcn.s.sleep(i32 0)
363  br i1 %tmp9, label %bb5, label %bb4
364
365bb5:
366  br i1 %tmp6, label %bb6, label %bb3
367
368bb6:
369  tail call void @llvm.amdgcn.s.sleep(i32 0)
370  tail call void @llvm.amdgcn.s.sleep(i32 0)
371  tail call void @llvm.amdgcn.s.sleep(i32 0)
372  tail call void @llvm.amdgcn.s.sleep(i32 0)
373  tail call void @llvm.amdgcn.s.sleep(i32 0)
374  tail call void @llvm.amdgcn.s.sleep(i32 0)
375  tail call void @llvm.amdgcn.s.sleep(i32 0)
376  tail call void @llvm.amdgcn.s.sleep(i32 0)
377  tail call void @llvm.amdgcn.s.sleep(i32 0)
378  tail call void @llvm.amdgcn.s.sleep(i32 0)
379  tail call void @llvm.amdgcn.s.sleep(i32 0)
380  tail call void @llvm.amdgcn.s.sleep(i32 0)
381  tail call void @llvm.amdgcn.s.sleep(i32 0)
382  tail call void @llvm.amdgcn.s.sleep(i32 0)
383  tail call void @llvm.amdgcn.s.sleep(i32 0)
384  tail call void @llvm.amdgcn.s.sleep(i32 0)
385  br i1 %tmp3, label %bb1, label %bb2
386}
387
388declare void @llvm.amdgcn.s.sleep(i32)
389