1; RUN: opt -mtriple=amdgcn-- -S -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow %s | FileCheck -check-prefix=IR %s
2; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
3
4; Add an extra verifier runs. There were some cases where invalid IR
5; was produced but happened to be fixed by the later passes.
6
7; Make sure divergent control flow with multiple exits from a region
8; is properly handled. UnifyFunctionExitNodes should be run before
9; StructurizeCFG.
10
11; IR-LABEL: @multi_divergent_region_exit_ret_ret(
12; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
13; IR: %2 = extractvalue { i1, i64 } %1, 0
14; IR: %3 = extractvalue { i1, i64 } %1, 1
15; IR: br i1 %2, label %LeafBlock1, label %Flow
16
17; IR: Flow:
18; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
19; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
20; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
21; IR: %7 = extractvalue { i1, i64 } %6, 0
22; IR: %8 = extractvalue { i1, i64 } %6, 1
23; IR: br i1 %7, label %LeafBlock, label %Flow1
24
25; IR: LeafBlock:
26; IR: br label %Flow1
27
28; IR: LeafBlock1:
29; IR: br label %Flow{{$}}
30
31; IR:  Flow2:
32; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
33; IR: call void @llvm.amdgcn.end.cf(i64 %19)
34; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
35; IR: %13 = extractvalue { i1, i64 } %12, 0
36; IR: %14 = extractvalue { i1, i64 } %12, 1
37; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
38
39; IR: exit0:
40; IR: store volatile i32 9, i32 addrspace(1)* undef
41; IR: br label %UnifiedReturnBlock
42
43; IR: Flow1:
44; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
45; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
46; IR: call void @llvm.amdgcn.end.cf(i64 %8)
47; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
48; IR: %18 = extractvalue { i1, i64 } %17, 0
49; IR: %19 = extractvalue { i1, i64 } %17, 1
50; IR: br i1 %18, label %exit1, label %Flow2
51
52; IR: exit1:
53; IR: store volatile i32 17, i32 addrspace(3)* undef
54; IR:  br label %Flow2
55
56; IR: UnifiedReturnBlock:
57; IR: call void @llvm.amdgcn.end.cf(i64 %14)
58; IR: ret void
59
60
61; GCN-LABEL: {{^}}multi_divergent_region_exit_ret_ret:
62; GCN: v_cmp_lt_i32_e32 vcc, 1
63; GCN: s_and_saveexec_b64
64; GCN: s_xor_b64
65
66
67; FIXME: Why is this compare essentially repeated?
68; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]]
69; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
70; GCN: v_cmp_ne_u32_e32 vcc, 1, [[REG]]
71; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
72
73; GCN: ; %Flow4
74; GCN-NEXT: s_or_b64 exec, exec
75; GCN: v_cmp_ne_u32_e32 vcc, 0
76
77; GCN: ; %exit1
78; GCN: ds_write_b32
79
80; GCN: %Flow5
81; GCN-NEXT: s_or_b64 exec, exec
82; GCN: v_cmp_ne_u32_e32 vcc, 0
83; GCN-NEXT: s_and_saveexec_b64
84
85; GCN: ; %exit0
86; GCN: buffer_store_dword
87
88; GCN: ; %UnifiedReturnBlock
89; GCN-NEXT: s_endpgm
90define amdgpu_kernel void @multi_divergent_region_exit_ret_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
91entry:
92  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
93  %tmp1 = add i32 0, %tmp
94  %tmp2 = zext i32 %tmp1 to i64
95  %tmp3 = add i64 0, %tmp2
96  %tmp4 = shl i64 %tmp3, 32
97  %tmp5 = ashr exact i64 %tmp4, 32
98  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
99  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
100  %tmp8 = sext i32 %tmp7 to i64
101  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
102  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
103  %tmp13 = zext i32 %tmp10 to i64
104  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
105  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
106  %Pivot = icmp slt i32 %tmp16, 2
107  br i1 %Pivot, label %LeafBlock, label %LeafBlock1
108
109LeafBlock:                                        ; preds = %entry
110  %SwitchLeaf = icmp eq i32 %tmp16, 1
111  br i1 %SwitchLeaf, label %exit0, label %exit1
112
113LeafBlock1:                                       ; preds = %entry
114  %SwitchLeaf2 = icmp eq i32 %tmp16, 2
115  br i1 %SwitchLeaf2, label %exit0, label %exit1
116
117exit0:                                     ; preds = %LeafBlock, %LeafBlock1
118  store volatile i32 9, i32 addrspace(1)* undef
119  ret void
120
121exit1:                                     ; preds = %LeafBlock, %LeafBlock1
122  store volatile i32 17, i32 addrspace(3)* undef
123  ret void
124}
125
126; IR-LABEL: @multi_divergent_region_exit_unreachable_unreachable(
127; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
128
129; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
130
131; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
132; IR: call void @llvm.amdgcn.end.cf(i64 %19)
133; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
134; IR: br i1 %13, label %exit0, label %UnifiedUnreachableBlock
135
136
137; IR: UnifiedUnreachableBlock:
138; IR-NEXT: unreachable
139
140
141; FIXME: Probably should insert an s_endpgm anyway.
142; GCN-LABEL: {{^}}multi_divergent_region_exit_unreachable_unreachable:
143; GCN: ; %UnifiedUnreachableBlock
144; GCN-NEXT: .Lfunc_end
145define amdgpu_kernel void @multi_divergent_region_exit_unreachable_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
146entry:
147  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
148  %tmp1 = add i32 0, %tmp
149  %tmp2 = zext i32 %tmp1 to i64
150  %tmp3 = add i64 0, %tmp2
151  %tmp4 = shl i64 %tmp3, 32
152  %tmp5 = ashr exact i64 %tmp4, 32
153  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
154  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
155  %tmp8 = sext i32 %tmp7 to i64
156  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
157  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
158  %tmp13 = zext i32 %tmp10 to i64
159  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
160  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
161  %Pivot = icmp slt i32 %tmp16, 2
162  br i1 %Pivot, label %LeafBlock, label %LeafBlock1
163
164LeafBlock:                                        ; preds = %entry
165  %SwitchLeaf = icmp eq i32 %tmp16, 1
166  br i1 %SwitchLeaf, label %exit0, label %exit1
167
168LeafBlock1:                                       ; preds = %entry
169  %SwitchLeaf2 = icmp eq i32 %tmp16, 2
170  br i1 %SwitchLeaf2, label %exit0, label %exit1
171
172exit0:                                     ; preds = %LeafBlock, %LeafBlock1
173  store volatile i32 9, i32 addrspace(1)* undef
174  unreachable
175
176exit1:                                     ; preds = %LeafBlock, %LeafBlock1
177  store volatile i32 17, i32 addrspace(3)* undef
178  unreachable
179}
180
181; IR-LABEL: @multi_exit_region_divergent_ret_uniform_ret(
182; IR: %divergent.cond0 = icmp slt i32 %tmp16, 2
183; IR: llvm.amdgcn.if
184; IR: br i1
185
186; IR: {{^}}Flow:
187; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
188; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
189; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
190; IR: br i1 %7, label %LeafBlock, label %Flow1
191
192; IR: {{^}}LeafBlock:
193; IR: %divergent.cond1 = icmp eq i32 %tmp16, 1
194; IR: %9 = xor i1 %divergent.cond1, true
195; IR: br label %Flow1
196
197; IR: LeafBlock1:
198; IR: %uniform.cond0 = icmp eq i32 %arg3, 2
199; IR: %10 = xor i1 %uniform.cond0, true
200; IR: br label %Flow
201
202; IR: Flow2:
203; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
204; IR: call void @llvm.amdgcn.end.cf(i64 %19)
205; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
206; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
207
208; IR: exit0:
209; IR: store volatile i32 9, i32 addrspace(1)* undef
210; IR: br label %UnifiedReturnBlock
211
212; IR: {{^}}Flow1:
213; IR: %15 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %4, %Flow ]
214; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
215; IR: call void @llvm.amdgcn.end.cf(i64 %8)
216; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
217; IR: %18 = extractvalue { i1, i64 } %17, 0
218; IR: %19 = extractvalue { i1, i64 } %17, 1
219; IR: br i1 %18, label %exit1, label %Flow2
220
221; IR: exit1:
222; IR: store volatile i32 17, i32 addrspace(3)* undef
223; IR: br label %Flow2
224
225; IR: UnifiedReturnBlock:
226; IR: call void @llvm.amdgcn.end.cf(i64 %14)
227; IR: ret void
228define amdgpu_kernel void @multi_exit_region_divergent_ret_uniform_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
229entry:
230  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
231  %tmp1 = add i32 0, %tmp
232  %tmp2 = zext i32 %tmp1 to i64
233  %tmp3 = add i64 0, %tmp2
234  %tmp4 = shl i64 %tmp3, 32
235  %tmp5 = ashr exact i64 %tmp4, 32
236  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
237  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
238  %tmp8 = sext i32 %tmp7 to i64
239  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
240  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
241  %tmp13 = zext i32 %tmp10 to i64
242  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
243  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
244  %divergent.cond0 = icmp slt i32 %tmp16, 2
245  br i1 %divergent.cond0, label %LeafBlock, label %LeafBlock1
246
247LeafBlock:                                        ; preds = %entry
248  %divergent.cond1 = icmp eq i32 %tmp16, 1
249  br i1 %divergent.cond1, label %exit0, label %exit1
250
251LeafBlock1:                                       ; preds = %entry
252  %uniform.cond0 = icmp eq i32 %arg3, 2
253  br i1 %uniform.cond0, label %exit0, label %exit1
254
255exit0:                                     ; preds = %LeafBlock, %LeafBlock1
256  store volatile i32 9, i32 addrspace(1)* undef
257  ret void
258
259exit1:                                     ; preds = %LeafBlock, %LeafBlock1
260  store volatile i32 17, i32 addrspace(3)* undef
261  ret void
262}
263
264; IR-LABEL: @multi_exit_region_uniform_ret_divergent_ret(
265; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
266; IR: br i1 %2, label %LeafBlock1, label %Flow
267
268; IR: Flow:
269; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
270; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
271; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
272
273; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
274; IR: call void @llvm.amdgcn.end.cf(i64 %19)
275; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
276
277define amdgpu_kernel void @multi_exit_region_uniform_ret_divergent_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
278entry:
279  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
280  %tmp1 = add i32 0, %tmp
281  %tmp2 = zext i32 %tmp1 to i64
282  %tmp3 = add i64 0, %tmp2
283  %tmp4 = shl i64 %tmp3, 32
284  %tmp5 = ashr exact i64 %tmp4, 32
285  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
286  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
287  %tmp8 = sext i32 %tmp7 to i64
288  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
289  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
290  %tmp13 = zext i32 %tmp10 to i64
291  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
292  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
293  %Pivot = icmp slt i32 %tmp16, 2
294  br i1 %Pivot, label %LeafBlock, label %LeafBlock1
295
296LeafBlock:                                        ; preds = %entry
297  %SwitchLeaf = icmp eq i32 %arg3, 1
298  br i1 %SwitchLeaf, label %exit0, label %exit1
299
300LeafBlock1:                                       ; preds = %entry
301  %SwitchLeaf2 = icmp eq i32 %tmp16, 2
302  br i1 %SwitchLeaf2, label %exit0, label %exit1
303
304exit0:                                     ; preds = %LeafBlock, %LeafBlock1
305  store volatile i32 9, i32 addrspace(1)* undef
306  ret void
307
308exit1:                                     ; preds = %LeafBlock, %LeafBlock1
309  store volatile i32 17, i32 addrspace(3)* undef
310  ret void
311}
312
313; IR-LABEL: @multi_divergent_region_exit_ret_ret_return_value(
314; IR: Flow2:
315; IR: %11 = phi float [ 2.000000e+00, %exit1 ], [ undef, %Flow1 ]
316; IR: %12 = phi i1 [ false, %exit1 ], [ %16, %Flow1 ]
317; IR: call void @llvm.amdgcn.end.cf(i64 %20)
318
319; IR: UnifiedReturnBlock:
320; IR: %UnifiedRetVal = phi float [ %11, %Flow2 ], [ 1.000000e+00, %exit0 ]
321; IR: call void @llvm.amdgcn.end.cf(i64 %15)
322; IR: ret float %UnifiedRetVal
323define amdgpu_ps float @multi_divergent_region_exit_ret_ret_return_value(i32 %vgpr) #0 {
324entry:
325  %Pivot = icmp slt i32 %vgpr, 2
326  br i1 %Pivot, label %LeafBlock, label %LeafBlock1
327
328LeafBlock:                                        ; preds = %entry
329  %SwitchLeaf = icmp eq i32 %vgpr, 1
330  br i1 %SwitchLeaf, label %exit0, label %exit1
331
332LeafBlock1:                                       ; preds = %entry
333  %SwitchLeaf2 = icmp eq i32 %vgpr, 2
334  br i1 %SwitchLeaf2, label %exit0, label %exit1
335
336exit0:                                     ; preds = %LeafBlock, %LeafBlock1
337  store i32 9, i32 addrspace(1)* undef
338  ret float 1.0
339
340exit1:                                     ; preds = %LeafBlock, %LeafBlock1
341  store i32 17, i32 addrspace(3)* undef
342  ret float 2.0
343}
344
345; IR-LABEL: @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(
346
347; GCN-LABEL: {{^}}uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value:
348; GCN: s_cmp_gt_i32 s0, 1
349; GCN: s_cbranch_scc0 [[FLOW:BB[0-9]+_[0-9]+]]
350
351; GCN: v_cmp_ne_u32_e32 vcc, 7, v0
352
353; GCN: {{^}}[[FLOW]]:
354; GCN: s_cbranch_vccnz [[FLOW1:BB[0-9]+]]
355
356; GCN: v_mov_b32_e32 v0, 2.0
357; GCN: s_or_b64 exec, exec
358; GCN-NOT: s_and_b64 exec, exec
359; GCN: v_mov_b32_e32 v0, 1.0
360
361; GCN: {{^BB[0-9]+_[0-9]+}}: ; %UnifiedReturnBlock
362; GCN-NEXT: s_or_b64 exec, exec
363; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
364; GCN-NEXT: ; return
365
366define amdgpu_ps float @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(i32 inreg %sgpr, i32 %vgpr) #0 {
367entry:
368  %uniform.cond = icmp slt i32 %sgpr, 2
369  br i1 %uniform.cond, label %LeafBlock, label %LeafBlock1
370
371LeafBlock:                                        ; preds = %entry
372  %divergent.cond0 = icmp eq i32 %vgpr, 3
373  br i1 %divergent.cond0, label %exit0, label %exit1
374
375LeafBlock1:                                       ; preds = %entry
376  %divergent.cond1 = icmp eq i32 %vgpr, 7
377  br i1 %divergent.cond1, label %exit0, label %exit1
378
379exit0:                                     ; preds = %LeafBlock, %LeafBlock1
380  store i32 9, i32 addrspace(1)* undef
381  ret float 1.0
382
383exit1:                                     ; preds = %LeafBlock, %LeafBlock1
384  store i32 17, i32 addrspace(3)* undef
385  ret float 2.0
386}
387
388; IR-LABEL: @multi_divergent_region_exit_ret_unreachable(
389; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
390
391; IR: Flow:
392; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
393; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
394; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
395
396; IR: Flow2:
397; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
398; IR: call void @llvm.amdgcn.end.cf(i64 %19)
399; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
400; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
401
402; IR: exit0:
403; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef
404; IR-NEXT: br label %UnifiedReturnBlock
405
406; IR: Flow1:
407; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
408; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
409; IR: call void @llvm.amdgcn.end.cf(i64 %8)
410; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
411; IR: %18 = extractvalue { i1, i64 } %17, 0
412; IR: %19 = extractvalue { i1, i64 } %17, 1
413; IR: br i1 %18, label %exit1, label %Flow2
414
415; IR: exit1:
416; IR-NEXT: store volatile i32 9, i32 addrspace(1)* undef
417; IR-NEXT: call void @llvm.amdgcn.unreachable()
418; IR-NEXT: br label %Flow2
419
420; IR: UnifiedReturnBlock:
421; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14)
422; IR-NEXT: ret void
423define amdgpu_kernel void @multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
424entry:
425  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
426  %tmp1 = add i32 0, %tmp
427  %tmp2 = zext i32 %tmp1 to i64
428  %tmp3 = add i64 0, %tmp2
429  %tmp4 = shl i64 %tmp3, 32
430  %tmp5 = ashr exact i64 %tmp4, 32
431  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
432  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
433  %tmp8 = sext i32 %tmp7 to i64
434  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
435  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
436  %tmp13 = zext i32 %tmp10 to i64
437  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
438  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
439  %Pivot = icmp slt i32 %tmp16, 2
440  br i1 %Pivot, label %LeafBlock, label %LeafBlock1
441
442LeafBlock:                                        ; preds = %entry
443  %SwitchLeaf = icmp eq i32 %tmp16, 1
444  br i1 %SwitchLeaf, label %exit0, label %exit1
445
446LeafBlock1:                                       ; preds = %entry
447  %SwitchLeaf2 = icmp eq i32 %tmp16, 2
448  br i1 %SwitchLeaf2, label %exit0, label %exit1
449
450exit0:                                     ; preds = %LeafBlock, %LeafBlock1
451  store volatile i32 17, i32 addrspace(3)* undef
452  ret void
453
454exit1:                                     ; preds = %LeafBlock, %LeafBlock1
455  store volatile i32 9, i32 addrspace(1)* undef
456  unreachable
457}
458
459; The non-uniformity of the branch to the exiting blocks requires
460; looking at transitive predecessors.
461
462; IR-LABEL: @indirect_multi_divergent_region_exit_ret_unreachable(
463
464; IR: exit0:                                            ; preds = %Flow2
465; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef
466; IR-NEXT: br label %UnifiedReturnBlock
467
468
469; IR: indirect.exit1:
470; IR: %load = load volatile i32, i32 addrspace(1)* undef
471; IR: store volatile i32 %load, i32 addrspace(1)* undef
472; IR: store volatile i32 9, i32 addrspace(1)* undef
473; IR: call void @llvm.amdgcn.unreachable()
474; IR-NEXT: br label %Flow2
475
476; IR: UnifiedReturnBlock:                               ; preds = %exit0, %Flow2
477; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14)
478; IR-NEXT: ret void
479define amdgpu_kernel void @indirect_multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
480entry:
481  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
482  %tmp1 = add i32 0, %tmp
483  %tmp2 = zext i32 %tmp1 to i64
484  %tmp3 = add i64 0, %tmp2
485  %tmp4 = shl i64 %tmp3, 32
486  %tmp5 = ashr exact i64 %tmp4, 32
487  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
488  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
489  %tmp8 = sext i32 %tmp7 to i64
490  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
491  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
492  %tmp13 = zext i32 %tmp10 to i64
493  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
494  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
495  %Pivot = icmp slt i32 %tmp16, 2
496  br i1 %Pivot, label %LeafBlock, label %LeafBlock1
497
498LeafBlock:                                        ; preds = %entry
499  %SwitchLeaf = icmp eq i32 %tmp16, 1
500  br i1 %SwitchLeaf, label %exit0, label %indirect.exit1
501
502LeafBlock1:                                       ; preds = %entry
503  %SwitchLeaf2 = icmp eq i32 %tmp16, 2
504  br i1 %SwitchLeaf2, label %exit0, label %indirect.exit1
505
506exit0:                                     ; preds = %LeafBlock, %LeafBlock1
507  store volatile i32 17, i32 addrspace(3)* undef
508  ret void
509
510indirect.exit1:
511  %load = load volatile i32, i32 addrspace(1)* undef
512  store volatile i32 %load, i32 addrspace(1)* undef
513  br label %exit1
514
515exit1:                                     ; preds = %LeafBlock, %LeafBlock1
516  store volatile i32 9, i32 addrspace(1)* undef
517  unreachable
518}
519
520; IR-LABEL: @multi_divergent_region_exit_ret_switch(
521define amdgpu_kernel void @multi_divergent_region_exit_ret_switch(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
522entry:
523  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
524  %tmp1 = add i32 0, %tmp
525  %tmp2 = zext i32 %tmp1 to i64
526  %tmp3 = add i64 0, %tmp2
527  %tmp4 = shl i64 %tmp3, 32
528  %tmp5 = ashr exact i64 %tmp4, 32
529  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
530  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
531  %tmp8 = sext i32 %tmp7 to i64
532  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
533  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
534  %tmp13 = zext i32 %tmp10 to i64
535  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
536  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
537  switch i32 %tmp16, label %exit1
538    [ i32 1, label %LeafBlock
539      i32 2, label %LeafBlock1
540      i32 3, label %exit0 ]
541
542LeafBlock:                                        ; preds = %entry
543  %SwitchLeaf = icmp eq i32 %tmp16, 1
544  br i1 %SwitchLeaf, label %exit0, label %exit1
545
546LeafBlock1:                                       ; preds = %entry
547  %SwitchLeaf2 = icmp eq i32 %tmp16, 2
548  br i1 %SwitchLeaf2, label %exit0, label %exit1
549
550exit0:                                     ; preds = %LeafBlock, %LeafBlock1
551  store volatile i32 17, i32 addrspace(3)* undef
552  ret void
553
554exit1:                                     ; preds = %LeafBlock, %LeafBlock1
555  store volatile i32 9, i32 addrspace(1)* undef
556  unreachable
557}
558
559; IR-LABEL: @divergent_multi_ret_nest_in_uniform_triangle(
560define amdgpu_kernel void @divergent_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 {
561entry:
562  %uniform.cond0 = icmp eq i32 %arg0, 4
563  br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret
564
565divergent.multi.exit.region:
566  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
567  %divergent.cond0 = icmp eq i32 %id.x, 0
568  br i1 %divergent.cond0, label %divergent.ret0, label %divergent.ret1
569
570divergent.ret0:
571  store volatile i32 11, i32 addrspace(3)* undef
572  ret void
573
574divergent.ret1:
575  store volatile i32 42, i32 addrspace(3)* undef
576  ret void
577
578uniform.ret:
579  store volatile i32 9, i32 addrspace(1)* undef
580  ret void
581}
582
583; IR-LABEL: @divergent_complex_multi_ret_nest_in_uniform_triangle(
584define amdgpu_kernel void @divergent_complex_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 {
585entry:
586  %uniform.cond0 = icmp eq i32 %arg0, 4
587  br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret
588
589divergent.multi.exit.region:
590  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
591  %divergent.cond0 = icmp eq i32 %id.x, 0
592  br i1 %divergent.cond0, label %divergent.if, label %divergent.ret1
593
594divergent.if:
595  %vgpr0 = load volatile float, float addrspace(1)* undef
596  %divergent.cond1 = fcmp ogt float %vgpr0, 1.0
597  br i1 %divergent.cond1, label %divergent.then, label %divergent.endif
598
599divergent.then:
600  %vgpr1 = load volatile float, float addrspace(1)* undef
601  %divergent.cond2 = fcmp olt float %vgpr1, 4.0
602  store volatile i32 33, i32 addrspace(1)* undef
603  br i1 %divergent.cond2, label %divergent.ret0, label %divergent.endif
604
605divergent.endif:
606  store volatile i32 38, i32 addrspace(1)* undef
607  br label %divergent.ret0
608
609divergent.ret0:
610  store volatile i32 11, i32 addrspace(3)* undef
611  ret void
612
613divergent.ret1:
614  store volatile i32 42, i32 addrspace(3)* undef
615  ret void
616
617uniform.ret:
618  store volatile i32 9, i32 addrspace(1)* undef
619  ret void
620}
621
622; IR-LABEL: @uniform_complex_multi_ret_nest_in_divergent_triangle(
623; IR: Flow1:                                            ; preds = %uniform.ret1, %uniform.multi.exit.region
624; IR: %8 = phi i1 [ false, %uniform.ret1 ], [ true, %uniform.multi.exit.region ]
625; IR: br i1 %8, label %uniform.if, label %Flow2
626
627; IR: Flow:                                             ; preds = %uniform.then, %uniform.if
628; IR: %11 = phi i1 [ %10, %uniform.then ], [ %9, %uniform.if ]
629; IR: br i1 %11, label %uniform.endif, label %uniform.ret0
630
631; IR: UnifiedReturnBlock:                               ; preds = %Flow3, %Flow2
632; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %6)
633; IR-NEXT: ret void
634define amdgpu_kernel void @uniform_complex_multi_ret_nest_in_divergent_triangle(i32 %arg0) #0 {
635entry:
636  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
637  %divergent.cond0 = icmp eq i32 %id.x, 0
638  br i1 %divergent.cond0, label %uniform.multi.exit.region, label %divergent.ret
639
640uniform.multi.exit.region:
641  %uniform.cond0 = icmp eq i32 %arg0, 4
642  br i1 %uniform.cond0, label %uniform.if, label %uniform.ret1
643
644uniform.if:
645  %sgpr0 = load volatile i32, i32 addrspace(4)* undef
646  %uniform.cond1 = icmp slt i32 %sgpr0, 1
647  br i1 %uniform.cond1, label %uniform.then, label %uniform.endif
648
649uniform.then:
650  %sgpr1 = load volatile i32, i32 addrspace(4)* undef
651  %uniform.cond2 = icmp sge i32 %sgpr1, 4
652  store volatile i32 33, i32 addrspace(1)* undef
653  br i1 %uniform.cond2, label %uniform.ret0, label %uniform.endif
654
655uniform.endif:
656  store volatile i32 38, i32 addrspace(1)* undef
657  br label %uniform.ret0
658
659uniform.ret0:
660  store volatile i32 11, i32 addrspace(3)* undef
661  ret void
662
663uniform.ret1:
664  store volatile i32 42, i32 addrspace(3)* undef
665  ret void
666
667divergent.ret:
668  store volatile i32 9, i32 addrspace(1)* undef
669  ret void
670}
671
672; IR-LABEL: @multi_divergent_unreachable_exit(
673; IR: UnifiedUnreachableBlock:
674; IR-NEXT: call void @llvm.amdgcn.unreachable()
675; IR-NEXT: br label %UnifiedReturnBlock
676
677; IR: UnifiedReturnBlock:
678; IR-NEXT: call void @llvm.amdgcn.end.cf(i64
679; IR-NEXT: ret void
680define amdgpu_kernel void @multi_divergent_unreachable_exit() #0 {
681bb:
682  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
683  switch i32 %tmp, label %bb3 [
684    i32 2, label %bb1
685    i32 0, label %bb2
686  ]
687
688bb1:                                              ; preds = %bb
689  unreachable
690
691bb2:                                              ; preds = %bb
692  unreachable
693
694bb3:                                              ; preds = %bb
695  switch i32 undef, label %bb5 [
696    i32 2, label %bb4
697  ]
698
699bb4:                                              ; preds = %bb3
700  ret void
701
702bb5:                                              ; preds = %bb3
703  unreachable
704}
705
706declare i32 @llvm.amdgcn.workitem.id.x() #1
707
708attributes #0 = { nounwind }
709attributes #1 = { nounwind readnone }
710