1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
2
3; CHECK-LABEL: {{^}}test_kill_depth_0_imm_pos:
4; CHECK-NEXT: ; %bb.0:
5; CHECK-NEXT: s_endpgm
6define amdgpu_ps void @test_kill_depth_0_imm_pos() #0 {
7  call void @llvm.AMDGPU.kill(float 0.0)
8  ret void
9}
10
11; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg:
12; CHECK-NEXT: ; %bb.0:
13; CHECK-NEXT: s_mov_b64 exec, 0
14; CHECK-NEXT: ; %bb.1:
15; CHECK-NEXT: s_endpgm
16define amdgpu_ps void @test_kill_depth_0_imm_neg() #0 {
17  call void @llvm.AMDGPU.kill(float -0.0)
18  ret void
19}
20
21; FIXME: Ideally only one would be emitted
22; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg_x2:
23; CHECK-NEXT: ; %bb.0:
24; CHECK-NEXT: s_mov_b64 exec, 0
25; CHECK-NEXT: ; %bb.1:
26; CHECK-NEXT: s_mov_b64 exec, 0
27; CHECK-NEXT: ; %bb.2:
28; CHECK-NEXT: s_endpgm
29define amdgpu_ps void @test_kill_depth_0_imm_neg_x2() #0 {
30  call void @llvm.AMDGPU.kill(float -0.0)
31  call void @llvm.AMDGPU.kill(float -1.0)
32  ret void
33}
34
35; CHECK-LABEL: {{^}}test_kill_depth_var:
36; CHECK-NEXT: ; %bb.0:
37; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0
38; CHECK-NEXT: ; %bb.1:
39; CHECK-NEXT: s_endpgm
40define amdgpu_ps void @test_kill_depth_var(float %x) #0 {
41  call void @llvm.AMDGPU.kill(float %x)
42  ret void
43}
44
45; FIXME: Ideally only one would be emitted
46; CHECK-LABEL: {{^}}test_kill_depth_var_x2_same:
47; CHECK-NEXT: ; %bb.0:
48; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0
49; CHECK-NEXT: ; %bb.1:
50; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0
51; CHECK-NEXT: ; %bb.2:
52; CHECK-NEXT: s_endpgm
53define amdgpu_ps void @test_kill_depth_var_x2_same(float %x) #0 {
54  call void @llvm.AMDGPU.kill(float %x)
55  call void @llvm.AMDGPU.kill(float %x)
56  ret void
57}
58
59; CHECK-LABEL: {{^}}test_kill_depth_var_x2:
60; CHECK-NEXT: ; %bb.0:
61; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0
62; CHECK-NEXT: ; %bb.1:
63; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v1
64; CHECK-NEXT: ; %bb.2:
65; CHECK-NEXT: s_endpgm
66define amdgpu_ps void @test_kill_depth_var_x2(float %x, float %y) #0 {
67  call void @llvm.AMDGPU.kill(float %x)
68  call void @llvm.AMDGPU.kill(float %y)
69  ret void
70}
71
72; CHECK-LABEL: {{^}}test_kill_depth_var_x2_instructions:
73; CHECK-NEXT: ; %bb.0:
74; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0
75; CHECK-NEXT: s_cbranch_execnz BB6_2
76; CHECK-NEXT: ; %bb.1:
77; CHECK-NEXT: exp
78; CHECK-NEXT: s_endpgm
79; CHECK-NEXT: BB6_2:
80; CHECK: v_mov_b32_e64 v7, -1
81; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7
82; CHECK-NEXT: s_cbranch_execnz BB6_4
83; CHECK-NEXT: ; %bb.3:
84; CHECK-NEXT: exp
85; CHECK-NEXT: s_endpgm
86; CHECK-NEXT: BB6_4:
87; CHECK-NEXT: s_endpgm
88define amdgpu_ps void @test_kill_depth_var_x2_instructions(float %x) #0 {
89  call void @llvm.AMDGPU.kill(float %x)
90  %y = call float asm sideeffect "v_mov_b32_e64 v7, -1", "={v7}"()
91  call void @llvm.AMDGPU.kill(float %y)
92  ret void
93}
94
95; FIXME: why does the skip depend on the asm length in the same block?
96
97; CHECK-LABEL: {{^}}test_kill_control_flow:
98; CHECK: s_cmp_lg_u32 s{{[0-9]+}}, 0
99; CHECK: s_cbranch_scc1 [[RETURN_BB:BB[0-9]+_[0-9]+]]
100
101; CHECK-NEXT: ; %bb.1:
102; CHECK: v_mov_b32_e64 v7, -1
103; CHECK: v_nop_e64
104; CHECK: v_nop_e64
105; CHECK: v_nop_e64
106; CHECK: v_nop_e64
107; CHECK: v_nop_e64
108; CHECK: v_nop_e64
109; CHECK: v_nop_e64
110; CHECK: v_nop_e64
111; CHECK: v_nop_e64
112; CHECK: v_nop_e64
113
114; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7
115; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]]
116; CHECK-NEXT: ; %bb.2:
117; CHECK-NEXT: exp null off, off, off, off done vm
118; CHECK-NEXT: s_endpgm
119
120; CHECK-NEXT: {{^}}[[SPLIT_BB]]:
121; CHECK-NEXT: s_endpgm
122define amdgpu_ps void @test_kill_control_flow(i32 inreg %arg) #0 {
123entry:
124  %cmp = icmp eq i32 %arg, 0
125  br i1 %cmp, label %bb, label %exit
126
127bb:
128  %var = call float asm sideeffect "
129    v_mov_b32_e64 v7, -1
130    v_nop_e64
131    v_nop_e64
132    v_nop_e64
133    v_nop_e64
134    v_nop_e64
135    v_nop_e64
136    v_nop_e64
137    v_nop_e64
138    v_nop_e64
139    v_nop_e64", "={v7}"()
140  call void @llvm.AMDGPU.kill(float %var)
141  br label %exit
142
143exit:
144  ret void
145}
146
147; CHECK-LABEL: {{^}}test_kill_control_flow_remainder:
148; CHECK: s_cmp_lg_u32 s{{[0-9]+}}, 0
149; CHECK-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 0
150; CHECK-NEXT: s_cbranch_scc1 [[RETURN_BB:BB[0-9]+_[0-9]+]]
151
152; CHECK-NEXT: ; %bb.1: ; %bb
153; CHECK: v_mov_b32_e64 v7, -1
154; CHECK: v_nop_e64
155; CHECK: v_nop_e64
156; CHECK: v_nop_e64
157; CHECK: v_nop_e64
158; CHECK: v_nop_e64
159; CHECK: v_nop_e64
160; CHECK: v_nop_e64
161; CHECK: v_nop_e64
162; CHECK: ;;#ASMEND
163; CHECK: v_mov_b32_e64 v8, -1
164; CHECK: ;;#ASMEND
165; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7
166; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]]
167
168; CHECK-NEXT: ; %bb.2:
169; CHECK-NEXT: exp null off, off, off, off done vm
170; CHECK-NEXT: s_endpgm
171
172; CHECK-NEXT: {{^}}[[SPLIT_BB]]:
173; CHECK: buffer_store_dword v8
174; CHECK: v_mov_b32_e64 v9, -2
175
176; CHECK: {{^}}BB{{[0-9]+_[0-9]+}}:
177; CHECK: buffer_store_dword v9
178; CHECK-NEXT: s_endpgm
179define amdgpu_ps void @test_kill_control_flow_remainder(i32 inreg %arg) #0 {
180entry:
181  %cmp = icmp eq i32 %arg, 0
182  br i1 %cmp, label %bb, label %exit
183
184bb:
185  %var = call float asm sideeffect "
186    v_mov_b32_e64 v7, -1
187    v_nop_e64
188    v_nop_e64
189    v_nop_e64
190    v_nop_e64
191    v_nop_e64
192    v_nop_e64
193    v_nop_e64
194    v_nop_e64
195    v_nop_e64
196    v_nop_e64
197    v_nop_e64", "={v7}"()
198  %live.across = call float asm sideeffect "v_mov_b32_e64 v8, -1", "={v8}"()
199  call void @llvm.AMDGPU.kill(float %var)
200  store volatile float %live.across, float addrspace(1)* undef
201  %live.out = call float asm sideeffect "v_mov_b32_e64 v9, -2", "={v9}"()
202  br label %exit
203
204exit:
205  %phi = phi float [ 0.0, %entry ], [ %live.out, %bb ]
206  store float %phi, float addrspace(1)* undef
207  ret void
208}
209
210; CHECK-LABEL: {{^}}test_kill_divergent_loop:
211; CHECK: v_cmp_eq_u32_e32 vcc, 0, v0
212; CHECK-NEXT: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], vcc
213; CHECK-NEXT: s_xor_b64 [[SAVEEXEC]], exec, [[SAVEEXEC]]
214; CHECK-NEXT: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
215; CHECK-NEXT: s_cbranch_execz [[EXIT]]
216
217; CHECK: {{BB[0-9]+_[0-9]+}}: ; %bb.preheader
218; CHECK: s_mov_b32
219
220; CHECK: [[LOOP_BB:BB[0-9]+_[0-9]+]]:
221
222; CHECK: v_mov_b32_e64 v7, -1
223; CHECK: v_nop_e64
224; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7
225
226; CHECK-NEXT: ; %bb.3:
227; CHECK: buffer_load_dword [[LOAD:v[0-9]+]]
228; CHECK: v_cmp_eq_u32_e32 vcc, 0, [[LOAD]]
229; CHECK-NEXT: s_and_b64 vcc, exec, vcc
230; CHECK-NEXT: s_cbranch_vccnz [[LOOP_BB]]
231
232; CHECK-NEXT: {{^}}[[EXIT]]:
233; CHECK: s_or_b64 exec, exec, [[SAVEEXEC]]
234; CHECK: buffer_store_dword
235; CHECK: s_endpgm
236define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 {
237entry:
238  %cmp = icmp eq i32 %arg, 0
239  br i1 %cmp, label %bb, label %exit
240
241bb:
242  %var = call float asm sideeffect "
243    v_mov_b32_e64 v7, -1
244    v_nop_e64
245    v_nop_e64
246    v_nop_e64
247    v_nop_e64
248    v_nop_e64
249    v_nop_e64
250    v_nop_e64
251    v_nop_e64
252    v_nop_e64
253    v_nop_e64", "={v7}"()
254  call void @llvm.AMDGPU.kill(float %var)
255  %vgpr = load volatile i32, i32 addrspace(1)* undef
256  %loop.cond = icmp eq i32 %vgpr, 0
257  br i1 %loop.cond, label %bb, label %exit
258
259exit:
260  store volatile i32 8, i32 addrspace(1)* undef
261  ret void
262}
263
264; bug 28550
265; CHECK-LABEL: {{^}}phi_use_def_before_kill:
266; CHECK: v_cndmask_b32_e64 [[PHIREG:v[0-9]+]], 0, -1.0,
267; CHECK: v_cmpx_le_f32_e32 vcc, 0,
268; CHECK-NEXT: s_cbranch_execnz [[BB4:BB[0-9]+_[0-9]+]]
269
270; CHECK: exp
271; CHECK-NEXT: s_endpgm
272
273; CHECK: [[KILLBB:BB[0-9]+_[0-9]+]]:
274; CHECK-NEXT: s_cbranch_scc0 [[PHIBB:BB[0-9]+_[0-9]+]]
275
276; CHECK: [[PHIBB]]:
277; CHECK: v_cmp_eq_f32_e32 vcc, 0, [[PHIREG]]
278; CHECK: s_cbranch_vccz [[ENDBB:BB[0-9]+_[0-9]+]]
279
280; CHECK: ; %bb10
281; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 9
282; CHECK: buffer_store_dword
283
284; CHECK: [[ENDBB]]:
285; CHECK-NEXT: s_endpgm
286define amdgpu_ps void @phi_use_def_before_kill(float inreg %x) #0 {
287bb:
288  %tmp = fadd float %x, 1.000000e+00
289  %tmp1 = fcmp olt float 0.000000e+00, %tmp
290  %tmp2 = select i1 %tmp1, float -1.000000e+00, float 0.000000e+00
291  call void @llvm.AMDGPU.kill(float %tmp2)
292  br i1 undef, label %phibb, label %bb8
293
294phibb:
295  %tmp5 = phi float [ %tmp2, %bb ], [ 4.0, %bb8 ]
296  %tmp6 = fcmp oeq float %tmp5, 0.000000e+00
297  br i1 %tmp6, label %bb10, label %end
298
299bb8:
300  store volatile i32 8, i32 addrspace(1)* undef
301  br label %phibb
302
303bb10:
304  store volatile i32 9, i32 addrspace(1)* undef
305  br label %end
306
307end:
308  ret void
309}
310
311; CHECK-LABEL: {{^}}no_skip_no_successors:
312; CHECK: v_cmp_nge_f32
313; CHECK: s_cbranch_vccz [[SKIPKILL:BB[0-9]+_[0-9]+]]
314
315; CHECK: ; %bb6
316; CHECK: s_mov_b64 exec, 0
317
318; CHECK: [[SKIPKILL]]:
319; CHECK: v_cmp_nge_f32_e32 vcc
320; CHECK: %bb.3: ; %bb5
321; CHECK-NEXT: .Lfunc_end{{[0-9]+}}
322define amdgpu_ps void @no_skip_no_successors(float inreg %arg, float inreg %arg1) #0 {
323bb:
324  %tmp = fcmp ult float %arg1, 0.000000e+00
325  %tmp2 = fcmp ult float %arg, 0x3FCF5C2900000000
326  br i1 %tmp, label %bb6, label %bb3
327
328bb3:                                              ; preds = %bb
329  br i1 %tmp2, label %bb5, label %bb4
330
331bb4:                                              ; preds = %bb3
332  br i1 true, label %bb5, label %bb7
333
334bb5:                                              ; preds = %bb4, %bb3
335  unreachable
336
337bb6:                                              ; preds = %bb
338  call void @llvm.AMDGPU.kill(float -1.000000e+00)
339  unreachable
340
341bb7:                                              ; preds = %bb4
342  ret void
343}
344
345; CHECK-LABEL: {{^}}if_after_kill_block:
346; CHECK: ; %bb.0:
347; CHECK: s_and_saveexec_b64
348; CHECK: s_xor_b64
349; CHECK-NEXT: mask branch [[BB4:BB[0-9]+_[0-9]+]]
350
351; CHECK: v_cmpx_le_f32_e32 vcc, 0,
352; CHECK: [[BB4]]:
353; CHECK: s_or_b64 exec, exec
354; CHECK: image_sample_c
355
356; CHECK: v_cmp_neq_f32_e32 vcc, 0,
357; CHECK: s_and_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc
358; CHECK: mask branch [[END:BB[0-9]+_[0-9]+]]
359; CHECK-NOT: branch
360
361; CHECK: BB{{[0-9]+_[0-9]+}}: ; %bb8
362; CHECK: buffer_store_dword
363
364; CHECK: [[END]]:
365; CHECK: s_endpgm
366define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2, float %arg3) #0 {
367bb:
368  %tmp = fcmp ult float %arg1, 0.000000e+00
369  br i1 %tmp, label %bb3, label %bb4
370
371bb3:                                              ; preds = %bb
372  call void @llvm.AMDGPU.kill(float %arg)
373  br label %bb4
374
375bb4:                                              ; preds = %bb3, %bb
376  %tmp5 = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32 16, float %arg2, float %arg3, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0)
377  %tmp6 = extractelement <4 x float> %tmp5, i32 0
378  %tmp7 = fcmp une float %tmp6, 0.000000e+00
379  br i1 %tmp7, label %bb8, label %bb9
380
381bb8:                                              ; preds = %bb9, %bb4
382  store volatile i32 9, i32 addrspace(1)* undef
383  ret void
384
385bb9:                                              ; preds = %bb4
386  ret void
387}
388
389declare <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
390declare void @llvm.AMDGPU.kill(float) #0
391
392attributes #0 = { nounwind }
393attributes #1 = { nounwind readonly }
394