1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
3; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck --check-prefix=FLAT %s
4
5define amdgpu_ps float @uniform_kill(float %a, i32 %b, float %c) {
6; SI-LABEL: uniform_kill:
7; SI:       ; %bb.0: ; %entry
8; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
9; SI-NEXT:    s_mov_b64 s[0:1], exec
10; SI-NEXT:    s_mov_b64 s[2:3], -1
11; SI-NEXT:    v_or_b32_e32 v0, v1, v0
12; SI-NEXT:    v_and_b32_e32 v0, 1, v0
13; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
14; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
15; SI-NEXT:  ; %bb.1: ; %if1
16; SI-NEXT:    s_xor_b64 s[2:3], exec, -1
17; SI-NEXT:  ; %bb.2: ; %endif1
18; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
19; SI-NEXT:    s_wqm_b64 s[4:5], s[2:3]
20; SI-NEXT:    s_xor_b64 s[4:5], s[4:5], exec
21; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
22; SI-NEXT:    s_cbranch_scc0 BB0_6
23; SI-NEXT:  ; %bb.3: ; %endif1
24; SI-NEXT:    s_and_b64 exec, exec, s[0:1]
25; SI-NEXT:    v_mov_b32_e32 v0, 0
26; SI-NEXT:    s_and_saveexec_b64 s[0:1], s[2:3]
27; SI-NEXT:    s_cbranch_execz BB0_5
28; SI-NEXT:  ; %bb.4: ; %if2
29; SI-NEXT:    s_mov_b32 s3, 0
30; SI-NEXT:    v_add_f32_e32 v0, 1.0, v2
31; SI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
32; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
33; SI-NEXT:    s_waitcnt lgkmcnt(0)
34; SI-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 offset:4 glc
35; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
36; SI-NEXT:    v_cvt_f32_i32_e32 v0, v0
37; SI-NEXT:  BB0_5: ; %endif2
38; SI-NEXT:    s_or_b64 exec, exec, s[0:1]
39; SI-NEXT:    s_branch BB0_7
40; SI-NEXT:  BB0_6:
41; SI-NEXT:    s_mov_b64 exec, 0
42; SI-NEXT:    exp null off, off, off, off done vm
43; SI-NEXT:    s_endpgm
44; SI-NEXT:  BB0_7:
45;
46; FLAT-LABEL: uniform_kill:
47; FLAT:       ; %bb.0: ; %entry
48; FLAT-NEXT:    v_cvt_i32_f32_e32 v0, v0
49; FLAT-NEXT:    s_mov_b64 s[0:1], exec
50; FLAT-NEXT:    s_mov_b64 s[2:3], -1
51; FLAT-NEXT:    v_or_b32_e32 v0, v1, v0
52; FLAT-NEXT:    v_and_b32_e32 v0, 1, v0
53; FLAT-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
54; FLAT-NEXT:    s_and_saveexec_b64 s[4:5], vcc
55; FLAT-NEXT:  ; %bb.1: ; %if1
56; FLAT-NEXT:    s_xor_b64 s[2:3], exec, -1
57; FLAT-NEXT:  ; %bb.2: ; %endif1
58; FLAT-NEXT:    s_or_b64 exec, exec, s[4:5]
59; FLAT-NEXT:    s_wqm_b64 s[4:5], s[2:3]
60; FLAT-NEXT:    s_xor_b64 s[4:5], s[4:5], exec
61; FLAT-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
62; FLAT-NEXT:    s_cbranch_scc0 BB0_6
63; FLAT-NEXT:  ; %bb.3: ; %endif1
64; FLAT-NEXT:    s_and_b64 exec, exec, s[0:1]
65; FLAT-NEXT:    v_mov_b32_e32 v0, 0
66; FLAT-NEXT:    s_and_saveexec_b64 s[0:1], s[2:3]
67; FLAT-NEXT:    s_cbranch_execz BB0_5
68; FLAT-NEXT:  ; %bb.4: ; %if2
69; FLAT-NEXT:    s_mov_b32 s3, 0
70; FLAT-NEXT:    v_add_f32_e32 v0, 1.0, v2
71; FLAT-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
72; FLAT-NEXT:    v_cvt_i32_f32_e32 v0, v0
73; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
74; FLAT-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 offset:4 glc
75; FLAT-NEXT:    s_waitcnt vmcnt(0)
76; FLAT-NEXT:    v_cvt_f32_i32_e32 v0, v0
77; FLAT-NEXT:  BB0_5: ; %endif2
78; FLAT-NEXT:    s_or_b64 exec, exec, s[0:1]
79; FLAT-NEXT:    s_branch BB0_7
80; FLAT-NEXT:  BB0_6:
81; FLAT-NEXT:    s_mov_b64 exec, 0
82; FLAT-NEXT:    exp null off, off, off, off done vm
83; FLAT-NEXT:    s_endpgm
84; FLAT-NEXT:  BB0_7:
85entry:
86  %.1 = fptosi float %a to i32
87  %.2 = or i32 %b, %.1
88  %.3 = and i32 %.2, 1
89  %.not = icmp eq i32 %.3, 0
90  br i1 %.not, label %endif1, label %if1
91
92if1:
93  br i1 false, label %if3, label %endif1
94
95if3:
96  br label %endif1
97
98endif1:
99  %.0 = phi i1 [ false, %if3 ], [ false, %if1 ], [ true, %entry ]
100  %.4 = call i1 @llvm.amdgcn.wqm.vote(i1 %.0)
101  ; This kill must be uniformly executed
102  call void @llvm.amdgcn.kill(i1 %.4)
103  %.test0 = fadd nsz arcp float %c, 1.0
104  %.test1 = fptosi float %.test0 to i32
105  br i1 %.0, label %if2, label %endif2
106
107if2:
108  %.5 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(6)* undef, i32 31, !amdgpu.uniform !0
109  %.6 = load <4 x i32>, <4 x i32> addrspace(6)* %.5, align 16, !invariant.load !0
110  %.7 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32 %.test1, <4 x i32> %.6, i32 4, i32 0, i32 0)
111  %.8 = sitofp i32 %.7 to float
112  br label %endif2
113
114endif2:
115  %.9 = phi float [ %.8, %if2 ], [ 0.0, %endif1 ]
116  ret float %.9
117}
118
119
120declare i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32, <4 x i32>, i32, i32, i32 immarg) #2
121declare i1 @llvm.amdgcn.wqm.vote(i1) #3
122declare void @llvm.amdgcn.kill(i1) #4
123declare float @llvm.amdgcn.wqm.f32(float) #1
124
125attributes #1 = { nounwind readnone speculatable willreturn }
126attributes #2 = { nounwind willreturn }
127attributes #3 = { convergent nounwind readnone willreturn }
128attributes #4 = { nounwind }
129
130!0 = !{}
131