1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=SI
3; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=VI
4; RUN: llc < %s -march=r600 -mcpu=cypress -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=EG
5; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10
6
7declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone
8declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone
9declare i16 @llvm.ctlz.i16(i16, i1) nounwind readnone
10
11declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
12declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
13declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
14
15declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
16declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readnone
17declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) nounwind readnone
18
19declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
20
21define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
22; SI-LABEL: s_ctlz_i32:
23; SI:       ; %bb.0:
24; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
25; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
26; SI-NEXT:    s_mov_b32 s3, 0xf000
27; SI-NEXT:    s_waitcnt lgkmcnt(0)
28; SI-NEXT:    s_flbit_i32_b32 s5, s4
29; SI-NEXT:    s_mov_b32 s2, -1
30; SI-NEXT:    v_mov_b32_e32 v0, s5
31; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
32; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v0, vcc
33; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
34; SI-NEXT:    s_endpgm
35;
36; VI-LABEL: s_ctlz_i32:
37; VI:       ; %bb.0:
38; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
39; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
40; VI-NEXT:    s_mov_b32 s7, 0xf000
41; VI-NEXT:    s_mov_b32 s6, -1
42; VI-NEXT:    s_waitcnt lgkmcnt(0)
43; VI-NEXT:    s_flbit_i32_b32 s1, s0
44; VI-NEXT:    s_cmp_lg_u32 s0, 0
45; VI-NEXT:    s_cselect_b32 s0, s1, 32
46; VI-NEXT:    v_mov_b32_e32 v0, s0
47; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
48; VI-NEXT:    s_endpgm
49;
50; EG-LABEL: s_ctlz_i32:
51; EG:       ; %bb.0:
52; EG-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
53; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
54; EG-NEXT:    CF_END
55; EG-NEXT:    PAD
56; EG-NEXT:    ALU clause starting at 4:
57; EG-NEXT:     FFBH_UINT * T0.W, KC0[2].Z,
58; EG-NEXT:     CNDE_INT T0.X, KC0[2].Z, literal.x, PV.W,
59; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
60; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
61;
62; GFX10-LABEL: s_ctlz_i32:
63; GFX10:       ; %bb.0:
64; GFX10-NEXT:    s_clause 0x1
65; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
66; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
67; GFX10-NEXT:    v_mov_b32_e32 v0, 0
68; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
69; GFX10-NEXT:    s_flbit_i32_b32 s0, s4
70; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
71; GFX10-NEXT:    s_cselect_b32 s0, s0, 32
72; GFX10-NEXT:    v_mov_b32_e32 v1, s0
73; GFX10-NEXT:    global_store_dword v0, v1, s[2:3]
74; GFX10-NEXT:    s_endpgm
75  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
76  store i32 %ctlz, i32 addrspace(1)* %out, align 4
77  ret void
78}
79
80define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
81; SI-LABEL: v_ctlz_i32:
82; SI:       ; %bb.0:
83; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
84; SI-NEXT:    s_mov_b32 s3, 0xf000
85; SI-NEXT:    s_mov_b32 s6, 0
86; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
87; SI-NEXT:    v_mov_b32_e32 v1, 0
88; SI-NEXT:    s_mov_b32 s7, s3
89; SI-NEXT:    s_waitcnt lgkmcnt(0)
90; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
91; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
92; SI-NEXT:    s_mov_b32 s2, -1
93; SI-NEXT:    s_waitcnt vmcnt(0)
94; SI-NEXT:    v_ffbh_u32_e32 v1, v0
95; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
96; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
97; SI-NEXT:    s_waitcnt lgkmcnt(0)
98; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
99; SI-NEXT:    s_endpgm
100;
101; VI-LABEL: v_ctlz_i32:
102; VI:       ; %bb.0:
103; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
104; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
105; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
106; VI-NEXT:    s_mov_b32 s7, 0xf000
107; VI-NEXT:    s_mov_b32 s6, -1
108; VI-NEXT:    s_waitcnt lgkmcnt(0)
109; VI-NEXT:    v_mov_b32_e32 v1, s1
110; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
111; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
112; VI-NEXT:    flat_load_dword v0, v[0:1]
113; VI-NEXT:    s_waitcnt vmcnt(0)
114; VI-NEXT:    v_ffbh_u32_e32 v1, v0
115; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
116; VI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
117; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
118; VI-NEXT:    s_endpgm
119;
120; EG-LABEL: v_ctlz_i32:
121; EG:       ; %bb.0:
122; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
123; EG-NEXT:    TEX 0 @6
124; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
125; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
126; EG-NEXT:    CF_END
127; EG-NEXT:    PAD
128; EG-NEXT:    Fetch clause starting at 6:
129; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
130; EG-NEXT:    ALU clause starting at 8:
131; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
132; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
133; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
134; EG-NEXT:    ALU clause starting at 11:
135; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
136; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
137; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
138; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
139;
140; GFX10-LABEL: v_ctlz_i32:
141; GFX10:       ; %bb.0:
142; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
143; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
144; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
145; GFX10-NEXT:    v_mov_b32_e32 v2, 0
146; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
147; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
148; GFX10-NEXT:    s_waitcnt vmcnt(0)
149; GFX10-NEXT:    v_ffbh_u32_e32 v1, v0
150; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
151; GFX10-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc_lo
152; GFX10-NEXT:    global_store_dword v2, v0, s[0:1]
153; GFX10-NEXT:    s_endpgm
154  %tid = call i32 @llvm.amdgcn.workitem.id.x()
155  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
156  %val = load i32, i32 addrspace(1)* %in.gep, align 4
157  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
158  store i32 %ctlz, i32 addrspace(1)* %out, align 4
159  ret void
160}
161
162define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
163; SI-LABEL: v_ctlz_v2i32:
164; SI:       ; %bb.0:
165; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
166; SI-NEXT:    s_mov_b32 s3, 0xf000
167; SI-NEXT:    s_mov_b32 s6, 0
168; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
169; SI-NEXT:    v_mov_b32_e32 v1, 0
170; SI-NEXT:    s_mov_b32 s7, s3
171; SI-NEXT:    s_waitcnt lgkmcnt(0)
172; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
173; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
174; SI-NEXT:    s_mov_b32 s2, -1
175; SI-NEXT:    s_waitcnt vmcnt(0)
176; SI-NEXT:    v_ffbh_u32_e32 v2, v1
177; SI-NEXT:    v_ffbh_u32_e32 v3, v0
178; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
179; SI-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
180; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
181; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v3, vcc
182; SI-NEXT:    s_waitcnt lgkmcnt(0)
183; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
184; SI-NEXT:    s_endpgm
185;
186; VI-LABEL: v_ctlz_v2i32:
187; VI:       ; %bb.0:
188; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
189; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
190; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
191; VI-NEXT:    s_mov_b32 s7, 0xf000
192; VI-NEXT:    s_mov_b32 s6, -1
193; VI-NEXT:    s_waitcnt lgkmcnt(0)
194; VI-NEXT:    v_mov_b32_e32 v1, s1
195; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
196; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
197; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
198; VI-NEXT:    s_waitcnt vmcnt(0)
199; VI-NEXT:    v_ffbh_u32_e32 v2, v1
200; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
201; VI-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
202; VI-NEXT:    v_ffbh_u32_e32 v3, v0
203; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
204; VI-NEXT:    v_cndmask_b32_e32 v0, 32, v3, vcc
205; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
206; VI-NEXT:    s_endpgm
207;
208; EG-LABEL: v_ctlz_v2i32:
209; EG:       ; %bb.0:
210; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
211; EG-NEXT:    TEX 0 @6
212; EG-NEXT:    ALU 6, @11, KC0[CB0:0-32], KC1[]
213; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
214; EG-NEXT:    CF_END
215; EG-NEXT:    PAD
216; EG-NEXT:    Fetch clause starting at 6:
217; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
218; EG-NEXT:    ALU clause starting at 8:
219; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
220; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
221; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
222; EG-NEXT:    ALU clause starting at 11:
223; EG-NEXT:     FFBH_UINT * T0.W, T0.Y,
224; EG-NEXT:     CNDE_INT T0.Y, T0.Y, literal.x, PV.W,
225; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
226; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
227; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
228; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
229; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
230;
231; GFX10-LABEL: v_ctlz_v2i32:
232; GFX10:       ; %bb.0:
233; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
234; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
235; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
236; GFX10-NEXT:    v_mov_b32_e32 v4, 0
237; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
238; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
239; GFX10-NEXT:    s_waitcnt vmcnt(0)
240; GFX10-NEXT:    v_ffbh_u32_e32 v2, v1
241; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
242; GFX10-NEXT:    v_ffbh_u32_e32 v3, v0
243; GFX10-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc_lo
244; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
245; GFX10-NEXT:    v_cndmask_b32_e32 v0, 32, v3, vcc_lo
246; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
247; GFX10-NEXT:    s_endpgm
248  %tid = call i32 @llvm.amdgcn.workitem.id.x()
249  %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
250  %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
251  %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 false) nounwind readnone
252  store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
253  ret void
254}
255
256define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
257; SI-LABEL: v_ctlz_v4i32:
258; SI:       ; %bb.0:
259; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
260; SI-NEXT:    s_mov_b32 s3, 0xf000
261; SI-NEXT:    s_mov_b32 s6, 0
262; SI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
263; SI-NEXT:    v_mov_b32_e32 v1, 0
264; SI-NEXT:    s_mov_b32 s7, s3
265; SI-NEXT:    s_waitcnt lgkmcnt(0)
266; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
267; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
268; SI-NEXT:    s_mov_b32 s2, -1
269; SI-NEXT:    s_waitcnt vmcnt(0)
270; SI-NEXT:    v_ffbh_u32_e32 v4, v3
271; SI-NEXT:    v_ffbh_u32_e32 v5, v2
272; SI-NEXT:    v_ffbh_u32_e32 v6, v1
273; SI-NEXT:    v_ffbh_u32_e32 v7, v0
274; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
275; SI-NEXT:    v_cndmask_b32_e32 v3, 32, v4, vcc
276; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
277; SI-NEXT:    v_cndmask_b32_e32 v2, 32, v5, vcc
278; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
279; SI-NEXT:    v_cndmask_b32_e32 v1, 32, v6, vcc
280; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
281; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v7, vcc
282; SI-NEXT:    s_waitcnt lgkmcnt(0)
283; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
284; SI-NEXT:    s_endpgm
285;
286; VI-LABEL: v_ctlz_v4i32:
287; VI:       ; %bb.0:
288; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
289; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
290; VI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
291; VI-NEXT:    s_mov_b32 s7, 0xf000
292; VI-NEXT:    s_mov_b32 s6, -1
293; VI-NEXT:    s_waitcnt lgkmcnt(0)
294; VI-NEXT:    v_mov_b32_e32 v1, s1
295; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
296; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
297; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
298; VI-NEXT:    s_waitcnt vmcnt(0)
299; VI-NEXT:    v_ffbh_u32_e32 v4, v3
300; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
301; VI-NEXT:    v_cndmask_b32_e32 v3, 32, v4, vcc
302; VI-NEXT:    v_ffbh_u32_e32 v5, v2
303; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
304; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v5, vcc
305; VI-NEXT:    v_ffbh_u32_e32 v6, v1
306; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
307; VI-NEXT:    v_cndmask_b32_e32 v1, 32, v6, vcc
308; VI-NEXT:    v_ffbh_u32_e32 v7, v0
309; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
310; VI-NEXT:    v_cndmask_b32_e32 v0, 32, v7, vcc
311; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
312; VI-NEXT:    s_endpgm
313;
314; EG-LABEL: v_ctlz_v4i32:
315; EG:       ; %bb.0:
316; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
317; EG-NEXT:    TEX 0 @6
318; EG-NEXT:    ALU 12, @11, KC0[CB0:0-32], KC1[]
319; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
320; EG-NEXT:    CF_END
321; EG-NEXT:    PAD
322; EG-NEXT:    Fetch clause starting at 6:
323; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
324; EG-NEXT:    ALU clause starting at 8:
325; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
326; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
327; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
328; EG-NEXT:    ALU clause starting at 11:
329; EG-NEXT:     FFBH_UINT * T1.W, T0.W,
330; EG-NEXT:     FFBH_UINT T2.W, T0.Z,
331; EG-NEXT:     CNDE_INT * T0.W, T0.W, literal.x, PV.W, BS:VEC_021/SCL_122
332; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
333; EG-NEXT:     CNDE_INT T0.Z, T0.Z, literal.x, PV.W,
334; EG-NEXT:     FFBH_UINT * T1.W, T0.Y,
335; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
336; EG-NEXT:     CNDE_INT T0.Y, T0.Y, literal.x, PV.W,
337; EG-NEXT:     FFBH_UINT * T1.W, T0.X,
338; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
339; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
340; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
341; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
342;
343; GFX10-LABEL: v_ctlz_v4i32:
344; GFX10:       ; %bb.0:
345; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
346; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
347; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
348; GFX10-NEXT:    v_mov_b32_e32 v4, 0
349; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
350; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
351; GFX10-NEXT:    s_waitcnt vmcnt(0)
352; GFX10-NEXT:    v_ffbh_u32_e32 v5, v3
353; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
354; GFX10-NEXT:    v_ffbh_u32_e32 v6, v2
355; GFX10-NEXT:    v_ffbh_u32_e32 v7, v1
356; GFX10-NEXT:    v_ffbh_u32_e32 v8, v0
357; GFX10-NEXT:    v_cndmask_b32_e32 v3, 32, v5, vcc_lo
358; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
359; GFX10-NEXT:    v_cndmask_b32_e32 v2, 32, v6, vcc_lo
360; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
361; GFX10-NEXT:    v_cndmask_b32_e32 v1, 32, v7, vcc_lo
362; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
363; GFX10-NEXT:    v_cndmask_b32_e32 v0, 32, v8, vcc_lo
364; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
365; GFX10-NEXT:    s_endpgm
366  %tid = call i32 @llvm.amdgcn.workitem.id.x()
367  %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid
368  %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
369  %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 false) nounwind readnone
370  store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
371  ret void
372}
373
374define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
375; SI-LABEL: v_ctlz_i8:
376; SI:       ; %bb.0:
377; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
378; SI-NEXT:    s_mov_b32 s3, 0xf000
379; SI-NEXT:    s_mov_b32 s2, -1
380; SI-NEXT:    s_mov_b32 s6, s2
381; SI-NEXT:    s_mov_b32 s7, s3
382; SI-NEXT:    s_waitcnt lgkmcnt(0)
383; SI-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
384; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
385; SI-NEXT:    s_waitcnt vmcnt(0)
386; SI-NEXT:    v_ffbh_u32_e32 v1, v0
387; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
388; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
389; SI-NEXT:    v_subrev_i32_e32 v0, vcc, 24, v0
390; SI-NEXT:    s_waitcnt lgkmcnt(0)
391; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
392; SI-NEXT:    s_endpgm
393;
394; VI-LABEL: v_ctlz_i8:
395; VI:       ; %bb.0:
396; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
397; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
398; VI-NEXT:    s_mov_b32 s7, 0xf000
399; VI-NEXT:    s_mov_b32 s6, -1
400; VI-NEXT:    s_mov_b32 s2, s6
401; VI-NEXT:    s_mov_b32 s3, s7
402; VI-NEXT:    s_waitcnt lgkmcnt(0)
403; VI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
404; VI-NEXT:    s_waitcnt vmcnt(0)
405; VI-NEXT:    v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
406; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
407; VI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
408; VI-NEXT:    v_add_u32_e32 v0, vcc, -16, v0
409; VI-NEXT:    v_add_u16_e32 v0, -8, v0
410; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
411; VI-NEXT:    s_endpgm
412;
413; EG-LABEL: v_ctlz_i8:
414; EG:       ; %bb.0:
415; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
416; EG-NEXT:    TEX 0 @6
417; EG-NEXT:    ALU 15, @9, KC0[CB0:0-32], KC1[]
418; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
419; EG-NEXT:    CF_END
420; EG-NEXT:    PAD
421; EG-NEXT:    Fetch clause starting at 6:
422; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
423; EG-NEXT:    ALU clause starting at 8:
424; EG-NEXT:     MOV * T0.X, KC0[2].Z,
425; EG-NEXT:    ALU clause starting at 9:
426; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
427; EG-NEXT:     CNDE_INT T0.W, T0.X, literal.x, PV.W,
428; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
429; EG-NEXT:    32(4.484155e-44), 3(4.203895e-45)
430; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
431; EG-NEXT:    -24(nan), 0(0.000000e+00)
432; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
433; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
434; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
435; EG-NEXT:     LSHL T0.X, PV.W, PS,
436; EG-NEXT:     LSHL * T0.W, literal.x, PS,
437; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
438; EG-NEXT:     MOV T0.Y, 0.0,
439; EG-NEXT:     MOV * T0.Z, 0.0,
440; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
441; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
442;
443; GFX10-LABEL: v_ctlz_i8:
444; GFX10:       ; %bb.0:
445; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
446; GFX10-NEXT:    v_mov_b32_e32 v0, 0
447; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
448; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
449; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3]
450; GFX10-NEXT:    s_waitcnt vmcnt(0)
451; GFX10-NEXT:    v_ffbh_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
452; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
453; GFX10-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc_lo
454; GFX10-NEXT:    v_add_nc_u32_e32 v1, -16, v1
455; GFX10-NEXT:    v_add_nc_u16 v1, v1, -8
456; GFX10-NEXT:    global_store_byte v0, v1, s[0:1]
457; GFX10-NEXT:    s_endpgm
458  %val = load i8, i8 addrspace(1)* %valptr
459  %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone
460  store i8 %ctlz, i8 addrspace(1)* %out
461  ret void
462}
463
464define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind {
465; SI-LABEL: s_ctlz_i64:
466; SI:       ; %bb.0:
467; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x13
468; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
469; SI-NEXT:    s_mov_b32 s3, 0xf000
470; SI-NEXT:    s_mov_b32 s2, -1
471; SI-NEXT:    s_waitcnt lgkmcnt(0)
472; SI-NEXT:    s_flbit_i32_b32 s6, s4
473; SI-NEXT:    s_flbit_i32_b32 s7, s5
474; SI-NEXT:    s_add_i32 s6, s6, 32
475; SI-NEXT:    s_or_b32 s4, s4, s5
476; SI-NEXT:    v_mov_b32_e32 v0, s7
477; SI-NEXT:    v_mov_b32_e32 v1, s6
478; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
479; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
480; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
481; SI-NEXT:    v_cndmask_b32_e32 v0, 64, v0, vcc
482; SI-NEXT:    v_mov_b32_e32 v1, 0
483; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
484; SI-NEXT:    s_endpgm
485;
486; VI-LABEL: s_ctlz_i64:
487; VI:       ; %bb.0:
488; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
489; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x4c
490; VI-NEXT:    s_mov_b32 s7, 0xf000
491; VI-NEXT:    s_mov_b32 s6, -1
492; VI-NEXT:    v_mov_b32_e32 v1, 0
493; VI-NEXT:    s_waitcnt lgkmcnt(0)
494; VI-NEXT:    s_flbit_i32_b32 s2, s0
495; VI-NEXT:    s_add_i32 s2, s2, 32
496; VI-NEXT:    s_flbit_i32_b32 s3, s1
497; VI-NEXT:    s_cmp_eq_u32 s1, 0
498; VI-NEXT:    s_cselect_b32 s2, s2, s3
499; VI-NEXT:    s_or_b32 s0, s0, s1
500; VI-NEXT:    s_cmp_lg_u32 s0, 0
501; VI-NEXT:    s_cselect_b32 s0, s2, 64
502; VI-NEXT:    v_mov_b32_e32 v0, s0
503; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
504; VI-NEXT:    s_endpgm
505;
506; EG-LABEL: s_ctlz_i64:
507; EG:       ; %bb.0:
508; EG-NEXT:    ALU 9, @4, KC0[CB0:0-32], KC1[]
509; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
510; EG-NEXT:    CF_END
511; EG-NEXT:    PAD
512; EG-NEXT:    ALU clause starting at 4:
513; EG-NEXT:     FFBH_UINT * T0.W, KC0[4].W,
514; EG-NEXT:     CNDE_INT * T0.W, KC0[4].W, literal.x, PV.W,
515; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
516; EG-NEXT:     FFBH_UINT T1.W, KC0[5].X,
517; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
518; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
519; EG-NEXT:     CNDE_INT T0.X, KC0[5].X, PS, PV.W,
520; EG-NEXT:     MOV T0.Y, 0.0,
521; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
522; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
523;
524; GFX10-LABEL: s_ctlz_i64:
525; GFX10:       ; %bb.0:
526; GFX10-NEXT:    s_clause 0x1
527; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x4c
528; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
529; GFX10-NEXT:    v_mov_b32_e32 v1, 0
530; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
531; GFX10-NEXT:    s_flbit_i32_b32 s0, s2
532; GFX10-NEXT:    s_flbit_i32_b32 s1, s3
533; GFX10-NEXT:    s_add_i32 s0, s0, 32
534; GFX10-NEXT:    s_cmp_eq_u32 s3, 0
535; GFX10-NEXT:    s_cselect_b32 s0, s0, s1
536; GFX10-NEXT:    s_or_b32 s1, s2, s3
537; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
538; GFX10-NEXT:    s_cselect_b32 s0, s0, 64
539; GFX10-NEXT:    v_mov_b32_e32 v0, s0
540; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
541; GFX10-NEXT:    s_endpgm
542  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
543  store i64 %ctlz, i64 addrspace(1)* %out
544  ret void
545}
546
547define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
548; SI-LABEL: s_ctlz_i64_trunc:
549; SI:       ; %bb.0:
550; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
551; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
552; SI-NEXT:    s_mov_b32 s3, 0xf000
553; SI-NEXT:    s_mov_b32 s2, -1
554; SI-NEXT:    s_waitcnt lgkmcnt(0)
555; SI-NEXT:    s_flbit_i32_b32 s6, s4
556; SI-NEXT:    s_flbit_i32_b32 s7, s5
557; SI-NEXT:    s_add_i32 s6, s6, 32
558; SI-NEXT:    s_or_b32 s4, s4, s5
559; SI-NEXT:    v_mov_b32_e32 v0, s7
560; SI-NEXT:    v_mov_b32_e32 v1, s6
561; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
562; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
563; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
564; SI-NEXT:    v_cndmask_b32_e32 v0, 64, v0, vcc
565; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
566; SI-NEXT:    s_endpgm
567;
568; VI-LABEL: s_ctlz_i64_trunc:
569; VI:       ; %bb.0:
570; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
571; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
572; VI-NEXT:    s_mov_b32 s7, 0xf000
573; VI-NEXT:    s_mov_b32 s6, -1
574; VI-NEXT:    s_waitcnt lgkmcnt(0)
575; VI-NEXT:    s_flbit_i32_b32 s2, s0
576; VI-NEXT:    s_add_i32 s2, s2, 32
577; VI-NEXT:    s_flbit_i32_b32 s3, s1
578; VI-NEXT:    s_cmp_eq_u32 s1, 0
579; VI-NEXT:    s_cselect_b32 s2, s2, s3
580; VI-NEXT:    s_or_b32 s0, s0, s1
581; VI-NEXT:    s_cmp_lg_u32 s0, 0
582; VI-NEXT:    s_cselect_b32 s0, s2, 64
583; VI-NEXT:    v_mov_b32_e32 v0, s0
584; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
585; VI-NEXT:    s_endpgm
586;
587; EG-LABEL: s_ctlz_i64_trunc:
588; EG:       ; %bb.0:
589; EG-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
590; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
591; EG-NEXT:    CF_END
592; EG-NEXT:    PAD
593; EG-NEXT:    ALU clause starting at 4:
594; EG-NEXT:     FFBH_UINT * T0.W, KC0[2].W,
595; EG-NEXT:     CNDE_INT * T0.W, KC0[2].W, literal.x, PV.W,
596; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
597; EG-NEXT:     FFBH_UINT T1.W, KC0[3].X,
598; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
599; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
600; EG-NEXT:     CNDE_INT T0.X, KC0[3].X, PS, PV.W,
601; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
602; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
603;
604; GFX10-LABEL: s_ctlz_i64_trunc:
605; GFX10:       ; %bb.0:
606; GFX10-NEXT:    s_clause 0x1
607; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
608; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
609; GFX10-NEXT:    v_mov_b32_e32 v0, 0
610; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
611; GFX10-NEXT:    s_flbit_i32_b32 s0, s2
612; GFX10-NEXT:    s_flbit_i32_b32 s1, s3
613; GFX10-NEXT:    s_add_i32 s0, s0, 32
614; GFX10-NEXT:    s_cmp_eq_u32 s3, 0
615; GFX10-NEXT:    s_cselect_b32 s0, s0, s1
616; GFX10-NEXT:    s_or_b32 s1, s2, s3
617; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
618; GFX10-NEXT:    s_cselect_b32 s0, s0, 64
619; GFX10-NEXT:    v_mov_b32_e32 v1, s0
620; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
621; GFX10-NEXT:    s_endpgm
622  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
623  %trunc = trunc i64 %ctlz to i32
624  store i32 %trunc, i32 addrspace(1)* %out
625  ret void
626}
627
628define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
629; SI-LABEL: v_ctlz_i64:
630; SI:       ; %bb.0:
631; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
632; SI-NEXT:    s_mov_b32 s7, 0xf000
633; SI-NEXT:    s_mov_b32 s6, 0
634; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
635; SI-NEXT:    v_mov_b32_e32 v1, 0
636; SI-NEXT:    s_waitcnt lgkmcnt(0)
637; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
638; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
639; SI-NEXT:    s_waitcnt vmcnt(0)
640; SI-NEXT:    v_ffbh_u32_e32 v4, v2
641; SI-NEXT:    v_ffbh_u32_e32 v5, v3
642; SI-NEXT:    v_or_b32_e32 v2, v2, v3
643; SI-NEXT:    v_add_i32_e32 v4, vcc, 32, v4
644; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
645; SI-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
646; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
647; SI-NEXT:    v_cndmask_b32_e32 v2, 64, v3, vcc
648; SI-NEXT:    v_mov_b32_e32 v3, v1
649; SI-NEXT:    s_waitcnt lgkmcnt(0)
650; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
651; SI-NEXT:    s_endpgm
652;
653; VI-LABEL: v_ctlz_i64:
654; VI:       ; %bb.0:
655; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
656; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
657; VI-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
658; VI-NEXT:    v_mov_b32_e32 v4, 0
659; VI-NEXT:    v_mov_b32_e32 v2, 0
660; VI-NEXT:    s_waitcnt lgkmcnt(0)
661; VI-NEXT:    v_mov_b32_e32 v5, s3
662; VI-NEXT:    v_mov_b32_e32 v1, s1
663; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v3
664; VI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
665; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
666; VI-NEXT:    v_add_u32_e32 v3, vcc, s2, v3
667; VI-NEXT:    v_addc_u32_e32 v4, vcc, v5, v4, vcc
668; VI-NEXT:    s_waitcnt vmcnt(0)
669; VI-NEXT:    v_ffbh_u32_e32 v5, v0
670; VI-NEXT:    v_add_u32_e32 v5, vcc, 32, v5
671; VI-NEXT:    v_ffbh_u32_e32 v6, v1
672; VI-NEXT:    v_or_b32_e32 v0, v0, v1
673; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
674; VI-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc
675; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
676; VI-NEXT:    v_cndmask_b32_e32 v1, 64, v1, vcc
677; VI-NEXT:    flat_store_dwordx2 v[3:4], v[1:2]
678; VI-NEXT:    s_endpgm
679;
680; EG-LABEL: v_ctlz_i64:
681; EG:       ; %bb.0:
682; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
683; EG-NEXT:    TEX 0 @6
684; EG-NEXT:    ALU 10, @11, KC0[CB0:0-32], KC1[]
685; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
686; EG-NEXT:    CF_END
687; EG-NEXT:    PAD
688; EG-NEXT:    Fetch clause starting at 6:
689; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
690; EG-NEXT:    ALU clause starting at 8:
691; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
692; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
693; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
694; EG-NEXT:    ALU clause starting at 11:
695; EG-NEXT:     FFBH_UINT * T1.W, T0.X,
696; EG-NEXT:     CNDE_INT * T1.W, T0.X, literal.x, PV.W,
697; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
698; EG-NEXT:     FFBH_UINT T2.W, T0.Y,
699; EG-NEXT:     ADD_INT * T1.W, PV.W, literal.x,
700; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
701; EG-NEXT:     CNDE_INT T0.X, T0.Y, PS, PV.W,
702; EG-NEXT:     MOV T0.Y, 0.0,
703; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
704; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
705; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
706;
707; GFX10-LABEL: v_ctlz_i64:
708; GFX10:       ; %bb.0:
709; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
710; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
711; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
712; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
713; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
714; GFX10-NEXT:    s_waitcnt vmcnt(0)
715; GFX10-NEXT:    v_ffbh_u32_e32 v3, v0
716; GFX10-NEXT:    v_ffbh_u32_e32 v4, v1
717; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
718; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
719; GFX10-NEXT:    v_add_nc_u32_e32 v3, 32, v3
720; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc_lo
721; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
722; GFX10-NEXT:    v_cndmask_b32_e32 v0, 64, v1, vcc_lo
723; GFX10-NEXT:    v_mov_b32_e32 v1, 0
724; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
725; GFX10-NEXT:    s_endpgm
726  %tid = call i32 @llvm.amdgcn.workitem.id.x()
727  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
728  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
729  %val = load i64, i64 addrspace(1)* %in.gep
730  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
731  store i64 %ctlz, i64 addrspace(1)* %out.gep
732  ret void
733}
734
735define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
736; SI-LABEL: v_ctlz_i64_trunc:
737; SI:       ; %bb.0:
738; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
739; SI-NEXT:    s_mov_b32 s7, 0xf000
740; SI-NEXT:    s_mov_b32 s6, 0
741; SI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
742; SI-NEXT:    v_mov_b32_e32 v2, 0
743; SI-NEXT:    s_waitcnt lgkmcnt(0)
744; SI-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
745; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
746; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
747; SI-NEXT:    s_waitcnt vmcnt(0)
748; SI-NEXT:    v_ffbh_u32_e32 v0, v3
749; SI-NEXT:    v_ffbh_u32_e32 v5, v4
750; SI-NEXT:    v_or_b32_e32 v3, v3, v4
751; SI-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
752; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
753; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
754; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
755; SI-NEXT:    v_cndmask_b32_e32 v0, 64, v0, vcc
756; SI-NEXT:    s_waitcnt lgkmcnt(0)
757; SI-NEXT:    buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
758; SI-NEXT:    s_endpgm
759;
760; VI-LABEL: v_ctlz_i64_trunc:
761; VI:       ; %bb.0:
762; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
763; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
764; VI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
765; VI-NEXT:    v_mov_b32_e32 v4, 0
766; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
767; VI-NEXT:    s_waitcnt lgkmcnt(0)
768; VI-NEXT:    v_mov_b32_e32 v5, s3
769; VI-NEXT:    v_mov_b32_e32 v2, s1
770; VI-NEXT:    v_add_u32_e32 v1, vcc, s0, v1
771; VI-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
772; VI-NEXT:    flat_load_dwordx2 v[1:2], v[1:2]
773; VI-NEXT:    v_add_u32_e32 v3, vcc, s2, v0
774; VI-NEXT:    v_addc_u32_e32 v4, vcc, v5, v4, vcc
775; VI-NEXT:    s_waitcnt vmcnt(0)
776; VI-NEXT:    v_ffbh_u32_e32 v0, v1
777; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
778; VI-NEXT:    v_ffbh_u32_e32 v5, v2
779; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
780; VI-NEXT:    v_or_b32_e32 v1, v1, v2
781; VI-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
782; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
783; VI-NEXT:    v_cndmask_b32_e32 v0, 64, v0, vcc
784; VI-NEXT:    flat_store_dword v[3:4], v0
785; VI-NEXT:    s_endpgm
786;
787; EG-LABEL: v_ctlz_i64_trunc:
788; EG:       ; %bb.0:
789; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
790; EG-NEXT:    TEX 0 @6
791; EG-NEXT:    ALU 10, @11, KC0[CB0:0-32], KC1[]
792; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
793; EG-NEXT:    CF_END
794; EG-NEXT:    PAD
795; EG-NEXT:    Fetch clause starting at 6:
796; EG-NEXT:     VTX_READ_64 T1.XY, T1.X, 0, #1
797; EG-NEXT:    ALU clause starting at 8:
798; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
799; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
800; EG-NEXT:     ADD_INT * T1.X, KC0[2].Z, PV.W,
801; EG-NEXT:    ALU clause starting at 11:
802; EG-NEXT:     FFBH_UINT * T0.W, T1.X,
803; EG-NEXT:     CNDE_INT * T0.W, T1.X, literal.x, PV.W,
804; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
805; EG-NEXT:     LSHL T0.Z, T0.X, literal.x,
806; EG-NEXT:     FFBH_UINT T1.W, T1.Y,
807; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.y,
808; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
809; EG-NEXT:     CNDE_INT T0.X, T1.Y, PS, PV.W,
810; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, PV.Z,
811; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
812; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
813;
814; GFX10-LABEL: v_ctlz_i64_trunc:
815; GFX10:       ; %bb.0:
816; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
817; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
818; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
819; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
820; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
821; GFX10-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
822; GFX10-NEXT:    s_waitcnt vmcnt(0)
823; GFX10-NEXT:    v_ffbh_u32_e32 v3, v1
824; GFX10-NEXT:    v_ffbh_u32_e32 v4, v2
825; GFX10-NEXT:    v_or_b32_e32 v1, v1, v2
826; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
827; GFX10-NEXT:    v_add_nc_u32_e32 v3, 32, v3
828; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc_lo
829; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
830; GFX10-NEXT:    v_cndmask_b32_e32 v1, 64, v2, vcc_lo
831; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
832; GFX10-NEXT:    s_endpgm
833  %tid = call i32 @llvm.amdgcn.workitem.id.x()
834  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
835  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
836  %val = load i64, i64 addrspace(1)* %in.gep
837  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
838  %trunc = trunc i64 %ctlz to i32
839  store i32 %trunc, i32 addrspace(1)* %out.gep
840  ret void
841}
842
843define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
844; SI-LABEL: v_ctlz_i32_sel_eq_neg1:
845; SI:       ; %bb.0:
846; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
847; SI-NEXT:    s_mov_b32 s3, 0xf000
848; SI-NEXT:    s_mov_b32 s6, 0
849; SI-NEXT:    s_mov_b32 s7, s3
850; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
851; SI-NEXT:    v_mov_b32_e32 v1, 0
852; SI-NEXT:    s_waitcnt lgkmcnt(0)
853; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
854; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
855; SI-NEXT:    s_mov_b32 s2, -1
856; SI-NEXT:    s_waitcnt vmcnt(0)
857; SI-NEXT:    v_ffbh_u32_e32 v0, v0
858; SI-NEXT:    s_waitcnt lgkmcnt(0)
859; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
860; SI-NEXT:    s_endpgm
861;
862; VI-LABEL: v_ctlz_i32_sel_eq_neg1:
863; VI:       ; %bb.0:
864; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
865; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
866; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
867; VI-NEXT:    s_mov_b32 s7, 0xf000
868; VI-NEXT:    s_mov_b32 s6, -1
869; VI-NEXT:    s_waitcnt lgkmcnt(0)
870; VI-NEXT:    v_mov_b32_e32 v1, s1
871; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
872; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
873; VI-NEXT:    flat_load_dword v0, v[0:1]
874; VI-NEXT:    s_waitcnt vmcnt(0)
875; VI-NEXT:    v_ffbh_u32_e32 v0, v0
876; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
877; VI-NEXT:    s_endpgm
878;
879; EG-LABEL: v_ctlz_i32_sel_eq_neg1:
880; EG:       ; %bb.0:
881; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
882; EG-NEXT:    TEX 0 @6
883; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
884; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
885; EG-NEXT:    CF_END
886; EG-NEXT:    PAD
887; EG-NEXT:    Fetch clause starting at 6:
888; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
889; EG-NEXT:    ALU clause starting at 8:
890; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
891; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
892; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
893; EG-NEXT:    ALU clause starting at 11:
894; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
895; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
896; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
897; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
898; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
899; EG-NEXT:    -1(nan), 2(2.802597e-45)
900;
901; GFX10-LABEL: v_ctlz_i32_sel_eq_neg1:
902; GFX10:       ; %bb.0:
903; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
904; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
905; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
906; GFX10-NEXT:    v_mov_b32_e32 v1, 0
907; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
908; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
909; GFX10-NEXT:    s_waitcnt vmcnt(0)
910; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
911; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
912; GFX10-NEXT:    s_endpgm
913  %tid = call i32 @llvm.amdgcn.workitem.id.x()
914  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
915  %val = load i32, i32 addrspace(1)* %in.gep
916  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
917  %cmp = icmp eq i32 %val, 0
918  %sel = select i1 %cmp, i32 -1, i32 %ctlz
919  store i32 %sel, i32 addrspace(1)* %out
920  ret void
921}
922
923define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
924; SI-LABEL: v_ctlz_i32_sel_ne_neg1:
925; SI:       ; %bb.0:
926; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
927; SI-NEXT:    s_mov_b32 s3, 0xf000
928; SI-NEXT:    s_mov_b32 s6, 0
929; SI-NEXT:    s_mov_b32 s7, s3
930; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
931; SI-NEXT:    v_mov_b32_e32 v1, 0
932; SI-NEXT:    s_waitcnt lgkmcnt(0)
933; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
934; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
935; SI-NEXT:    s_mov_b32 s2, -1
936; SI-NEXT:    s_waitcnt vmcnt(0)
937; SI-NEXT:    v_ffbh_u32_e32 v0, v0
938; SI-NEXT:    s_waitcnt lgkmcnt(0)
939; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
940; SI-NEXT:    s_endpgm
941;
942; VI-LABEL: v_ctlz_i32_sel_ne_neg1:
943; VI:       ; %bb.0:
944; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
945; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
946; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
947; VI-NEXT:    s_mov_b32 s7, 0xf000
948; VI-NEXT:    s_mov_b32 s6, -1
949; VI-NEXT:    s_waitcnt lgkmcnt(0)
950; VI-NEXT:    v_mov_b32_e32 v1, s1
951; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
952; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
953; VI-NEXT:    flat_load_dword v0, v[0:1]
954; VI-NEXT:    s_waitcnt vmcnt(0)
955; VI-NEXT:    v_ffbh_u32_e32 v0, v0
956; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
957; VI-NEXT:    s_endpgm
958;
959; EG-LABEL: v_ctlz_i32_sel_ne_neg1:
960; EG:       ; %bb.0:
961; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
962; EG-NEXT:    TEX 0 @6
963; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
964; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
965; EG-NEXT:    CF_END
966; EG-NEXT:    PAD
967; EG-NEXT:    Fetch clause starting at 6:
968; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
969; EG-NEXT:    ALU clause starting at 8:
970; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
971; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
972; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
973; EG-NEXT:    ALU clause starting at 11:
974; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
975; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
976; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
977; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
978; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
979; EG-NEXT:    -1(nan), 2(2.802597e-45)
980;
981; GFX10-LABEL: v_ctlz_i32_sel_ne_neg1:
982; GFX10:       ; %bb.0:
983; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
984; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
985; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
986; GFX10-NEXT:    v_mov_b32_e32 v1, 0
987; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
988; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
989; GFX10-NEXT:    s_waitcnt vmcnt(0)
990; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
991; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
992; GFX10-NEXT:    s_endpgm
993  %tid = call i32 @llvm.amdgcn.workitem.id.x()
994  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
995  %val = load i32, i32 addrspace(1)* %in.gep
996  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
997  %cmp = icmp ne i32 %val, 0
998  %sel = select i1 %cmp, i32 %ctlz, i32 -1
999  store i32 %sel, i32 addrspace(1)* %out
1000  ret void
1001}
1002
1003; TODO: Should be able to eliminate select here as well.
1004define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
1005; SI-LABEL: v_ctlz_i32_sel_eq_bitwidth:
1006; SI:       ; %bb.0:
1007; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1008; SI-NEXT:    s_mov_b32 s3, 0xf000
1009; SI-NEXT:    s_mov_b32 s6, 0
1010; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1011; SI-NEXT:    v_mov_b32_e32 v1, 0
1012; SI-NEXT:    s_mov_b32 s7, s3
1013; SI-NEXT:    s_waitcnt lgkmcnt(0)
1014; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1015; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1016; SI-NEXT:    s_mov_b32 s2, -1
1017; SI-NEXT:    s_waitcnt vmcnt(0)
1018; SI-NEXT:    v_ffbh_u32_e32 v1, v0
1019; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
1020; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
1021; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1022; SI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1023; SI-NEXT:    s_waitcnt lgkmcnt(0)
1024; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1025; SI-NEXT:    s_endpgm
1026;
1027; VI-LABEL: v_ctlz_i32_sel_eq_bitwidth:
1028; VI:       ; %bb.0:
1029; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1030; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1031; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1032; VI-NEXT:    s_mov_b32 s7, 0xf000
1033; VI-NEXT:    s_mov_b32 s6, -1
1034; VI-NEXT:    s_waitcnt lgkmcnt(0)
1035; VI-NEXT:    v_mov_b32_e32 v1, s1
1036; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1037; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1038; VI-NEXT:    flat_load_dword v0, v[0:1]
1039; VI-NEXT:    s_waitcnt vmcnt(0)
1040; VI-NEXT:    v_ffbh_u32_e32 v1, v0
1041; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
1042; VI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
1043; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1044; VI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1045; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1046; VI-NEXT:    s_endpgm
1047;
1048; EG-LABEL: v_ctlz_i32_sel_eq_bitwidth:
1049; EG:       ; %bb.0:
1050; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1051; EG-NEXT:    TEX 0 @6
1052; EG-NEXT:    ALU 7, @11, KC0[CB0:0-32], KC1[]
1053; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1054; EG-NEXT:    CF_END
1055; EG-NEXT:    PAD
1056; EG-NEXT:    Fetch clause starting at 6:
1057; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1058; EG-NEXT:    ALU clause starting at 8:
1059; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1060; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1061; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1062; EG-NEXT:    ALU clause starting at 11:
1063; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
1064; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
1065; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1066; EG-NEXT:     SETE_INT * T1.W, PV.W, literal.x,
1067; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1068; EG-NEXT:     CNDE_INT T0.X, PV.W, T0.W, literal.x,
1069; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1070; EG-NEXT:    -1(nan), 2(2.802597e-45)
1071;
1072; GFX10-LABEL: v_ctlz_i32_sel_eq_bitwidth:
1073; GFX10:       ; %bb.0:
1074; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1075; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1076; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1077; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1078; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1079; GFX10-NEXT:    s_waitcnt vmcnt(0)
1080; GFX10-NEXT:    v_ffbh_u32_e32 v1, v0
1081; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
1082; GFX10-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc_lo
1083; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1084; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
1085; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1086; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1087; GFX10-NEXT:    s_endpgm
1088  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1089  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
1090  %val = load i32, i32 addrspace(1)* %in.gep
1091  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
1092  %cmp = icmp eq i32 %ctlz, 32
1093  %sel = select i1 %cmp, i32 -1, i32 %ctlz
1094  store i32 %sel, i32 addrspace(1)* %out
1095  ret void
1096}
1097
1098define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
1099; SI-LABEL: v_ctlz_i32_sel_ne_bitwidth:
1100; SI:       ; %bb.0:
1101; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1102; SI-NEXT:    s_mov_b32 s3, 0xf000
1103; SI-NEXT:    s_mov_b32 s6, 0
1104; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1105; SI-NEXT:    v_mov_b32_e32 v1, 0
1106; SI-NEXT:    s_mov_b32 s7, s3
1107; SI-NEXT:    s_waitcnt lgkmcnt(0)
1108; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1109; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1110; SI-NEXT:    s_mov_b32 s2, -1
1111; SI-NEXT:    s_waitcnt vmcnt(0)
1112; SI-NEXT:    v_ffbh_u32_e32 v1, v0
1113; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
1114; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
1115; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1116; SI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1117; SI-NEXT:    s_waitcnt lgkmcnt(0)
1118; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1119; SI-NEXT:    s_endpgm
1120;
1121; VI-LABEL: v_ctlz_i32_sel_ne_bitwidth:
1122; VI:       ; %bb.0:
1123; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1124; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1125; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1126; VI-NEXT:    s_mov_b32 s7, 0xf000
1127; VI-NEXT:    s_mov_b32 s6, -1
1128; VI-NEXT:    s_waitcnt lgkmcnt(0)
1129; VI-NEXT:    v_mov_b32_e32 v1, s1
1130; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1131; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1132; VI-NEXT:    flat_load_dword v0, v[0:1]
1133; VI-NEXT:    s_waitcnt vmcnt(0)
1134; VI-NEXT:    v_ffbh_u32_e32 v1, v0
1135; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
1136; VI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
1137; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1138; VI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1139; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1140; VI-NEXT:    s_endpgm
1141;
1142; EG-LABEL: v_ctlz_i32_sel_ne_bitwidth:
1143; EG:       ; %bb.0:
1144; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1145; EG-NEXT:    TEX 0 @6
1146; EG-NEXT:    ALU 7, @11, KC0[CB0:0-32], KC1[]
1147; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1148; EG-NEXT:    CF_END
1149; EG-NEXT:    PAD
1150; EG-NEXT:    Fetch clause starting at 6:
1151; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1152; EG-NEXT:    ALU clause starting at 8:
1153; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1154; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1155; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1156; EG-NEXT:    ALU clause starting at 11:
1157; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
1158; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
1159; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1160; EG-NEXT:     SETNE_INT * T1.W, PV.W, literal.x,
1161; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1162; EG-NEXT:     CNDE_INT T0.X, PV.W, literal.x, T0.W,
1163; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1164; EG-NEXT:    -1(nan), 2(2.802597e-45)
1165;
1166; GFX10-LABEL: v_ctlz_i32_sel_ne_bitwidth:
1167; GFX10:       ; %bb.0:
1168; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1169; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1170; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1171; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1172; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1173; GFX10-NEXT:    s_waitcnt vmcnt(0)
1174; GFX10-NEXT:    v_ffbh_u32_e32 v1, v0
1175; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
1176; GFX10-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc_lo
1177; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1178; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
1179; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1180; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1181; GFX10-NEXT:    s_endpgm
1182  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1183  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
1184  %val = load i32, i32 addrspace(1)* %in.gep
1185  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
1186  %cmp = icmp ne i32 %ctlz, 32
1187  %sel = select i1 %cmp, i32 %ctlz, i32 -1
1188  store i32 %sel, i32 addrspace(1)* %out
1189  ret void
1190}
1191
1192 define amdgpu_kernel void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
1193; SI-LABEL: v_ctlz_i8_sel_eq_neg1:
1194; SI:       ; %bb.0:
1195; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1196; SI-NEXT:    s_mov_b32 s3, 0xf000
1197; SI-NEXT:    v_mov_b32_e32 v1, 0
1198; SI-NEXT:    s_mov_b32 s6, 0
1199; SI-NEXT:    s_mov_b32 s7, s3
1200; SI-NEXT:    s_waitcnt lgkmcnt(0)
1201; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
1202; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1203; SI-NEXT:    s_mov_b32 s2, -1
1204; SI-NEXT:    s_waitcnt vmcnt(0)
1205; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1206; SI-NEXT:    s_waitcnt lgkmcnt(0)
1207; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1208; SI-NEXT:    s_endpgm
1209;
1210; VI-LABEL: v_ctlz_i8_sel_eq_neg1:
1211; VI:       ; %bb.0:
1212; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1213; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1214; VI-NEXT:    s_mov_b32 s7, 0xf000
1215; VI-NEXT:    s_mov_b32 s6, -1
1216; VI-NEXT:    s_waitcnt lgkmcnt(0)
1217; VI-NEXT:    v_mov_b32_e32 v1, s1
1218; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1219; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1220; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1221; VI-NEXT:    s_waitcnt vmcnt(0)
1222; VI-NEXT:    v_ffbh_u32_e32 v0, v0
1223; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
1224; VI-NEXT:    s_endpgm
1225;
1226; EG-LABEL: v_ctlz_i8_sel_eq_neg1:
1227; EG:       ; %bb.0:
1228; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1229; EG-NEXT:    TEX 0 @6
1230; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1231; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1232; EG-NEXT:    CF_END
1233; EG-NEXT:    PAD
1234; EG-NEXT:    Fetch clause starting at 6:
1235; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
1236; EG-NEXT:    ALU clause starting at 8:
1237; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, T0.X,
1238; EG-NEXT:    ALU clause starting at 9:
1239; EG-NEXT:     FFBH_UINT T0.W, T0.X,
1240; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1241; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1242; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1243; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1244; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
1245; EG-NEXT:     LSHL T0.X, PV.W, PS,
1246; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1247; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1248; EG-NEXT:     MOV T0.Y, 0.0,
1249; EG-NEXT:     MOV * T0.Z, 0.0,
1250; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1251; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1252;
1253; GFX10-LABEL: v_ctlz_i8_sel_eq_neg1:
1254; GFX10:       ; %bb.0:
1255; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1256; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1257; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1258; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1259; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
1260; GFX10-NEXT:    s_waitcnt vmcnt(0)
1261; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
1262; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
1263; GFX10-NEXT:    s_endpgm
1264  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1265  %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid
1266  %val = load i8, i8 addrspace(1)* %valptr.gep
1267  %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone
1268  %cmp = icmp eq i8 %val, 0
1269  %sel = select i1 %cmp, i8 -1, i8 %ctlz
1270  store i8 %sel, i8 addrspace(1)* %out
1271  ret void
1272}
1273
1274 define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind {
1275; SI-LABEL: v_ctlz_i16_sel_eq_neg1:
1276; SI:       ; %bb.0:
1277; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1278; SI-NEXT:    s_mov_b32 s3, 0xf000
1279; SI-NEXT:    s_mov_b32 s2, -1
1280; SI-NEXT:    s_mov_b32 s6, s2
1281; SI-NEXT:    s_mov_b32 s7, s3
1282; SI-NEXT:    s_waitcnt lgkmcnt(0)
1283; SI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
1284; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1285; SI-NEXT:    s_waitcnt vmcnt(0)
1286; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1287; SI-NEXT:    s_waitcnt lgkmcnt(0)
1288; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1289; SI-NEXT:    s_endpgm
1290;
1291; VI-LABEL: v_ctlz_i16_sel_eq_neg1:
1292; VI:       ; %bb.0:
1293; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1294; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1295; VI-NEXT:    s_mov_b32 s7, 0xf000
1296; VI-NEXT:    s_mov_b32 s6, -1
1297; VI-NEXT:    s_mov_b32 s2, s6
1298; VI-NEXT:    s_mov_b32 s3, s7
1299; VI-NEXT:    s_waitcnt lgkmcnt(0)
1300; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
1301; VI-NEXT:    s_waitcnt vmcnt(0)
1302; VI-NEXT:    v_ffbh_u32_e32 v1, v0
1303; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v0
1304; VI-NEXT:    v_cndmask_b32_e64 v0, 32, v1, s[0:1]
1305; VI-NEXT:    v_add_u32_e32 v0, vcc, -16, v0
1306; VI-NEXT:    v_mov_b32_e32 v1, 0xffff
1307; VI-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[0:1]
1308; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
1309; VI-NEXT:    s_endpgm
1310;
1311; EG-LABEL: v_ctlz_i16_sel_eq_neg1:
1312; EG:       ; %bb.0:
1313; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1314; EG-NEXT:    TEX 0 @6
1315; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1316; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1317; EG-NEXT:    CF_END
1318; EG-NEXT:    PAD
1319; EG-NEXT:    Fetch clause starting at 6:
1320; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1321; EG-NEXT:    ALU clause starting at 8:
1322; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1323; EG-NEXT:    ALU clause starting at 9:
1324; EG-NEXT:     FFBH_UINT T0.W, T0.X,
1325; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1326; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1327; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1328; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1329; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
1330; EG-NEXT:     LSHL T0.X, PV.W, PS,
1331; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1332; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1333; EG-NEXT:     MOV T0.Y, 0.0,
1334; EG-NEXT:     MOV * T0.Z, 0.0,
1335; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1336; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1337;
1338; GFX10-LABEL: v_ctlz_i16_sel_eq_neg1:
1339; GFX10:       ; %bb.0:
1340; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1341; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1342; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1343; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1344; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
1345; GFX10-NEXT:    s_waitcnt vmcnt(0)
1346; GFX10-NEXT:    v_ffbh_u32_e32 v2, v1
1347; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
1348; GFX10-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc_lo
1349; GFX10-NEXT:    v_add_nc_u32_e32 v1, -16, v1
1350; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v1, vcc_lo
1351; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
1352; GFX10-NEXT:    s_endpgm
1353  %val = load i16, i16 addrspace(1)* %valptr
1354  %ctlz = call i16 @llvm.ctlz.i16(i16 %val, i1 false) nounwind readnone
1355  %cmp = icmp eq i16 %val, 0
1356  %sel = select i1 %cmp, i16 -1, i16 %ctlz
1357  store i16 %sel, i16 addrspace(1)* %out
1358  ret void
1359}
1360
1361; FIXME: Need to handle non-uniform case for function below (load without gep).
1362define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind {
1363; SI-LABEL: v_ctlz_i7_sel_eq_neg1:
1364; SI:       ; %bb.0:
1365; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1366; SI-NEXT:    s_mov_b32 s3, 0xf000
1367; SI-NEXT:    v_mov_b32_e32 v1, 0
1368; SI-NEXT:    s_mov_b32 s6, 0
1369; SI-NEXT:    s_mov_b32 s7, s3
1370; SI-NEXT:    s_waitcnt lgkmcnt(0)
1371; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
1372; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1373; SI-NEXT:    s_mov_b32 s2, -1
1374; SI-NEXT:    s_waitcnt vmcnt(0)
1375; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1376; SI-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1377; SI-NEXT:    s_waitcnt lgkmcnt(0)
1378; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1379; SI-NEXT:    s_endpgm
1380;
1381; VI-LABEL: v_ctlz_i7_sel_eq_neg1:
1382; VI:       ; %bb.0:
1383; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1384; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1385; VI-NEXT:    s_mov_b32 s7, 0xf000
1386; VI-NEXT:    s_mov_b32 s6, -1
1387; VI-NEXT:    s_waitcnt lgkmcnt(0)
1388; VI-NEXT:    v_mov_b32_e32 v1, s1
1389; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1390; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1391; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1392; VI-NEXT:    s_waitcnt vmcnt(0)
1393; VI-NEXT:    v_ffbh_u32_e32 v0, v0
1394; VI-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1395; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
1396; VI-NEXT:    s_endpgm
1397;
1398; EG-LABEL: v_ctlz_i7_sel_eq_neg1:
1399; EG:       ; %bb.0:
1400; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1401; EG-NEXT:    TEX 0 @6
1402; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1403; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1404; EG-NEXT:    CF_END
1405; EG-NEXT:    PAD
1406; EG-NEXT:    Fetch clause starting at 6:
1407; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
1408; EG-NEXT:    ALU clause starting at 8:
1409; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, T0.X,
1410; EG-NEXT:    ALU clause starting at 9:
1411; EG-NEXT:     FFBH_UINT T0.W, T0.X,
1412; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1413; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1414; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1415; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1416; EG-NEXT:    127(1.779649e-43), 3(4.203895e-45)
1417; EG-NEXT:     LSHL T0.X, PV.W, PS,
1418; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1419; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1420; EG-NEXT:     MOV T0.Y, 0.0,
1421; EG-NEXT:     MOV * T0.Z, 0.0,
1422; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1423; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1424;
1425; GFX10-LABEL: v_ctlz_i7_sel_eq_neg1:
1426; GFX10:       ; %bb.0:
1427; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1428; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1429; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1430; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1431; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
1432; GFX10-NEXT:    s_waitcnt vmcnt(0)
1433; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
1434; GFX10-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1435; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
1436; GFX10-NEXT:    s_endpgm
1437  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1438  %valptr.gep = getelementptr i7, i7 addrspace(1)* %valptr, i32 %tid
1439  %val = load i7, i7 addrspace(1)* %valptr.gep
1440  %ctlz = call i7 @llvm.ctlz.i7(i7 %val, i1 false) nounwind readnone
1441  %cmp = icmp eq i7 %val, 0
1442  %sel = select i1 %cmp, i7 -1, i7 %ctlz
1443  store i7 %sel, i7 addrspace(1)* %out
1444  ret void
1445}
1446