1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=SI
3; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=VI
4; RUN: llc < %s -march=r600 -mcpu=cypress -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=EG
5; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10
6; RUN: llc < %s -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10-GISEL
7
8declare i7 @llvm.cttz.i7(i7, i1) nounwind readnone
9declare i8 @llvm.cttz.i8(i8, i1) nounwind readnone
10declare i16 @llvm.cttz.i16(i16, i1) nounwind readnone
11
12declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
13declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone
14declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone
15
16declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone
17declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1) nounwind readnone
18declare <4 x i64> @llvm.cttz.v4i64(<4 x i64>, i1) nounwind readnone
19
20declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
21
22define amdgpu_kernel void @s_cttz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
23; SI-LABEL: s_cttz_i32:
24; SI:       ; %bb.0:
25; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
26; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
27; SI-NEXT:    s_mov_b32 s3, 0xf000
28; SI-NEXT:    s_waitcnt lgkmcnt(0)
29; SI-NEXT:    s_ff1_i32_b32 s2, s2
30; SI-NEXT:    s_min_u32 s4, s2, 32
31; SI-NEXT:    s_mov_b32 s2, -1
32; SI-NEXT:    v_mov_b32_e32 v0, s4
33; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
34; SI-NEXT:    s_endpgm
35;
36; VI-LABEL: s_cttz_i32:
37; VI:       ; %bb.0:
38; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
39; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
40; VI-NEXT:    s_mov_b32 s7, 0xf000
41; VI-NEXT:    s_mov_b32 s6, -1
42; VI-NEXT:    s_waitcnt lgkmcnt(0)
43; VI-NEXT:    s_ff1_i32_b32 s0, s0
44; VI-NEXT:    s_min_u32 s0, s0, 32
45; VI-NEXT:    v_mov_b32_e32 v0, s0
46; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
47; VI-NEXT:    s_endpgm
48;
49; EG-LABEL: s_cttz_i32:
50; EG:       ; %bb.0:
51; EG-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
52; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
53; EG-NEXT:    CF_END
54; EG-NEXT:    PAD
55; EG-NEXT:    ALU clause starting at 4:
56; EG-NEXT:     FFBL_INT * T0.W, KC0[2].Z,
57; EG-NEXT:     CNDE_INT T0.X, KC0[2].Z, literal.x, PV.W,
58; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
59; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
60;
61; GFX10-LABEL: s_cttz_i32:
62; GFX10:       ; %bb.0:
63; GFX10-NEXT:    s_clause 0x1
64; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
65; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
66; GFX10-NEXT:    v_mov_b32_e32 v0, 0
67; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
68; GFX10-NEXT:    s_ff1_i32_b32 s0, s4
69; GFX10-NEXT:    s_min_u32 s0, s0, 32
70; GFX10-NEXT:    v_mov_b32_e32 v1, s0
71; GFX10-NEXT:    global_store_dword v0, v1, s[2:3]
72; GFX10-NEXT:    s_endpgm
73;
74; GFX10-GISEL-LABEL: s_cttz_i32:
75; GFX10-GISEL:       ; %bb.0:
76; GFX10-GISEL-NEXT:    s_clause 0x1
77; GFX10-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x2c
78; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
79; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
80; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
81; GFX10-GISEL-NEXT:    s_ff1_i32_b32 s0, s4
82; GFX10-GISEL-NEXT:    s_min_u32 s0, s0, 32
83; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
84; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
85; GFX10-GISEL-NEXT:    s_endpgm
86  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
87  store i32 %cttz, i32 addrspace(1)* %out, align 4
88  ret void
89}
90
91define amdgpu_kernel void @v_cttz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
92; SI-LABEL: v_cttz_i32:
93; SI:       ; %bb.0:
94; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
95; SI-NEXT:    s_mov_b32 s3, 0xf000
96; SI-NEXT:    s_mov_b32 s6, 0
97; SI-NEXT:    s_mov_b32 s7, s3
98; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
99; SI-NEXT:    v_mov_b32_e32 v1, 0
100; SI-NEXT:    s_waitcnt lgkmcnt(0)
101; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
102; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
103; SI-NEXT:    s_mov_b32 s2, -1
104; SI-NEXT:    s_waitcnt vmcnt(0)
105; SI-NEXT:    v_ffbl_b32_e32 v0, v0
106; SI-NEXT:    v_min_u32_e32 v0, 32, v0
107; SI-NEXT:    s_waitcnt lgkmcnt(0)
108; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
109; SI-NEXT:    s_endpgm
110;
111; VI-LABEL: v_cttz_i32:
112; VI:       ; %bb.0:
113; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
114; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
115; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
116; VI-NEXT:    s_mov_b32 s7, 0xf000
117; VI-NEXT:    s_mov_b32 s6, -1
118; VI-NEXT:    s_waitcnt lgkmcnt(0)
119; VI-NEXT:    v_mov_b32_e32 v1, s1
120; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
121; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
122; VI-NEXT:    flat_load_dword v0, v[0:1]
123; VI-NEXT:    s_waitcnt vmcnt(0)
124; VI-NEXT:    v_ffbl_b32_e32 v0, v0
125; VI-NEXT:    v_min_u32_e32 v0, 32, v0
126; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
127; VI-NEXT:    s_endpgm
128;
129; EG-LABEL: v_cttz_i32:
130; EG:       ; %bb.0:
131; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
132; EG-NEXT:    TEX 0 @6
133; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
134; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
135; EG-NEXT:    CF_END
136; EG-NEXT:    PAD
137; EG-NEXT:    Fetch clause starting at 6:
138; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
139; EG-NEXT:    ALU clause starting at 8:
140; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
141; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
142; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
143; EG-NEXT:    ALU clause starting at 11:
144; EG-NEXT:     FFBL_INT * T0.W, T0.X,
145; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
146; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
147; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
148;
149; GFX10-LABEL: v_cttz_i32:
150; GFX10:       ; %bb.0:
151; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
152; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
153; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
154; GFX10-NEXT:    v_mov_b32_e32 v1, 0
155; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
156; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
157; GFX10-NEXT:    s_waitcnt vmcnt(0)
158; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
159; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
160; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
161; GFX10-NEXT:    s_endpgm
162;
163; GFX10-GISEL-LABEL: v_cttz_i32:
164; GFX10-GISEL:       ; %bb.0:
165; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
166; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
167; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
168; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
169; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
170; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
171; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
172; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
173; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
174; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
175; GFX10-GISEL-NEXT:    s_endpgm
176  %tid = call i32 @llvm.amdgcn.workitem.id.x()
177  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
178  %val = load i32, i32 addrspace(1)* %in.gep, align 4
179  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
180  store i32 %cttz, i32 addrspace(1)* %out, align 4
181  ret void
182}
183
184define amdgpu_kernel void @v_cttz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
185; SI-LABEL: v_cttz_v2i32:
186; SI:       ; %bb.0:
187; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
188; SI-NEXT:    s_mov_b32 s3, 0xf000
189; SI-NEXT:    s_mov_b32 s6, 0
190; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
191; SI-NEXT:    v_mov_b32_e32 v1, 0
192; SI-NEXT:    s_mov_b32 s7, s3
193; SI-NEXT:    s_waitcnt lgkmcnt(0)
194; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
195; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
196; SI-NEXT:    s_mov_b32 s2, -1
197; SI-NEXT:    s_waitcnt vmcnt(0)
198; SI-NEXT:    v_ffbl_b32_e32 v1, v1
199; SI-NEXT:    v_ffbl_b32_e32 v0, v0
200; SI-NEXT:    v_min_u32_e32 v1, 32, v1
201; SI-NEXT:    v_min_u32_e32 v0, 32, v0
202; SI-NEXT:    s_waitcnt lgkmcnt(0)
203; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
204; SI-NEXT:    s_endpgm
205;
206; VI-LABEL: v_cttz_v2i32:
207; VI:       ; %bb.0:
208; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
209; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
210; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
211; VI-NEXT:    s_mov_b32 s7, 0xf000
212; VI-NEXT:    s_mov_b32 s6, -1
213; VI-NEXT:    s_waitcnt lgkmcnt(0)
214; VI-NEXT:    v_mov_b32_e32 v1, s1
215; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
216; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
217; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
218; VI-NEXT:    s_waitcnt vmcnt(0)
219; VI-NEXT:    v_ffbl_b32_e32 v1, v1
220; VI-NEXT:    v_ffbl_b32_e32 v0, v0
221; VI-NEXT:    v_min_u32_e32 v1, 32, v1
222; VI-NEXT:    v_min_u32_e32 v0, 32, v0
223; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
224; VI-NEXT:    s_endpgm
225;
226; EG-LABEL: v_cttz_v2i32:
227; EG:       ; %bb.0:
228; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
229; EG-NEXT:    TEX 0 @6
230; EG-NEXT:    ALU 6, @11, KC0[CB0:0-32], KC1[]
231; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
232; EG-NEXT:    CF_END
233; EG-NEXT:    PAD
234; EG-NEXT:    Fetch clause starting at 6:
235; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
236; EG-NEXT:    ALU clause starting at 8:
237; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
238; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
239; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
240; EG-NEXT:    ALU clause starting at 11:
241; EG-NEXT:     FFBL_INT * T0.W, T0.Y,
242; EG-NEXT:     CNDE_INT T0.Y, T0.Y, literal.x, PV.W,
243; EG-NEXT:     FFBL_INT * T0.W, T0.X,
244; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
245; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
246; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
247; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
248;
249; GFX10-LABEL: v_cttz_v2i32:
250; GFX10:       ; %bb.0:
251; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
252; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
253; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
254; GFX10-NEXT:    v_mov_b32_e32 v2, 0
255; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
256; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
257; GFX10-NEXT:    s_waitcnt vmcnt(0)
258; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
259; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
260; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
261; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
262; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
263; GFX10-NEXT:    s_endpgm
264;
265; GFX10-GISEL-LABEL: v_cttz_v2i32:
266; GFX10-GISEL:       ; %bb.0:
267; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
268; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
269; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
270; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
271; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
272; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
273; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
274; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
275; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
276; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
277; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
278; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
279; GFX10-GISEL-NEXT:    s_endpgm
280  %tid = call i32 @llvm.amdgcn.workitem.id.x()
281  %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
282  %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
283  %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 false) nounwind readnone
284  store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8
285  ret void
286}
287
288define amdgpu_kernel void @v_cttz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
289; SI-LABEL: v_cttz_v4i32:
290; SI:       ; %bb.0:
291; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
292; SI-NEXT:    s_mov_b32 s3, 0xf000
293; SI-NEXT:    s_mov_b32 s6, 0
294; SI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
295; SI-NEXT:    v_mov_b32_e32 v1, 0
296; SI-NEXT:    s_mov_b32 s7, s3
297; SI-NEXT:    s_waitcnt lgkmcnt(0)
298; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
299; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
300; SI-NEXT:    s_mov_b32 s2, -1
301; SI-NEXT:    s_waitcnt vmcnt(0)
302; SI-NEXT:    v_ffbl_b32_e32 v3, v3
303; SI-NEXT:    v_ffbl_b32_e32 v2, v2
304; SI-NEXT:    v_ffbl_b32_e32 v1, v1
305; SI-NEXT:    v_ffbl_b32_e32 v0, v0
306; SI-NEXT:    v_min_u32_e32 v3, 32, v3
307; SI-NEXT:    v_min_u32_e32 v2, 32, v2
308; SI-NEXT:    v_min_u32_e32 v1, 32, v1
309; SI-NEXT:    v_min_u32_e32 v0, 32, v0
310; SI-NEXT:    s_waitcnt lgkmcnt(0)
311; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
312; SI-NEXT:    s_endpgm
313;
314; VI-LABEL: v_cttz_v4i32:
315; VI:       ; %bb.0:
316; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
317; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
318; VI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
319; VI-NEXT:    s_mov_b32 s7, 0xf000
320; VI-NEXT:    s_mov_b32 s6, -1
321; VI-NEXT:    s_waitcnt lgkmcnt(0)
322; VI-NEXT:    v_mov_b32_e32 v1, s1
323; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
324; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
325; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
326; VI-NEXT:    s_waitcnt vmcnt(0)
327; VI-NEXT:    v_ffbl_b32_e32 v3, v3
328; VI-NEXT:    v_ffbl_b32_e32 v2, v2
329; VI-NEXT:    v_ffbl_b32_e32 v1, v1
330; VI-NEXT:    v_ffbl_b32_e32 v0, v0
331; VI-NEXT:    v_min_u32_e32 v3, 32, v3
332; VI-NEXT:    v_min_u32_e32 v2, 32, v2
333; VI-NEXT:    v_min_u32_e32 v1, 32, v1
334; VI-NEXT:    v_min_u32_e32 v0, 32, v0
335; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
336; VI-NEXT:    s_endpgm
337;
338; EG-LABEL: v_cttz_v4i32:
339; EG:       ; %bb.0:
340; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
341; EG-NEXT:    TEX 0 @6
342; EG-NEXT:    ALU 12, @11, KC0[CB0:0-32], KC1[]
343; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
344; EG-NEXT:    CF_END
345; EG-NEXT:    PAD
346; EG-NEXT:    Fetch clause starting at 6:
347; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
348; EG-NEXT:    ALU clause starting at 8:
349; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
350; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
351; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
352; EG-NEXT:    ALU clause starting at 11:
353; EG-NEXT:     FFBL_INT * T1.W, T0.W,
354; EG-NEXT:     FFBL_INT T2.W, T0.Z,
355; EG-NEXT:     CNDE_INT * T0.W, T0.W, literal.x, PV.W, BS:VEC_021/SCL_122
356; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
357; EG-NEXT:     CNDE_INT T0.Z, T0.Z, literal.x, PV.W,
358; EG-NEXT:     FFBL_INT * T1.W, T0.Y,
359; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
360; EG-NEXT:     CNDE_INT T0.Y, T0.Y, literal.x, PV.W,
361; EG-NEXT:     FFBL_INT * T1.W, T0.X,
362; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
363; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
364; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
365; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
366;
367; GFX10-LABEL: v_cttz_v4i32:
368; GFX10:       ; %bb.0:
369; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
370; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
371; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
372; GFX10-NEXT:    v_mov_b32_e32 v4, 0
373; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
374; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
375; GFX10-NEXT:    s_waitcnt vmcnt(0)
376; GFX10-NEXT:    v_ffbl_b32_e32 v3, v3
377; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
378; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
379; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
380; GFX10-NEXT:    v_min_u32_e32 v3, 32, v3
381; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
382; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
383; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
384; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
385; GFX10-NEXT:    s_endpgm
386;
387; GFX10-GISEL-LABEL: v_cttz_v4i32:
388; GFX10-GISEL:       ; %bb.0:
389; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
390; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
391; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
392; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0
393; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
394; GFX10-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
395; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
396; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
397; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
398; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
399; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v3, v3
400; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
401; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
402; GFX10-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
403; GFX10-GISEL-NEXT:    v_min_u32_e32 v3, 32, v3
404; GFX10-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
405; GFX10-GISEL-NEXT:    s_endpgm
406  %tid = call i32 @llvm.amdgcn.workitem.id.x()
407  %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid
408  %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
409  %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 false) nounwind readnone
410  store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16
411  ret void
412}
413
414define amdgpu_kernel void @v_cttz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
415; SI-LABEL: v_cttz_i8:
416; SI:       ; %bb.0:
417; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
418; SI-NEXT:    s_mov_b32 s3, 0xf000
419; SI-NEXT:    s_mov_b32 s2, -1
420; SI-NEXT:    s_mov_b32 s6, s2
421; SI-NEXT:    s_mov_b32 s7, s3
422; SI-NEXT:    s_waitcnt lgkmcnt(0)
423; SI-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
424; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
425; SI-NEXT:    s_waitcnt vmcnt(0)
426; SI-NEXT:    v_or_b32_e32 v0, 0x100, v0
427; SI-NEXT:    v_ffbl_b32_e32 v0, v0
428; SI-NEXT:    s_waitcnt lgkmcnt(0)
429; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
430; SI-NEXT:    s_endpgm
431;
432; VI-LABEL: v_cttz_i8:
433; VI:       ; %bb.0:
434; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
435; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
436; VI-NEXT:    s_mov_b32 s7, 0xf000
437; VI-NEXT:    s_mov_b32 s6, -1
438; VI-NEXT:    s_mov_b32 s2, s6
439; VI-NEXT:    s_mov_b32 s3, s7
440; VI-NEXT:    s_waitcnt lgkmcnt(0)
441; VI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
442; VI-NEXT:    s_waitcnt vmcnt(0)
443; VI-NEXT:    v_or_b32_e32 v0, 0x100, v0
444; VI-NEXT:    v_ffbl_b32_e32 v0, v0
445; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
446; VI-NEXT:    s_endpgm
447;
448; EG-LABEL: v_cttz_i8:
449; EG:       ; %bb.0:
450; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
451; EG-NEXT:    TEX 0 @6
452; EG-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
453; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
454; EG-NEXT:    CF_END
455; EG-NEXT:    PAD
456; EG-NEXT:    Fetch clause starting at 6:
457; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
458; EG-NEXT:    ALU clause starting at 8:
459; EG-NEXT:     MOV * T0.X, KC0[2].Z,
460; EG-NEXT:    ALU clause starting at 9:
461; EG-NEXT:     OR_INT * T0.W, T0.X, literal.x,
462; EG-NEXT:    256(3.587324e-43), 0(0.000000e+00)
463; EG-NEXT:     FFBL_INT T0.W, PV.W,
464; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
465; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
466; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
467; EG-NEXT:     LSHL * T1.W, PS, literal.y,
468; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
469; EG-NEXT:     LSHL T0.X, PV.W, PS,
470; EG-NEXT:     LSHL * T0.W, literal.x, PS,
471; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
472; EG-NEXT:     MOV T0.Y, 0.0,
473; EG-NEXT:     MOV * T0.Z, 0.0,
474; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
475; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
476;
477; GFX10-LABEL: v_cttz_i8:
478; GFX10:       ; %bb.0:
479; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
480; GFX10-NEXT:    v_mov_b32_e32 v0, 0
481; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
482; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
483; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3]
484; GFX10-NEXT:    s_waitcnt vmcnt(0)
485; GFX10-NEXT:    v_or_b32_e32 v1, 0x100, v1
486; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
487; GFX10-NEXT:    global_store_byte v0, v1, s[0:1]
488; GFX10-NEXT:    s_endpgm
489;
490; GFX10-GISEL-LABEL: v_cttz_i8:
491; GFX10-GISEL:       ; %bb.0:
492; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
493; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
494; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
495; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
496; GFX10-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
497; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
498; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, 0x100, v1
499; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
500; GFX10-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
501; GFX10-GISEL-NEXT:    s_endpgm
502  %val = load i8, i8 addrspace(1)* %valptr
503  %cttz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone
504  store i8 %cttz, i8 addrspace(1)* %out
505  ret void
506}
507
508define amdgpu_kernel void @s_cttz_i64(i64 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind {
509; SI-LABEL: s_cttz_i64:
510; SI:       ; %bb.0:
511; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x13
512; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
513; SI-NEXT:    s_mov_b32 s3, 0xf000
514; SI-NEXT:    s_mov_b32 s2, -1
515; SI-NEXT:    s_waitcnt lgkmcnt(0)
516; SI-NEXT:    s_ff1_i32_b32 s5, s5
517; SI-NEXT:    s_min_u32 s5, s5, 0xffffffdf
518; SI-NEXT:    s_add_i32 s5, s5, 32
519; SI-NEXT:    s_ff1_i32_b32 s4, s4
520; SI-NEXT:    v_mov_b32_e32 v0, s5
521; SI-NEXT:    v_min3_u32 v0, s4, v0, 64
522; SI-NEXT:    v_mov_b32_e32 v1, 0
523; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
524; SI-NEXT:    s_endpgm
525;
526; VI-LABEL: s_cttz_i64:
527; VI:       ; %bb.0:
528; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
529; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x4c
530; VI-NEXT:    s_mov_b32 s7, 0xf000
531; VI-NEXT:    s_mov_b32 s6, -1
532; VI-NEXT:    v_mov_b32_e32 v1, 0
533; VI-NEXT:    s_waitcnt lgkmcnt(0)
534; VI-NEXT:    s_ff1_i32_b32 s1, s1
535; VI-NEXT:    v_add_u32_e64 v0, s[2:3], s1, 32 clamp
536; VI-NEXT:    s_ff1_i32_b32 s0, s0
537; VI-NEXT:    v_min3_u32 v0, s0, v0, 64
538; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
539; VI-NEXT:    s_endpgm
540;
541; EG-LABEL: s_cttz_i64:
542; EG:       ; %bb.0:
543; EG-NEXT:    ALU 9, @4, KC0[CB0:0-32], KC1[]
544; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
545; EG-NEXT:    CF_END
546; EG-NEXT:    PAD
547; EG-NEXT:    ALU clause starting at 4:
548; EG-NEXT:     FFBL_INT * T0.W, KC0[5].X,
549; EG-NEXT:     CNDE_INT * T0.W, KC0[5].X, literal.x, PV.W,
550; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
551; EG-NEXT:     FFBL_INT T1.W, KC0[4].W,
552; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
553; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
554; EG-NEXT:     CNDE_INT T0.X, KC0[4].W, PS, PV.W,
555; EG-NEXT:     MOV T0.Y, 0.0,
556; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
557; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
558;
559; GFX10-LABEL: s_cttz_i64:
560; GFX10:       ; %bb.0:
561; GFX10-NEXT:    s_clause 0x1
562; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x4c
563; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
564; GFX10-NEXT:    v_mov_b32_e32 v1, 0
565; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
566; GFX10-NEXT:    s_ff1_i32_b32 s0, s3
567; GFX10-NEXT:    v_add_nc_u32_e64 v0, s0, 32 clamp
568; GFX10-NEXT:    s_ff1_i32_b32 s0, s2
569; GFX10-NEXT:    v_min3_u32 v0, s0, v0, 64
570; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
571; GFX10-NEXT:    s_endpgm
572;
573; GFX10-GISEL-LABEL: s_cttz_i64:
574; GFX10-GISEL:       ; %bb.0:
575; GFX10-GISEL-NEXT:    s_clause 0x1
576; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x4c
577; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
578; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
579; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
580; GFX10-GISEL-NEXT:    s_ff1_i32_b64 s0, s[2:3]
581; GFX10-GISEL-NEXT:    s_min_u32 s0, s0, 64
582; GFX10-GISEL-NEXT:    s_bfe_u64 s[0:1], s[0:1], 0x200000
583; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
584; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s1
585; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
586; GFX10-GISEL-NEXT:    s_endpgm
587  %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false)
588  store i64 %cttz, i64 addrspace(1)* %out
589  ret void
590}
591
592define amdgpu_kernel void @s_cttz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
593; SI-LABEL: s_cttz_i64_trunc:
594; SI:       ; %bb.0:
595; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
596; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
597; SI-NEXT:    s_mov_b32 s3, 0xf000
598; SI-NEXT:    s_mov_b32 s2, -1
599; SI-NEXT:    s_waitcnt lgkmcnt(0)
600; SI-NEXT:    s_ff1_i32_b32 s5, s5
601; SI-NEXT:    s_min_u32 s5, s5, 0xffffffdf
602; SI-NEXT:    s_add_i32 s5, s5, 32
603; SI-NEXT:    s_ff1_i32_b32 s4, s4
604; SI-NEXT:    v_mov_b32_e32 v0, s5
605; SI-NEXT:    v_min3_u32 v0, s4, v0, 64
606; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
607; SI-NEXT:    s_endpgm
608;
609; VI-LABEL: s_cttz_i64_trunc:
610; VI:       ; %bb.0:
611; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
612; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
613; VI-NEXT:    s_mov_b32 s7, 0xf000
614; VI-NEXT:    s_mov_b32 s6, -1
615; VI-NEXT:    s_waitcnt lgkmcnt(0)
616; VI-NEXT:    s_ff1_i32_b32 s1, s1
617; VI-NEXT:    v_add_u32_e64 v0, s[2:3], s1, 32 clamp
618; VI-NEXT:    s_ff1_i32_b32 s0, s0
619; VI-NEXT:    v_min3_u32 v0, s0, v0, 64
620; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
621; VI-NEXT:    s_endpgm
622;
623; EG-LABEL: s_cttz_i64_trunc:
624; EG:       ; %bb.0:
625; EG-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
626; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
627; EG-NEXT:    CF_END
628; EG-NEXT:    PAD
629; EG-NEXT:    ALU clause starting at 4:
630; EG-NEXT:     FFBL_INT * T0.W, KC0[3].X,
631; EG-NEXT:     CNDE_INT * T0.W, KC0[3].X, literal.x, PV.W,
632; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
633; EG-NEXT:     FFBL_INT T1.W, KC0[2].W,
634; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
635; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
636; EG-NEXT:     CNDE_INT T0.X, KC0[2].W, PS, PV.W,
637; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
638; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
639;
640; GFX10-LABEL: s_cttz_i64_trunc:
641; GFX10:       ; %bb.0:
642; GFX10-NEXT:    s_clause 0x1
643; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
644; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
645; GFX10-NEXT:    v_mov_b32_e32 v1, 0
646; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
647; GFX10-NEXT:    s_ff1_i32_b32 s0, s3
648; GFX10-NEXT:    v_add_nc_u32_e64 v0, s0, 32 clamp
649; GFX10-NEXT:    s_ff1_i32_b32 s0, s2
650; GFX10-NEXT:    v_min3_u32 v0, s0, v0, 64
651; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
652; GFX10-NEXT:    s_endpgm
653;
654; GFX10-GISEL-LABEL: s_cttz_i64_trunc:
655; GFX10-GISEL:       ; %bb.0:
656; GFX10-GISEL-NEXT:    s_clause 0x1
657; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
658; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
659; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
660; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
661; GFX10-GISEL-NEXT:    s_ff1_i32_b64 s0, s[2:3]
662; GFX10-GISEL-NEXT:    s_min_u32 s0, s0, 64
663; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
664; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
665; GFX10-GISEL-NEXT:    s_endpgm
666  %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false)
667  %trunc = trunc i64 %cttz to i32
668  store i32 %trunc, i32 addrspace(1)* %out
669  ret void
670}
671
672define amdgpu_kernel void @v_cttz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
673; SI-LABEL: v_cttz_i64:
674; SI:       ; %bb.0:
675; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
676; SI-NEXT:    s_mov_b32 s7, 0xf000
677; SI-NEXT:    s_mov_b32 s6, 0
678; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
679; SI-NEXT:    v_mov_b32_e32 v1, 0
680; SI-NEXT:    s_waitcnt lgkmcnt(0)
681; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
682; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
683; SI-NEXT:    s_waitcnt vmcnt(0)
684; SI-NEXT:    v_ffbl_b32_e32 v3, v3
685; SI-NEXT:    v_min_u32_e32 v3, 0xffffffdf, v3
686; SI-NEXT:    v_add_i32_e32 v3, vcc, 32, v3
687; SI-NEXT:    v_ffbl_b32_e32 v2, v2
688; SI-NEXT:    v_min3_u32 v2, v2, v3, 64
689; SI-NEXT:    v_mov_b32_e32 v3, v1
690; SI-NEXT:    s_waitcnt lgkmcnt(0)
691; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
692; SI-NEXT:    s_endpgm
693;
694; VI-LABEL: v_cttz_i64:
695; VI:       ; %bb.0:
696; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
697; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
698; VI-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
699; VI-NEXT:    v_mov_b32_e32 v2, 0
700; VI-NEXT:    s_waitcnt lgkmcnt(0)
701; VI-NEXT:    v_mov_b32_e32 v4, s3
702; VI-NEXT:    v_mov_b32_e32 v1, s1
703; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v3
704; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
705; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
706; VI-NEXT:    v_add_u32_e32 v3, vcc, s2, v3
707; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
708; VI-NEXT:    s_waitcnt vmcnt(0)
709; VI-NEXT:    v_ffbl_b32_e32 v1, v1
710; VI-NEXT:    v_add_u32_e64 v1, s[0:1], v1, 32 clamp
711; VI-NEXT:    v_ffbl_b32_e32 v0, v0
712; VI-NEXT:    v_min3_u32 v1, v0, v1, 64
713; VI-NEXT:    flat_store_dwordx2 v[3:4], v[1:2]
714; VI-NEXT:    s_endpgm
715;
716; EG-LABEL: v_cttz_i64:
717; EG:       ; %bb.0:
718; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
719; EG-NEXT:    TEX 0 @6
720; EG-NEXT:    ALU 10, @11, KC0[CB0:0-32], KC1[]
721; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
722; EG-NEXT:    CF_END
723; EG-NEXT:    PAD
724; EG-NEXT:    Fetch clause starting at 6:
725; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
726; EG-NEXT:    ALU clause starting at 8:
727; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
728; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
729; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
730; EG-NEXT:    ALU clause starting at 11:
731; EG-NEXT:     FFBL_INT * T1.W, T0.Y,
732; EG-NEXT:     CNDE_INT * T1.W, T0.Y, literal.x, PV.W,
733; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
734; EG-NEXT:     FFBL_INT T2.W, T0.X,
735; EG-NEXT:     ADD_INT * T1.W, PV.W, literal.x,
736; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
737; EG-NEXT:     CNDE_INT T0.X, T0.X, PS, PV.W,
738; EG-NEXT:     MOV T0.Y, 0.0,
739; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
740; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
741; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
742;
743; GFX10-LABEL: v_cttz_i64:
744; GFX10:       ; %bb.0:
745; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
746; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
747; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
748; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
749; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
750; GFX10-NEXT:    s_waitcnt vmcnt(0)
751; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
752; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
753; GFX10-NEXT:    v_add_nc_u32_e64 v1, v1, 32 clamp
754; GFX10-NEXT:    v_min3_u32 v0, v0, v1, 64
755; GFX10-NEXT:    v_mov_b32_e32 v1, 0
756; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
757; GFX10-NEXT:    s_endpgm
758;
759; GFX10-GISEL-LABEL: v_cttz_i64:
760; GFX10-GISEL:       ; %bb.0:
761; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
762; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
763; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
764; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
765; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
766; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
767; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
768; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
769; GFX10-GISEL-NEXT:    v_add_nc_u32_e64 v1, v1, 32 clamp
770; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, v0, v1
771; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
772; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 64, v0
773; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
774; GFX10-GISEL-NEXT:    s_endpgm
775  %tid = call i32 @llvm.amdgcn.workitem.id.x()
776  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
777  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
778  %val = load i64, i64 addrspace(1)* %in.gep
779  %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false)
780  store i64 %cttz, i64 addrspace(1)* %out.gep
781  ret void
782}
783
784define amdgpu_kernel void @v_cttz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
785; SI-LABEL: v_cttz_i64_trunc:
786; SI:       ; %bb.0:
787; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
788; SI-NEXT:    s_mov_b32 s7, 0xf000
789; SI-NEXT:    s_mov_b32 s6, 0
790; SI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
791; SI-NEXT:    v_mov_b32_e32 v2, 0
792; SI-NEXT:    s_waitcnt lgkmcnt(0)
793; SI-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
794; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
795; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
796; SI-NEXT:    s_waitcnt vmcnt(0)
797; SI-NEXT:    v_ffbl_b32_e32 v0, v4
798; SI-NEXT:    v_min_u32_e32 v0, 0xffffffdf, v0
799; SI-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
800; SI-NEXT:    v_ffbl_b32_e32 v3, v3
801; SI-NEXT:    v_min3_u32 v0, v3, v0, 64
802; SI-NEXT:    s_waitcnt lgkmcnt(0)
803; SI-NEXT:    buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
804; SI-NEXT:    s_endpgm
805;
806; VI-LABEL: v_cttz_i64_trunc:
807; VI:       ; %bb.0:
808; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
809; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
810; VI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
811; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
812; VI-NEXT:    s_waitcnt lgkmcnt(0)
813; VI-NEXT:    v_mov_b32_e32 v4, s3
814; VI-NEXT:    v_mov_b32_e32 v2, s1
815; VI-NEXT:    v_add_u32_e32 v1, vcc, s0, v1
816; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
817; VI-NEXT:    flat_load_dwordx2 v[1:2], v[1:2]
818; VI-NEXT:    v_add_u32_e32 v3, vcc, s2, v0
819; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
820; VI-NEXT:    s_waitcnt vmcnt(0)
821; VI-NEXT:    v_ffbl_b32_e32 v0, v2
822; VI-NEXT:    v_add_u32_e64 v0, s[0:1], v0, 32 clamp
823; VI-NEXT:    v_ffbl_b32_e32 v1, v1
824; VI-NEXT:    v_min3_u32 v0, v1, v0, 64
825; VI-NEXT:    flat_store_dword v[3:4], v0
826; VI-NEXT:    s_endpgm
827;
828; EG-LABEL: v_cttz_i64_trunc:
829; EG:       ; %bb.0:
830; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
831; EG-NEXT:    TEX 0 @6
832; EG-NEXT:    ALU 10, @11, KC0[CB0:0-32], KC1[]
833; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
834; EG-NEXT:    CF_END
835; EG-NEXT:    PAD
836; EG-NEXT:    Fetch clause starting at 6:
837; EG-NEXT:     VTX_READ_64 T1.XY, T1.X, 0, #1
838; EG-NEXT:    ALU clause starting at 8:
839; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
840; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
841; EG-NEXT:     ADD_INT * T1.X, KC0[2].Z, PV.W,
842; EG-NEXT:    ALU clause starting at 11:
843; EG-NEXT:     FFBL_INT * T0.W, T1.Y,
844; EG-NEXT:     CNDE_INT * T0.W, T1.Y, literal.x, PV.W,
845; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
846; EG-NEXT:     LSHL T0.Z, T0.X, literal.x,
847; EG-NEXT:     FFBL_INT T1.W, T1.X, BS:VEC_120/SCL_212
848; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.y,
849; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
850; EG-NEXT:     CNDE_INT T0.X, T1.X, PS, PV.W,
851; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, PV.Z,
852; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
853; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
854;
855; GFX10-LABEL: v_cttz_i64_trunc:
856; GFX10:       ; %bb.0:
857; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
858; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
859; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
860; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
861; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
862; GFX10-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
863; GFX10-NEXT:    s_waitcnt vmcnt(0)
864; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
865; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
866; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, 32 clamp
867; GFX10-NEXT:    v_min3_u32 v1, v1, v2, 64
868; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
869; GFX10-NEXT:    s_endpgm
870;
871; GFX10-GISEL-LABEL: v_cttz_i64_trunc:
872; GFX10-GISEL:       ; %bb.0:
873; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
874; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
875; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
876; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
877; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
878; GFX10-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
879; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
880; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
881; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
882; GFX10-GISEL-NEXT:    v_add_nc_u32_e64 v2, v2, 32 clamp
883; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, v1, v2
884; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 64, v1
885; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
886; GFX10-GISEL-NEXT:    s_endpgm
887  %tid = call i32 @llvm.amdgcn.workitem.id.x()
888  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
889  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
890  %val = load i64, i64 addrspace(1)* %in.gep
891  %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false)
892  %trunc = trunc i64 %cttz to i32
893  store i32 %trunc, i32 addrspace(1)* %out.gep
894  ret void
895}
896
897define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
898; SI-LABEL: v_cttz_i32_sel_eq_neg1:
899; SI:       ; %bb.0:
900; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
901; SI-NEXT:    s_mov_b32 s3, 0xf000
902; SI-NEXT:    s_mov_b32 s6, 0
903; SI-NEXT:    s_mov_b32 s7, s3
904; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
905; SI-NEXT:    v_mov_b32_e32 v1, 0
906; SI-NEXT:    s_waitcnt lgkmcnt(0)
907; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
908; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
909; SI-NEXT:    s_mov_b32 s2, -1
910; SI-NEXT:    s_waitcnt vmcnt(0)
911; SI-NEXT:    v_ffbl_b32_e32 v0, v0
912; SI-NEXT:    s_waitcnt lgkmcnt(0)
913; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
914; SI-NEXT:    s_endpgm
915;
916; VI-LABEL: v_cttz_i32_sel_eq_neg1:
917; VI:       ; %bb.0:
918; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
919; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
920; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
921; VI-NEXT:    s_mov_b32 s7, 0xf000
922; VI-NEXT:    s_mov_b32 s6, -1
923; VI-NEXT:    s_waitcnt lgkmcnt(0)
924; VI-NEXT:    v_mov_b32_e32 v1, s1
925; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
926; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
927; VI-NEXT:    flat_load_dword v0, v[0:1]
928; VI-NEXT:    s_waitcnt vmcnt(0)
929; VI-NEXT:    v_ffbl_b32_e32 v0, v0
930; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
931; VI-NEXT:    s_endpgm
932;
933; EG-LABEL: v_cttz_i32_sel_eq_neg1:
934; EG:       ; %bb.0:
935; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
936; EG-NEXT:    TEX 0 @6
937; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
938; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
939; EG-NEXT:    CF_END
940; EG-NEXT:    PAD
941; EG-NEXT:    Fetch clause starting at 6:
942; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
943; EG-NEXT:    ALU clause starting at 8:
944; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
945; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
946; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
947; EG-NEXT:    ALU clause starting at 11:
948; EG-NEXT:     FFBL_INT * T0.W, T0.X,
949; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
950; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
951; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
952; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
953; EG-NEXT:    -1(nan), 2(2.802597e-45)
954;
955; GFX10-LABEL: v_cttz_i32_sel_eq_neg1:
956; GFX10:       ; %bb.0:
957; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
958; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
959; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
960; GFX10-NEXT:    v_mov_b32_e32 v1, 0
961; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
962; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
963; GFX10-NEXT:    s_waitcnt vmcnt(0)
964; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
965; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
966; GFX10-NEXT:    s_endpgm
967;
968; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_neg1:
969; GFX10-GISEL:       ; %bb.0:
970; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
971; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
972; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
973; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
974; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
975; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
976; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v0
977; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
978; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
979; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc_lo
980; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
981; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
982; GFX10-GISEL-NEXT:    s_endpgm
983  %tid = call i32 @llvm.amdgcn.workitem.id.x()
984  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
985  %val = load i32, i32 addrspace(1)* %in.gep
986  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
987  %cmp = icmp eq i32 %val, 0
988  %sel = select i1 %cmp, i32 -1, i32 %cttz
989  store i32 %sel, i32 addrspace(1)* %out
990  ret void
991}
992
993define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
994; SI-LABEL: v_cttz_i32_sel_ne_neg1:
995; SI:       ; %bb.0:
996; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
997; SI-NEXT:    s_mov_b32 s3, 0xf000
998; SI-NEXT:    s_mov_b32 s6, 0
999; SI-NEXT:    s_mov_b32 s7, s3
1000; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1001; SI-NEXT:    v_mov_b32_e32 v1, 0
1002; SI-NEXT:    s_waitcnt lgkmcnt(0)
1003; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1004; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1005; SI-NEXT:    s_mov_b32 s2, -1
1006; SI-NEXT:    s_waitcnt vmcnt(0)
1007; SI-NEXT:    v_ffbl_b32_e32 v0, v0
1008; SI-NEXT:    s_waitcnt lgkmcnt(0)
1009; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1010; SI-NEXT:    s_endpgm
1011;
1012; VI-LABEL: v_cttz_i32_sel_ne_neg1:
1013; VI:       ; %bb.0:
1014; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1015; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1016; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1017; VI-NEXT:    s_mov_b32 s7, 0xf000
1018; VI-NEXT:    s_mov_b32 s6, -1
1019; VI-NEXT:    s_waitcnt lgkmcnt(0)
1020; VI-NEXT:    v_mov_b32_e32 v1, s1
1021; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1022; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1023; VI-NEXT:    flat_load_dword v0, v[0:1]
1024; VI-NEXT:    s_waitcnt vmcnt(0)
1025; VI-NEXT:    v_ffbl_b32_e32 v0, v0
1026; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1027; VI-NEXT:    s_endpgm
1028;
1029; EG-LABEL: v_cttz_i32_sel_ne_neg1:
1030; EG:       ; %bb.0:
1031; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1032; EG-NEXT:    TEX 0 @6
1033; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
1034; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1035; EG-NEXT:    CF_END
1036; EG-NEXT:    PAD
1037; EG-NEXT:    Fetch clause starting at 6:
1038; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1039; EG-NEXT:    ALU clause starting at 8:
1040; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1041; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1042; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1043; EG-NEXT:    ALU clause starting at 11:
1044; EG-NEXT:     FFBL_INT * T0.W, T0.X,
1045; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
1046; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1047; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
1048; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1049; EG-NEXT:    -1(nan), 2(2.802597e-45)
1050;
1051; GFX10-LABEL: v_cttz_i32_sel_ne_neg1:
1052; GFX10:       ; %bb.0:
1053; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1054; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1055; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1056; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1057; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1058; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1059; GFX10-NEXT:    s_waitcnt vmcnt(0)
1060; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
1061; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1062; GFX10-NEXT:    s_endpgm
1063;
1064; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_neg1:
1065; GFX10-GISEL:       ; %bb.0:
1066; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1067; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1068; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1069; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1070; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
1071; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1072; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v0
1073; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
1074; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
1075; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v1, vcc_lo
1076; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1077; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
1078; GFX10-GISEL-NEXT:    s_endpgm
1079  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1080  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
1081  %val = load i32, i32 addrspace(1)* %in.gep
1082  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
1083  %cmp = icmp ne i32 %val, 0
1084  %sel = select i1 %cmp, i32 %cttz, i32 -1
1085  store i32 %sel, i32 addrspace(1)* %out
1086  ret void
1087}
1088
1089; TODO: Should be able to eliminate select here as well.
1090define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
1091; SI-LABEL: v_cttz_i32_sel_eq_bitwidth:
1092; SI:       ; %bb.0:
1093; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1094; SI-NEXT:    s_mov_b32 s3, 0xf000
1095; SI-NEXT:    s_mov_b32 s6, 0
1096; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1097; SI-NEXT:    v_mov_b32_e32 v1, 0
1098; SI-NEXT:    s_mov_b32 s7, s3
1099; SI-NEXT:    s_waitcnt lgkmcnt(0)
1100; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1101; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1102; SI-NEXT:    s_mov_b32 s2, -1
1103; SI-NEXT:    s_waitcnt vmcnt(0)
1104; SI-NEXT:    v_ffbl_b32_e32 v0, v0
1105; SI-NEXT:    v_min_u32_e32 v0, 32, v0
1106; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1107; SI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1108; SI-NEXT:    s_waitcnt lgkmcnt(0)
1109; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1110; SI-NEXT:    s_endpgm
1111;
1112; VI-LABEL: v_cttz_i32_sel_eq_bitwidth:
1113; VI:       ; %bb.0:
1114; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1115; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1116; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1117; VI-NEXT:    s_mov_b32 s7, 0xf000
1118; VI-NEXT:    s_mov_b32 s6, -1
1119; VI-NEXT:    s_waitcnt lgkmcnt(0)
1120; VI-NEXT:    v_mov_b32_e32 v1, s1
1121; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1122; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1123; VI-NEXT:    flat_load_dword v0, v[0:1]
1124; VI-NEXT:    s_waitcnt vmcnt(0)
1125; VI-NEXT:    v_ffbl_b32_e32 v0, v0
1126; VI-NEXT:    v_min_u32_e32 v0, 32, v0
1127; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1128; VI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1129; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1130; VI-NEXT:    s_endpgm
1131;
1132; EG-LABEL: v_cttz_i32_sel_eq_bitwidth:
1133; EG:       ; %bb.0:
1134; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1135; EG-NEXT:    TEX 0 @6
1136; EG-NEXT:    ALU 7, @11, KC0[CB0:0-32], KC1[]
1137; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1138; EG-NEXT:    CF_END
1139; EG-NEXT:    PAD
1140; EG-NEXT:    Fetch clause starting at 6:
1141; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1142; EG-NEXT:    ALU clause starting at 8:
1143; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1144; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1145; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1146; EG-NEXT:    ALU clause starting at 11:
1147; EG-NEXT:     FFBL_INT * T0.W, T0.X,
1148; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
1149; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1150; EG-NEXT:     SETE_INT * T1.W, PV.W, literal.x,
1151; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1152; EG-NEXT:     CNDE_INT T0.X, PV.W, T0.W, literal.x,
1153; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1154; EG-NEXT:    -1(nan), 2(2.802597e-45)
1155;
1156; GFX10-LABEL: v_cttz_i32_sel_eq_bitwidth:
1157; GFX10:       ; %bb.0:
1158; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1159; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1160; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1161; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1162; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1163; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1164; GFX10-NEXT:    s_waitcnt vmcnt(0)
1165; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
1166; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
1167; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
1168; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1169; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1170; GFX10-NEXT:    s_endpgm
1171;
1172; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_bitwidth:
1173; GFX10-GISEL:       ; %bb.0:
1174; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1175; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1176; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1177; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1178; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1179; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
1180; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1181; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
1182; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
1183; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 32, v0
1184; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
1185; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
1186; GFX10-GISEL-NEXT:    s_endpgm
1187  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1188  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
1189  %val = load i32, i32 addrspace(1)* %in.gep
1190  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
1191  %cmp = icmp eq i32 %cttz, 32
1192  %sel = select i1 %cmp, i32 -1, i32 %cttz
1193  store i32 %sel, i32 addrspace(1)* %out
1194  ret void
1195}
1196
1197define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
1198; SI-LABEL: v_cttz_i32_sel_ne_bitwidth:
1199; SI:       ; %bb.0:
1200; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1201; SI-NEXT:    s_mov_b32 s3, 0xf000
1202; SI-NEXT:    s_mov_b32 s6, 0
1203; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1204; SI-NEXT:    v_mov_b32_e32 v1, 0
1205; SI-NEXT:    s_mov_b32 s7, s3
1206; SI-NEXT:    s_waitcnt lgkmcnt(0)
1207; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1208; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1209; SI-NEXT:    s_mov_b32 s2, -1
1210; SI-NEXT:    s_waitcnt vmcnt(0)
1211; SI-NEXT:    v_ffbl_b32_e32 v0, v0
1212; SI-NEXT:    v_min_u32_e32 v0, 32, v0
1213; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1214; SI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1215; SI-NEXT:    s_waitcnt lgkmcnt(0)
1216; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1217; SI-NEXT:    s_endpgm
1218;
1219; VI-LABEL: v_cttz_i32_sel_ne_bitwidth:
1220; VI:       ; %bb.0:
1221; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1222; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1223; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1224; VI-NEXT:    s_mov_b32 s7, 0xf000
1225; VI-NEXT:    s_mov_b32 s6, -1
1226; VI-NEXT:    s_waitcnt lgkmcnt(0)
1227; VI-NEXT:    v_mov_b32_e32 v1, s1
1228; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1229; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1230; VI-NEXT:    flat_load_dword v0, v[0:1]
1231; VI-NEXT:    s_waitcnt vmcnt(0)
1232; VI-NEXT:    v_ffbl_b32_e32 v0, v0
1233; VI-NEXT:    v_min_u32_e32 v0, 32, v0
1234; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1235; VI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1236; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1237; VI-NEXT:    s_endpgm
1238;
1239; EG-LABEL: v_cttz_i32_sel_ne_bitwidth:
1240; EG:       ; %bb.0:
1241; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1242; EG-NEXT:    TEX 0 @6
1243; EG-NEXT:    ALU 7, @11, KC0[CB0:0-32], KC1[]
1244; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1245; EG-NEXT:    CF_END
1246; EG-NEXT:    PAD
1247; EG-NEXT:    Fetch clause starting at 6:
1248; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1249; EG-NEXT:    ALU clause starting at 8:
1250; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1251; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1252; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1253; EG-NEXT:    ALU clause starting at 11:
1254; EG-NEXT:     FFBL_INT * T0.W, T0.X,
1255; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
1256; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1257; EG-NEXT:     SETNE_INT * T1.W, PV.W, literal.x,
1258; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1259; EG-NEXT:     CNDE_INT T0.X, PV.W, literal.x, T0.W,
1260; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1261; EG-NEXT:    -1(nan), 2(2.802597e-45)
1262;
1263; GFX10-LABEL: v_cttz_i32_sel_ne_bitwidth:
1264; GFX10:       ; %bb.0:
1265; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1266; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1267; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1268; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1269; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1270; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1271; GFX10-NEXT:    s_waitcnt vmcnt(0)
1272; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
1273; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
1274; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
1275; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1276; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1277; GFX10-NEXT:    s_endpgm
1278;
1279; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth:
1280; GFX10-GISEL:       ; %bb.0:
1281; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1282; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1283; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1284; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1285; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1286; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
1287; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1288; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
1289; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
1290; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
1291; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1292; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
1293; GFX10-GISEL-NEXT:    s_endpgm
1294  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1295  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
1296  %val = load i32, i32 addrspace(1)* %in.gep
1297  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
1298  %cmp = icmp ne i32 %cttz, 32
1299  %sel = select i1 %cmp, i32 %cttz, i32 -1
1300  store i32 %sel, i32 addrspace(1)* %out
1301  ret void
1302}
1303
1304 define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
1305; SI-LABEL: v_cttz_i8_sel_eq_neg1:
1306; SI:       ; %bb.0:
1307; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1308; SI-NEXT:    s_mov_b32 s3, 0xf000
1309; SI-NEXT:    v_mov_b32_e32 v1, 0
1310; SI-NEXT:    s_mov_b32 s6, 0
1311; SI-NEXT:    s_mov_b32 s7, s3
1312; SI-NEXT:    s_waitcnt lgkmcnt(0)
1313; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
1314; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1315; SI-NEXT:    s_mov_b32 s2, -1
1316; SI-NEXT:    s_waitcnt vmcnt(0)
1317; SI-NEXT:    v_ffbl_b32_e32 v0, v0
1318; SI-NEXT:    s_waitcnt lgkmcnt(0)
1319; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1320; SI-NEXT:    s_endpgm
1321;
1322; VI-LABEL: v_cttz_i8_sel_eq_neg1:
1323; VI:       ; %bb.0:
1324; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1325; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1326; VI-NEXT:    s_mov_b32 s7, 0xf000
1327; VI-NEXT:    s_mov_b32 s6, -1
1328; VI-NEXT:    s_waitcnt lgkmcnt(0)
1329; VI-NEXT:    v_mov_b32_e32 v1, s1
1330; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1331; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1332; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1333; VI-NEXT:    s_waitcnt vmcnt(0)
1334; VI-NEXT:    v_ffbl_b32_e32 v0, v0
1335; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
1336; VI-NEXT:    s_endpgm
1337;
1338; EG-LABEL: v_cttz_i8_sel_eq_neg1:
1339; EG:       ; %bb.0:
1340; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1341; EG-NEXT:    TEX 0 @6
1342; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1343; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1344; EG-NEXT:    CF_END
1345; EG-NEXT:    PAD
1346; EG-NEXT:    Fetch clause starting at 6:
1347; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
1348; EG-NEXT:    ALU clause starting at 8:
1349; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, T0.X,
1350; EG-NEXT:    ALU clause starting at 9:
1351; EG-NEXT:     FFBL_INT T0.W, T0.X,
1352; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1353; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1354; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1355; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1356; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
1357; EG-NEXT:     LSHL T0.X, PV.W, PS,
1358; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1359; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1360; EG-NEXT:     MOV T0.Y, 0.0,
1361; EG-NEXT:     MOV * T0.Z, 0.0,
1362; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1363; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1364;
1365; GFX10-LABEL: v_cttz_i8_sel_eq_neg1:
1366; GFX10:       ; %bb.0:
1367; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1368; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1369; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1370; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1371; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
1372; GFX10-NEXT:    s_waitcnt vmcnt(0)
1373; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
1374; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
1375; GFX10-NEXT:    s_endpgm
1376;
1377; GFX10-GISEL-LABEL: v_cttz_i8_sel_eq_neg1:
1378; GFX10-GISEL:       ; %bb.0:
1379; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1380; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
1381; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1382; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1383; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
1384; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
1385; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
1386; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
1387; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
1388; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
1389; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1390; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, 0x100, v0
1391; GFX10-GISEL-NEXT:    v_cmp_eq_u32_sdwa s2, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
1392; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
1393; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, s2
1394; GFX10-GISEL-NEXT:    global_store_byte v2, v0, s[0:1]
1395; GFX10-GISEL-NEXT:    s_endpgm
1396  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1397  %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid
1398  %val = load i8, i8 addrspace(1)* %valptr.gep
1399  %cttz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone
1400  %cmp = icmp eq i8 %val, 0
1401  %sel = select i1 %cmp, i8 -1, i8 %cttz
1402  store i8 %sel, i8 addrspace(1)* %out
1403  ret void
1404}
1405
1406 define amdgpu_kernel void @v_cttz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind {
1407; SI-LABEL: v_cttz_i16_sel_eq_neg1:
1408; SI:       ; %bb.0:
1409; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1410; SI-NEXT:    s_mov_b32 s3, 0xf000
1411; SI-NEXT:    s_mov_b32 s2, -1
1412; SI-NEXT:    s_mov_b32 s6, s2
1413; SI-NEXT:    s_mov_b32 s7, s3
1414; SI-NEXT:    s_waitcnt lgkmcnt(0)
1415; SI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
1416; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1417; SI-NEXT:    s_waitcnt vmcnt(0)
1418; SI-NEXT:    v_ffbl_b32_e32 v0, v0
1419; SI-NEXT:    s_waitcnt lgkmcnt(0)
1420; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1421; SI-NEXT:    s_endpgm
1422;
1423; VI-LABEL: v_cttz_i16_sel_eq_neg1:
1424; VI:       ; %bb.0:
1425; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1426; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1427; VI-NEXT:    s_mov_b32 s7, 0xf000
1428; VI-NEXT:    s_mov_b32 s6, -1
1429; VI-NEXT:    s_mov_b32 s2, s6
1430; VI-NEXT:    s_mov_b32 s3, s7
1431; VI-NEXT:    s_waitcnt lgkmcnt(0)
1432; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
1433; VI-NEXT:    v_mov_b32_e32 v1, 0xffff
1434; VI-NEXT:    s_waitcnt vmcnt(0)
1435; VI-NEXT:    v_or_b32_e32 v2, 0x10000, v0
1436; VI-NEXT:    v_ffbl_b32_e32 v2, v2
1437; VI-NEXT:    v_min_u32_e32 v2, 32, v2
1438; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
1439; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
1440; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
1441; VI-NEXT:    s_endpgm
1442;
1443; EG-LABEL: v_cttz_i16_sel_eq_neg1:
1444; EG:       ; %bb.0:
1445; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1446; EG-NEXT:    TEX 0 @6
1447; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1448; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1449; EG-NEXT:    CF_END
1450; EG-NEXT:    PAD
1451; EG-NEXT:    Fetch clause starting at 6:
1452; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1453; EG-NEXT:    ALU clause starting at 8:
1454; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1455; EG-NEXT:    ALU clause starting at 9:
1456; EG-NEXT:     FFBL_INT T0.W, T0.X,
1457; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1458; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1459; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1460; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1461; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
1462; EG-NEXT:     LSHL T0.X, PV.W, PS,
1463; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1464; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1465; EG-NEXT:     MOV T0.Y, 0.0,
1466; EG-NEXT:     MOV * T0.Z, 0.0,
1467; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1468; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1469;
1470; GFX10-LABEL: v_cttz_i16_sel_eq_neg1:
1471; GFX10:       ; %bb.0:
1472; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1473; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1474; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1475; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1476; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
1477; GFX10-NEXT:    s_waitcnt vmcnt(0)
1478; GFX10-NEXT:    v_or_b32_e32 v2, 0x10000, v1
1479; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
1480; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
1481; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
1482; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
1483; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
1484; GFX10-NEXT:    s_endpgm
1485;
1486; GFX10-GISEL-LABEL: v_cttz_i16_sel_eq_neg1:
1487; GFX10-GISEL:       ; %bb.0:
1488; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1489; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
1490; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1491; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1492; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[2:3]
1493; GFX10-GISEL-NEXT:    s_waitcnt_depctr 0xffe3
1494; GFX10-GISEL-NEXT:    s_mov_b32 s2, 0xffff
1495; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1496; GFX10-GISEL-NEXT:    v_or_b32_e32 v2, 0x10000, v1
1497; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
1498; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
1499; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, s2, v2
1500; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, s2, vcc_lo
1501; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
1502; GFX10-GISEL-NEXT:    s_endpgm
1503  %val = load i16, i16 addrspace(1)* %valptr
1504  %cttz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone
1505  %cmp = icmp eq i16 %val, 0
1506  %sel = select i1 %cmp, i16 -1, i16 %cttz
1507  store i16 %sel, i16 addrspace(1)* %out
1508  ret void
1509}
1510
1511; FIXME: Need to handle non-uniform case for function below (load without gep).
1512define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind {
1513; SI-LABEL: v_cttz_i7_sel_eq_neg1:
1514; SI:       ; %bb.0:
1515; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1516; SI-NEXT:    s_mov_b32 s3, 0xf000
1517; SI-NEXT:    v_mov_b32_e32 v1, 0
1518; SI-NEXT:    s_mov_b32 s6, 0
1519; SI-NEXT:    s_mov_b32 s7, s3
1520; SI-NEXT:    s_waitcnt lgkmcnt(0)
1521; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
1522; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1523; SI-NEXT:    s_mov_b32 s2, -1
1524; SI-NEXT:    s_waitcnt vmcnt(0)
1525; SI-NEXT:    v_ffbl_b32_e32 v0, v0
1526; SI-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1527; SI-NEXT:    s_waitcnt lgkmcnt(0)
1528; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1529; SI-NEXT:    s_endpgm
1530;
1531; VI-LABEL: v_cttz_i7_sel_eq_neg1:
1532; VI:       ; %bb.0:
1533; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1534; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1535; VI-NEXT:    s_mov_b32 s7, 0xf000
1536; VI-NEXT:    s_mov_b32 s6, -1
1537; VI-NEXT:    s_waitcnt lgkmcnt(0)
1538; VI-NEXT:    v_mov_b32_e32 v1, s1
1539; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1540; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1541; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1542; VI-NEXT:    s_waitcnt vmcnt(0)
1543; VI-NEXT:    v_ffbl_b32_e32 v0, v0
1544; VI-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1545; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
1546; VI-NEXT:    s_endpgm
1547;
1548; EG-LABEL: v_cttz_i7_sel_eq_neg1:
1549; EG:       ; %bb.0:
1550; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1551; EG-NEXT:    TEX 0 @6
1552; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1553; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1554; EG-NEXT:    CF_END
1555; EG-NEXT:    PAD
1556; EG-NEXT:    Fetch clause starting at 6:
1557; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
1558; EG-NEXT:    ALU clause starting at 8:
1559; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, T0.X,
1560; EG-NEXT:    ALU clause starting at 9:
1561; EG-NEXT:     FFBL_INT T0.W, T0.X,
1562; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1563; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1564; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1565; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1566; EG-NEXT:    127(1.779649e-43), 3(4.203895e-45)
1567; EG-NEXT:     LSHL T0.X, PV.W, PS,
1568; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1569; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1570; EG-NEXT:     MOV T0.Y, 0.0,
1571; EG-NEXT:     MOV * T0.Z, 0.0,
1572; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1573; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1574;
1575; GFX10-LABEL: v_cttz_i7_sel_eq_neg1:
1576; GFX10:       ; %bb.0:
1577; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1578; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1579; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1580; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1581; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
1582; GFX10-NEXT:    s_waitcnt vmcnt(0)
1583; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
1584; GFX10-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1585; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
1586; GFX10-NEXT:    s_endpgm
1587;
1588; GFX10-GISEL-LABEL: v_cttz_i7_sel_eq_neg1:
1589; GFX10-GISEL:       ; %bb.0:
1590; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1591; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
1592; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1593; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1594; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
1595; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
1596; GFX10-GISEL-NEXT:    s_movk_i32 s2, 0x7f
1597; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
1598; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
1599; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
1600; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1601; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, 0x80, v0
1602; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, s2, v0
1603; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
1604; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1605; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo
1606; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1607; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, s2, v0
1608; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
1609; GFX10-GISEL-NEXT:    s_endpgm
1610  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1611  %valptr.gep = getelementptr i7, i7 addrspace(1)* %valptr, i32 %tid
1612  %val = load i7, i7 addrspace(1)* %valptr.gep
1613  %cttz = call i7 @llvm.cttz.i7(i7 %val, i1 false) nounwind readnone
1614  %cmp = icmp eq i7 %val, 0
1615  %sel = select i1 %cmp, i7 -1, i7 %cttz
1616  store i7 %sel, i7 addrspace(1)* %out
1617  ret void
1618}
1619