1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=SI 3; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=VI 4; RUN: llc < %s -march=r600 -mcpu=cypress -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=EG 5; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10 6; RUN: llc < %s -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10-GISEL 7 8declare i7 @llvm.cttz.i7(i7, i1) nounwind readnone 9declare i8 @llvm.cttz.i8(i8, i1) nounwind readnone 10declare i16 @llvm.cttz.i16(i16, i1) nounwind readnone 11 12declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone 13declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone 14declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone 15 16declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone 17declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1) nounwind readnone 18declare <4 x i64> @llvm.cttz.v4i64(<4 x i64>, i1) nounwind readnone 19 20declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 21 22define amdgpu_kernel void @s_cttz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { 23; SI-LABEL: s_cttz_i32: 24; SI: ; %bb.0: 25; SI-NEXT: s_load_dword s2, s[0:1], 0xb 26; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 27; SI-NEXT: s_mov_b32 s3, 0xf000 28; SI-NEXT: s_waitcnt lgkmcnt(0) 29; SI-NEXT: s_ff1_i32_b32 s2, s2 30; SI-NEXT: s_min_u32 s4, s2, 32 31; SI-NEXT: s_mov_b32 s2, -1 32; SI-NEXT: v_mov_b32_e32 v0, s4 33; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 34; SI-NEXT: s_endpgm 35; 36; VI-LABEL: s_cttz_i32: 37; VI: ; %bb.0: 38; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 39; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 40; VI-NEXT: s_mov_b32 s7, 0xf000 41; VI-NEXT: s_mov_b32 s6, -1 42; VI-NEXT: s_waitcnt lgkmcnt(0) 43; VI-NEXT: s_ff1_i32_b32 s0, s0 44; VI-NEXT: s_min_u32 s0, s0, 32 45; VI-NEXT: v_mov_b32_e32 v0, s0 46; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 47; VI-NEXT: s_endpgm 48; 49; EG-LABEL: s_cttz_i32: 50; EG: ; %bb.0: 51; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 52; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 53; EG-NEXT: CF_END 54; EG-NEXT: PAD 55; EG-NEXT: ALU clause starting at 4: 56; EG-NEXT: FFBL_INT * T0.W, KC0[2].Z, 57; EG-NEXT: CNDE_INT T0.X, KC0[2].Z, literal.x, PV.W, 58; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 59; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) 60; 61; GFX10-LABEL: s_cttz_i32: 62; GFX10: ; %bb.0: 63; GFX10-NEXT: s_clause 0x1 64; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c 65; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 66; GFX10-NEXT: v_mov_b32_e32 v0, 0 67; GFX10-NEXT: s_waitcnt lgkmcnt(0) 68; GFX10-NEXT: s_ff1_i32_b32 s0, s4 69; GFX10-NEXT: s_min_u32 s0, s0, 32 70; GFX10-NEXT: v_mov_b32_e32 v1, s0 71; GFX10-NEXT: global_store_dword v0, v1, s[2:3] 72; GFX10-NEXT: s_endpgm 73; 74; GFX10-GISEL-LABEL: s_cttz_i32: 75; GFX10-GISEL: ; %bb.0: 76; GFX10-GISEL-NEXT: s_clause 0x1 77; GFX10-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c 78; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 79; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 80; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 81; GFX10-GISEL-NEXT: s_ff1_i32_b32 s0, s4 82; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 32 83; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 84; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[2:3] 85; GFX10-GISEL-NEXT: s_endpgm 86 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone 87 store i32 %cttz, i32 addrspace(1)* %out, align 4 88 ret void 89} 90 91define amdgpu_kernel void @v_cttz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 92; SI-LABEL: v_cttz_i32: 93; SI: ; %bb.0: 94; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 95; SI-NEXT: s_mov_b32 s3, 0xf000 96; SI-NEXT: s_mov_b32 s6, 0 97; SI-NEXT: s_mov_b32 s7, s3 98; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 99; SI-NEXT: v_mov_b32_e32 v1, 0 100; SI-NEXT: s_waitcnt lgkmcnt(0) 101; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 102; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 103; SI-NEXT: s_mov_b32 s2, -1 104; SI-NEXT: s_waitcnt vmcnt(0) 105; SI-NEXT: v_ffbl_b32_e32 v0, v0 106; SI-NEXT: v_min_u32_e32 v0, 32, v0 107; SI-NEXT: s_waitcnt lgkmcnt(0) 108; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 109; SI-NEXT: s_endpgm 110; 111; VI-LABEL: v_cttz_i32: 112; VI: ; %bb.0: 113; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 114; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 115; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 116; VI-NEXT: s_mov_b32 s7, 0xf000 117; VI-NEXT: s_mov_b32 s6, -1 118; VI-NEXT: s_waitcnt lgkmcnt(0) 119; VI-NEXT: v_mov_b32_e32 v1, s1 120; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 121; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 122; VI-NEXT: flat_load_dword v0, v[0:1] 123; VI-NEXT: s_waitcnt vmcnt(0) 124; VI-NEXT: v_ffbl_b32_e32 v0, v0 125; VI-NEXT: v_min_u32_e32 v0, 32, v0 126; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 127; VI-NEXT: s_endpgm 128; 129; EG-LABEL: v_cttz_i32: 130; EG: ; %bb.0: 131; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 132; EG-NEXT: TEX 0 @6 133; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] 134; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 135; EG-NEXT: CF_END 136; EG-NEXT: PAD 137; EG-NEXT: Fetch clause starting at 6: 138; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 139; EG-NEXT: ALU clause starting at 8: 140; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 141; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 142; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 143; EG-NEXT: ALU clause starting at 11: 144; EG-NEXT: FFBL_INT * T0.W, T0.X, 145; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 146; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 147; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) 148; 149; GFX10-LABEL: v_cttz_i32: 150; GFX10: ; %bb.0: 151; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 152; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 153; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 154; GFX10-NEXT: v_mov_b32_e32 v1, 0 155; GFX10-NEXT: s_waitcnt lgkmcnt(0) 156; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 157; GFX10-NEXT: s_waitcnt vmcnt(0) 158; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 159; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 160; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 161; GFX10-NEXT: s_endpgm 162; 163; GFX10-GISEL-LABEL: v_cttz_i32: 164; GFX10-GISEL: ; %bb.0: 165; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 166; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 167; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 168; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 169; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 170; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 171; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 172; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 173; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 174; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 175; GFX10-GISEL-NEXT: s_endpgm 176 %tid = call i32 @llvm.amdgcn.workitem.id.x() 177 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 178 %val = load i32, i32 addrspace(1)* %in.gep, align 4 179 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone 180 store i32 %cttz, i32 addrspace(1)* %out, align 4 181 ret void 182} 183 184define amdgpu_kernel void @v_cttz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { 185; SI-LABEL: v_cttz_v2i32: 186; SI: ; %bb.0: 187; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 188; SI-NEXT: s_mov_b32 s3, 0xf000 189; SI-NEXT: s_mov_b32 s6, 0 190; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 191; SI-NEXT: v_mov_b32_e32 v1, 0 192; SI-NEXT: s_mov_b32 s7, s3 193; SI-NEXT: s_waitcnt lgkmcnt(0) 194; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 195; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 196; SI-NEXT: s_mov_b32 s2, -1 197; SI-NEXT: s_waitcnt vmcnt(0) 198; SI-NEXT: v_ffbl_b32_e32 v1, v1 199; SI-NEXT: v_ffbl_b32_e32 v0, v0 200; SI-NEXT: v_min_u32_e32 v1, 32, v1 201; SI-NEXT: v_min_u32_e32 v0, 32, v0 202; SI-NEXT: s_waitcnt lgkmcnt(0) 203; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 204; SI-NEXT: s_endpgm 205; 206; VI-LABEL: v_cttz_v2i32: 207; VI: ; %bb.0: 208; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 209; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 210; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 211; VI-NEXT: s_mov_b32 s7, 0xf000 212; VI-NEXT: s_mov_b32 s6, -1 213; VI-NEXT: s_waitcnt lgkmcnt(0) 214; VI-NEXT: v_mov_b32_e32 v1, s1 215; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 216; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 217; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 218; VI-NEXT: s_waitcnt vmcnt(0) 219; VI-NEXT: v_ffbl_b32_e32 v1, v1 220; VI-NEXT: v_ffbl_b32_e32 v0, v0 221; VI-NEXT: v_min_u32_e32 v1, 32, v1 222; VI-NEXT: v_min_u32_e32 v0, 32, v0 223; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 224; VI-NEXT: s_endpgm 225; 226; EG-LABEL: v_cttz_v2i32: 227; EG: ; %bb.0: 228; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 229; EG-NEXT: TEX 0 @6 230; EG-NEXT: ALU 6, @11, KC0[CB0:0-32], KC1[] 231; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 232; EG-NEXT: CF_END 233; EG-NEXT: PAD 234; EG-NEXT: Fetch clause starting at 6: 235; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 236; EG-NEXT: ALU clause starting at 8: 237; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 238; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 239; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 240; EG-NEXT: ALU clause starting at 11: 241; EG-NEXT: FFBL_INT * T0.W, T0.Y, 242; EG-NEXT: CNDE_INT T0.Y, T0.Y, literal.x, PV.W, 243; EG-NEXT: FFBL_INT * T0.W, T0.X, 244; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 245; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 246; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 247; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) 248; 249; GFX10-LABEL: v_cttz_v2i32: 250; GFX10: ; %bb.0: 251; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 252; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 253; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 254; GFX10-NEXT: v_mov_b32_e32 v2, 0 255; GFX10-NEXT: s_waitcnt lgkmcnt(0) 256; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 257; GFX10-NEXT: s_waitcnt vmcnt(0) 258; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 259; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 260; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 261; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 262; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 263; GFX10-NEXT: s_endpgm 264; 265; GFX10-GISEL-LABEL: v_cttz_v2i32: 266; GFX10-GISEL: ; %bb.0: 267; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 268; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 269; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 270; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 271; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 272; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 273; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 274; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 275; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 276; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 277; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 278; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 279; GFX10-GISEL-NEXT: s_endpgm 280 %tid = call i32 @llvm.amdgcn.workitem.id.x() 281 %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid 282 %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8 283 %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 false) nounwind readnone 284 store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8 285 ret void 286} 287 288define amdgpu_kernel void @v_cttz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { 289; SI-LABEL: v_cttz_v4i32: 290; SI: ; %bb.0: 291; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 292; SI-NEXT: s_mov_b32 s3, 0xf000 293; SI-NEXT: s_mov_b32 s6, 0 294; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 295; SI-NEXT: v_mov_b32_e32 v1, 0 296; SI-NEXT: s_mov_b32 s7, s3 297; SI-NEXT: s_waitcnt lgkmcnt(0) 298; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 299; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 300; SI-NEXT: s_mov_b32 s2, -1 301; SI-NEXT: s_waitcnt vmcnt(0) 302; SI-NEXT: v_ffbl_b32_e32 v3, v3 303; SI-NEXT: v_ffbl_b32_e32 v2, v2 304; SI-NEXT: v_ffbl_b32_e32 v1, v1 305; SI-NEXT: v_ffbl_b32_e32 v0, v0 306; SI-NEXT: v_min_u32_e32 v3, 32, v3 307; SI-NEXT: v_min_u32_e32 v2, 32, v2 308; SI-NEXT: v_min_u32_e32 v1, 32, v1 309; SI-NEXT: v_min_u32_e32 v0, 32, v0 310; SI-NEXT: s_waitcnt lgkmcnt(0) 311; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 312; SI-NEXT: s_endpgm 313; 314; VI-LABEL: v_cttz_v4i32: 315; VI: ; %bb.0: 316; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 317; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 318; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 319; VI-NEXT: s_mov_b32 s7, 0xf000 320; VI-NEXT: s_mov_b32 s6, -1 321; VI-NEXT: s_waitcnt lgkmcnt(0) 322; VI-NEXT: v_mov_b32_e32 v1, s1 323; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 324; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 325; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 326; VI-NEXT: s_waitcnt vmcnt(0) 327; VI-NEXT: v_ffbl_b32_e32 v3, v3 328; VI-NEXT: v_ffbl_b32_e32 v2, v2 329; VI-NEXT: v_ffbl_b32_e32 v1, v1 330; VI-NEXT: v_ffbl_b32_e32 v0, v0 331; VI-NEXT: v_min_u32_e32 v3, 32, v3 332; VI-NEXT: v_min_u32_e32 v2, 32, v2 333; VI-NEXT: v_min_u32_e32 v1, 32, v1 334; VI-NEXT: v_min_u32_e32 v0, 32, v0 335; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 336; VI-NEXT: s_endpgm 337; 338; EG-LABEL: v_cttz_v4i32: 339; EG: ; %bb.0: 340; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 341; EG-NEXT: TEX 0 @6 342; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[] 343; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 344; EG-NEXT: CF_END 345; EG-NEXT: PAD 346; EG-NEXT: Fetch clause starting at 6: 347; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 348; EG-NEXT: ALU clause starting at 8: 349; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 350; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) 351; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 352; EG-NEXT: ALU clause starting at 11: 353; EG-NEXT: FFBL_INT * T1.W, T0.W, 354; EG-NEXT: FFBL_INT T2.W, T0.Z, 355; EG-NEXT: CNDE_INT * T0.W, T0.W, literal.x, PV.W, BS:VEC_021/SCL_122 356; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 357; EG-NEXT: CNDE_INT T0.Z, T0.Z, literal.x, PV.W, 358; EG-NEXT: FFBL_INT * T1.W, T0.Y, 359; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 360; EG-NEXT: CNDE_INT T0.Y, T0.Y, literal.x, PV.W, 361; EG-NEXT: FFBL_INT * T1.W, T0.X, 362; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 363; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 364; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 365; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) 366; 367; GFX10-LABEL: v_cttz_v4i32: 368; GFX10: ; %bb.0: 369; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 370; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 371; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 372; GFX10-NEXT: v_mov_b32_e32 v4, 0 373; GFX10-NEXT: s_waitcnt lgkmcnt(0) 374; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] 375; GFX10-NEXT: s_waitcnt vmcnt(0) 376; GFX10-NEXT: v_ffbl_b32_e32 v3, v3 377; GFX10-NEXT: v_ffbl_b32_e32 v2, v2 378; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 379; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 380; GFX10-NEXT: v_min_u32_e32 v3, 32, v3 381; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 382; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 383; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 384; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 385; GFX10-NEXT: s_endpgm 386; 387; GFX10-GISEL-LABEL: v_cttz_v4i32: 388; GFX10-GISEL: ; %bb.0: 389; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 390; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 391; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 392; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 393; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 394; GFX10-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] 395; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 396; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 397; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 398; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 399; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v3, v3 400; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 401; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 402; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 403; GFX10-GISEL-NEXT: v_min_u32_e32 v3, 32, v3 404; GFX10-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 405; GFX10-GISEL-NEXT: s_endpgm 406 %tid = call i32 @llvm.amdgcn.workitem.id.x() 407 %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid 408 %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16 409 %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 false) nounwind readnone 410 store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16 411 ret void 412} 413 414define amdgpu_kernel void @v_cttz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { 415; SI-LABEL: v_cttz_i8: 416; SI: ; %bb.0: 417; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 418; SI-NEXT: s_mov_b32 s3, 0xf000 419; SI-NEXT: s_mov_b32 s2, -1 420; SI-NEXT: s_mov_b32 s6, s2 421; SI-NEXT: s_mov_b32 s7, s3 422; SI-NEXT: s_waitcnt lgkmcnt(0) 423; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 424; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 425; SI-NEXT: s_waitcnt vmcnt(0) 426; SI-NEXT: v_or_b32_e32 v0, 0x100, v0 427; SI-NEXT: v_ffbl_b32_e32 v0, v0 428; SI-NEXT: s_waitcnt lgkmcnt(0) 429; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 430; SI-NEXT: s_endpgm 431; 432; VI-LABEL: v_cttz_i8: 433; VI: ; %bb.0: 434; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 435; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 436; VI-NEXT: s_mov_b32 s7, 0xf000 437; VI-NEXT: s_mov_b32 s6, -1 438; VI-NEXT: s_mov_b32 s2, s6 439; VI-NEXT: s_mov_b32 s3, s7 440; VI-NEXT: s_waitcnt lgkmcnt(0) 441; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 442; VI-NEXT: s_waitcnt vmcnt(0) 443; VI-NEXT: v_or_b32_e32 v0, 0x100, v0 444; VI-NEXT: v_ffbl_b32_e32 v0, v0 445; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 446; VI-NEXT: s_endpgm 447; 448; EG-LABEL: v_cttz_i8: 449; EG: ; %bb.0: 450; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 451; EG-NEXT: TEX 0 @6 452; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[] 453; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 454; EG-NEXT: CF_END 455; EG-NEXT: PAD 456; EG-NEXT: Fetch clause starting at 6: 457; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 458; EG-NEXT: ALU clause starting at 8: 459; EG-NEXT: MOV * T0.X, KC0[2].Z, 460; EG-NEXT: ALU clause starting at 9: 461; EG-NEXT: OR_INT * T0.W, T0.X, literal.x, 462; EG-NEXT: 256(3.587324e-43), 0(0.000000e+00) 463; EG-NEXT: FFBL_INT T0.W, PV.W, 464; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 465; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 466; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 467; EG-NEXT: LSHL * T1.W, PS, literal.y, 468; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) 469; EG-NEXT: LSHL T0.X, PV.W, PS, 470; EG-NEXT: LSHL * T0.W, literal.x, PS, 471; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 472; EG-NEXT: MOV T0.Y, 0.0, 473; EG-NEXT: MOV * T0.Z, 0.0, 474; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 475; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 476; 477; GFX10-LABEL: v_cttz_i8: 478; GFX10: ; %bb.0: 479; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 480; GFX10-NEXT: v_mov_b32_e32 v0, 0 481; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 482; GFX10-NEXT: s_waitcnt lgkmcnt(0) 483; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] 484; GFX10-NEXT: s_waitcnt vmcnt(0) 485; GFX10-NEXT: v_or_b32_e32 v1, 0x100, v1 486; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 487; GFX10-NEXT: global_store_byte v0, v1, s[0:1] 488; GFX10-NEXT: s_endpgm 489; 490; GFX10-GISEL-LABEL: v_cttz_i8: 491; GFX10-GISEL: ; %bb.0: 492; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 493; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 494; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 495; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 496; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] 497; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 498; GFX10-GISEL-NEXT: v_or_b32_e32 v1, 0x100, v1 499; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 500; GFX10-GISEL-NEXT: global_store_byte v0, v1, s[0:1] 501; GFX10-GISEL-NEXT: s_endpgm 502 %val = load i8, i8 addrspace(1)* %valptr 503 %cttz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone 504 store i8 %cttz, i8 addrspace(1)* %out 505 ret void 506} 507 508define amdgpu_kernel void @s_cttz_i64(i64 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind { 509; SI-LABEL: s_cttz_i64: 510; SI: ; %bb.0: 511; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 512; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 513; SI-NEXT: s_mov_b32 s3, 0xf000 514; SI-NEXT: s_mov_b32 s2, -1 515; SI-NEXT: s_waitcnt lgkmcnt(0) 516; SI-NEXT: s_ff1_i32_b32 s5, s5 517; SI-NEXT: s_min_u32 s5, s5, 0xffffffdf 518; SI-NEXT: s_add_i32 s5, s5, 32 519; SI-NEXT: s_ff1_i32_b32 s4, s4 520; SI-NEXT: v_mov_b32_e32 v0, s5 521; SI-NEXT: v_min3_u32 v0, s4, v0, 64 522; SI-NEXT: v_mov_b32_e32 v1, 0 523; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 524; SI-NEXT: s_endpgm 525; 526; VI-LABEL: s_cttz_i64: 527; VI: ; %bb.0: 528; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 529; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c 530; VI-NEXT: s_mov_b32 s7, 0xf000 531; VI-NEXT: s_mov_b32 s6, -1 532; VI-NEXT: v_mov_b32_e32 v1, 0 533; VI-NEXT: s_waitcnt lgkmcnt(0) 534; VI-NEXT: s_ff1_i32_b32 s1, s1 535; VI-NEXT: v_add_u32_e64 v0, s[2:3], s1, 32 clamp 536; VI-NEXT: s_ff1_i32_b32 s0, s0 537; VI-NEXT: v_min3_u32 v0, s0, v0, 64 538; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 539; VI-NEXT: s_endpgm 540; 541; EG-LABEL: s_cttz_i64: 542; EG: ; %bb.0: 543; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] 544; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 545; EG-NEXT: CF_END 546; EG-NEXT: PAD 547; EG-NEXT: ALU clause starting at 4: 548; EG-NEXT: FFBL_INT * T0.W, KC0[5].X, 549; EG-NEXT: CNDE_INT * T0.W, KC0[5].X, literal.x, PV.W, 550; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 551; EG-NEXT: FFBL_INT T1.W, KC0[4].W, 552; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, 553; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 554; EG-NEXT: CNDE_INT T0.X, KC0[4].W, PS, PV.W, 555; EG-NEXT: MOV T0.Y, 0.0, 556; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 557; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 558; 559; GFX10-LABEL: s_cttz_i64: 560; GFX10: ; %bb.0: 561; GFX10-NEXT: s_clause 0x1 562; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c 563; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 564; GFX10-NEXT: v_mov_b32_e32 v1, 0 565; GFX10-NEXT: s_waitcnt lgkmcnt(0) 566; GFX10-NEXT: s_ff1_i32_b32 s0, s3 567; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, 32 clamp 568; GFX10-NEXT: s_ff1_i32_b32 s0, s2 569; GFX10-NEXT: v_min3_u32 v0, s0, v0, 64 570; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] 571; GFX10-NEXT: s_endpgm 572; 573; GFX10-GISEL-LABEL: s_cttz_i64: 574; GFX10-GISEL: ; %bb.0: 575; GFX10-GISEL-NEXT: s_clause 0x1 576; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c 577; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 578; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 579; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 580; GFX10-GISEL-NEXT: s_ff1_i32_b64 s0, s[2:3] 581; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 582; GFX10-GISEL-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 583; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 584; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1 585; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 586; GFX10-GISEL-NEXT: s_endpgm 587 %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false) 588 store i64 %cttz, i64 addrspace(1)* %out 589 ret void 590} 591 592define amdgpu_kernel void @s_cttz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind { 593; SI-LABEL: s_cttz_i64_trunc: 594; SI: ; %bb.0: 595; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 596; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 597; SI-NEXT: s_mov_b32 s3, 0xf000 598; SI-NEXT: s_mov_b32 s2, -1 599; SI-NEXT: s_waitcnt lgkmcnt(0) 600; SI-NEXT: s_ff1_i32_b32 s5, s5 601; SI-NEXT: s_min_u32 s5, s5, 0xffffffdf 602; SI-NEXT: s_add_i32 s5, s5, 32 603; SI-NEXT: s_ff1_i32_b32 s4, s4 604; SI-NEXT: v_mov_b32_e32 v0, s5 605; SI-NEXT: v_min3_u32 v0, s4, v0, 64 606; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 607; SI-NEXT: s_endpgm 608; 609; VI-LABEL: s_cttz_i64_trunc: 610; VI: ; %bb.0: 611; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 612; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 613; VI-NEXT: s_mov_b32 s7, 0xf000 614; VI-NEXT: s_mov_b32 s6, -1 615; VI-NEXT: s_waitcnt lgkmcnt(0) 616; VI-NEXT: s_ff1_i32_b32 s1, s1 617; VI-NEXT: v_add_u32_e64 v0, s[2:3], s1, 32 clamp 618; VI-NEXT: s_ff1_i32_b32 s0, s0 619; VI-NEXT: v_min3_u32 v0, s0, v0, 64 620; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 621; VI-NEXT: s_endpgm 622; 623; EG-LABEL: s_cttz_i64_trunc: 624; EG: ; %bb.0: 625; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] 626; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 627; EG-NEXT: CF_END 628; EG-NEXT: PAD 629; EG-NEXT: ALU clause starting at 4: 630; EG-NEXT: FFBL_INT * T0.W, KC0[3].X, 631; EG-NEXT: CNDE_INT * T0.W, KC0[3].X, literal.x, PV.W, 632; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 633; EG-NEXT: FFBL_INT T1.W, KC0[2].W, 634; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, 635; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 636; EG-NEXT: CNDE_INT T0.X, KC0[2].W, PS, PV.W, 637; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 638; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 639; 640; GFX10-LABEL: s_cttz_i64_trunc: 641; GFX10: ; %bb.0: 642; GFX10-NEXT: s_clause 0x1 643; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 644; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 645; GFX10-NEXT: v_mov_b32_e32 v1, 0 646; GFX10-NEXT: s_waitcnt lgkmcnt(0) 647; GFX10-NEXT: s_ff1_i32_b32 s0, s3 648; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, 32 clamp 649; GFX10-NEXT: s_ff1_i32_b32 s0, s2 650; GFX10-NEXT: v_min3_u32 v0, s0, v0, 64 651; GFX10-NEXT: global_store_dword v1, v0, s[4:5] 652; GFX10-NEXT: s_endpgm 653; 654; GFX10-GISEL-LABEL: s_cttz_i64_trunc: 655; GFX10-GISEL: ; %bb.0: 656; GFX10-GISEL-NEXT: s_clause 0x1 657; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 658; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 659; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 660; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 661; GFX10-GISEL-NEXT: s_ff1_i32_b64 s0, s[2:3] 662; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 663; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 664; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] 665; GFX10-GISEL-NEXT: s_endpgm 666 %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false) 667 %trunc = trunc i64 %cttz to i32 668 store i32 %trunc, i32 addrspace(1)* %out 669 ret void 670} 671 672define amdgpu_kernel void @v_cttz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { 673; SI-LABEL: v_cttz_i64: 674; SI: ; %bb.0: 675; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 676; SI-NEXT: s_mov_b32 s7, 0xf000 677; SI-NEXT: s_mov_b32 s6, 0 678; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 679; SI-NEXT: v_mov_b32_e32 v1, 0 680; SI-NEXT: s_waitcnt lgkmcnt(0) 681; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 682; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 683; SI-NEXT: s_waitcnt vmcnt(0) 684; SI-NEXT: v_ffbl_b32_e32 v3, v3 685; SI-NEXT: v_min_u32_e32 v3, 0xffffffdf, v3 686; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v3 687; SI-NEXT: v_ffbl_b32_e32 v2, v2 688; SI-NEXT: v_min3_u32 v2, v2, v3, 64 689; SI-NEXT: v_mov_b32_e32 v3, v1 690; SI-NEXT: s_waitcnt lgkmcnt(0) 691; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 692; SI-NEXT: s_endpgm 693; 694; VI-LABEL: v_cttz_i64: 695; VI: ; %bb.0: 696; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 697; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 698; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 699; VI-NEXT: v_mov_b32_e32 v2, 0 700; VI-NEXT: s_waitcnt lgkmcnt(0) 701; VI-NEXT: v_mov_b32_e32 v4, s3 702; VI-NEXT: v_mov_b32_e32 v1, s1 703; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v3 704; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 705; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 706; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v3 707; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 708; VI-NEXT: s_waitcnt vmcnt(0) 709; VI-NEXT: v_ffbl_b32_e32 v1, v1 710; VI-NEXT: v_add_u32_e64 v1, s[0:1], v1, 32 clamp 711; VI-NEXT: v_ffbl_b32_e32 v0, v0 712; VI-NEXT: v_min3_u32 v1, v0, v1, 64 713; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2] 714; VI-NEXT: s_endpgm 715; 716; EG-LABEL: v_cttz_i64: 717; EG: ; %bb.0: 718; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 719; EG-NEXT: TEX 0 @6 720; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[] 721; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 722; EG-NEXT: CF_END 723; EG-NEXT: PAD 724; EG-NEXT: Fetch clause starting at 6: 725; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 726; EG-NEXT: ALU clause starting at 8: 727; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 728; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 729; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 730; EG-NEXT: ALU clause starting at 11: 731; EG-NEXT: FFBL_INT * T1.W, T0.Y, 732; EG-NEXT: CNDE_INT * T1.W, T0.Y, literal.x, PV.W, 733; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 734; EG-NEXT: FFBL_INT T2.W, T0.X, 735; EG-NEXT: ADD_INT * T1.W, PV.W, literal.x, 736; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 737; EG-NEXT: CNDE_INT T0.X, T0.X, PS, PV.W, 738; EG-NEXT: MOV T0.Y, 0.0, 739; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, 740; EG-NEXT: LSHR * T1.X, PV.W, literal.x, 741; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 742; 743; GFX10-LABEL: v_cttz_i64: 744; GFX10: ; %bb.0: 745; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 746; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 747; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 748; GFX10-NEXT: s_waitcnt lgkmcnt(0) 749; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 750; GFX10-NEXT: s_waitcnt vmcnt(0) 751; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 752; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 753; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp 754; GFX10-NEXT: v_min3_u32 v0, v0, v1, 64 755; GFX10-NEXT: v_mov_b32_e32 v1, 0 756; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 757; GFX10-NEXT: s_endpgm 758; 759; GFX10-GISEL-LABEL: v_cttz_i64: 760; GFX10-GISEL: ; %bb.0: 761; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 762; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 763; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 764; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 765; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 766; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 767; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 768; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 769; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp 770; GFX10-GISEL-NEXT: v_min_u32_e32 v0, v0, v1 771; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 772; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 64, v0 773; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 774; GFX10-GISEL-NEXT: s_endpgm 775 %tid = call i32 @llvm.amdgcn.workitem.id.x() 776 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid 777 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid 778 %val = load i64, i64 addrspace(1)* %in.gep 779 %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false) 780 store i64 %cttz, i64 addrspace(1)* %out.gep 781 ret void 782} 783 784define amdgpu_kernel void @v_cttz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { 785; SI-LABEL: v_cttz_i64_trunc: 786; SI: ; %bb.0: 787; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 788; SI-NEXT: s_mov_b32 s7, 0xf000 789; SI-NEXT: s_mov_b32 s6, 0 790; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 791; SI-NEXT: v_mov_b32_e32 v2, 0 792; SI-NEXT: s_waitcnt lgkmcnt(0) 793; SI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 794; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 795; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 796; SI-NEXT: s_waitcnt vmcnt(0) 797; SI-NEXT: v_ffbl_b32_e32 v0, v4 798; SI-NEXT: v_min_u32_e32 v0, 0xffffffdf, v0 799; SI-NEXT: v_add_i32_e32 v0, vcc, 32, v0 800; SI-NEXT: v_ffbl_b32_e32 v3, v3 801; SI-NEXT: v_min3_u32 v0, v3, v0, 64 802; SI-NEXT: s_waitcnt lgkmcnt(0) 803; SI-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 804; SI-NEXT: s_endpgm 805; 806; VI-LABEL: v_cttz_i64_trunc: 807; VI: ; %bb.0: 808; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 809; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 810; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 811; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 812; VI-NEXT: s_waitcnt lgkmcnt(0) 813; VI-NEXT: v_mov_b32_e32 v4, s3 814; VI-NEXT: v_mov_b32_e32 v2, s1 815; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 816; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 817; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] 818; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v0 819; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 820; VI-NEXT: s_waitcnt vmcnt(0) 821; VI-NEXT: v_ffbl_b32_e32 v0, v2 822; VI-NEXT: v_add_u32_e64 v0, s[0:1], v0, 32 clamp 823; VI-NEXT: v_ffbl_b32_e32 v1, v1 824; VI-NEXT: v_min3_u32 v0, v1, v0, 64 825; VI-NEXT: flat_store_dword v[3:4], v0 826; VI-NEXT: s_endpgm 827; 828; EG-LABEL: v_cttz_i64_trunc: 829; EG: ; %bb.0: 830; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 831; EG-NEXT: TEX 0 @6 832; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[] 833; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 834; EG-NEXT: CF_END 835; EG-NEXT: PAD 836; EG-NEXT: Fetch clause starting at 6: 837; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1 838; EG-NEXT: ALU clause starting at 8: 839; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 840; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 841; EG-NEXT: ADD_INT * T1.X, KC0[2].Z, PV.W, 842; EG-NEXT: ALU clause starting at 11: 843; EG-NEXT: FFBL_INT * T0.W, T1.Y, 844; EG-NEXT: CNDE_INT * T0.W, T1.Y, literal.x, PV.W, 845; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 846; EG-NEXT: LSHL T0.Z, T0.X, literal.x, 847; EG-NEXT: FFBL_INT T1.W, T1.X, BS:VEC_120/SCL_212 848; EG-NEXT: ADD_INT * T0.W, PV.W, literal.y, 849; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44) 850; EG-NEXT: CNDE_INT T0.X, T1.X, PS, PV.W, 851; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, PV.Z, 852; EG-NEXT: LSHR * T1.X, PV.W, literal.x, 853; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 854; 855; GFX10-LABEL: v_cttz_i64_trunc: 856; GFX10: ; %bb.0: 857; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 858; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0 859; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 860; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 861; GFX10-NEXT: s_waitcnt lgkmcnt(0) 862; GFX10-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] 863; GFX10-NEXT: s_waitcnt vmcnt(0) 864; GFX10-NEXT: v_ffbl_b32_e32 v2, v2 865; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 866; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp 867; GFX10-NEXT: v_min3_u32 v1, v1, v2, 64 868; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 869; GFX10-NEXT: s_endpgm 870; 871; GFX10-GISEL-LABEL: v_cttz_i64_trunc: 872; GFX10-GISEL: ; %bb.0: 873; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 874; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 875; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 876; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 877; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 878; GFX10-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] 879; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 880; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 881; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 882; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp 883; GFX10-GISEL-NEXT: v_min_u32_e32 v1, v1, v2 884; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 64, v1 885; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] 886; GFX10-GISEL-NEXT: s_endpgm 887 %tid = call i32 @llvm.amdgcn.workitem.id.x() 888 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid 889 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid 890 %val = load i64, i64 addrspace(1)* %in.gep 891 %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false) 892 %trunc = trunc i64 %cttz to i32 893 store i32 %trunc, i32 addrspace(1)* %out.gep 894 ret void 895} 896 897define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 898; SI-LABEL: v_cttz_i32_sel_eq_neg1: 899; SI: ; %bb.0: 900; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 901; SI-NEXT: s_mov_b32 s3, 0xf000 902; SI-NEXT: s_mov_b32 s6, 0 903; SI-NEXT: s_mov_b32 s7, s3 904; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 905; SI-NEXT: v_mov_b32_e32 v1, 0 906; SI-NEXT: s_waitcnt lgkmcnt(0) 907; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 908; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 909; SI-NEXT: s_mov_b32 s2, -1 910; SI-NEXT: s_waitcnt vmcnt(0) 911; SI-NEXT: v_ffbl_b32_e32 v0, v0 912; SI-NEXT: s_waitcnt lgkmcnt(0) 913; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 914; SI-NEXT: s_endpgm 915; 916; VI-LABEL: v_cttz_i32_sel_eq_neg1: 917; VI: ; %bb.0: 918; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 919; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 920; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 921; VI-NEXT: s_mov_b32 s7, 0xf000 922; VI-NEXT: s_mov_b32 s6, -1 923; VI-NEXT: s_waitcnt lgkmcnt(0) 924; VI-NEXT: v_mov_b32_e32 v1, s1 925; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 926; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 927; VI-NEXT: flat_load_dword v0, v[0:1] 928; VI-NEXT: s_waitcnt vmcnt(0) 929; VI-NEXT: v_ffbl_b32_e32 v0, v0 930; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 931; VI-NEXT: s_endpgm 932; 933; EG-LABEL: v_cttz_i32_sel_eq_neg1: 934; EG: ; %bb.0: 935; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 936; EG-NEXT: TEX 0 @6 937; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 938; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 939; EG-NEXT: CF_END 940; EG-NEXT: PAD 941; EG-NEXT: Fetch clause starting at 6: 942; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 943; EG-NEXT: ALU clause starting at 8: 944; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 945; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 946; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 947; EG-NEXT: ALU clause starting at 11: 948; EG-NEXT: FFBL_INT * T0.W, T0.X, 949; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 950; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 951; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 952; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 953; EG-NEXT: -1(nan), 2(2.802597e-45) 954; 955; GFX10-LABEL: v_cttz_i32_sel_eq_neg1: 956; GFX10: ; %bb.0: 957; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 958; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 959; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 960; GFX10-NEXT: v_mov_b32_e32 v1, 0 961; GFX10-NEXT: s_waitcnt lgkmcnt(0) 962; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 963; GFX10-NEXT: s_waitcnt vmcnt(0) 964; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 965; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 966; GFX10-NEXT: s_endpgm 967; 968; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_neg1: 969; GFX10-GISEL: ; %bb.0: 970; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 971; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 972; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 973; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 974; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 975; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 976; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v0 977; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 978; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 979; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo 980; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 981; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 982; GFX10-GISEL-NEXT: s_endpgm 983 %tid = call i32 @llvm.amdgcn.workitem.id.x() 984 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 985 %val = load i32, i32 addrspace(1)* %in.gep 986 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone 987 %cmp = icmp eq i32 %val, 0 988 %sel = select i1 %cmp, i32 -1, i32 %cttz 989 store i32 %sel, i32 addrspace(1)* %out 990 ret void 991} 992 993define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 994; SI-LABEL: v_cttz_i32_sel_ne_neg1: 995; SI: ; %bb.0: 996; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 997; SI-NEXT: s_mov_b32 s3, 0xf000 998; SI-NEXT: s_mov_b32 s6, 0 999; SI-NEXT: s_mov_b32 s7, s3 1000; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1001; SI-NEXT: v_mov_b32_e32 v1, 0 1002; SI-NEXT: s_waitcnt lgkmcnt(0) 1003; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1004; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1005; SI-NEXT: s_mov_b32 s2, -1 1006; SI-NEXT: s_waitcnt vmcnt(0) 1007; SI-NEXT: v_ffbl_b32_e32 v0, v0 1008; SI-NEXT: s_waitcnt lgkmcnt(0) 1009; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1010; SI-NEXT: s_endpgm 1011; 1012; VI-LABEL: v_cttz_i32_sel_ne_neg1: 1013; VI: ; %bb.0: 1014; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1015; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1016; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1017; VI-NEXT: s_mov_b32 s7, 0xf000 1018; VI-NEXT: s_mov_b32 s6, -1 1019; VI-NEXT: s_waitcnt lgkmcnt(0) 1020; VI-NEXT: v_mov_b32_e32 v1, s1 1021; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1022; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1023; VI-NEXT: flat_load_dword v0, v[0:1] 1024; VI-NEXT: s_waitcnt vmcnt(0) 1025; VI-NEXT: v_ffbl_b32_e32 v0, v0 1026; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1027; VI-NEXT: s_endpgm 1028; 1029; EG-LABEL: v_cttz_i32_sel_ne_neg1: 1030; EG: ; %bb.0: 1031; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 1032; EG-NEXT: TEX 0 @6 1033; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 1034; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1035; EG-NEXT: CF_END 1036; EG-NEXT: PAD 1037; EG-NEXT: Fetch clause starting at 6: 1038; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1039; EG-NEXT: ALU clause starting at 8: 1040; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1041; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1042; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1043; EG-NEXT: ALU clause starting at 11: 1044; EG-NEXT: FFBL_INT * T0.W, T0.X, 1045; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 1046; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1047; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 1048; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1049; EG-NEXT: -1(nan), 2(2.802597e-45) 1050; 1051; GFX10-LABEL: v_cttz_i32_sel_ne_neg1: 1052; GFX10: ; %bb.0: 1053; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1054; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1055; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1056; GFX10-NEXT: v_mov_b32_e32 v1, 0 1057; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1058; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 1059; GFX10-NEXT: s_waitcnt vmcnt(0) 1060; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 1061; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1062; GFX10-NEXT: s_endpgm 1063; 1064; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_neg1: 1065; GFX10-GISEL: ; %bb.0: 1066; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1067; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1068; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1069; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1070; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 1071; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1072; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v0 1073; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 1074; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 1075; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc_lo 1076; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 1077; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 1078; GFX10-GISEL-NEXT: s_endpgm 1079 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1080 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 1081 %val = load i32, i32 addrspace(1)* %in.gep 1082 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone 1083 %cmp = icmp ne i32 %val, 0 1084 %sel = select i1 %cmp, i32 %cttz, i32 -1 1085 store i32 %sel, i32 addrspace(1)* %out 1086 ret void 1087} 1088 1089; TODO: Should be able to eliminate select here as well. 1090define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 1091; SI-LABEL: v_cttz_i32_sel_eq_bitwidth: 1092; SI: ; %bb.0: 1093; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1094; SI-NEXT: s_mov_b32 s3, 0xf000 1095; SI-NEXT: s_mov_b32 s6, 0 1096; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1097; SI-NEXT: v_mov_b32_e32 v1, 0 1098; SI-NEXT: s_mov_b32 s7, s3 1099; SI-NEXT: s_waitcnt lgkmcnt(0) 1100; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1101; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1102; SI-NEXT: s_mov_b32 s2, -1 1103; SI-NEXT: s_waitcnt vmcnt(0) 1104; SI-NEXT: v_ffbl_b32_e32 v0, v0 1105; SI-NEXT: v_min_u32_e32 v0, 32, v0 1106; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 1107; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 1108; SI-NEXT: s_waitcnt lgkmcnt(0) 1109; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1110; SI-NEXT: s_endpgm 1111; 1112; VI-LABEL: v_cttz_i32_sel_eq_bitwidth: 1113; VI: ; %bb.0: 1114; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1115; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1116; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1117; VI-NEXT: s_mov_b32 s7, 0xf000 1118; VI-NEXT: s_mov_b32 s6, -1 1119; VI-NEXT: s_waitcnt lgkmcnt(0) 1120; VI-NEXT: v_mov_b32_e32 v1, s1 1121; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1122; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1123; VI-NEXT: flat_load_dword v0, v[0:1] 1124; VI-NEXT: s_waitcnt vmcnt(0) 1125; VI-NEXT: v_ffbl_b32_e32 v0, v0 1126; VI-NEXT: v_min_u32_e32 v0, 32, v0 1127; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 1128; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 1129; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1130; VI-NEXT: s_endpgm 1131; 1132; EG-LABEL: v_cttz_i32_sel_eq_bitwidth: 1133; EG: ; %bb.0: 1134; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 1135; EG-NEXT: TEX 0 @6 1136; EG-NEXT: ALU 7, @11, KC0[CB0:0-32], KC1[] 1137; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1138; EG-NEXT: CF_END 1139; EG-NEXT: PAD 1140; EG-NEXT: Fetch clause starting at 6: 1141; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1142; EG-NEXT: ALU clause starting at 8: 1143; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1144; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1145; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1146; EG-NEXT: ALU clause starting at 11: 1147; EG-NEXT: FFBL_INT * T0.W, T0.X, 1148; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 1149; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1150; EG-NEXT: SETE_INT * T1.W, PV.W, literal.x, 1151; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1152; EG-NEXT: CNDE_INT T0.X, PV.W, T0.W, literal.x, 1153; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1154; EG-NEXT: -1(nan), 2(2.802597e-45) 1155; 1156; GFX10-LABEL: v_cttz_i32_sel_eq_bitwidth: 1157; GFX10: ; %bb.0: 1158; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1159; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1160; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1161; GFX10-NEXT: v_mov_b32_e32 v1, 0 1162; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1163; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 1164; GFX10-NEXT: s_waitcnt vmcnt(0) 1165; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 1166; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 1167; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 1168; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo 1169; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1170; GFX10-NEXT: s_endpgm 1171; 1172; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_bitwidth: 1173; GFX10-GISEL: ; %bb.0: 1174; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1175; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1176; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1177; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 1178; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1179; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 1180; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1181; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 1182; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 1183; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 32, v0 1184; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo 1185; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 1186; GFX10-GISEL-NEXT: s_endpgm 1187 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1188 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 1189 %val = load i32, i32 addrspace(1)* %in.gep 1190 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone 1191 %cmp = icmp eq i32 %cttz, 32 1192 %sel = select i1 %cmp, i32 -1, i32 %cttz 1193 store i32 %sel, i32 addrspace(1)* %out 1194 ret void 1195} 1196 1197define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 1198; SI-LABEL: v_cttz_i32_sel_ne_bitwidth: 1199; SI: ; %bb.0: 1200; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1201; SI-NEXT: s_mov_b32 s3, 0xf000 1202; SI-NEXT: s_mov_b32 s6, 0 1203; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1204; SI-NEXT: v_mov_b32_e32 v1, 0 1205; SI-NEXT: s_mov_b32 s7, s3 1206; SI-NEXT: s_waitcnt lgkmcnt(0) 1207; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1208; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1209; SI-NEXT: s_mov_b32 s2, -1 1210; SI-NEXT: s_waitcnt vmcnt(0) 1211; SI-NEXT: v_ffbl_b32_e32 v0, v0 1212; SI-NEXT: v_min_u32_e32 v0, 32, v0 1213; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 1214; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 1215; SI-NEXT: s_waitcnt lgkmcnt(0) 1216; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1217; SI-NEXT: s_endpgm 1218; 1219; VI-LABEL: v_cttz_i32_sel_ne_bitwidth: 1220; VI: ; %bb.0: 1221; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1222; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1223; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1224; VI-NEXT: s_mov_b32 s7, 0xf000 1225; VI-NEXT: s_mov_b32 s6, -1 1226; VI-NEXT: s_waitcnt lgkmcnt(0) 1227; VI-NEXT: v_mov_b32_e32 v1, s1 1228; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1229; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1230; VI-NEXT: flat_load_dword v0, v[0:1] 1231; VI-NEXT: s_waitcnt vmcnt(0) 1232; VI-NEXT: v_ffbl_b32_e32 v0, v0 1233; VI-NEXT: v_min_u32_e32 v0, 32, v0 1234; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 1235; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 1236; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1237; VI-NEXT: s_endpgm 1238; 1239; EG-LABEL: v_cttz_i32_sel_ne_bitwidth: 1240; EG: ; %bb.0: 1241; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 1242; EG-NEXT: TEX 0 @6 1243; EG-NEXT: ALU 7, @11, KC0[CB0:0-32], KC1[] 1244; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1245; EG-NEXT: CF_END 1246; EG-NEXT: PAD 1247; EG-NEXT: Fetch clause starting at 6: 1248; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1249; EG-NEXT: ALU clause starting at 8: 1250; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1251; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1252; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1253; EG-NEXT: ALU clause starting at 11: 1254; EG-NEXT: FFBL_INT * T0.W, T0.X, 1255; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 1256; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1257; EG-NEXT: SETNE_INT * T1.W, PV.W, literal.x, 1258; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1259; EG-NEXT: CNDE_INT T0.X, PV.W, literal.x, T0.W, 1260; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1261; EG-NEXT: -1(nan), 2(2.802597e-45) 1262; 1263; GFX10-LABEL: v_cttz_i32_sel_ne_bitwidth: 1264; GFX10: ; %bb.0: 1265; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1266; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1267; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1268; GFX10-NEXT: v_mov_b32_e32 v1, 0 1269; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1270; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 1271; GFX10-NEXT: s_waitcnt vmcnt(0) 1272; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 1273; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 1274; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 1275; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo 1276; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1277; GFX10-NEXT: s_endpgm 1278; 1279; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth: 1280; GFX10-GISEL: ; %bb.0: 1281; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1282; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1283; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1284; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 1285; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1286; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 1287; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1288; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 1289; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 1290; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 1291; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo 1292; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 1293; GFX10-GISEL-NEXT: s_endpgm 1294 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1295 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 1296 %val = load i32, i32 addrspace(1)* %in.gep 1297 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone 1298 %cmp = icmp ne i32 %cttz, 32 1299 %sel = select i1 %cmp, i32 %cttz, i32 -1 1300 store i32 %sel, i32 addrspace(1)* %out 1301 ret void 1302} 1303 1304 define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { 1305; SI-LABEL: v_cttz_i8_sel_eq_neg1: 1306; SI: ; %bb.0: 1307; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1308; SI-NEXT: s_mov_b32 s3, 0xf000 1309; SI-NEXT: v_mov_b32_e32 v1, 0 1310; SI-NEXT: s_mov_b32 s6, 0 1311; SI-NEXT: s_mov_b32 s7, s3 1312; SI-NEXT: s_waitcnt lgkmcnt(0) 1313; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 1314; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1315; SI-NEXT: s_mov_b32 s2, -1 1316; SI-NEXT: s_waitcnt vmcnt(0) 1317; SI-NEXT: v_ffbl_b32_e32 v0, v0 1318; SI-NEXT: s_waitcnt lgkmcnt(0) 1319; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 1320; SI-NEXT: s_endpgm 1321; 1322; VI-LABEL: v_cttz_i8_sel_eq_neg1: 1323; VI: ; %bb.0: 1324; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1325; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1326; VI-NEXT: s_mov_b32 s7, 0xf000 1327; VI-NEXT: s_mov_b32 s6, -1 1328; VI-NEXT: s_waitcnt lgkmcnt(0) 1329; VI-NEXT: v_mov_b32_e32 v1, s1 1330; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1331; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1332; VI-NEXT: flat_load_ubyte v0, v[0:1] 1333; VI-NEXT: s_waitcnt vmcnt(0) 1334; VI-NEXT: v_ffbl_b32_e32 v0, v0 1335; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 1336; VI-NEXT: s_endpgm 1337; 1338; EG-LABEL: v_cttz_i8_sel_eq_neg1: 1339; EG: ; %bb.0: 1340; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1341; EG-NEXT: TEX 0 @6 1342; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 1343; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1344; EG-NEXT: CF_END 1345; EG-NEXT: PAD 1346; EG-NEXT: Fetch clause starting at 6: 1347; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 1348; EG-NEXT: ALU clause starting at 8: 1349; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X, 1350; EG-NEXT: ALU clause starting at 9: 1351; EG-NEXT: FFBL_INT T0.W, T0.X, 1352; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1353; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1354; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1355; EG-NEXT: LSHL * T1.W, PS, literal.y, 1356; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) 1357; EG-NEXT: LSHL T0.X, PV.W, PS, 1358; EG-NEXT: LSHL * T0.W, literal.x, PS, 1359; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1360; EG-NEXT: MOV T0.Y, 0.0, 1361; EG-NEXT: MOV * T0.Z, 0.0, 1362; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1363; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1364; 1365; GFX10-LABEL: v_cttz_i8_sel_eq_neg1: 1366; GFX10: ; %bb.0: 1367; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1368; GFX10-NEXT: v_mov_b32_e32 v1, 0 1369; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1370; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1371; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] 1372; GFX10-NEXT: s_waitcnt vmcnt(0) 1373; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 1374; GFX10-NEXT: global_store_byte v1, v0, s[0:1] 1375; GFX10-NEXT: s_endpgm 1376; 1377; GFX10-GISEL-LABEL: v_cttz_i8_sel_eq_neg1: 1378; GFX10-GISEL: ; %bb.0: 1379; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1380; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 1381; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1382; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1383; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 1384; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 1385; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 1386; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo 1387; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 1388; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off 1389; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1390; GFX10-GISEL-NEXT: v_or_b32_e32 v1, 0x100, v0 1391; GFX10-GISEL-NEXT: v_cmp_eq_u32_sdwa s2, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD 1392; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 1393; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, s2 1394; GFX10-GISEL-NEXT: global_store_byte v2, v0, s[0:1] 1395; GFX10-GISEL-NEXT: s_endpgm 1396 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1397 %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid 1398 %val = load i8, i8 addrspace(1)* %valptr.gep 1399 %cttz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone 1400 %cmp = icmp eq i8 %val, 0 1401 %sel = select i1 %cmp, i8 -1, i8 %cttz 1402 store i8 %sel, i8 addrspace(1)* %out 1403 ret void 1404} 1405 1406 define amdgpu_kernel void @v_cttz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind { 1407; SI-LABEL: v_cttz_i16_sel_eq_neg1: 1408; SI: ; %bb.0: 1409; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1410; SI-NEXT: s_mov_b32 s3, 0xf000 1411; SI-NEXT: s_mov_b32 s2, -1 1412; SI-NEXT: s_mov_b32 s6, s2 1413; SI-NEXT: s_mov_b32 s7, s3 1414; SI-NEXT: s_waitcnt lgkmcnt(0) 1415; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 1416; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1417; SI-NEXT: s_waitcnt vmcnt(0) 1418; SI-NEXT: v_ffbl_b32_e32 v0, v0 1419; SI-NEXT: s_waitcnt lgkmcnt(0) 1420; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1421; SI-NEXT: s_endpgm 1422; 1423; VI-LABEL: v_cttz_i16_sel_eq_neg1: 1424; VI: ; %bb.0: 1425; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1426; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1427; VI-NEXT: s_mov_b32 s7, 0xf000 1428; VI-NEXT: s_mov_b32 s6, -1 1429; VI-NEXT: s_mov_b32 s2, s6 1430; VI-NEXT: s_mov_b32 s3, s7 1431; VI-NEXT: s_waitcnt lgkmcnt(0) 1432; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 1433; VI-NEXT: v_mov_b32_e32 v1, 0xffff 1434; VI-NEXT: s_waitcnt vmcnt(0) 1435; VI-NEXT: v_or_b32_e32 v2, 0x10000, v0 1436; VI-NEXT: v_ffbl_b32_e32 v2, v2 1437; VI-NEXT: v_min_u32_e32 v2, 32, v2 1438; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 1439; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 1440; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 1441; VI-NEXT: s_endpgm 1442; 1443; EG-LABEL: v_cttz_i16_sel_eq_neg1: 1444; EG: ; %bb.0: 1445; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1446; EG-NEXT: TEX 0 @6 1447; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 1448; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1449; EG-NEXT: CF_END 1450; EG-NEXT: PAD 1451; EG-NEXT: Fetch clause starting at 6: 1452; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1453; EG-NEXT: ALU clause starting at 8: 1454; EG-NEXT: MOV * T0.X, KC0[2].Z, 1455; EG-NEXT: ALU clause starting at 9: 1456; EG-NEXT: FFBL_INT T0.W, T0.X, 1457; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1458; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1459; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1460; EG-NEXT: LSHL * T1.W, PS, literal.y, 1461; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 1462; EG-NEXT: LSHL T0.X, PV.W, PS, 1463; EG-NEXT: LSHL * T0.W, literal.x, PS, 1464; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1465; EG-NEXT: MOV T0.Y, 0.0, 1466; EG-NEXT: MOV * T0.Z, 0.0, 1467; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1468; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1469; 1470; GFX10-LABEL: v_cttz_i16_sel_eq_neg1: 1471; GFX10: ; %bb.0: 1472; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1473; GFX10-NEXT: v_mov_b32_e32 v0, 0 1474; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1475; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1476; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] 1477; GFX10-NEXT: s_waitcnt vmcnt(0) 1478; GFX10-NEXT: v_or_b32_e32 v2, 0x10000, v1 1479; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v1 1480; GFX10-NEXT: v_ffbl_b32_e32 v2, v2 1481; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 1482; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo 1483; GFX10-NEXT: global_store_short v0, v1, s[0:1] 1484; GFX10-NEXT: s_endpgm 1485; 1486; GFX10-GISEL-LABEL: v_cttz_i16_sel_eq_neg1: 1487; GFX10-GISEL: ; %bb.0: 1488; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1489; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 1490; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1491; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1492; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] 1493; GFX10-GISEL-NEXT: s_waitcnt_depctr 0xffe3 1494; GFX10-GISEL-NEXT: s_mov_b32 s2, 0xffff 1495; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1496; GFX10-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1 1497; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 1498; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 1499; GFX10-GISEL-NEXT: v_and_b32_e32 v2, s2, v2 1500; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, s2, vcc_lo 1501; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1] 1502; GFX10-GISEL-NEXT: s_endpgm 1503 %val = load i16, i16 addrspace(1)* %valptr 1504 %cttz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone 1505 %cmp = icmp eq i16 %val, 0 1506 %sel = select i1 %cmp, i16 -1, i16 %cttz 1507 store i16 %sel, i16 addrspace(1)* %out 1508 ret void 1509} 1510 1511; FIXME: Need to handle non-uniform case for function below (load without gep). 1512define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind { 1513; SI-LABEL: v_cttz_i7_sel_eq_neg1: 1514; SI: ; %bb.0: 1515; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1516; SI-NEXT: s_mov_b32 s3, 0xf000 1517; SI-NEXT: v_mov_b32_e32 v1, 0 1518; SI-NEXT: s_mov_b32 s6, 0 1519; SI-NEXT: s_mov_b32 s7, s3 1520; SI-NEXT: s_waitcnt lgkmcnt(0) 1521; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 1522; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1523; SI-NEXT: s_mov_b32 s2, -1 1524; SI-NEXT: s_waitcnt vmcnt(0) 1525; SI-NEXT: v_ffbl_b32_e32 v0, v0 1526; SI-NEXT: v_and_b32_e32 v0, 0x7f, v0 1527; SI-NEXT: s_waitcnt lgkmcnt(0) 1528; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 1529; SI-NEXT: s_endpgm 1530; 1531; VI-LABEL: v_cttz_i7_sel_eq_neg1: 1532; VI: ; %bb.0: 1533; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1534; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1535; VI-NEXT: s_mov_b32 s7, 0xf000 1536; VI-NEXT: s_mov_b32 s6, -1 1537; VI-NEXT: s_waitcnt lgkmcnt(0) 1538; VI-NEXT: v_mov_b32_e32 v1, s1 1539; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1540; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1541; VI-NEXT: flat_load_ubyte v0, v[0:1] 1542; VI-NEXT: s_waitcnt vmcnt(0) 1543; VI-NEXT: v_ffbl_b32_e32 v0, v0 1544; VI-NEXT: v_and_b32_e32 v0, 0x7f, v0 1545; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 1546; VI-NEXT: s_endpgm 1547; 1548; EG-LABEL: v_cttz_i7_sel_eq_neg1: 1549; EG: ; %bb.0: 1550; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1551; EG-NEXT: TEX 0 @6 1552; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 1553; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1554; EG-NEXT: CF_END 1555; EG-NEXT: PAD 1556; EG-NEXT: Fetch clause starting at 6: 1557; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 1558; EG-NEXT: ALU clause starting at 8: 1559; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X, 1560; EG-NEXT: ALU clause starting at 9: 1561; EG-NEXT: FFBL_INT T0.W, T0.X, 1562; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1563; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1564; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1565; EG-NEXT: LSHL * T1.W, PS, literal.y, 1566; EG-NEXT: 127(1.779649e-43), 3(4.203895e-45) 1567; EG-NEXT: LSHL T0.X, PV.W, PS, 1568; EG-NEXT: LSHL * T0.W, literal.x, PS, 1569; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1570; EG-NEXT: MOV T0.Y, 0.0, 1571; EG-NEXT: MOV * T0.Z, 0.0, 1572; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1573; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1574; 1575; GFX10-LABEL: v_cttz_i7_sel_eq_neg1: 1576; GFX10: ; %bb.0: 1577; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1578; GFX10-NEXT: v_mov_b32_e32 v1, 0 1579; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1580; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1581; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] 1582; GFX10-NEXT: s_waitcnt vmcnt(0) 1583; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 1584; GFX10-NEXT: v_and_b32_e32 v0, 0x7f, v0 1585; GFX10-NEXT: global_store_byte v1, v0, s[0:1] 1586; GFX10-NEXT: s_endpgm 1587; 1588; GFX10-GISEL-LABEL: v_cttz_i7_sel_eq_neg1: 1589; GFX10-GISEL: ; %bb.0: 1590; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1591; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 1592; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1593; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1594; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 1595; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 1596; GFX10-GISEL-NEXT: s_movk_i32 s2, 0x7f 1597; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 1598; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo 1599; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off 1600; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1601; GFX10-GISEL-NEXT: v_or_b32_e32 v1, 0x80, v0 1602; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s2, v0 1603; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 1604; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1605; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo 1606; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 1607; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s2, v0 1608; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] 1609; GFX10-GISEL-NEXT: s_endpgm 1610 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1611 %valptr.gep = getelementptr i7, i7 addrspace(1)* %valptr, i32 %tid 1612 %val = load i7, i7 addrspace(1)* %valptr.gep 1613 %cttz = call i7 @llvm.cttz.i7(i7 %val, i1 false) nounwind readnone 1614 %cmp = icmp eq i7 %val, 0 1615 %sel = select i1 %cmp, i7 -1, i7 %cttz 1616 store i7 %sel, i7 addrspace(1)* %out 1617 ret void 1618} 1619