1; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s 2; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s 3; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s 4; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032,GFX10DEFWAVE %s 6 7; GCN-LABEL: {{^}}test_vopc_i32: 8; GFX1032: v_cmp_lt_i32_e32 vcc_lo, 0, v{{[0-9]+}} 9; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 1, vcc_lo 10; GFX1064: v_cmp_lt_i32_e32 vcc, 0, v{{[0-9]+}} 11; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 1, vcc{{$}} 12define amdgpu_kernel void @test_vopc_i32(i32 addrspace(1)* %arg) { 13 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 14 %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %lid 15 %load = load i32, i32 addrspace(1)* %gep, align 4 16 %cmp = icmp sgt i32 %load, 0 17 %sel = select i1 %cmp, i32 1, i32 2 18 store i32 %sel, i32 addrspace(1)* %gep, align 4 19 ret void 20} 21 22; GCN-LABEL: {{^}}test_vopc_f32: 23; GFX1032: v_cmp_nge_f32_e32 vcc_lo, 0, v{{[0-9]+}} 24; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 2.0, 1.0, vcc_lo 25; GFX1064: v_cmp_nge_f32_e32 vcc, 0, v{{[0-9]+}} 26; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 2.0, 1.0, vcc{{$}} 27define amdgpu_kernel void @test_vopc_f32(float addrspace(1)* %arg) { 28 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 29 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %lid 30 %load = load float, float addrspace(1)* %gep, align 4 31 %cmp = fcmp ugt float %load, 0.0 32 %sel = select i1 %cmp, float 1.0, float 2.0 33 store float %sel, float addrspace(1)* %gep, align 4 34 ret void 35} 36 37; GCN-LABEL: {{^}}test_vopc_vcmpx: 38; GFX1032: v_cmpx_le_f32_e32 0, v{{[0-9]+}} 39; GFX1064: v_cmpx_le_f32_e32 0, v{{[0-9]+}} 40define amdgpu_ps void @test_vopc_vcmpx(float %x) { 41 %cmp = fcmp oge float %x, 0.0 42 call void @llvm.amdgcn.kill(i1 %cmp) 43 ret void 44} 45 46; GCN-LABEL: {{^}}test_vopc_2xf16: 47; GFX1032: v_cmp_le_f16_sdwa [[SC:s[0-9]+]], {{[vs][0-9]+}}, v{{[0-9]+}} src0_sel:WORD_1 src1_sel:DWORD 48; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 0x3c003c00, v{{[0-9]+}}, [[SC]] 49; GFX1064: v_cmp_le_f16_sdwa [[SC:s\[[0-9:]+\]]], {{[vs][0-9]+}}, v{{[0-9]+}} src0_sel:WORD_1 src1_sel:DWORD 50; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 0x3c003c00, v{{[0-9]+}}, [[SC]] 51define amdgpu_kernel void @test_vopc_2xf16(<2 x half> addrspace(1)* %arg) { 52 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 53 %gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %arg, i32 %lid 54 %load = load <2 x half>, <2 x half> addrspace(1)* %gep, align 4 55 %elt = extractelement <2 x half> %load, i32 1 56 %cmp = fcmp ugt half %elt, 0.0 57 %sel = select i1 %cmp, <2 x half> <half 1.0, half 1.0>, <2 x half> %load 58 store <2 x half> %sel, <2 x half> addrspace(1)* %gep, align 4 59 ret void 60} 61 62; GCN-LABEL: {{^}}test_vopc_class: 63; GFX1032: v_cmp_class_f32_e64 [[C:vcc_lo|s[0-9:]+]], s{{[0-9]+}}, 0x204 64; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[C]] 65; GFX1064: v_cmp_class_f32_e64 [[C:vcc|s\[[0-9:]+\]]], s{{[0-9]+}}, 0x204 66; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[C]]{{$}} 67define amdgpu_kernel void @test_vopc_class(i32 addrspace(1)* %out, float %x) #0 { 68 %fabs = tail call float @llvm.fabs.f32(float %x) 69 %cmp = fcmp oeq float %fabs, 0x7FF0000000000000 70 %ext = zext i1 %cmp to i32 71 store i32 %ext, i32 addrspace(1)* %out, align 4 72 ret void 73} 74 75; GCN-LABEL: {{^}}test_vcmp_vcnd_f16: 76; GFX1032: v_cmp_neq_f16_e64 [[C:vcc_lo|s\[[0-9:]+\]]], 0x7c00, s{{[0-9]+}} 77; GFX1032: v_cndmask_b32_e32 v{{[0-9]+}}, 0x3c00, v{{[0-9]+}}, [[C]] 78 79; GFX1064: v_cmp_neq_f16_e64 [[C:vcc|s\[[0-9:]+\]]], 0x7c00, s{{[0-9]+}} 80; GFX1064: v_cndmask_b32_e32 v{{[0-9]+}}, 0x3c00, v{{[0-9]+}}, [[C]]{{$}} 81define amdgpu_kernel void @test_vcmp_vcnd_f16(half addrspace(1)* %out, half %x) #0 { 82 %cmp = fcmp oeq half %x, 0x7FF0000000000000 83 %sel = select i1 %cmp, half 1.0, half %x 84 store half %sel, half addrspace(1)* %out, align 2 85 ret void 86} 87 88; GCN-LABEL: {{^}}test_vop3_cmp_f32_sop_and: 89; GFX1032: v_cmp_nge_f32_e32 vcc_lo, 0, v{{[0-9]+}} 90; GFX1032: v_cmp_nle_f32_e64 [[C2:s[0-9]+]], 1.0, v{{[0-9]+}} 91; GFX1032: s_and_b32 [[AND:s[0-9]+]], vcc_lo, [[C2]] 92; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 2.0, 1.0, [[AND]] 93; GFX1064: v_cmp_nge_f32_e32 vcc, 0, v{{[0-9]+}} 94; GFX1064: v_cmp_nle_f32_e64 [[C2:s\[[0-9:]+\]]], 1.0, v{{[0-9]+}} 95; GFX1064: s_and_b64 [[AND:s\[[0-9:]+\]]], vcc, [[C2]] 96; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 2.0, 1.0, [[AND]] 97define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(float addrspace(1)* %arg) { 98 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 99 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %lid 100 %load = load float, float addrspace(1)* %gep, align 4 101 %cmp = fcmp ugt float %load, 0.0 102 %cmp2 = fcmp ult float %load, 1.0 103 %and = and i1 %cmp, %cmp2 104 %sel = select i1 %and, float 1.0, float 2.0 105 store float %sel, float addrspace(1)* %gep, align 4 106 ret void 107} 108 109; GCN-LABEL: {{^}}test_vop3_cmp_i32_sop_xor: 110; GFX1032: v_cmp_lt_i32_e32 vcc_lo, 0, v{{[0-9]+}} 111; GFX1032: v_cmp_gt_i32_e64 [[C2:s[0-9]+]], 1, v{{[0-9]+}} 112; GFX1032: s_xor_b32 [[AND:s[0-9]+]], vcc_lo, [[C2]] 113; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 1, [[AND]] 114; GFX1064: v_cmp_lt_i32_e32 vcc, 0, v{{[0-9]+}} 115; GFX1064: v_cmp_gt_i32_e64 [[C2:s\[[0-9:]+\]]], 1, v{{[0-9]+}} 116; GFX1064: s_xor_b64 [[AND:s\[[0-9:]+\]]], vcc, [[C2]] 117; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 1, [[AND]] 118define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(i32 addrspace(1)* %arg) { 119 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 120 %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %lid 121 %load = load i32, i32 addrspace(1)* %gep, align 4 122 %cmp = icmp sgt i32 %load, 0 123 %cmp2 = icmp slt i32 %load, 1 124 %xor = xor i1 %cmp, %cmp2 125 %sel = select i1 %xor, i32 1, i32 2 126 store i32 %sel, i32 addrspace(1)* %gep, align 4 127 ret void 128} 129 130; GCN-LABEL: {{^}}test_vop3_cmp_u32_sop_or: 131; GFX1032: v_cmp_lt_u32_e32 vcc_lo, 3, v{{[0-9]+}} 132; GFX1032: v_cmp_gt_u32_e64 [[C2:s[0-9]+]], 2, v{{[0-9]+}} 133; GFX1032: s_or_b32 [[AND:s[0-9]+]], vcc_lo, [[C2]] 134; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 1, [[AND]] 135; GFX1064: v_cmp_lt_u32_e32 vcc, 3, v{{[0-9]+}} 136; GFX1064: v_cmp_gt_u32_e64 [[C2:s\[[0-9:]+\]]], 2, v{{[0-9]+}} 137; GFX1064: s_or_b64 [[AND:s\[[0-9:]+\]]], vcc, [[C2]] 138; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 1, [[AND]] 139define amdgpu_kernel void @test_vop3_cmp_u32_sop_or(i32 addrspace(1)* %arg) { 140 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 141 %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %lid 142 %load = load i32, i32 addrspace(1)* %gep, align 4 143 %cmp = icmp ugt i32 %load, 3 144 %cmp2 = icmp ult i32 %load, 2 145 %or = or i1 %cmp, %cmp2 146 %sel = select i1 %or, i32 1, i32 2 147 store i32 %sel, i32 addrspace(1)* %gep, align 4 148 ret void 149} 150 151; GCN-LABEL: {{^}}test_mask_if: 152; GFX1032: s_and_saveexec_b32 s{{[0-9]+}}, vcc_lo 153; GFX1064: s_and_saveexec_b64 s[{{[0-9:]+}}], vcc{{$}} 154; GCN: ; mask branch 155define amdgpu_kernel void @test_mask_if(i32 addrspace(1)* %arg) #0 { 156 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 157 %cmp = icmp ugt i32 %lid, 10 158 br i1 %cmp, label %if, label %endif 159 160if: 161 store i32 0, i32 addrspace(1)* %arg, align 4 162 br label %endif 163 164endif: 165 ret void 166} 167 168; GCN-LABEL: {{^}}test_loop_with_if: 169; GFX1032: s_or_b32 s{{[0-9]+}}, vcc_lo, s{{[0-9]+}} 170; GFX1032: s_andn2_b32 exec_lo, exec_lo, s{{[0-9]+}} 171; GFX1064: s_or_b64 s[{{[0-9:]+}}], vcc, s[{{[0-9:]+}}] 172; GFX1064: s_andn2_b64 exec, exec, s[{{[0-9:]+}}] 173; GCN: s_cbranch_execz 174; GCN: BB{{.*}}: 175; GFX1032: s_and_saveexec_b32 s{{[0-9]+}}, vcc_lo 176; GFX1064: s_and_saveexec_b64 s[{{[0-9:]+}}], vcc{{$}} 177; GCN: s_cbranch_execz 178; GCN: BB{{.*}}: 179; GCN: BB{{.*}}: 180; GFX1032: s_xor_b32 s{{[0-9]+}}, exec_lo, s{{[0-9]+}} 181; GFX1064: s_xor_b64 s[{{[0-9:]+}}], exec, s[{{[0-9:]+}}] 182; GCN: ; mask branch BB 183; GCN: BB{{.*}}: 184; GCN: BB{{.*}}: 185; GFX1032: s_or_b32 exec_lo, exec_lo, s{{[0-9]+}} 186; GFX1032: s_and_saveexec_b32 s{{[0-9]+}}, s{{[0-9]+}} 187; GFX1064: s_or_b64 exec, exec, s[{{[0-9:]+}}] 188; GFX1064: s_and_saveexec_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} 189; GCN: ; mask branch BB 190; GCN: BB{{.*}}: 191; GCN: BB{{.*}}: 192; GCN: s_endpgm 193define amdgpu_kernel void @test_loop_with_if(i32 addrspace(1)* %arg) #0 { 194bb: 195 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() 196 br label %bb2 197 198bb1: 199 ret void 200 201bb2: 202 %tmp3 = phi i32 [ 0, %bb ], [ %tmp15, %bb13 ] 203 %tmp4 = icmp slt i32 %tmp3, %tmp 204 br i1 %tmp4, label %bb5, label %bb11 205 206bb5: 207 %tmp6 = sext i32 %tmp3 to i64 208 %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp6 209 %tmp8 = load i32, i32 addrspace(1)* %tmp7, align 4 210 %tmp9 = icmp sgt i32 %tmp8, 10 211 br i1 %tmp9, label %bb10, label %bb11 212 213bb10: 214 store i32 %tmp, i32 addrspace(1)* %tmp7, align 4 215 br label %bb13 216 217bb11: 218 %tmp12 = sdiv i32 %tmp3, 2 219 br label %bb13 220 221bb13: 222 %tmp14 = phi i32 [ %tmp3, %bb10 ], [ %tmp12, %bb11 ] 223 %tmp15 = add nsw i32 %tmp14, 1 224 %tmp16 = icmp slt i32 %tmp14, 255 225 br i1 %tmp16, label %bb2, label %bb1 226} 227 228; GCN-LABEL: {{^}}test_loop_with_if_else_break: 229; GFX1032: s_and_saveexec_b32 s{{[0-9]+}}, vcc_lo 230; GFX1064: s_and_saveexec_b64 s[{{[0-9:]+}}], vcc{{$}} 231; GCN: ; mask branch 232; GCN: s_cbranch_execz 233; GCN: BB{{.*}}: 234; GCN: BB{{.*}}: 235; GFX1032: s_andn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, exec_lo 236; GFX1064: s_andn2_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], exec 237; GFX1032: s_or_b32 s{{[0-9]+}}, vcc_lo, s{{[0-9]+}} 238; GFX1032: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} 239; GFX1064: s_or_b64 s[{{[0-9:]+}}], vcc, s[{{[0-9:]+}}] 240; GFX1064: s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] 241; GCN: s_cbranch_execz 242; GCN: BB{{.*}}: 243define amdgpu_kernel void @test_loop_with_if_else_break(i32 addrspace(1)* %arg) #0 { 244bb: 245 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() 246 %tmp1 = icmp eq i32 %tmp, 0 247 br i1 %tmp1, label %.loopexit, label %.preheader 248 249.preheader: 250 br label %bb2 251 252bb2: 253 %tmp3 = phi i32 [ %tmp9, %bb8 ], [ 0, %.preheader ] 254 %tmp4 = zext i32 %tmp3 to i64 255 %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp4 256 %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4 257 %tmp7 = icmp sgt i32 %tmp6, 10 258 br i1 %tmp7, label %bb8, label %.loopexit 259 260bb8: 261 store i32 %tmp, i32 addrspace(1)* %tmp5, align 4 262 %tmp9 = add nuw nsw i32 %tmp3, 1 263 %tmp10 = icmp ult i32 %tmp9, 256 264 %tmp11 = icmp ult i32 %tmp9, %tmp 265 %tmp12 = and i1 %tmp10, %tmp11 266 br i1 %tmp12, label %bb2, label %.loopexit 267 268.loopexit: 269 ret void 270} 271 272; GCN-LABEL: {{^}}test_addc_vop2b: 273; GFX1032: v_add_co_u32_e64 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, s{{[0-9]+}} 274; GFX1032: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}, vcc_lo 275; GFX1064: v_add_co_u32_e64 v{{[0-9]+}}, vcc, v{{[0-9]+}}, s{{[0-9]+}} 276; GFX1064: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}, vcc{{$}} 277define amdgpu_kernel void @test_addc_vop2b(i64 addrspace(1)* %arg, i64 %arg1) #0 { 278bb: 279 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() 280 %tmp3 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp 281 %tmp4 = load i64, i64 addrspace(1)* %tmp3, align 8 282 %tmp5 = add nsw i64 %tmp4, %arg1 283 store i64 %tmp5, i64 addrspace(1)* %tmp3, align 8 284 ret void 285} 286 287; GCN-LABEL: {{^}}test_subbrev_vop2b: 288; GFX1032: v_sub_co_u32_e64 v{{[0-9]+}}, [[A0:s[0-9]+|vcc_lo]], v{{[0-9]+}}, s{{[0-9]+}}{{$}} 289; GFX1032: v_subrev_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[A0]]{{$}} 290; GFX1064: v_sub_co_u32_e64 v{{[0-9]+}}, [[A0:s\[[0-9:]+\]|vcc]], v{{[0-9]+}}, s{{[0-9]+}}{{$}} 291; GFX1064: v_subrev_co_ci_u32_e32 v{{[0-9]+}}, vcc, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[A0]]{{$}} 292define amdgpu_kernel void @test_subbrev_vop2b(i64 addrspace(1)* %arg, i64 %arg1) #0 { 293bb: 294 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() 295 %tmp3 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp 296 %tmp4 = load i64, i64 addrspace(1)* %tmp3, align 8 297 %tmp5 = sub nsw i64 %tmp4, %arg1 298 store i64 %tmp5, i64 addrspace(1)* %tmp3, align 8 299 ret void 300} 301 302; GCN-LABEL: {{^}}test_subb_vop2b: 303; GFX1032: v_sub_co_u32_e64 v{{[0-9]+}}, [[A0:s[0-9]+|vcc_lo]], s{{[0-9]+}}, v{{[0-9]+}}{{$}} 304; GFX1032: v_sub_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, {{[vs][0-9]+}}, v{{[0-9]+}}, [[A0]]{{$}} 305; GFX1064: v_sub_co_u32_e64 v{{[0-9]+}}, [[A0:s\[[0-9:]+\]|vcc]], s{{[0-9]+}}, v{{[0-9]+}}{{$}} 306; GFX1064: v_sub_co_ci_u32_e32 v{{[0-9]+}}, vcc, {{[vs][0-9]+}}, v{{[0-9]+}}, [[A0]]{{$}} 307define amdgpu_kernel void @test_subb_vop2b(i64 addrspace(1)* %arg, i64 %arg1) #0 { 308bb: 309 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() 310 %tmp3 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp 311 %tmp4 = load i64, i64 addrspace(1)* %tmp3, align 8 312 %tmp5 = sub nsw i64 %arg1, %tmp4 313 store i64 %tmp5, i64 addrspace(1)* %tmp3, align 8 314 ret void 315} 316 317; GCN-LABEL: {{^}}test_udiv64: 318; GFX1032: v_add_co_u32_e64 v{{[0-9]+}}, [[SDST:s[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} 319; GFX1032: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, 0, v{{[0-9]+}}, vcc_lo 320; GFX1032: v_add_co_ci_u32_e64 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}}, [[SDST]] 321; GFX1032: v_add_co_u32_e64 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}} 322; GFX1032: v_add_co_u32_e64 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}} 323; GFX1032: v_add_co_u32_e64 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}} 324; GFX1032: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, 0, v{{[0-9]+}}, vcc_lo 325; GFX1032: v_sub_co_u32_e64 v{{[0-9]+}}, vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} 326; GFX1032: v_sub_co_ci_u32_e64 v{{[0-9]+}}, s{{[0-9]+}}, {{[vs][0-9]+}}, v{{[0-9]+}}, vcc_lo 327; GFX1032: v_subrev_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, {{[vs][0-9]+}}, v{{[0-9]+}}, vcc_lo 328; GFX1064: v_add_co_u32_e64 v{{[0-9]+}}, [[SDST:s\[[0-9:]+\]]], v{{[0-9]+}}, v{{[0-9]+}} 329; GFX1064: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} 330; GFX1064: v_add_co_ci_u32_e64 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}, [[SDST]] 331; GFX1064: v_add_co_u32_e64 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} 332; GFX1064: v_add_co_u32_e64 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} 333; GFX1064: v_add_co_u32_e64 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} 334; GFX1064: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} 335; GFX1064: v_sub_co_u32_e64 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}} 336; GFX1064: v_sub_co_ci_u32_e64 v{{[0-9]+}}, s[{{[0-9:]+}}], {{[vs][0-9]+}}, v{{[0-9]+}}, vcc{{$}} 337; GFX1064: v_subrev_co_ci_u32_e32 v{{[0-9]+}}, vcc, {{[vs][0-9]+}}, v{{[0-9]+}}, vcc{{$}} 338define amdgpu_kernel void @test_udiv64(i64 addrspace(1)* %arg) #0 { 339bb: 340 %tmp = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 1 341 %tmp1 = load i64, i64 addrspace(1)* %tmp, align 8 342 %tmp2 = load i64, i64 addrspace(1)* %arg, align 8 343 %tmp3 = udiv i64 %tmp1, %tmp2 344 %tmp4 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 2 345 store i64 %tmp3, i64 addrspace(1)* %tmp4, align 8 346 ret void 347} 348 349; GCN-LABEL: {{^}}test_div_scale_f32: 350; GFX1032: v_div_scale_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 351; GFX1064: v_div_scale_f32 v{{[0-9]+}}, s[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 352define amdgpu_kernel void @test_div_scale_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 353 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 354 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 355 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 356 357 %a = load volatile float, float addrspace(1)* %gep.0, align 4 358 %b = load volatile float, float addrspace(1)* %gep.1, align 4 359 360 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone 361 %result0 = extractvalue { float, i1 } %result, 0 362 store float %result0, float addrspace(1)* %out, align 4 363 ret void 364} 365 366; GCN-LABEL: {{^}}test_div_scale_f64: 367; GFX1032: v_div_scale_f64 v[{{[0-9:]+}}], s{{[0-9]+}}, v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] 368; GFX1064: v_div_scale_f64 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] 369define amdgpu_kernel void @test_div_scale_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) #0 { 370 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 371 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 372 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 373 374 %a = load volatile double, double addrspace(1)* %gep.0, align 8 375 %b = load volatile double, double addrspace(1)* %gep.1, align 8 376 377 %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone 378 %result0 = extractvalue { double, i1 } %result, 0 379 store double %result0, double addrspace(1)* %out, align 8 380 ret void 381} 382 383; GCN-LABEL: {{^}}test_mad_i64_i32: 384; GFX1032: v_mad_i64_i32 v[{{[0-9:]+}}], s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9:]+}}] 385; GFX1064: v_mad_i64_i32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9:]+}}] 386define i64 @test_mad_i64_i32(i32 %arg0, i32 %arg1, i64 %arg2) #0 { 387 %sext0 = sext i32 %arg0 to i64 388 %sext1 = sext i32 %arg1 to i64 389 %mul = mul i64 %sext0, %sext1 390 %mad = add i64 %mul, %arg2 391 ret i64 %mad 392} 393 394; GCN-LABEL: {{^}}test_mad_u64_u32: 395; GFX1032: v_mad_u64_u32 v[{{[0-9:]+}}], s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9:]+}}] 396; GFX1064: v_mad_u64_u32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9:]+}}] 397define i64 @test_mad_u64_u32(i32 %arg0, i32 %arg1, i64 %arg2) #0 { 398 %sext0 = zext i32 %arg0 to i64 399 %sext1 = zext i32 %arg1 to i64 400 %mul = mul i64 %sext0, %sext1 401 %mad = add i64 %mul, %arg2 402 ret i64 %mad 403} 404 405; GCN-LABEL: {{^}}test_div_fmas_f32: 406; GFX1032: v_cmp_eq_u32_e64 vcc_lo, 407; GFX1064: v_cmp_eq_u32_e64 vcc, 408; GCN: v_div_fmas_f32 v{{[0-9]+}}, {{[vs][0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 409define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { 410 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone 411 store float %result, float addrspace(1)* %out, align 4 412 ret void 413} 414 415; GCN-LABEL: {{^}}test_div_fmas_f64: 416; GFX1032: v_cmp_eq_u32_e64 vcc_lo, 417; GFX1064: v_cmp_eq_u32_e64 vcc, 418; GCN-DAG: v_div_fmas_f64 v[{{[0-9:]+}}], {{[vs]}}[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] 419define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind { 420 %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone 421 store double %result, double addrspace(1)* %out, align 8 422 ret void 423} 424 425; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc: 426; GFX1032: s_mov_b32 [[VCC:vcc_lo]], 0{{$}} 427; GFX1064: s_mov_b64 [[VCC:vcc]], 0{{$}} 428; GFX1032: s_and_saveexec_b32 [[SAVE:s[0-9]+]], s{{[0-9]+}}{{$}} 429; GFX1064: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], s[{{[0-9:]+}}]{{$}} 430 431; GCN: load_dword [[LOAD:v[0-9]+]] 432; GCN: v_cmp_ne_u32_e32 [[VCC]], 0, [[LOAD]] 433 434; GCN: BB{{[0-9_]+}}: 435; GFX1032: s_or_b32 exec_lo, exec_lo, [[SAVE]] 436; GFX1064: s_or_b64 exec, exec, [[SAVE]] 437; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} 438define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) #0 { 439entry: 440 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 441 %gep.out = getelementptr float, float addrspace(1)* %out, i32 2 442 %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid 443 %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1 444 %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2 445 446 %a = load float, float addrspace(1)* %gep.a 447 %b = load float, float addrspace(1)* %gep.b 448 %c = load float, float addrspace(1)* %gep.c 449 450 %cmp0 = icmp eq i32 %tid, 0 451 br i1 %cmp0, label %bb, label %exit 452 453bb: 454 %val = load volatile i32, i32 addrspace(1)* %dummy 455 %cmp1 = icmp ne i32 %val, 0 456 br label %exit 457 458exit: 459 %cond = phi i1 [false, %entry], [%cmp1, %bb] 460 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cond) nounwind readnone 461 store float %result, float addrspace(1)* %gep.out, align 4 462 ret void 463} 464 465; GCN-LABEL: {{^}}fdiv_f32: 466; GFC1032: v_div_scale_f32 v{{[0-9]+}}, vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} 467; GFC1064: v_div_scale_f32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} 468; GCN: v_rcp_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} 469; GCN-NOT: vcc 470; GCN: v_div_fmas_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 471define amdgpu_kernel void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 { 472entry: 473 %fdiv = fdiv float %a, %b 474 store float %fdiv, float addrspace(1)* %out 475 ret void 476} 477 478; GCN-LABEL: {{^}}test_br_cc_f16: 479; GFX1032: v_cmp_nlt_f16_e32 vcc_lo, 480; GFX1032-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo 481; GFX1064: v_cmp_nlt_f16_e32 vcc, 482; GFX1064-NEXT: s_and_b64 vcc, exec, vcc{{$}} 483; GCN-NEXT: s_cbranch_vccnz 484define amdgpu_kernel void @test_br_cc_f16( 485 half addrspace(1)* %r, 486 half addrspace(1)* %a, 487 half addrspace(1)* %b) { 488entry: 489 %a.val = load half, half addrspace(1)* %a 490 %b.val = load half, half addrspace(1)* %b 491 %fcmp = fcmp olt half %a.val, %b.val 492 br i1 %fcmp, label %one, label %two 493 494one: 495 store half %a.val, half addrspace(1)* %r 496 ret void 497 498two: 499 store half %b.val, half addrspace(1)* %r 500 ret void 501} 502 503; GCN-LABEL: {{^}}test_brcc_i1: 504; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0 505; GCN-NEXT: s_cbranch_scc1 506define amdgpu_kernel void @test_brcc_i1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i1 %val) #0 { 507 %cmp0 = icmp ne i1 %val, 0 508 br i1 %cmp0, label %store, label %end 509 510store: 511 store i32 222, i32 addrspace(1)* %out 512 ret void 513 514end: 515 ret void 516} 517 518; GCN-LABEL: {{^}}test_preserve_condition_undef_flag: 519; GFX1032: v_cmp_nlt_f32_e64 s{{[0-9]+}}, s{{[0-9]+}}, 1.0 520; GFX1032: v_cmp_ngt_f32_e64 s{{[0-9]+}}, s{{[0-9]+}}, 0 521; GFX1032: v_cmp_nlt_f32_e64 s{{[0-9]+}}, s{{[0-9]+}}, 1.0 522; GFX1032: s_or_b32 [[OR1:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}} 523; GFX1032: s_or_b32 [[OR2:s[0-9]+]], [[OR1]], s{{[0-9]+}} 524; GFX1032: s_and_b32 vcc_lo, exec_lo, [[OR2]] 525; GFX1064: v_cmp_nlt_f32_e64 s[{{[0-9:]+}}], s{{[0-9]+}}, 1.0 526; GFX1064: v_cmp_ngt_f32_e64 s[{{[0-9:]+}}], s{{[0-9]+}}, 0 527; GFX1064: v_cmp_nlt_f32_e64 s[{{[0-9:]+}}], s{{[0-9]+}}, 1.0 528; GFX1064: s_or_b64 [[OR1:s\[[0-9:]+\]]], s[{{[0-9:]+}}], s[{{[0-9:]+}}] 529; GFX1064: s_or_b64 [[OR2:s\[[0-9:]+\]]], [[OR1]], s[{{[0-9:]+}}] 530; GFX1064: s_and_b64 vcc, exec, [[OR2]] 531; GCN: s_cbranch_vccnz 532define amdgpu_kernel void @test_preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) #0 { 533bb0: 534 %tmp = icmp sgt i32 %arg1, 4 535 %undef = call i1 @llvm.amdgcn.class.f32(float undef, i32 undef) 536 %tmp4 = select i1 %undef, float %arg, float 1.000000e+00 537 %tmp5 = fcmp ogt float %arg2, 0.000000e+00 538 %tmp6 = fcmp olt float %arg2, 1.000000e+00 539 %tmp7 = fcmp olt float %arg, %tmp4 540 %tmp8 = and i1 %tmp5, %tmp6 541 %tmp9 = and i1 %tmp8, %tmp7 542 br i1 %tmp9, label %bb1, label %bb2 543 544bb1: 545 store volatile i32 0, i32 addrspace(1)* undef 546 br label %bb2 547 548bb2: 549 ret void 550} 551 552; GCN-LABEL: {{^}}test_invert_true_phi_cond_break_loop: 553; GFX1032: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, -1 554; GFX1032: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} 555; GFX1064: s_xor_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], -1 556; GFX1064: s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] 557define amdgpu_kernel void @test_invert_true_phi_cond_break_loop(i32 %arg) #0 { 558bb: 559 %id = call i32 @llvm.amdgcn.workitem.id.x() 560 %tmp = sub i32 %id, %arg 561 br label %bb1 562 563bb1: ; preds = %Flow, %bb 564 %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ] 565 %lsr.iv.next = add i32 %lsr.iv, 1 566 %cmp0 = icmp slt i32 %lsr.iv.next, 0 567 br i1 %cmp0, label %bb4, label %Flow 568 569bb4: ; preds = %bb1 570 %load = load volatile i32, i32 addrspace(1)* undef, align 4 571 %cmp1 = icmp sge i32 %tmp, %load 572 br label %Flow 573 574Flow: ; preds = %bb4, %bb1 575 %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ] 576 %tmp3 = phi i1 [ %cmp1, %bb4 ], [ true, %bb1 ] 577 br i1 %tmp3, label %bb1, label %bb9 578 579bb9: ; preds = %Flow 580 store volatile i32 7, i32 addrspace(3)* undef 581 ret void 582} 583 584; GCN-LABEL: {{^}}test_movrels_extract_neg_offset_vgpr: 585; GFX1032: v_cmp_eq_u32_e32 vcc_lo, 1, v{{[0-9]+}} 586; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc_lo 587; GFX1032: v_cmp_ne_u32_e32 vcc_lo, 2, v{{[0-9]+}} 588; GFX1032: v_cndmask_b32_e32 v{{[0-9]+}}, 2, v{{[0-9]+}}, vcc_lo 589; GFX1032: v_cmp_ne_u32_e32 vcc_lo, 3, v{{[0-9]+}} 590; GFX1032: v_cndmask_b32_e32 v{{[0-9]+}}, 3, v{{[0-9]+}}, vcc_lo 591; GFX1064: v_cmp_eq_u32_e32 vcc, 1, v{{[0-9]+}} 592; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc 593; GFX1064: v_cmp_ne_u32_e32 vcc, 2, v{{[0-9]+}} 594; GFX1064: v_cndmask_b32_e32 v{{[0-9]+}}, 2, v{{[0-9]+}}, vcc 595; GFX1064: v_cmp_ne_u32_e32 vcc, 3, v{{[0-9]+}} 596; GFX1064: v_cndmask_b32_e32 v{{[0-9]+}}, 3, v{{[0-9]+}}, vcc 597define amdgpu_kernel void @test_movrels_extract_neg_offset_vgpr(i32 addrspace(1)* %out) #0 { 598entry: 599 %id = call i32 @llvm.amdgcn.workitem.id.x() #1 600 %index = add i32 %id, -512 601 %value = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index 602 store i32 %value, i32 addrspace(1)* %out 603 ret void 604} 605 606; GCN-LABEL: {{^}}test_set_inactive: 607; GFX1032: s_not_b32 exec_lo, exec_lo 608; GFX1032: v_mov_b32_e32 {{v[0-9]+}}, 42 609; GFX1032: s_not_b32 exec_lo, exec_lo 610; GFX1064: s_not_b64 exec, exec{{$}} 611; GFX1064: v_mov_b32_e32 {{v[0-9]+}}, 42 612; GFX1064: s_not_b64 exec, exec{{$}} 613define amdgpu_kernel void @test_set_inactive(i32 addrspace(1)* %out, i32 %in) #0 { 614 %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) 615 store i32 %tmp, i32 addrspace(1)* %out 616 ret void 617} 618 619; GCN-LABEL: {{^}}test_set_inactive_64: 620; GFX1032: s_not_b32 exec_lo, exec_lo 621; GFX1032: v_mov_b32_e32 {{v[0-9]+}}, 0 622; GFX1032: v_mov_b32_e32 {{v[0-9]+}}, 0 623; GFX1032: s_not_b32 exec_lo, exec_lo 624; GFX1064: s_not_b64 exec, exec{{$}} 625; GFX1064: v_mov_b32_e32 {{v[0-9]+}}, 0 626; GFX1064: v_mov_b32_e32 {{v[0-9]+}}, 0 627; GFX1064: s_not_b64 exec, exec{{$}} 628define amdgpu_kernel void @test_set_inactive_64(i64 addrspace(1)* %out, i64 %in) #0 { 629 %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) 630 store i64 %tmp, i64 addrspace(1)* %out 631 ret void 632} 633 634; GCN-LABEL: {{^}}test_kill_i1_terminator_float: 635; GFX1032: s_mov_b32 exec_lo, 0 636; GFX1064: s_mov_b64 exec, 0 637define amdgpu_ps void @test_kill_i1_terminator_float() #0 { 638 call void @llvm.amdgcn.kill(i1 false) 639 ret void 640} 641 642; GCN-LABEL: {{^}}test_kill_i1_terminator_i1: 643; GFX1032: s_or_b32 [[OR:s[0-9]+]], 644; GFX1032: s_and_b32 exec_lo, exec_lo, [[OR]] 645; GFX1064: s_or_b64 [[OR:s\[[0-9:]+\]]], 646; GFX1064: s_and_b64 exec, exec, [[OR]] 647define amdgpu_gs void @test_kill_i1_terminator_i1(i32 %a, i32 %b, i32 %c, i32 %d) #0 { 648 %c1 = icmp slt i32 %a, %b 649 %c2 = icmp slt i32 %c, %d 650 %x = or i1 %c1, %c2 651 call void @llvm.amdgcn.kill(i1 %x) 652 ret void 653} 654 655; GCN-LABEL: {{^}}test_loop_vcc: 656; GFX1032: v_cmp_lt_f32_e32 vcc_lo, 657; GFX1064: v_cmp_lt_f32_e32 vcc, 658; GCN: s_cbranch_vccnz 659define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) #0 { 660entry: 661 br label %loop 662 663loop: 664 %ctr.iv = phi float [ 0.0, %entry ], [ %ctr.next, %body ] 665 %c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ] 666 %cc = fcmp ogt float %ctr.iv, 7.0 667 br i1 %cc, label %break, label %body 668 669body: 670 %c.iv0 = extractelement <4 x float> %c.iv, i32 0 671 %c.next = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.iv0, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) 672 %ctr.next = fadd float %ctr.iv, 2.0 673 br label %loop 674 675break: 676 ret <4 x float> %c.iv 677} 678 679; GCN-LABEL: {{^}}test_wwm1: 680; GFX1032: s_or_saveexec_b32 [[SAVE:s[0-9]+]], -1 681; GFX1032: s_mov_b32 exec_lo, [[SAVE]] 682; GFX1064: s_or_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], -1 683; GFX1064: s_mov_b64 exec, [[SAVE]] 684define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1, float %src0, float %src1) { 685main_body: 686 %out = fadd float %src0, %src1 687 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) 688 ret float %out.0 689} 690 691; GCN-LABEL: {{^}}test_wwm2: 692; GFX1032: v_cmp_gt_u32_e32 vcc_lo, 32, v{{[0-9]+}} 693; GFX1032: s_and_saveexec_b32 [[SAVE1:s[0-9]+]], vcc_lo 694; GFX1032: s_or_saveexec_b32 [[SAVE2:s[0-9]+]], -1 695; GFX1032: s_mov_b32 exec_lo, [[SAVE2]] 696; GFX1032: s_or_b32 exec_lo, exec_lo, [[SAVE1]] 697; GFX1064: v_cmp_gt_u32_e32 vcc, 32, v{{[0-9]+}} 698; GFX1064: s_and_saveexec_b64 [[SAVE1:s\[[0-9:]+\]]], vcc{{$}} 699; GFX1064: s_or_saveexec_b64 [[SAVE2:s\[[0-9:]+\]]], -1 700; GFX1064: s_mov_b64 exec, [[SAVE2]] 701; GFX1064: s_or_b64 exec, exec, [[SAVE1]] 702define amdgpu_ps float @test_wwm2(i32 inreg %idx) { 703main_body: 704 ; use mbcnt to make sure the branch is divergent 705 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 706 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 707 %cc = icmp uge i32 %hi, 32 708 br i1 %cc, label %endif, label %if 709 710if: 711 %src = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0) 712 %out = fadd float %src, %src 713 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) 714 %out.1 = fadd float %src, %out.0 715 br label %endif 716 717endif: 718 %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ] 719 ret float %out.2 720} 721 722; GCN-LABEL: {{^}}test_wqm1: 723; GFX1032: s_mov_b32 [[ORIG:s[0-9]+]], exec_lo 724; GFX1032: s_wqm_b32 exec_lo, exec_lo 725; GFX1032: s_and_b32 exec_lo, exec_lo, [[ORIG]] 726; GFX1064: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec{{$}} 727; GFX1064: s_wqm_b64 exec, exec{{$}} 728; GFX1064: s_and_b64 exec, exec, [[ORIG]] 729define amdgpu_ps <4 x float> @test_wqm1(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #0 { 730main_body: 731 %inst23 = extractelement <2 x float> %pos, i32 0 732 %inst24 = extractelement <2 x float> %pos, i32 1 733 %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0) 734 %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0) 735 %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0) 736 %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0) 737 %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) 738 ret <4 x float> %tex 739} 740 741; GCN-LABEL: {{^}}test_wqm2: 742; GFX1032: s_wqm_b32 exec_lo, exec_lo 743; GFX1032: s_and_b32 exec_lo, exec_lo, s{{[0-9+]}} 744; GFX1064: s_wqm_b64 exec, exec{{$}} 745; GFX1064: s_and_b64 exec, exec, s[{{[0-9:]+}}] 746define amdgpu_ps float @test_wqm2(i32 inreg %idx0, i32 inreg %idx1) #0 { 747main_body: 748 %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) 749 %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) 750 %out = fadd float %src0, %src1 751 %out.0 = bitcast float %out to i32 752 %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0) 753 %out.2 = bitcast i32 %out.1 to float 754 ret float %out.2 755} 756 757; GCN-LABEL: {{^}}test_intr_fcmp_i64: 758; GFX1032-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], 0{{$}} 759; GFX1032-DAG: v_cmp_eq_f32_e64 s[[C_LO:[0-9]+]], {{s[0-9]+}}, |{{[vs][0-9]+}}| 760; GFX1032-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]] 761; GFX1064: v_cmp_eq_f32_e64 s{{\[}}[[C_LO:[0-9]+]]:[[C_HI:[0-9]+]]], {{s[0-9]+}}, |{{[vs][0-9]+}}| 762; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]] 763; GFX1064-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[C_HI]] 764; GCN: store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[V_LO]]:[[V_HI]]], 765define amdgpu_kernel void @test_intr_fcmp_i64(i64 addrspace(1)* %out, float %src, float %a) { 766 %temp = call float @llvm.fabs.f32(float %a) 767 %result = call i64 @llvm.amdgcn.fcmp.i64.f32(float %src, float %temp, i32 1) 768 store i64 %result, i64 addrspace(1)* %out 769 ret void 770} 771 772; GCN-LABEL: {{^}}test_intr_icmp_i64: 773; GFX1032-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], 0{{$}} 774; GFX1032-DAG: v_cmp_eq_u32_e64 [[C_LO:vcc_lo|s[0-9]+]], 0x64, {{s[0-9]+}} 775; GFX1032-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[C_LO]] 776; GFX1064: v_cmp_eq_u32_e64 s{{\[}}[[C_LO:[0-9]+]]:[[C_HI:[0-9]+]]], 0x64, {{s[0-9]+}} 777; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]] 778; GFX1064-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[C_HI]] 779; GCN: store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[V_LO]]:[[V_HI]]], 780define amdgpu_kernel void @test_intr_icmp_i64(i64 addrspace(1)* %out, i32 %src) { 781 %result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 %src, i32 100, i32 32) 782 store i64 %result, i64 addrspace(1)* %out 783 ret void 784} 785 786; GCN-LABEL: {{^}}test_intr_fcmp_i32: 787; GFX1032-DAG: v_cmp_eq_f32_e64 s[[C_LO:[0-9]+]], {{s[0-9]+}}, |{{[vs][0-9]+}}| 788; GFX1032-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]] 789; GFX1064: v_cmp_eq_f32_e64 s{{\[}}[[C_LO:[0-9]+]]:[[C_HI:[0-9]+]]], {{s[0-9]+}}, |{{[vs][0-9]+}}| 790; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]] 791; GCN: store_dword v[{{[0-9:]+}}], v[[V_LO]], 792define amdgpu_kernel void @test_intr_fcmp_i32(i32 addrspace(1)* %out, float %src, float %a) { 793 %temp = call float @llvm.fabs.f32(float %a) 794 %result = call i32 @llvm.amdgcn.fcmp.i32.f32(float %src, float %temp, i32 1) 795 store i32 %result, i32 addrspace(1)* %out 796 ret void 797} 798 799; GCN-LABEL: {{^}}test_intr_icmp_i32: 800; GFX1032-DAG: v_cmp_eq_u32_e64 s[[C_LO:[0-9]+]], 0x64, {{s[0-9]+}} 801; GFX1032-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]{{$}} 802; GFX1064: v_cmp_eq_u32_e64 s{{\[}}[[C_LO:[0-9]+]]:{{[0-9]+}}], 0x64, {{s[0-9]+}} 803; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]{{$}} 804; GCN: store_dword v[{{[0-9:]+}}], v[[V_LO]], 805define amdgpu_kernel void @test_intr_icmp_i32(i32 addrspace(1)* %out, i32 %src) { 806 %result = call i32 @llvm.amdgcn.icmp.i32.i32(i32 %src, i32 100, i32 32) 807 store i32 %result, i32 addrspace(1)* %out 808 ret void 809} 810 811; GCN-LABEL: {{^}}test_wqm_vote: 812; GFX1032: v_cmp_neq_f32_e32 vcc_lo, 0 813; GFX1032: s_wqm_b32 [[WQM:s[0-9]+]], vcc_lo 814; GFX1032: s_and_b32 exec_lo, exec_lo, [[WQM]] 815; GFX1064: v_cmp_neq_f32_e32 vcc, 0 816; GFX1064: s_wqm_b64 [[WQM:s\[[0-9:]+\]]], vcc{{$}} 817; GFX1064: s_and_b64 exec, exec, [[WQM]] 818define amdgpu_ps void @test_wqm_vote(float %a) { 819 %c1 = fcmp une float %a, 0.0 820 %c2 = call i1 @llvm.amdgcn.wqm.vote(i1 %c1) 821 call void @llvm.amdgcn.kill(i1 %c2) 822 ret void 823} 824 825; GCN-LABEL: {{^}}test_branch_true: 826; GFX1032: s_and_b32 vcc_lo, exec_lo, -1 827; GFX1064: s_and_b64 vcc, exec, -1 828define amdgpu_kernel void @test_branch_true() #2 { 829entry: 830 br i1 true, label %for.end, label %for.body.lr.ph 831 832for.body.lr.ph: ; preds = %entry 833 br label %for.body 834 835for.body: ; preds = %for.body, %for.body.lr.ph 836 br i1 undef, label %for.end, label %for.body 837 838for.end: ; preds = %for.body, %entry 839 ret void 840} 841 842; GCN-LABEL: {{^}}test_ps_live: 843; GFX1032: s_mov_b32 [[C:s[0-9]+]], exec_lo 844; GFX1064: s_mov_b64 [[C:s\[[0-9:]+\]]], exec{{$}} 845; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[C]] 846define amdgpu_ps float @test_ps_live() #0 { 847 %live = call i1 @llvm.amdgcn.ps.live() 848 %live.32 = zext i1 %live to i32 849 %r = bitcast i32 %live.32 to float 850 ret float %r 851} 852 853; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle64: 854; GFX1032: v_cmp_neq_f64_e64 [[C:s[0-9]+]], s[{{[0-9:]+}}], 1.0 855; GFX1032: s_and_b32 vcc_lo, exec_lo, [[C]] 856; GFX1064: v_cmp_neq_f64_e64 [[C:s\[[0-9:]+\]]], s[{{[0-9:]+}}], 1.0 857; GFX1064: s_and_b64 vcc, exec, [[C]] 858define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { 859entry: 860 %v = load double, double addrspace(1)* %in 861 %cc = fcmp oeq double %v, 1.000000e+00 862 br i1 %cc, label %if, label %endif 863 864if: 865 %u = fadd double %v, %v 866 br label %endif 867 868endif: 869 %r = phi double [ %v, %entry ], [ %u, %if ] 870 store double %r, double addrspace(1)* %out 871 ret void 872} 873 874; GCN-LABEL: {{^}}test_init_exec: 875; GFX1032: s_mov_b32 exec_lo, 0x12345 876; GFX1064: s_mov_b64 exec, 0x12345 877; GCN: v_add_f32_e32 v0, 878define amdgpu_ps float @test_init_exec(float %a, float %b) { 879main_body: 880 %s = fadd float %a, %b 881 call void @llvm.amdgcn.init.exec(i64 74565) 882 ret float %s 883} 884 885; GCN-LABEL: {{^}}test_init_exec_from_input: 886; GCN: s_bfe_u32 s0, s3, 0x70008 887; GFX1032: s_bfm_b32 exec_lo, s0, 0 888; GFX1032: s_cmp_eq_u32 s0, 32 889; GFX1032: s_cmov_b32 exec_lo, -1 890; GFX1064: s_bfm_b64 exec, s0, 0 891; GFX1064: s_cmp_eq_u32 s0, 64 892; GFX1064: s_cmov_b64 exec, -1 893; GCN: v_add_f32_e32 v0, 894define amdgpu_ps float @test_init_exec_from_input(i32 inreg, i32 inreg, i32 inreg, i32 inreg %count, float %a, float %b) { 895main_body: 896 %s = fadd float %a, %b 897 call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8) 898 ret float %s 899} 900 901; GCN-LABEL: {{^}}test_vgprblocks_w32_attr: 902; Test that the wave size can be overridden in function attributes and that the block size is correct as a result 903; GFX10DEFWAVE: ; VGPRBlocks: 1 904define amdgpu_gs float @test_vgprblocks_w32_attr(float %a, float %b, float %c, float %d, float %e, 905 float %f, float %g, float %h, float %i, float %j, float %k, float %l) #3 { 906main_body: 907 %s = fadd float %a, %b 908 %s.1 = fadd float %s, %c 909 %s.2 = fadd float %s.1, %d 910 %s.3 = fadd float %s.2, %e 911 %s.4 = fadd float %s.3, %f 912 %s.5 = fadd float %s.4, %g 913 %s.6 = fadd float %s.5, %h 914 %s.7 = fadd float %s.6, %i 915 %s.8 = fadd float %s.7, %j 916 %s.9 = fadd float %s.8, %k 917 %s.10 = fadd float %s.9, %l 918 ret float %s.10 919} 920 921; GCN-LABEL: {{^}}test_vgprblocks_w64_attr: 922; Test that the wave size can be overridden in function attributes and that the block size is correct as a result 923; GFX10DEFWAVE: ; VGPRBlocks: 2 924define amdgpu_gs float @test_vgprblocks_w64_attr(float %a, float %b, float %c, float %d, float %e, 925 float %f, float %g, float %h, float %i, float %j, float %k, float %l) #4 { 926main_body: 927 %s = fadd float %a, %b 928 %s.1 = fadd float %s, %c 929 %s.2 = fadd float %s.1, %d 930 %s.3 = fadd float %s.2, %e 931 %s.4 = fadd float %s.3, %f 932 %s.5 = fadd float %s.4, %g 933 %s.6 = fadd float %s.5, %h 934 %s.7 = fadd float %s.6, %i 935 %s.8 = fadd float %s.7, %j 936 %s.9 = fadd float %s.8, %k 937 %s.10 = fadd float %s.9, %l 938 ret float %s.10 939} 940 941; GCN-LABEL: {{^}}icmp64: 942; GFX1032: v_cmp_eq_u32_e32 vcc_lo, 0, v 943; GFX1064: v_cmp_eq_u32_e32 vcc, 0, v 944define amdgpu_kernel void @icmp64(i32 %n, i32 %s) { 945entry: 946 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 947 %mul4 = mul nsw i32 %s, %n 948 %cmp = icmp slt i32 0, %mul4 949 br label %if.end 950 951if.end: ; preds = %entry 952 %rem = urem i32 %id, %s 953 %icmp = tail call i64 @llvm.amdgcn.icmp.i64.i32(i32 %rem, i32 0, i32 32) 954 %shr = lshr i64 %icmp, 1 955 %notmask = shl nsw i64 -1, 0 956 %and = and i64 %notmask, %shr 957 %or = or i64 %and, -9223372036854775808 958 %cttz = tail call i64 @llvm.cttz.i64(i64 %or, i1 true) 959 %cast = trunc i64 %cttz to i32 960 %cmp3 = icmp ugt i32 10, %cast 961 %cmp6 = icmp ne i32 %rem, 0 962 %brmerge = or i1 %cmp6, %cmp3 963 br i1 %brmerge, label %if.end2, label %if.then 964 965if.then: ; preds = %if.end 966 unreachable 967 968if.end2: ; preds = %if.end 969 ret void 970} 971 972; GCN-LABEL: {{^}}fcmp64: 973; GFX1032: v_cmp_eq_f32_e32 vcc_lo, 0, v 974; GFX1064: v_cmp_eq_f32_e32 vcc, 0, v 975define amdgpu_kernel void @fcmp64(float %n, float %s) { 976entry: 977 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 978 %id.f = uitofp i32 %id to float 979 %mul4 = fmul float %s, %n 980 %cmp = fcmp ult float 0.0, %mul4 981 br label %if.end 982 983if.end: ; preds = %entry 984 %rem.f = frem float %id.f, %s 985 %fcmp = tail call i64 @llvm.amdgcn.fcmp.i64.f32(float %rem.f, float 0.0, i32 1) 986 %shr = lshr i64 %fcmp, 1 987 %notmask = shl nsw i64 -1, 0 988 %and = and i64 %notmask, %shr 989 %or = or i64 %and, -9223372036854775808 990 %cttz = tail call i64 @llvm.cttz.i64(i64 %or, i1 true) 991 %cast = trunc i64 %cttz to i32 992 %cmp3 = icmp ugt i32 10, %cast 993 %cmp6 = fcmp one float %rem.f, 0.0 994 %brmerge = or i1 %cmp6, %cmp3 995 br i1 %brmerge, label %if.end2, label %if.then 996 997if.then: ; preds = %if.end 998 unreachable 999 1000if.end2: ; preds = %if.end 1001 ret void 1002} 1003 1004; GCN-LABEL: {{^}}icmp32: 1005; GFX1032: v_cmp_eq_u32_e32 vcc_lo, 0, v 1006; GFX1064: v_cmp_eq_u32_e32 vcc, 0, v 1007define amdgpu_kernel void @icmp32(i32 %n, i32 %s) { 1008entry: 1009 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 1010 %mul4 = mul nsw i32 %s, %n 1011 %cmp = icmp slt i32 0, %mul4 1012 br label %if.end 1013 1014if.end: ; preds = %entry 1015 %rem = urem i32 %id, %s 1016 %icmp = tail call i32 @llvm.amdgcn.icmp.i32.i32(i32 %rem, i32 0, i32 32) 1017 %shr = lshr i32 %icmp, 1 1018 %notmask = shl nsw i32 -1, 0 1019 %and = and i32 %notmask, %shr 1020 %or = or i32 %and, 2147483648 1021 %cttz = tail call i32 @llvm.cttz.i32(i32 %or, i1 true) 1022 %cmp3 = icmp ugt i32 10, %cttz 1023 %cmp6 = icmp ne i32 %rem, 0 1024 %brmerge = or i1 %cmp6, %cmp3 1025 br i1 %brmerge, label %if.end2, label %if.then 1026 1027if.then: ; preds = %if.end 1028 unreachable 1029 1030if.end2: ; preds = %if.end 1031 ret void 1032} 1033 1034; GCN-LABEL: {{^}}fcmp32: 1035; GFX1032: v_cmp_eq_f32_e32 vcc_lo, 0, v 1036; GFX1064: v_cmp_eq_f32_e32 vcc, 0, v 1037define amdgpu_kernel void @fcmp32(float %n, float %s) { 1038entry: 1039 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 1040 %id.f = uitofp i32 %id to float 1041 %mul4 = fmul float %s, %n 1042 %cmp = fcmp ult float 0.0, %mul4 1043 br label %if.end 1044 1045if.end: ; preds = %entry 1046 %rem.f = frem float %id.f, %s 1047 %fcmp = tail call i32 @llvm.amdgcn.fcmp.i32.f32(float %rem.f, float 0.0, i32 1) 1048 %shr = lshr i32 %fcmp, 1 1049 %notmask = shl nsw i32 -1, 0 1050 %and = and i32 %notmask, %shr 1051 %or = or i32 %and, 2147483648 1052 %cttz = tail call i32 @llvm.cttz.i32(i32 %or, i1 true) 1053 %cmp3 = icmp ugt i32 10, %cttz 1054 %cmp6 = fcmp one float %rem.f, 0.0 1055 %brmerge = or i1 %cmp6, %cmp3 1056 br i1 %brmerge, label %if.end2, label %if.then 1057 1058if.then: ; preds = %if.end 1059 unreachable 1060 1061if.end2: ; preds = %if.end 1062 ret void 1063} 1064 1065declare void @external_void_func_void() #1 1066 1067; Test save/restore of VGPR needed for SGPR spilling. 1068 1069; GCN-LABEL: {{^}}callee_no_stack_with_call: 1070; GCN: s_waitcnt 1071; GCN-NEXT: s_waitcnt_vscnt 1072 1073; GFX1064-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} 1074; GFX1032-NEXT: s_or_saveexec_b32 [[COPY_EXEC0:s[0-9]]], -1{{$}} 1075; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill 1076; GCN-NEXT: v_nop 1077; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] 1078; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC0]] 1079 1080; GCN-NEXT: v_writelane_b32 v32, s34, 2 1081; GCN: s_mov_b32 s34, s32 1082; GFX1064: s_add_u32 s32, s32, 0x400 1083; GFX1032: s_add_u32 s32, s32, 0x200 1084 1085 1086; GCN-DAG: v_writelane_b32 v32, s30, 0 1087; GCN-DAG: v_writelane_b32 v32, s31, 1 1088; GCN: s_swappc_b64 1089; GCN-DAG: v_readlane_b32 s4, v32, 0 1090; GCN-DAG: v_readlane_b32 s5, v32, 1 1091 1092 1093; GFX1064: s_sub_u32 s32, s32, 0x400 1094; GFX1032: s_sub_u32 s32, s32, 0x200 1095; GCN: v_readlane_b32 s34, v32, 2 1096; GFX1064: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} 1097; GFX1032: s_or_saveexec_b32 [[COPY_EXEC1:s[0-9]]], -1{{$}} 1098; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload 1099; GCN-NEXT: v_nop 1100; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] 1101; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC1]] 1102; GCN-NEXT: s_waitcnt vmcnt(0) 1103; GCN-NEXT: s_setpc_b64 1104define void @callee_no_stack_with_call() #1 { 1105 call void @external_void_func_void() 1106 ret void 1107} 1108 1109 1110declare i32 @llvm.amdgcn.workitem.id.x() 1111declare float @llvm.fabs.f32(float) 1112declare { float, i1 } @llvm.amdgcn.div.scale.f32(float, float, i1) 1113declare { double, i1 } @llvm.amdgcn.div.scale.f64(double, double, i1) 1114declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1) 1115declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) 1116declare i1 @llvm.amdgcn.class.f32(float, i32) 1117declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) 1118declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) 1119declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) 1120declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) 1121declare float @llvm.amdgcn.wwm.f32(float) 1122declare i32 @llvm.amdgcn.wqm.i32(i32) 1123declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) 1124declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) 1125declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) 1126declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) 1127declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) 1128declare i64 @llvm.amdgcn.fcmp.i64.f32(float, float, i32) 1129declare i64 @llvm.amdgcn.icmp.i64.i32(i32, i32, i32) 1130declare i32 @llvm.amdgcn.fcmp.i32.f32(float, float, i32) 1131declare i32 @llvm.amdgcn.icmp.i32.i32(i32, i32, i32) 1132declare void @llvm.amdgcn.kill(i1) 1133declare i1 @llvm.amdgcn.wqm.vote(i1) 1134declare i1 @llvm.amdgcn.ps.live() 1135declare void @llvm.amdgcn.init.exec(i64) 1136declare void @llvm.amdgcn.init.exec.from.input(i32, i32) 1137declare i64 @llvm.cttz.i64(i64, i1) 1138declare i32 @llvm.cttz.i32(i32, i1) 1139 1140attributes #0 = { nounwind readnone speculatable } 1141attributes #1 = { nounwind } 1142attributes #2 = { nounwind readnone optnone noinline } 1143attributes #3 = { "target-features"="+wavefrontsize32" } 1144attributes #4 = { "target-features"="+wavefrontsize64" } 1145