1; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s 2; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s 3 4; Check that WQM isn't triggered by image load/store intrinsics. 5; 6;CHECK-LABEL: {{^}}test1: 7;CHECK-NOT: s_wqm 8define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, i32 %c) { 9main_body: 10 %tex = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0) 11 call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %tex, i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0) 12 ret <4 x float> %tex 13} 14 15; Check that WQM is triggered by code calculating inputs to image samples and is disabled as soon as possible 16; 17;CHECK-LABEL: {{^}}test2: 18;CHECK-NEXT: ; %main_body 19;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 20;CHECK-NEXT: s_wqm_b64 exec, exec 21;CHECK: interp 22;CHECK: s_and_b64 exec, exec, [[ORIG]] 23;CHECK-NOT: interp 24;CHECK: image_sample 25;CHECK-NOT: exec 26;CHECK: .size test2 27define amdgpu_ps <4 x float> @test2(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 { 28main_body: 29 %inst23 = extractelement <2 x float> %pos, i32 0 30 %inst24 = extractelement <2 x float> %pos, i32 1 31 %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0) 32 %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0) 33 %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0) 34 %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0) 35 %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 36 ret <4 x float> %tex 37} 38 39; ... but disabled for stores (and, in this simple case, not re-enabled) ... 40; 41;CHECK-LABEL: {{^}}test3: 42;CHECK-NEXT: ; %main_body 43;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 44;CHECK-NEXT: s_wqm_b64 exec, exec 45;CHECK: s_and_b64 exec, exec, [[ORIG]] 46;CHECK: image_sample 47;CHECK: store 48;CHECK-NOT: exec 49;CHECK: .size test3 50define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c) { 51main_body: 52 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 53 %tex.1 = bitcast <4 x float> %tex to <4 x i32> 54 %tex.2 = extractelement <4 x i32> %tex.1, i32 0 55 56 call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %tex, <4 x i32> undef, i32 %tex.2, i32 0, i32 0, i32 0) 57 58 ret <4 x float> %tex 59} 60 61; ... and disabled for export. 62; 63;CHECK-LABEL: {{^}}test3x: 64;CHECK-NEXT: ; %main_body 65;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 66;CHECK-NEXT: s_wqm_b64 exec, exec 67;CHECK: s_and_b64 exec, exec, [[ORIG]] 68;CHECK: image_sample 69;CHECK: exp 70;CHECK-NOT: exec 71;CHECK: .size test3x 72define amdgpu_ps void @test3x(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 { 73main_body: 74 %inst23 = extractelement <2 x float> %pos, i32 0 75 %inst24 = extractelement <2 x float> %pos, i32 1 76 %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0) 77 %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0) 78 %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0) 79 %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0) 80 %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 81 %tex.0 = extractelement <4 x float> %tex, i32 0 82 %tex.1 = extractelement <4 x float> %tex, i32 1 83 %tex.2 = extractelement <4 x float> %tex, i32 2 84 %tex.3 = extractelement <4 x float> %tex, i32 3 85 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tex.0, float %tex.1, float %tex.2, float %tex.3, i1 true, i1 true) 86 ret void 87} 88 89; Check that WQM is re-enabled when required. 90; 91;CHECK-LABEL: {{^}}test4: 92;CHECK-NEXT: ; %main_body 93;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 94;CHECK-NEXT: s_wqm_b64 exec, exec 95;CHECK: v_mul_lo_u32 [[MUL:v[0-9]+]], v0, v1 96;CHECK: image_sample 97;CHECK: s_and_b64 exec, exec, [[ORIG]] 98;CHECK: image_sample 99;CHECK: store 100define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) { 101main_body: 102 %c.1 = mul i32 %c, %d 103 104 call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i32 0, i32 0) 105 %c.1.bc = bitcast i32 %c.1 to float 106 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 107 %tex0 = extractelement <4 x float> %tex, i32 0 108 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 109 ret <4 x float> %dtex 110} 111 112; Check that WQM is triggered by the wqm intrinsic. 113; 114;CHECK-LABEL: {{^}}test5: 115;CHECK: s_wqm_b64 exec, exec 116;CHECK: buffer_load_dword 117;CHECK: buffer_load_dword 118;CHECK: v_add_f32_e32 119; WQM was inserting an unecessary v_mov to self after the v_add. Make sure this 120; does not happen - the v_add should write the return reg directly. 121;CHECK-NOT: v_mov_b32_e32 122define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) { 123main_body: 124 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 125 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 126 %out = fadd float %src0, %src1 127 %out.0 = call float @llvm.amdgcn.wqm.f32(float %out) 128 ret float %out.0 129} 130 131; Check that the wqm intrinsic works correctly for integers. 132; 133;CHECK-LABEL: {{^}}test6: 134;CHECK: s_wqm_b64 exec, exec 135;CHECK: buffer_load_dword 136;CHECK: buffer_load_dword 137;CHECK: v_add_f32_e32 138define amdgpu_ps float @test6(i32 inreg %idx0, i32 inreg %idx1) { 139main_body: 140 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 141 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 142 %out = fadd float %src0, %src1 143 %out.0 = bitcast float %out to i32 144 %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0) 145 %out.2 = bitcast i32 %out.1 to float 146 ret float %out.2 147} 148 149; Check that WWM is triggered by the wwm intrinsic. 150; 151;CHECK-LABEL: {{^}}test_wwm1: 152;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 153;CHECK: buffer_load_dword 154;CHECK: buffer_load_dword 155;CHECK: v_add_f32_e32 156define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) { 157main_body: 158 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 159 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 160 %out = fadd float %src0, %src1 161 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) 162 ret float %out.0 163} 164 165; Same as above, but with an integer type. 166; 167;CHECK-LABEL: {{^}}test_wwm2: 168;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 169;CHECK: buffer_load_dword 170;CHECK: buffer_load_dword 171;CHECK: v_add_{{[iu]}}32_e32 172define amdgpu_ps float @test_wwm2(i32 inreg %idx0, i32 inreg %idx1) { 173main_body: 174 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 175 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 176 %src0.0 = bitcast float %src0 to i32 177 %src1.0 = bitcast float %src1 to i32 178 %out = add i32 %src0.0, %src1.0 179 %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out) 180 %out.1 = bitcast i32 %out.0 to float 181 ret float %out.1 182} 183 184; Check that we don't leave WWM on for computations that don't require WWM, 185; since that will lead clobbering things that aren't supposed to be clobbered 186; in cases like this. 187; 188;CHECK-LABEL: {{^}}test_wwm3: 189;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 190;CHECK: buffer_load_dword 191;CHECK: v_add_f32_e32 192;CHECK: s_mov_b64 exec, [[ORIG]] 193;CHECK: v_add_f32_e32 194define amdgpu_ps float @test_wwm3(i32 inreg %idx) { 195main_body: 196 ; use mbcnt to make sure the branch is divergent 197 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 198 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 199 %cc = icmp uge i32 %hi, 32 200 br i1 %cc, label %endif, label %if 201 202if: 203 %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) 204 %out = fadd float %src, %src 205 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) 206 %out.1 = fadd float %src, %out.0 207 br label %endif 208 209endif: 210 %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ] 211 ret float %out.2 212} 213 214; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM 215; write could clobber disabled channels in the non-WWM one. 216; 217;CHECK-LABEL: {{^}}test_wwm4: 218;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 219;CHECK: buffer_load_dword 220;CHECK: v_add_f32_e32 221;CHECK: s_mov_b64 exec, [[ORIG]] 222;CHECK-NEXT: v_mov_b32_e32 223define amdgpu_ps float @test_wwm4(i32 inreg %idx) { 224main_body: 225 ; use mbcnt to make sure the branch is divergent 226 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 227 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 228 %cc = icmp uge i32 %hi, 32 229 br i1 %cc, label %endif, label %if 230 231if: 232 %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) 233 %out = fadd float %src, %src 234 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) 235 br label %endif 236 237endif: 238 %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] 239 ret float %out.1 240} 241 242; Make sure the transition from Exact to WWM then WQM works properly. 243; 244;CHECK-LABEL: {{^}}test_wwm5: 245;CHECK: buffer_load_dword 246;CHECK: buffer_store_dword 247;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 248;CHECK: buffer_load_dword 249;CHECK: v_add_f32_e32 250;CHECK: s_mov_b64 exec, [[ORIG]] 251;CHECK: s_wqm_b64 exec, exec 252define amdgpu_ps float @test_wwm5(i32 inreg %idx0, i32 inreg %idx1) { 253main_body: 254 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 255 call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 256 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 257 %temp = fadd float %src1, %src1 258 %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp) 259 %out = fadd float %temp.0, %temp.0 260 %out.0 = call float @llvm.amdgcn.wqm.f32(float %out) 261 ret float %out.0 262} 263 264; Check that WWM is turned on correctly across basic block boundaries. 265; if..then..endif version 266; 267;CHECK-LABEL: {{^}}test_wwm6_then: 268;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 269;SI-CHECK: buffer_load_dword 270;VI-CHECK: flat_load_dword 271;CHECK: s_mov_b64 exec, [[ORIG]] 272;CHECK: %if 273;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1 274;SI-CHECK: buffer_load_dword 275;VI-CHECK: flat_load_dword 276;CHECK: v_add_f32_e32 277;CHECK: s_mov_b64 exec, [[ORIG2]] 278define amdgpu_ps float @test_wwm6_then() { 279main_body: 280 %src0 = load volatile float, float addrspace(1)* undef 281 ; use mbcnt to make sure the branch is divergent 282 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 283 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 284 %cc = icmp uge i32 %hi, 32 285 br i1 %cc, label %endif, label %if 286 287if: 288 %src1 = load volatile float, float addrspace(1)* undef 289 %out = fadd float %src0, %src1 290 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) 291 br label %endif 292 293endif: 294 %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] 295 ret float %out.1 296} 297 298; Check that WWM is turned on correctly across basic block boundaries. 299; loop version 300; 301;CHECK-LABEL: {{^}}test_wwm6_loop: 302;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 303;SI-CHECK: buffer_load_dword 304;VI-CHECK: flat_load_dword 305;CHECK: s_mov_b64 exec, [[ORIG]] 306;CHECK: %loop 307;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1 308;SI-CHECK: buffer_load_dword 309;VI-CHECK: flat_load_dword 310;CHECK: s_mov_b64 exec, [[ORIG2]] 311define amdgpu_ps float @test_wwm6_loop() { 312main_body: 313 %src0 = load volatile float, float addrspace(1)* undef 314 ; use mbcnt to make sure the branch is divergent 315 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 316 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 317 br label %loop 318 319loop: 320 %counter = phi i32 [ %lo, %main_body ], [ %counter.1, %loop ] 321 %src1 = load volatile float, float addrspace(1)* undef 322 %out = fadd float %src0, %src1 323 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) 324 %counter.1 = sub i32 %counter, 1 325 %cc = icmp ne i32 %counter.1, 0 326 br i1 %cc, label %loop, label %endloop 327 328endloop: 329 ret float %out.0 330} 331 332; Check that @llvm.amdgcn.set.inactive disables WWM. 333; 334;CHECK-LABEL: {{^}}test_set_inactive1: 335;CHECK: buffer_load_dword 336;CHECK: s_not_b64 exec, exec 337;CHECK: v_mov_b32_e32 338;CHECK: s_not_b64 exec, exec 339;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 340;CHECK: v_add_{{[iu]}}32_e32 341define amdgpu_ps void @test_set_inactive1(i32 inreg %idx) { 342main_body: 343 %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) 344 %src.0 = bitcast float %src to i32 345 %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0) 346 %out = add i32 %src.1, %src.1 347 %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out) 348 %out.1 = bitcast i32 %out.0 to float 349 call void @llvm.amdgcn.struct.buffer.store.f32(float %out.1, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) 350 ret void 351} 352 353; Check that enabling WQM anywhere enables WQM for the set.inactive source. 354; 355;CHECK-LABEL: {{^}}test_set_inactive2: 356;CHECK: s_wqm_b64 exec, exec 357;CHECK: buffer_load_dword 358;CHECK: buffer_load_dword 359define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) { 360main_body: 361 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 362 %src1.0 = bitcast float %src1 to i32 363 %src1.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src1.0, i32 undef) 364 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 365 %src0.0 = bitcast float %src0 to i32 366 %src0.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %src0.0) 367 %out = add i32 %src0.1, %src1.1 368 %out.0 = bitcast i32 %out to float 369 call void @llvm.amdgcn.struct.buffer.store.f32(float %out.0, <4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 370 ret void 371} 372 373; Check a case of one branch of an if-else requiring WQM, the other requiring 374; exact. 375; 376; Note: In this particular case, the save-and-restore could be avoided if the 377; analysis understood that the two branches of the if-else are mutually 378; exclusive. 379; 380;CHECK-LABEL: {{^}}test_control_flow_0: 381;CHECK-NEXT: ; %main_body 382;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 383;CHECK-NEXT: s_wqm_b64 exec, exec 384;CHECK: %ELSE 385;CHECK: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[ORIG]] 386;CHECK: store 387;CHECK: s_mov_b64 exec, [[SAVED]] 388;CHECK: %IF 389;CHECK: image_sample 390;CHECK: image_sample 391define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { 392main_body: 393 %cmp = icmp eq i32 %z, 0 394 br i1 %cmp, label %IF, label %ELSE 395 396IF: 397 %c.bc = bitcast i32 %c to float 398 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 399 %tex0 = extractelement <4 x float> %tex, i32 0 400 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 401 %data.if = extractelement <4 x float> %dtex, i32 0 402 br label %END 403 404ELSE: 405 call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0) 406 br label %END 407 408END: 409 %r = phi float [ %data.if, %IF ], [ %data, %ELSE ] 410 ret float %r 411} 412 413; Reverse branch order compared to the previous test. 414; 415;CHECK-LABEL: {{^}}test_control_flow_1: 416;CHECK-NEXT: ; %main_body 417;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 418;CHECK-NEXT: s_wqm_b64 exec, exec 419;CHECK: %IF 420;CHECK: image_sample 421;CHECK: image_sample 422;CHECK: %Flow 423;CHECK-NEXT: s_or_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], 424;CHECK-NEXT: s_and_b64 exec, exec, [[ORIG]] 425;CHECK-NEXT: s_and_b64 [[SAVED]], exec, [[SAVED]] 426;CHECK-NEXT: s_xor_b64 exec, exec, [[SAVED]] 427;CHECK-NEXT: s_cbranch_execz [[END_BB:BB[0-9]+_[0-9]+]] 428;CHECK-NEXT: ; %bb.{{[0-9]+}}: ; %ELSE 429;CHECK: store_dword 430;CHECK: [[END_BB]]: ; %END 431;CHECK: s_or_b64 exec, exec, 432;CHECK: v_mov_b32_e32 v0 433;CHECK: ; return 434define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { 435main_body: 436 %cmp = icmp eq i32 %z, 0 437 br i1 %cmp, label %ELSE, label %IF 438 439IF: 440 %c.bc = bitcast i32 %c to float 441 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 442 %tex0 = extractelement <4 x float> %tex, i32 0 443 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 444 %data.if = extractelement <4 x float> %dtex, i32 0 445 br label %END 446 447ELSE: 448 call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0) 449 br label %END 450 451END: 452 %r = phi float [ %data.if, %IF ], [ %data, %ELSE ] 453 ret float %r 454} 455 456; Check that branch conditions are properly marked as needing WQM... 457; 458;CHECK-LABEL: {{^}}test_control_flow_2: 459;CHECK-NEXT: ; %main_body 460;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 461;CHECK-NEXT: s_wqm_b64 exec, exec 462;CHECK: s_and_b64 exec, exec, [[ORIG]] 463;CHECK: store 464;CHECK: s_wqm_b64 exec, exec 465;CHECK: load 466;CHECK: s_and_b64 exec, exec, [[ORIG]] 467;CHECK: store 468;CHECK: s_wqm_b64 exec, exec 469;CHECK: v_cmp 470define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) { 471main_body: 472 %idx.1 = extractelement <3 x i32> %idx, i32 0 473 %data.1 = extractelement <2 x float> %data, i32 0 474 call void @llvm.amdgcn.struct.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i32 0, i32 0) 475 476 ; The load that determines the branch (and should therefore be WQM) is 477 ; surrounded by stores that require disabled WQM. 478 %idx.2 = extractelement <3 x i32> %idx, i32 1 479 %z = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx.2, i32 0, i32 0, i32 0) 480 481 %idx.3 = extractelement <3 x i32> %idx, i32 2 482 %data.3 = extractelement <2 x float> %data, i32 1 483 call void @llvm.amdgcn.struct.buffer.store.f32(float %data.3, <4 x i32> undef, i32 %idx.3, i32 0, i32 0, i32 0) 484 485 %cc = fcmp ogt float %z, 0.0 486 br i1 %cc, label %IF, label %ELSE 487 488IF: 489 %coord.IF = mul i32 %coord, 3 490 br label %END 491 492ELSE: 493 %coord.ELSE = mul i32 %coord, 4 494 br label %END 495 496END: 497 %coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ] 498 %coord.END.bc = bitcast i32 %coord.END to float 499 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord.END.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 500 ret <4 x float> %tex 501} 502 503; ... but only if they really do need it. 504; 505;CHECK-LABEL: {{^}}test_control_flow_3: 506;CHECK-NEXT: ; %main_body 507;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 508;CHECK-NEXT: s_wqm_b64 exec, exec 509;CHECK: image_sample 510;CHECK: s_and_b64 exec, exec, [[ORIG]] 511;CHECK: image_sample 512;CHECK-DAG: v_cmp 513;CHECK-DAG: store 514define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %coord) { 515main_body: 516 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 517 %tex0 = extractelement <4 x float> %tex, i32 0 518 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 519 %dtex.1 = extractelement <4 x float> %dtex, i32 0 520 call void @llvm.amdgcn.struct.buffer.store.f32(float %dtex.1, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) 521 522 %cc = fcmp ogt float %dtex.1, 0.0 523 br i1 %cc, label %IF, label %ELSE 524 525IF: 526 %tex.IF = fmul float %dtex.1, 3.0 527 br label %END 528 529ELSE: 530 %tex.ELSE = fmul float %dtex.1, 4.0 531 br label %END 532 533END: 534 %tex.END = phi float [ %tex.IF, %IF ], [ %tex.ELSE, %ELSE ] 535 ret float %tex.END 536} 537 538; Another test that failed at some point because of terminator handling. 539; 540;CHECK-LABEL: {{^}}test_control_flow_4: 541;CHECK-NEXT: ; %main_body 542;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 543;CHECK-NEXT: s_wqm_b64 exec, exec 544;CHECK: %IF 545;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]] 546;CHECK: load 547;CHECK: store 548;CHECK: s_mov_b64 exec, [[SAVE]] 549;CHECK: %END 550;CHECK: image_sample 551;CHECK: image_sample 552define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %coord, i32 %y, float %z) { 553main_body: 554 %cond = icmp eq i32 %y, 0 555 br i1 %cond, label %IF, label %END 556 557IF: 558 %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> undef, i32 0, i32 0, i32 0) 559 call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i32 0, i32 0) 560 br label %END 561 562END: 563 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 564 %tex0 = extractelement <4 x float> %tex, i32 0 565 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 566 ret <4 x float> %dtex 567} 568 569; Kill is performed in WQM mode so that uniform kill behaves correctly ... 570; 571;CHECK-LABEL: {{^}}test_kill_0: 572;CHECK-NEXT: ; %main_body 573;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 574;CHECK-NEXT: s_wqm_b64 exec, exec 575;CHECK: s_and_b64 exec, exec, [[ORIG]] 576;CHECK: image_sample 577;CHECK: buffer_store_dword 578;CHECK: s_wqm_b64 exec, exec 579;CHECK: v_cmpx_ 580;CHECK: image_sample 581;CHECK: s_and_b64 exec, exec, [[ORIG]] 582;CHECK: image_sample 583;CHECK: buffer_store_dword 584define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, float %coord, float %coord2, float %z) { 585main_body: 586 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 587 %idx.0 = extractelement <2 x i32> %idx, i32 0 588 %data.0 = extractelement <2 x float> %data, i32 0 589 call void @llvm.amdgcn.struct.buffer.store.f32(float %data.0, <4 x i32> undef, i32 %idx.0, i32 0, i32 0, i32 0) 590 591 %z.cmp = fcmp olt float %z, 0.0 592 call void @llvm.amdgcn.kill(i1 %z.cmp) 593 594 %idx.1 = extractelement <2 x i32> %idx, i32 1 595 %data.1 = extractelement <2 x float> %data, i32 1 596 call void @llvm.amdgcn.struct.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i32 0, i32 0) 597 %tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 598 %tex2.0 = extractelement <4 x float> %tex2, i32 0 599 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex2.0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 600 %out = fadd <4 x float> %tex, %dtex 601 602 ret <4 x float> %out 603} 604 605; ... but only if WQM is necessary. 606; 607; CHECK-LABEL: {{^}}test_kill_1: 608; CHECK-NEXT: ; %main_body 609; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 610; CHECK: s_wqm_b64 exec, exec 611; CHECK: image_sample 612; CHECK: s_and_b64 exec, exec, [[ORIG]] 613; CHECK: image_sample 614; CHECK: buffer_store_dword 615; CHECK-NOT: wqm 616; CHECK: v_cmpx_ 617define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { 618main_body: 619 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 620 %tex0 = extractelement <4 x float> %tex, i32 0 621 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 622 623 call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i32 0) 624 625 %z.cmp = fcmp olt float %z, 0.0 626 call void @llvm.amdgcn.kill(i1 %z.cmp) 627 628 ret <4 x float> %dtex 629} 630 631; Check prolog shaders. 632; 633; CHECK-LABEL: {{^}}test_prolog_1: 634; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 635; CHECK: s_wqm_b64 exec, exec 636; CHECK: v_add_f32_e32 v0, 637; CHECK: s_and_b64 exec, exec, [[ORIG]] 638define amdgpu_ps float @test_prolog_1(float %a, float %b) #5 { 639main_body: 640 %s = fadd float %a, %b 641 ret float %s 642} 643 644; CHECK-LABEL: {{^}}test_loop_vcc: 645; CHECK-NEXT: ; %entry 646; CHECK-NEXT: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec 647; CHECK: s_wqm_b64 exec, exec 648; CHECK: s_and_b64 exec, exec, [[LIVE]] 649; CHECK: image_store 650; CHECK: s_wqm_b64 exec, exec 651; CHECK-DAG: v_mov_b32_e32 [[CTR:v[0-9]+]], 0 652; CHECK-DAG: s_mov_b32 [[SEVEN:s[0-9]+]], 0x40e00000 653 654; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %body 655; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]] 656; CHECK: [[LOOP:BB[0-9]+_[0-9]+]]: ; %loop 657; CHECK: v_cmp_lt_f32_e32 vcc, [[SEVEN]], [[CTR]] 658; CHECK: s_cbranch_vccz [[LOOPHDR]] 659 660; CHECK: ; %break 661; CHECK: ; return 662define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind { 663entry: 664 call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %in, i32 15, i32 undef, <8 x i32> undef, i32 0, i32 0) 665 br label %loop 666 667loop: 668 %ctr.iv = phi float [ 0.0, %entry ], [ %ctr.next, %body ] 669 %c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ] 670 %cc = fcmp ogt float %ctr.iv, 7.0 671 br i1 %cc, label %break, label %body 672 673body: 674 %c.iv0 = extractelement <4 x float> %c.iv, i32 0 675 %c.next = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.iv0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 676 %ctr.next = fadd float %ctr.iv, 2.0 677 br label %loop 678 679break: 680 ret <4 x float> %c.iv 681} 682 683; Only intrinsic stores need exact execution -- other stores do not have 684; externally visible effects and may require WQM for correctness. 685; 686; CHECK-LABEL: {{^}}test_alloca: 687; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec 688; CHECK: s_wqm_b64 exec, exec 689 690; CHECK: s_and_b64 exec, exec, [[LIVE]] 691; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 692; CHECK: s_wqm_b64 exec, exec 693; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} 694; CHECK: s_and_b64 exec, exec, [[LIVE]] 695; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen 696; CHECK: s_wqm_b64 exec, exec 697; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen 698 699; CHECK: s_and_b64 exec, exec, [[LIVE]] 700; CHECK: image_sample 701; CHECK: buffer_store_dwordx4 702define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind { 703entry: 704 %array = alloca [32 x i32], align 4, addrspace(5) 705 706 call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i32 0) 707 708 %s.gep = getelementptr [32 x i32], [32 x i32] addrspace(5)* %array, i32 0, i32 0 709 store volatile i32 %a, i32 addrspace(5)* %s.gep, align 4 710 711 call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i32 0, i32 0) 712 713 %c.gep = getelementptr [32 x i32], [32 x i32] addrspace(5)* %array, i32 0, i32 %idx 714 %c = load i32, i32 addrspace(5)* %c.gep, align 4 715 %c.bc = bitcast i32 %c to float 716 %t = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 717 call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %t, <4 x i32> undef, i32 0, i32 0, i32 0) 718 719 ret void 720} 721 722; Must return to exact at the end of a non-void returning shader, 723; otherwise the EXEC mask exported by the epilog will be wrong. This is true 724; even if the shader has no kills, because a kill could have happened in a 725; previous shader fragment. 726; 727; CHECK-LABEL: {{^}}test_nonvoid_return: 728; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec 729; CHECK: s_wqm_b64 exec, exec 730; 731; CHECK: s_and_b64 exec, exec, [[LIVE]] 732; CHECK-NOT: exec 733define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind { 734 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 735 %tex0 = extractelement <4 x float> %tex, i32 0 736 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 737 ret <4 x float> %dtex 738} 739 740; CHECK-LABEL: {{^}}test_nonvoid_return_unreachable: 741; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec 742; CHECK: s_wqm_b64 exec, exec 743; 744; CHECK: s_and_b64 exec, exec, [[LIVE]] 745; CHECK-NOT: exec 746define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind { 747entry: 748 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 749 %tex0 = extractelement <4 x float> %tex, i32 0 750 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 751 %cc = icmp sgt i32 %c, 0 752 br i1 %cc, label %if, label %else 753 754if: 755 store volatile <4 x float> %dtex, <4 x float> addrspace(1)* undef 756 unreachable 757 758else: 759 ret <4 x float> %dtex 760} 761 762; Test awareness that s_wqm_b64 clobbers SCC. 763; 764; CHECK-LABEL: {{^}}test_scc: 765; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 766; CHECK: s_wqm_b64 exec, exec 767; CHECK: s_cmp_ 768; CHECK-NEXT: s_cbranch_scc 769; CHECK: ; %else 770; CHECK: image_sample 771; CHECK: ; %if 772; CHECK: image_sample 773; CHECK: ; %end 774; CHECK: s_and_b64 exec, exec, [[ORIG]] 775define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 { 776main_body: 777 %cc = icmp sgt i32 %sel, 0 778 br i1 %cc, label %if, label %else 779 780if: 781 %r.if = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 782 br label %end 783 784else: 785 %r.else = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.0, float bitcast (i32 1 to float), <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 786 br label %end 787 788end: 789 %r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ] 790 call void @llvm.amdgcn.struct.buffer.store.f32(float 1.0, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) 791 ret <4 x float> %r 792} 793 794; Check a case of a block being entirely WQM except for a bit of WWM. 795; There was a bug where it forgot to enter and leave WWM. 796; 797;CHECK-LABEL: {{^}}test_wwm_within_wqm: 798;CHECK: %IF 799;CHECK: s_or_saveexec_b64 {{.*}}, -1 800;CHECK: ds_swizzle 801; 802define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { 803main_body: 804 %c.bc = bitcast i32 %c to float 805 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 806 %tex0 = extractelement <4 x float> %tex, i32 0 807 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 808 %cmp = icmp eq i32 %z, 0 809 br i1 %cmp, label %IF, label %ENDIF 810 811IF: 812 %dataf = extractelement <4 x float> %dtex, i32 0 813 %data1 = fptosi float %dataf to i32 814 %data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0) 815 %data3 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data2, i32 2079) 816 %data4 = call i32 @llvm.amdgcn.wwm.i32(i32 %data3) 817 %data4f = sitofp i32 %data4 to float 818 br label %ENDIF 819 820ENDIF: 821 %r = phi float [ %data4f, %IF ], [ 0.0, %main_body ] 822 ret float %r 823} 824 825declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1 826declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1 827 828declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #2 829declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) #2 830declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32 immarg) #2 831declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) #2 832declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #3 833declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) #3 834 835declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #3 836declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 837declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 838declare void @llvm.amdgcn.kill(i1) #1 839declare float @llvm.amdgcn.wqm.f32(float) #3 840declare i32 @llvm.amdgcn.wqm.i32(i32) #3 841declare float @llvm.amdgcn.wwm.f32(float) #3 842declare i32 @llvm.amdgcn.wwm.i32(i32) #3 843declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #4 844declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3 845declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3 846declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #3 847declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #1 848declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2 849declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2 850declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) 851 852attributes #1 = { nounwind } 853attributes #2 = { nounwind readonly } 854attributes #3 = { nounwind readnone } 855attributes #4 = { nounwind readnone convergent } 856attributes #5 = { "amdgpu-ps-wqm-outputs" } 857attributes #6 = { nounwind "InitialPSInputAddr"="2" } 858