1; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=CHECK %s 2 3; Check that WQM is not triggered by the softwqm intrinsic alone. 4; 5;CHECK-LABEL: {{^}}test1: 6;CHECK-NOT: s_wqm_b64 exec, exec 7;CHECK: buffer_load_dword 8;CHECK: buffer_load_dword 9;CHECK: v_add_f32_e32 10define amdgpu_ps float @test1(i32 inreg %idx0, i32 inreg %idx1) { 11main_body: 12 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 13 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 14 %out = fadd float %src0, %src1 15 %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out) 16 ret float %out.0 17} 18 19; Check that the softwqm intrinsic works correctly for integers. 20; 21;CHECK-LABEL: {{^}}test2: 22;CHECK-NOT: s_wqm_b64 exec, exec 23;CHECK: buffer_load_dword 24;CHECK: buffer_load_dword 25;CHECK: v_add_f32_e32 26define amdgpu_ps float @test2(i32 inreg %idx0, i32 inreg %idx1) { 27main_body: 28 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 29 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 30 %out = fadd float %src0, %src1 31 %out.0 = bitcast float %out to i32 32 %out.1 = call i32 @llvm.amdgcn.softwqm.i32(i32 %out.0) 33 %out.2 = bitcast i32 %out.1 to float 34 ret float %out.2 35} 36 37; Make sure the transition from WQM to Exact to softwqm does not trigger WQM. 38; 39;CHECK-LABEL: {{^}}test_softwqm1: 40;CHECK-NOT: s_wqm_b64 exec, exec 41;CHECK: buffer_load_dword 42;CHECK: buffer_load_dword 43;CHECK: buffer_store_dword 44;CHECK-NOT; s_wqm_b64 exec, exec 45;CHECK: v_add_f32_e32 46define amdgpu_ps float @test_softwqm1(i32 inreg %idx0, i32 inreg %idx1) { 47main_body: 48 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 49 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 50 %temp = fadd float %src0, %src1 51 call void @llvm.amdgcn.struct.buffer.store.f32(float %temp, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 52 %out = fadd float %temp, %temp 53 %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out) 54 ret float %out.0 55} 56 57; Make sure the transition from WQM to Exact to softwqm does trigger WQM. 58; 59;CHECK-LABEL: {{^}}test_softwqm2: 60;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 61;CHECK: s_wqm_b64 exec, exec 62;CHECK: buffer_load_dword 63;CHECK: buffer_load_dword 64;CHECK: v_add_f32_e32 65;CHECK: v_add_f32_e32 66;CHECK: s_and_b64 exec, exec, [[ORIG]] 67;CHECK: buffer_store_dword 68define amdgpu_ps float @test_softwqm2(i32 inreg %idx0, i32 inreg %idx1) { 69main_body: 70 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 71 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 72 %temp = fadd float %src0, %src1 73 %temp.0 = call float @llvm.amdgcn.wqm.f32(float %temp) 74 call void @llvm.amdgcn.struct.buffer.store.f32(float %temp.0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 75 %out = fadd float %temp, %temp 76 %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out) 77 ret float %out.0 78} 79 80; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead. 81; Make sure the transition from Exact to STRICT_WWM then softwqm does not trigger WQM. 82; 83;CHECK-LABEL: {{^}}test_wwm1: 84;CHECK: s_or_saveexec_b64 [[ORIG0:s\[[0-9]+:[0-9]+\]]], -1 85;CHECK: buffer_load_dword 86;CHECK: s_mov_b64 exec, [[ORIG0]] 87;CHECK: buffer_store_dword 88;CHECK: s_or_saveexec_b64 [[ORIG1:s\[[0-9]+:[0-9]+\]]], -1 89;CHECK: buffer_load_dword 90;CHECK: v_add_f32_e32 91;CHECK: s_mov_b64 exec, [[ORIG1]] 92;CHECK-NOT: s_wqm_b64 93define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) { 94main_body: 95 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 96 call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 97 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 98 %temp = fadd float %src0, %src1 99 %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp) 100 %out = fadd float %temp.0, %temp.0 101 %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out) 102 ret float %out.0 103} 104 105; Make sure the transition from Exact to STRICT_WWM then softwqm does not trigger WQM. 106; 107;CHECK-LABEL: {{^}}test_strict_wwm1: 108;CHECK: s_or_saveexec_b64 [[ORIG0:s\[[0-9]+:[0-9]+\]]], -1 109;CHECK: buffer_load_dword 110;CHECK: s_mov_b64 exec, [[ORIG0]] 111;CHECK: buffer_store_dword 112;CHECK: s_or_saveexec_b64 [[ORIG1:s\[[0-9]+:[0-9]+\]]], -1 113;CHECK: buffer_load_dword 114;CHECK: v_add_f32_e32 115;CHECK: s_mov_b64 exec, [[ORIG1]] 116;CHECK-NOT: s_wqm_b64 117define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1) { 118main_body: 119 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 120 call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 121 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 122 %temp = fadd float %src0, %src1 123 %temp.0 = call float @llvm.amdgcn.strict.wwm.f32(float %temp) 124 %out = fadd float %temp.0, %temp.0 125 %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out) 126 ret float %out.0 127} 128 129 130; Check that softwqm on one case of branch does not trigger WQM for shader. 131; 132;CHECK-LABEL: {{^}}test_control_flow_0: 133;CHECK-NEXT: ; %main_body 134;CHECK-NOT: s_wqm_b64 exec, exec 135;CHECK: %ELSE 136;CHECK: store 137;CHECK: %IF 138;CHECK: buffer_load 139;CHECK: buffer_load 140define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) { 141main_body: 142 %cmp = icmp eq i32 %z, 0 143 br i1 %cmp, label %IF, label %ELSE 144 145IF: 146 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 147 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 148 %out = fadd float %src0, %src1 149 %data.if = call float @llvm.amdgcn.softwqm.f32(float %out) 150 br label %END 151 152ELSE: 153 call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0) 154 br label %END 155 156END: 157 %r = phi float [ %data.if, %IF ], [ %data, %ELSE ] 158 ret float %r 159} 160 161; Check that softwqm on one case of branch is treated as WQM in WQM shader. 162; 163;CHECK-LABEL: {{^}}test_control_flow_1: 164;CHECK-NEXT: ; %main_body 165;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 166;CHECK-NEXT: s_wqm_b64 exec, exec 167;CHECK: %ELSE 168;CHECK: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[ORIG]] 169;CHECK: store 170;CHECK: s_mov_b64 exec, [[SAVED]] 171;CHECK: %IF 172;CHECK-NOT: s_and_saveexec_b64 173;CHECK-NOT: s_and_b64 exec 174;CHECK: buffer_load 175;CHECK: buffer_load 176define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) { 177main_body: 178 %c.bc = bitcast i32 %c to float 179 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 180 %tex0 = extractelement <4 x float> %tex, i32 0 181 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 182 %data.sample = extractelement <4 x float> %dtex, i32 0 183 184 %cmp = icmp eq i32 %z, 0 185 br i1 %cmp, label %IF, label %ELSE 186 187IF: 188 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 189 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 190 %out = fadd float %src0, %src1 191 %data.if = call float @llvm.amdgcn.softwqm.f32(float %out) 192 br label %END 193 194ELSE: 195 call void @llvm.amdgcn.struct.buffer.store.f32(float %data.sample, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0) 196 br label %END 197 198END: 199 %r = phi float [ %data.if, %IF ], [ %data, %ELSE ] 200 ret float %r 201} 202 203declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #2 204declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) #2 205declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32 immarg) #3 206declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 207declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 208declare void @llvm.amdgcn.kill(i1) #1 209declare float @llvm.amdgcn.wqm.f32(float) #3 210declare float @llvm.amdgcn.softwqm.f32(float) #3 211declare i32 @llvm.amdgcn.softwqm.i32(i32) #3 212declare float @llvm.amdgcn.strict.wwm.f32(float) #3 213declare float @llvm.amdgcn.wwm.f32(float) #3 214 215attributes #1 = { nounwind } 216attributes #2 = { nounwind readonly } 217attributes #3 = { nounwind readnone } 218