1; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=CHECK %s
2
3; Check that WQM is not triggered by the softwqm intrinsic alone.
4;
5;CHECK-LABEL: {{^}}test1:
6;CHECK-NOT: s_wqm_b64 exec, exec
7;CHECK: buffer_load_dword
8;CHECK: buffer_load_dword
9;CHECK: v_add_f32_e32
10define amdgpu_ps float @test1(i32 inreg %idx0, i32 inreg %idx1) {
11main_body:
12  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
13  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
14  %out = fadd float %src0, %src1
15  %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
16  ret float %out.0
17}
18
19; Check that the softwqm intrinsic works correctly for integers.
20;
21;CHECK-LABEL: {{^}}test2:
22;CHECK-NOT: s_wqm_b64 exec, exec
23;CHECK: buffer_load_dword
24;CHECK: buffer_load_dword
25;CHECK: v_add_f32_e32
26define amdgpu_ps float @test2(i32 inreg %idx0, i32 inreg %idx1) {
27main_body:
28  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
29  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
30  %out = fadd float %src0, %src1
31  %out.0 = bitcast float %out to i32
32  %out.1 = call i32 @llvm.amdgcn.softwqm.i32(i32 %out.0)
33  %out.2 = bitcast i32 %out.1 to float
34  ret float %out.2
35}
36
37; Make sure the transition from WQM to Exact to softwqm does not trigger WQM.
38;
39;CHECK-LABEL: {{^}}test_softwqm1:
40;CHECK-NOT: s_wqm_b64 exec, exec
41;CHECK: buffer_load_dword
42;CHECK: buffer_load_dword
43;CHECK: buffer_store_dword
44;CHECK-NOT; s_wqm_b64 exec, exec
45;CHECK: v_add_f32_e32
46define amdgpu_ps float @test_softwqm1(i32 inreg %idx0, i32 inreg %idx1) {
47main_body:
48  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
49  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
50  %temp = fadd float %src0, %src1
51  call void @llvm.amdgcn.struct.buffer.store.f32(float %temp, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
52  %out = fadd float %temp, %temp
53  %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
54  ret float %out.0
55}
56
57; Make sure the transition from WQM to Exact to softwqm does trigger WQM.
58;
59;CHECK-LABEL: {{^}}test_softwqm2:
60;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
61;CHECK: s_wqm_b64 exec, exec
62;CHECK: buffer_load_dword
63;CHECK: buffer_load_dword
64;CHECK: v_add_f32_e32
65;CHECK: v_add_f32_e32
66;CHECK: s_and_b64 exec, exec, [[ORIG]]
67;CHECK: buffer_store_dword
68define amdgpu_ps float @test_softwqm2(i32 inreg %idx0, i32 inreg %idx1) {
69main_body:
70  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
71  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
72  %temp = fadd float %src0, %src1
73  %temp.0 = call float @llvm.amdgcn.wqm.f32(float %temp)
74  call void @llvm.amdgcn.struct.buffer.store.f32(float %temp.0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
75  %out = fadd float %temp, %temp
76  %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
77  ret float %out.0
78}
79
80; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead.
81; Make sure the transition from Exact to STRICT_WWM then softwqm does not trigger WQM.
82;
83;CHECK-LABEL: {{^}}test_wwm1:
84;CHECK: s_or_saveexec_b64 [[ORIG0:s\[[0-9]+:[0-9]+\]]], -1
85;CHECK: buffer_load_dword
86;CHECK: s_mov_b64 exec, [[ORIG0]]
87;CHECK: buffer_store_dword
88;CHECK: s_or_saveexec_b64 [[ORIG1:s\[[0-9]+:[0-9]+\]]], -1
89;CHECK: buffer_load_dword
90;CHECK: v_add_f32_e32
91;CHECK: s_mov_b64 exec, [[ORIG1]]
92;CHECK-NOT: s_wqm_b64
93define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
94main_body:
95  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
96  call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
97  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
98  %temp = fadd float %src0, %src1
99  %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp)
100  %out = fadd float %temp.0, %temp.0
101  %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
102  ret float %out.0
103}
104
105; Make sure the transition from Exact to STRICT_WWM then softwqm does not trigger WQM.
106;
107;CHECK-LABEL: {{^}}test_strict_wwm1:
108;CHECK: s_or_saveexec_b64 [[ORIG0:s\[[0-9]+:[0-9]+\]]], -1
109;CHECK: buffer_load_dword
110;CHECK: s_mov_b64 exec, [[ORIG0]]
111;CHECK: buffer_store_dword
112;CHECK: s_or_saveexec_b64 [[ORIG1:s\[[0-9]+:[0-9]+\]]], -1
113;CHECK: buffer_load_dword
114;CHECK: v_add_f32_e32
115;CHECK: s_mov_b64 exec, [[ORIG1]]
116;CHECK-NOT: s_wqm_b64
117define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
118main_body:
119  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
120  call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
121  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
122  %temp = fadd float %src0, %src1
123  %temp.0 = call float @llvm.amdgcn.strict.wwm.f32(float %temp)
124  %out = fadd float %temp.0, %temp.0
125  %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
126  ret float %out.0
127}
128
129
130; Check that softwqm on one case of branch does not trigger WQM for shader.
131;
132;CHECK-LABEL: {{^}}test_control_flow_0:
133;CHECK-NEXT: ; %main_body
134;CHECK-NOT: s_wqm_b64 exec, exec
135;CHECK: %ELSE
136;CHECK: store
137;CHECK: %IF
138;CHECK: buffer_load
139;CHECK: buffer_load
140define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) {
141main_body:
142  %cmp = icmp eq i32 %z, 0
143  br i1 %cmp, label %IF, label %ELSE
144
145IF:
146  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
147  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
148  %out = fadd float %src0, %src1
149  %data.if = call float @llvm.amdgcn.softwqm.f32(float %out)
150  br label %END
151
152ELSE:
153  call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0)
154  br label %END
155
156END:
157  %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
158  ret float %r
159}
160
161; Check that softwqm on one case of branch is treated as WQM in WQM shader.
162;
163;CHECK-LABEL: {{^}}test_control_flow_1:
164;CHECK-NEXT: ; %main_body
165;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
166;CHECK-NEXT: s_wqm_b64 exec, exec
167;CHECK: %ELSE
168;CHECK: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
169;CHECK: store
170;CHECK: s_mov_b64 exec, [[SAVED]]
171;CHECK: %IF
172;CHECK-NOT: s_and_saveexec_b64
173;CHECK-NOT: s_and_b64 exec
174;CHECK: buffer_load
175;CHECK: buffer_load
176define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) {
177main_body:
178  %c.bc = bitcast i32 %c to float
179  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
180  %tex0 = extractelement <4 x float> %tex, i32 0
181  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
182  %data.sample = extractelement <4 x float> %dtex, i32 0
183
184  %cmp = icmp eq i32 %z, 0
185  br i1 %cmp, label %IF, label %ELSE
186
187IF:
188  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
189  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
190  %out = fadd float %src0, %src1
191  %data.if = call float @llvm.amdgcn.softwqm.f32(float %out)
192  br label %END
193
194ELSE:
195  call void @llvm.amdgcn.struct.buffer.store.f32(float %data.sample, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0)
196  br label %END
197
198END:
199  %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
200  ret float %r
201}
202
203declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #2
204declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) #2
205declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32 immarg) #3
206declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
207declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
208declare void @llvm.amdgcn.kill(i1) #1
209declare float @llvm.amdgcn.wqm.f32(float) #3
210declare float @llvm.amdgcn.softwqm.f32(float) #3
211declare i32 @llvm.amdgcn.softwqm.i32(i32) #3
212declare float @llvm.amdgcn.strict.wwm.f32(float) #3
213declare float @llvm.amdgcn.wwm.f32(float) #3
214
215attributes #1 = { nounwind }
216attributes #2 = { nounwind readonly }
217attributes #3 = { nounwind readnone }
218