1; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
2; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
3
4; Check that WQM isn't triggered by image load/store intrinsics.
5;
6;CHECK-LABEL: {{^}}test1:
7;CHECK-NOT: s_wqm
8define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, i32 %c) {
9main_body:
10  %tex = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0)
11  call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %tex, i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0)
12  ret <4 x float> %tex
13}
14
15; Check that WQM is triggered by code calculating inputs to image samples and is disabled as soon as possible
16;
17;CHECK-LABEL: {{^}}test2:
18;CHECK-NEXT: ; %main_body
19;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
20;CHECK-NEXT: s_wqm_b64 exec, exec
21;CHECK: interp
22;CHECK: s_and_b64 exec, exec, [[ORIG]]
23;CHECK-NOT: interp
24;CHECK: image_sample
25;CHECK-NOT: exec
26;CHECK: .size test2
27define amdgpu_ps <4 x float> @test2(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 {
28main_body:
29  %inst23 = extractelement <2 x float> %pos, i32 0
30  %inst24 = extractelement <2 x float> %pos, i32 1
31  %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)
32  %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0)
33  %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)
34  %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0)
35  %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
36  ret <4 x float> %tex
37}
38
39; ... but disabled for stores (and, in this simple case, not re-enabled) ...
40;
41;CHECK-LABEL: {{^}}test3:
42;CHECK-NEXT: ; %main_body
43;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
44;CHECK-NEXT: s_wqm_b64 exec, exec
45;CHECK: s_and_b64 exec, exec, [[ORIG]]
46;CHECK: image_sample
47;CHECK: store
48;CHECK-NOT: exec
49;CHECK: .size test3
50define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c) {
51main_body:
52  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
53  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
54  %tex.2 = extractelement <4 x i32> %tex.1, i32 0
55
56  call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %tex, <4 x i32> undef, i32 %tex.2, i32 0, i32 0, i32 0)
57
58  ret <4 x float> %tex
59}
60
61; ... and disabled for export.
62;
63;CHECK-LABEL: {{^}}test3x:
64;CHECK-NEXT: ; %main_body
65;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
66;CHECK-NEXT: s_wqm_b64 exec, exec
67;CHECK: s_and_b64 exec, exec, [[ORIG]]
68;CHECK: image_sample
69;CHECK: exp
70;CHECK-NOT: exec
71;CHECK: .size test3x
72define amdgpu_ps void @test3x(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 {
73main_body:
74  %inst23 = extractelement <2 x float> %pos, i32 0
75  %inst24 = extractelement <2 x float> %pos, i32 1
76  %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)
77  %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0)
78  %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)
79  %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0)
80  %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
81  %tex.0 = extractelement <4 x float> %tex, i32 0
82  %tex.1 = extractelement <4 x float> %tex, i32 1
83  %tex.2 = extractelement <4 x float> %tex, i32 2
84  %tex.3 = extractelement <4 x float> %tex, i32 3
85  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tex.0, float %tex.1, float %tex.2, float %tex.3, i1 true, i1 true)
86  ret void
87}
88
89; Check that WQM is re-enabled when required.
90;
91;CHECK-LABEL: {{^}}test4:
92;CHECK-NEXT: ; %main_body
93;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
94;CHECK-NEXT: s_wqm_b64 exec, exec
95;CHECK: v_mul_lo_u32 [[MUL:v[0-9]+]], v0, v1
96;CHECK: image_sample
97;CHECK: s_and_b64 exec, exec, [[ORIG]]
98;CHECK: image_sample
99;CHECK: store
100define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) {
101main_body:
102  %c.1 = mul i32 %c, %d
103
104  call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i32 0, i32 0)
105  %c.1.bc = bitcast i32 %c.1 to float
106  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
107  %tex0 = extractelement <4 x float> %tex, i32 0
108  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
109  ret <4 x float> %dtex
110}
111
112; Check that WQM is triggered by the wqm intrinsic.
113;
114;CHECK-LABEL: {{^}}test5:
115;CHECK: s_wqm_b64 exec, exec
116;CHECK: buffer_load_dword
117;CHECK: buffer_load_dword
118;CHECK: v_add_f32_e32
119; WQM was inserting an unecessary v_mov to self after the v_add. Make sure this
120; does not happen - the v_add should write the return reg directly.
121;CHECK-NOT: v_mov_b32_e32
122define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) {
123main_body:
124  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
125  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
126  %out = fadd float %src0, %src1
127  %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
128  ret float %out.0
129}
130
131; Check that the wqm intrinsic works correctly for integers.
132;
133;CHECK-LABEL: {{^}}test6:
134;CHECK: s_wqm_b64 exec, exec
135;CHECK: buffer_load_dword
136;CHECK: buffer_load_dword
137;CHECK: v_add_f32_e32
138define amdgpu_ps float @test6(i32 inreg %idx0, i32 inreg %idx1) {
139main_body:
140  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
141  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
142  %out = fadd float %src0, %src1
143  %out.0 = bitcast float %out to i32
144  %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0)
145  %out.2 = bitcast i32 %out.1 to float
146  ret float %out.2
147}
148
149; Check that WWM is triggered by the wwm intrinsic.
150;
151;CHECK-LABEL: {{^}}test_wwm1:
152;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
153;CHECK: buffer_load_dword
154;CHECK: buffer_load_dword
155;CHECK: v_add_f32_e32
156define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
157main_body:
158  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
159  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
160  %out = fadd float %src0, %src1
161  %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
162  ret float %out.0
163}
164
165; Same as above, but with an integer type.
166;
167;CHECK-LABEL: {{^}}test_wwm2:
168;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
169;CHECK: buffer_load_dword
170;CHECK: buffer_load_dword
171;CHECK: v_add_{{[iu]}}32_e32
172define amdgpu_ps float @test_wwm2(i32 inreg %idx0, i32 inreg %idx1) {
173main_body:
174  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
175  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
176  %src0.0 = bitcast float %src0 to i32
177  %src1.0 = bitcast float %src1 to i32
178  %out = add i32 %src0.0, %src1.0
179  %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out)
180  %out.1 = bitcast i32 %out.0 to float
181  ret float %out.1
182}
183
184; Check that we don't leave WWM on for computations that don't require WWM,
185; since that will lead clobbering things that aren't supposed to be clobbered
186; in cases like this.
187;
188;CHECK-LABEL: {{^}}test_wwm3:
189;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
190;CHECK: buffer_load_dword
191;CHECK: v_add_f32_e32
192;CHECK: s_mov_b64 exec, [[ORIG]]
193;CHECK: v_add_f32_e32
194define amdgpu_ps float @test_wwm3(i32 inreg %idx) {
195main_body:
196  ; use mbcnt to make sure the branch is divergent
197  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
198  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
199  %cc = icmp uge i32 %hi, 32
200  br i1 %cc, label %endif, label %if
201
202if:
203  %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
204  %out = fadd float %src, %src
205  %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
206  %out.1 = fadd float %src, %out.0
207  br label %endif
208
209endif:
210  %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
211  ret float %out.2
212}
213
214; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM
215; write could clobber disabled channels in the non-WWM one.
216;
217;CHECK-LABEL: {{^}}test_wwm4:
218;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
219;CHECK: buffer_load_dword
220;CHECK: v_add_f32_e32
221;CHECK: s_mov_b64 exec, [[ORIG]]
222;CHECK-NEXT: v_mov_b32_e32
223define amdgpu_ps float @test_wwm4(i32 inreg %idx) {
224main_body:
225  ; use mbcnt to make sure the branch is divergent
226  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
227  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
228  %cc = icmp uge i32 %hi, 32
229  br i1 %cc, label %endif, label %if
230
231if:
232  %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
233  %out = fadd float %src, %src
234  %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
235  br label %endif
236
237endif:
238  %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
239  ret float %out.1
240}
241
242; Make sure the transition from Exact to WWM then WQM works properly.
243;
244;CHECK-LABEL: {{^}}test_wwm5:
245;CHECK: buffer_load_dword
246;CHECK: buffer_store_dword
247;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
248;CHECK: buffer_load_dword
249;CHECK: v_add_f32_e32
250;CHECK: s_mov_b64 exec, [[ORIG]]
251;CHECK: s_wqm_b64 exec, exec
252define amdgpu_ps float @test_wwm5(i32 inreg %idx0, i32 inreg %idx1) {
253main_body:
254  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
255  call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
256  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
257  %temp = fadd float %src1, %src1
258  %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp)
259  %out = fadd float %temp.0, %temp.0
260  %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
261  ret float %out.0
262}
263
264; Check that WWM is turned on correctly across basic block boundaries.
265; if..then..endif version
266;
267;CHECK-LABEL: {{^}}test_wwm6_then:
268;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
269;SI-CHECK: buffer_load_dword
270;VI-CHECK: flat_load_dword
271;CHECK: s_mov_b64 exec, [[ORIG]]
272;CHECK: %if
273;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1
274;SI-CHECK: buffer_load_dword
275;VI-CHECK: flat_load_dword
276;CHECK: v_add_f32_e32
277;CHECK: s_mov_b64 exec, [[ORIG2]]
278define amdgpu_ps float @test_wwm6_then() {
279main_body:
280  %src0 = load volatile float, float addrspace(1)* undef
281  ; use mbcnt to make sure the branch is divergent
282  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
283  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
284  %cc = icmp uge i32 %hi, 32
285  br i1 %cc, label %endif, label %if
286
287if:
288  %src1 = load volatile float, float addrspace(1)* undef
289  %out = fadd float %src0, %src1
290  %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
291  br label %endif
292
293endif:
294  %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
295  ret float %out.1
296}
297
298; Check that WWM is turned on correctly across basic block boundaries.
299; loop version
300;
301;CHECK-LABEL: {{^}}test_wwm6_loop:
302;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
303;SI-CHECK: buffer_load_dword
304;VI-CHECK: flat_load_dword
305;CHECK: s_mov_b64 exec, [[ORIG]]
306;CHECK: %loop
307;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1
308;SI-CHECK: buffer_load_dword
309;VI-CHECK: flat_load_dword
310;CHECK: s_mov_b64 exec, [[ORIG2]]
311define amdgpu_ps float @test_wwm6_loop() {
312main_body:
313  %src0 = load volatile float, float addrspace(1)* undef
314  ; use mbcnt to make sure the branch is divergent
315  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
316  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
317  br label %loop
318
319loop:
320  %counter = phi i32 [ %lo, %main_body ], [ %counter.1, %loop ]
321  %src1 = load volatile float, float addrspace(1)* undef
322  %out = fadd float %src0, %src1
323  %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
324  %counter.1 = sub i32 %counter, 1
325  %cc = icmp ne i32 %counter.1, 0
326  br i1 %cc, label %loop, label %endloop
327
328endloop:
329  ret float %out.0
330}
331
332; Check that @llvm.amdgcn.set.inactive disables WWM.
333;
334;CHECK-LABEL: {{^}}test_set_inactive1:
335;CHECK: buffer_load_dword
336;CHECK: s_not_b64 exec, exec
337;CHECK: v_mov_b32_e32
338;CHECK: s_not_b64 exec, exec
339;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
340;CHECK: v_add_{{[iu]}}32_e32
341define amdgpu_ps void @test_set_inactive1(i32 inreg %idx) {
342main_body:
343  %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
344  %src.0 = bitcast float %src to i32
345  %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0)
346  %out = add i32 %src.1, %src.1
347  %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out)
348  %out.1 = bitcast i32 %out.0 to float
349  call void @llvm.amdgcn.struct.buffer.store.f32(float %out.1, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
350  ret void
351}
352
353; Check that enabling WQM anywhere enables WQM for the set.inactive source.
354;
355;CHECK-LABEL: {{^}}test_set_inactive2:
356;CHECK: s_wqm_b64 exec, exec
357;CHECK: buffer_load_dword
358;CHECK: buffer_load_dword
359define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) {
360main_body:
361  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
362  %src1.0 = bitcast float %src1 to i32
363  %src1.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src1.0, i32 undef)
364  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
365  %src0.0 = bitcast float %src0 to i32
366  %src0.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %src0.0)
367  %out = add i32 %src0.1, %src1.1
368  %out.0 = bitcast i32 %out to float
369  call void @llvm.amdgcn.struct.buffer.store.f32(float %out.0, <4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
370  ret void
371}
372
373; Check a case of one branch of an if-else requiring WQM, the other requiring
374; exact.
375;
376; Note: In this particular case, the save-and-restore could be avoided if the
377; analysis understood that the two branches of the if-else are mutually
378; exclusive.
379;
380;CHECK-LABEL: {{^}}test_control_flow_0:
381;CHECK-NEXT: ; %main_body
382;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
383;CHECK-NEXT: s_wqm_b64 exec, exec
384;CHECK: %ELSE
385;CHECK: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
386;CHECK: store
387;CHECK: s_mov_b64 exec, [[SAVED]]
388;CHECK: %IF
389;CHECK: image_sample
390;CHECK: image_sample
391define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
392main_body:
393  %cmp = icmp eq i32 %z, 0
394  br i1 %cmp, label %IF, label %ELSE
395
396IF:
397  %c.bc = bitcast i32 %c to float
398  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
399  %tex0 = extractelement <4 x float> %tex, i32 0
400  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
401  %data.if = extractelement <4 x float> %dtex, i32 0
402  br label %END
403
404ELSE:
405  call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0)
406  br label %END
407
408END:
409  %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
410  ret float %r
411}
412
413; Reverse branch order compared to the previous test.
414;
415;CHECK-LABEL: {{^}}test_control_flow_1:
416;CHECK-NEXT: ; %main_body
417;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
418;CHECK-NEXT: s_wqm_b64 exec, exec
419;CHECK: %IF
420;CHECK: image_sample
421;CHECK: image_sample
422;CHECK: %Flow
423;CHECK-NEXT: s_or_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]],
424;CHECK-NEXT: s_and_b64 exec, exec, [[ORIG]]
425;CHECK-NEXT: s_and_b64 [[SAVED]], exec, [[SAVED]]
426;CHECK-NEXT: s_xor_b64 exec, exec, [[SAVED]]
427;CHECK-NEXT: s_cbranch_execz [[END_BB:BB[0-9]+_[0-9]+]]
428;CHECK-NEXT: ; %bb.{{[0-9]+}}: ; %ELSE
429;CHECK: store_dword
430;CHECK: [[END_BB]]: ; %END
431;CHECK: s_or_b64 exec, exec,
432;CHECK: v_mov_b32_e32 v0
433;CHECK: ; return
434define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
435main_body:
436  %cmp = icmp eq i32 %z, 0
437  br i1 %cmp, label %ELSE, label %IF
438
439IF:
440  %c.bc = bitcast i32 %c to float
441  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
442  %tex0 = extractelement <4 x float> %tex, i32 0
443  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
444  %data.if = extractelement <4 x float> %dtex, i32 0
445  br label %END
446
447ELSE:
448  call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0)
449  br label %END
450
451END:
452  %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
453  ret float %r
454}
455
456; Check that branch conditions are properly marked as needing WQM...
457;
458;CHECK-LABEL: {{^}}test_control_flow_2:
459;CHECK-NEXT: ; %main_body
460;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
461;CHECK-NEXT: s_wqm_b64 exec, exec
462;CHECK: s_and_b64 exec, exec, [[ORIG]]
463;CHECK: store
464;CHECK: s_wqm_b64 exec, exec
465;CHECK: load
466;CHECK: s_and_b64 exec, exec, [[ORIG]]
467;CHECK: store
468;CHECK: s_wqm_b64 exec, exec
469;CHECK: v_cmp
470define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
471main_body:
472  %idx.1 = extractelement <3 x i32> %idx, i32 0
473  %data.1 = extractelement <2 x float> %data, i32 0
474  call void @llvm.amdgcn.struct.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i32 0, i32 0)
475
476  ; The load that determines the branch (and should therefore be WQM) is
477  ; surrounded by stores that require disabled WQM.
478  %idx.2 = extractelement <3 x i32> %idx, i32 1
479  %z = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx.2, i32 0, i32 0, i32 0)
480
481  %idx.3 = extractelement <3 x i32> %idx, i32 2
482  %data.3 = extractelement <2 x float> %data, i32 1
483  call void @llvm.amdgcn.struct.buffer.store.f32(float %data.3, <4 x i32> undef, i32 %idx.3, i32 0, i32 0, i32 0)
484
485  %cc = fcmp ogt float %z, 0.0
486  br i1 %cc, label %IF, label %ELSE
487
488IF:
489  %coord.IF = mul i32 %coord, 3
490  br label %END
491
492ELSE:
493  %coord.ELSE = mul i32 %coord, 4
494  br label %END
495
496END:
497  %coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ]
498  %coord.END.bc = bitcast i32 %coord.END to float
499  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord.END.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
500  ret <4 x float> %tex
501}
502
503; ... but only if they really do need it.
504;
505;CHECK-LABEL: {{^}}test_control_flow_3:
506;CHECK-NEXT: ; %main_body
507;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
508;CHECK-NEXT: s_wqm_b64 exec, exec
509;CHECK: image_sample
510;CHECK: s_and_b64 exec, exec, [[ORIG]]
511;CHECK: image_sample
512;CHECK-DAG: v_cmp
513;CHECK-DAG: store
514define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %coord) {
515main_body:
516  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
517  %tex0 = extractelement <4 x float> %tex, i32 0
518  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
519  %dtex.1 = extractelement <4 x float> %dtex, i32 0
520  call void @llvm.amdgcn.struct.buffer.store.f32(float %dtex.1, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
521
522  %cc = fcmp ogt float %dtex.1, 0.0
523  br i1 %cc, label %IF, label %ELSE
524
525IF:
526  %tex.IF = fmul float %dtex.1, 3.0
527  br label %END
528
529ELSE:
530  %tex.ELSE = fmul float %dtex.1, 4.0
531  br label %END
532
533END:
534  %tex.END = phi float [ %tex.IF, %IF ], [ %tex.ELSE, %ELSE ]
535  ret float %tex.END
536}
537
538; Another test that failed at some point because of terminator handling.
539;
540;CHECK-LABEL: {{^}}test_control_flow_4:
541;CHECK-NEXT: ; %main_body
542;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
543;CHECK-NEXT: s_wqm_b64 exec, exec
544;CHECK: %IF
545;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]],  [[ORIG]]
546;CHECK: load
547;CHECK: store
548;CHECK: s_mov_b64 exec, [[SAVE]]
549;CHECK: %END
550;CHECK: image_sample
551;CHECK: image_sample
552define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %coord, i32 %y, float %z) {
553main_body:
554  %cond = icmp eq i32 %y, 0
555  br i1 %cond, label %IF, label %END
556
557IF:
558  %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> undef, i32 0, i32 0, i32 0)
559  call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i32 0, i32 0)
560  br label %END
561
562END:
563  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
564  %tex0 = extractelement <4 x float> %tex, i32 0
565  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
566  ret <4 x float> %dtex
567}
568
569; Kill is performed in WQM mode so that uniform kill behaves correctly ...
570;
571;CHECK-LABEL: {{^}}test_kill_0:
572;CHECK-NEXT: ; %main_body
573;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
574;CHECK-NEXT: s_wqm_b64 exec, exec
575;CHECK: s_and_b64 exec, exec, [[ORIG]]
576;CHECK: image_sample
577;CHECK: buffer_store_dword
578;CHECK: s_wqm_b64 exec, exec
579;CHECK: v_cmpx_
580;CHECK: image_sample
581;CHECK: s_and_b64 exec, exec, [[ORIG]]
582;CHECK: image_sample
583;CHECK: buffer_store_dword
584define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, float %coord, float %coord2, float %z) {
585main_body:
586  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
587  %idx.0 = extractelement <2 x i32> %idx, i32 0
588  %data.0 = extractelement <2 x float> %data, i32 0
589  call void @llvm.amdgcn.struct.buffer.store.f32(float %data.0, <4 x i32> undef, i32 %idx.0, i32 0, i32 0, i32 0)
590
591  %z.cmp = fcmp olt float %z, 0.0
592  call void @llvm.amdgcn.kill(i1 %z.cmp)
593
594  %idx.1 = extractelement <2 x i32> %idx, i32 1
595  %data.1 = extractelement <2 x float> %data, i32 1
596  call void @llvm.amdgcn.struct.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i32 0, i32 0)
597  %tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
598  %tex2.0 = extractelement <4 x float> %tex2, i32 0
599  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex2.0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
600  %out = fadd <4 x float> %tex, %dtex
601
602  ret <4 x float> %out
603}
604
605; ... but only if WQM is necessary.
606;
607; CHECK-LABEL: {{^}}test_kill_1:
608; CHECK-NEXT: ; %main_body
609; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
610; CHECK: s_wqm_b64 exec, exec
611; CHECK: image_sample
612; CHECK: s_and_b64 exec, exec, [[ORIG]]
613; CHECK: image_sample
614; CHECK: buffer_store_dword
615; CHECK-NOT: wqm
616; CHECK: v_cmpx_
617define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
618main_body:
619  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
620  %tex0 = extractelement <4 x float> %tex, i32 0
621  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
622
623  call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i32 0)
624
625  %z.cmp = fcmp olt float %z, 0.0
626  call void @llvm.amdgcn.kill(i1 %z.cmp)
627
628  ret <4 x float> %dtex
629}
630
631; Check prolog shaders.
632;
633; CHECK-LABEL: {{^}}test_prolog_1:
634; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
635; CHECK: s_wqm_b64 exec, exec
636; CHECK: v_add_f32_e32 v0,
637; CHECK: s_and_b64 exec, exec, [[ORIG]]
638define amdgpu_ps float @test_prolog_1(float %a, float %b) #5 {
639main_body:
640  %s = fadd float %a, %b
641  ret float %s
642}
643
644; CHECK-LABEL: {{^}}test_loop_vcc:
645; CHECK-NEXT: ; %entry
646; CHECK-NEXT: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
647; CHECK: s_wqm_b64 exec, exec
648; CHECK: s_and_b64 exec, exec, [[LIVE]]
649; CHECK: image_store
650; CHECK: s_wqm_b64 exec, exec
651; CHECK-DAG: v_mov_b32_e32 [[CTR:v[0-9]+]], 0
652; CHECK-DAG: s_mov_b32 [[SEVEN:s[0-9]+]], 0x40e00000
653
654; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %body
655; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]]
656; CHECK: [[LOOP:BB[0-9]+_[0-9]+]]: ; %loop
657; CHECK: v_cmp_lt_f32_e32 vcc, [[SEVEN]], [[CTR]]
658; CHECK: s_cbranch_vccz [[LOOPHDR]]
659
660; CHECK: ; %break
661; CHECK: ; return
662define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {
663entry:
664  call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %in, i32 15, i32 undef, <8 x i32> undef, i32 0, i32 0)
665  br label %loop
666
667loop:
668  %ctr.iv = phi float [ 0.0, %entry ], [ %ctr.next, %body ]
669  %c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ]
670  %cc = fcmp ogt float %ctr.iv, 7.0
671  br i1 %cc, label %break, label %body
672
673body:
674  %c.iv0 = extractelement <4 x float> %c.iv, i32 0
675  %c.next = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.iv0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
676  %ctr.next = fadd float %ctr.iv, 2.0
677  br label %loop
678
679break:
680  ret <4 x float> %c.iv
681}
682
683; Only intrinsic stores need exact execution -- other stores do not have
684; externally visible effects and may require WQM for correctness.
685;
686; CHECK-LABEL: {{^}}test_alloca:
687; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
688; CHECK: s_wqm_b64 exec, exec
689
690; CHECK: s_and_b64 exec, exec, [[LIVE]]
691; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0
692; CHECK: s_wqm_b64 exec, exec
693; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}}
694; CHECK: s_and_b64 exec, exec, [[LIVE]]
695; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen
696; CHECK: s_wqm_b64 exec, exec
697; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
698
699; CHECK: s_and_b64 exec, exec, [[LIVE]]
700; CHECK: image_sample
701; CHECK: buffer_store_dwordx4
702define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind {
703entry:
704  %array = alloca [32 x i32], align 4, addrspace(5)
705
706  call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i32 0)
707
708  %s.gep = getelementptr [32 x i32], [32 x i32] addrspace(5)* %array, i32 0, i32 0
709  store volatile i32 %a, i32 addrspace(5)* %s.gep, align 4
710
711  call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i32 0, i32 0)
712
713  %c.gep = getelementptr [32 x i32], [32 x i32] addrspace(5)* %array, i32 0, i32 %idx
714  %c = load i32, i32 addrspace(5)* %c.gep, align 4
715  %c.bc = bitcast i32 %c to float
716  %t = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
717  call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %t, <4 x i32> undef, i32 0, i32 0, i32 0)
718
719  ret void
720}
721
722; Must return to exact at the end of a non-void returning shader,
723; otherwise the EXEC mask exported by the epilog will be wrong. This is true
724; even if the shader has no kills, because a kill could have happened in a
725; previous shader fragment.
726;
727; CHECK-LABEL: {{^}}test_nonvoid_return:
728; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
729; CHECK: s_wqm_b64 exec, exec
730;
731; CHECK: s_and_b64 exec, exec, [[LIVE]]
732; CHECK-NOT: exec
733define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind {
734  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
735  %tex0 = extractelement <4 x float> %tex, i32 0
736  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
737  ret <4 x float> %dtex
738}
739
740; CHECK-LABEL: {{^}}test_nonvoid_return_unreachable:
741; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
742; CHECK: s_wqm_b64 exec, exec
743;
744; CHECK: s_and_b64 exec, exec, [[LIVE]]
745; CHECK-NOT: exec
746define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind {
747entry:
748  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
749  %tex0 = extractelement <4 x float> %tex, i32 0
750  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
751  %cc = icmp sgt i32 %c, 0
752  br i1 %cc, label %if, label %else
753
754if:
755  store volatile <4 x float> %dtex, <4 x float> addrspace(1)* undef
756  unreachable
757
758else:
759  ret <4 x float> %dtex
760}
761
762; Test awareness that s_wqm_b64 clobbers SCC.
763;
764; CHECK-LABEL: {{^}}test_scc:
765; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
766; CHECK: s_wqm_b64 exec, exec
767; CHECK: s_cmp_
768; CHECK-NEXT: s_cbranch_scc
769; CHECK: ; %else
770; CHECK: image_sample
771; CHECK: ; %if
772; CHECK: image_sample
773; CHECK: ; %end
774; CHECK: s_and_b64 exec, exec, [[ORIG]]
775define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 {
776main_body:
777  %cc = icmp sgt i32 %sel, 0
778  br i1 %cc, label %if, label %else
779
780if:
781  %r.if = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
782  br label %end
783
784else:
785  %r.else = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.0, float bitcast (i32 1 to float), <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
786  br label %end
787
788end:
789  %r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ]
790  call void @llvm.amdgcn.struct.buffer.store.f32(float 1.0, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
791  ret <4 x float> %r
792}
793
794; Check a case of a block being entirely WQM except for a bit of WWM.
795; There was a bug where it forgot to enter and leave WWM.
796;
797;CHECK-LABEL: {{^}}test_wwm_within_wqm:
798;CHECK: %IF
799;CHECK: s_or_saveexec_b64 {{.*}}, -1
800;CHECK: ds_swizzle
801;
802define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
803main_body:
804  %c.bc = bitcast i32 %c to float
805  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
806  %tex0 = extractelement <4 x float> %tex, i32 0
807  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
808  %cmp = icmp eq i32 %z, 0
809  br i1 %cmp, label %IF, label %ENDIF
810
811IF:
812  %dataf = extractelement <4 x float> %dtex, i32 0
813  %data1 = fptosi float %dataf to i32
814  %data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0)
815  %data3 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data2, i32 2079)
816  %data4 = call i32 @llvm.amdgcn.wwm.i32(i32 %data3)
817  %data4f = sitofp i32 %data4 to float
818  br label %ENDIF
819
820ENDIF:
821  %r = phi float [ %data4f, %IF ], [ 0.0, %main_body ]
822  ret float %r
823}
824
825declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
826declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1
827
828declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #2
829declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) #2
830declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32 immarg) #2
831declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) #2
832declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #3
833declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) #3
834
835declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #3
836declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
837declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
838declare void @llvm.amdgcn.kill(i1) #1
839declare float @llvm.amdgcn.wqm.f32(float) #3
840declare i32 @llvm.amdgcn.wqm.i32(i32) #3
841declare float @llvm.amdgcn.wwm.f32(float) #3
842declare i32 @llvm.amdgcn.wwm.i32(i32) #3
843declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #4
844declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3
845declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3
846declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #3
847declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #1
848declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2
849declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2
850declare i32 @llvm.amdgcn.ds.swizzle(i32, i32)
851
852attributes #1 = { nounwind }
853attributes #2 = { nounwind readonly }
854attributes #3 = { nounwind readnone }
855attributes #4 = { nounwind readnone convergent }
856attributes #5 = { "amdgpu-ps-wqm-outputs" }
857attributes #6 = { nounwind "InitialPSInputAddr"="2" }
858