1; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-MAD,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,SI %s
2; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,SI %s
3; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-MAD,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-STRICT,SI %s
4; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-STRICT,SI %s
5
6; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-CONTRACT,GCN-FLUSH-MAD,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-CONTRACT,SI %s
7; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s
8; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-CONTRACT,GCN-FLUSH-MAD,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-CONTRACT,SI %s
9; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s
10
11
12; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx900 -mattr=-fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-MAD,GFX9-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,GFX900 %s
13; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx900 -mattr=+fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,GFX9-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,GFX900 %s
14
15; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx906 -mattr=-fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-FMAC,GFX9-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,GFX906 %s
16
17; FIXME: Should probably test this, but sometimes selecting fmac is painful to match.
18; XUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx906 -mattr=+fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,GFX9-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,GFX906 %s
19
20
21; Test all permutations of: fp32 denormals, fast fp contract, fp contract enabled for fmuladd, fmaf fast/slow.
22
23target triple = "amdgcn--"
24
25
26declare i32 @llvm.amdgcn.workitem.id.x() #1
27declare float @llvm.fmuladd.f32(float, float, float) #1
28declare half @llvm.fmuladd.f16(half, half, half) #1
29declare float @llvm.fabs.f32(float) #1
30
31; GCN-LABEL: {{^}}fmuladd_f32:
32; GCN-FLUSH-MAD: v_mac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
33; GCN-FLUSH-FMAC: v_fmac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
34
35; GCN-DENORM-FASTFMA: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
36
37; GCN-DENORM-SLOWFMA: v_mul_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
38; GCN-DENORM-SLOWFMA: v_add_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
39define amdgpu_kernel void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
40                         float addrspace(1)* %in2, float addrspace(1)* %in3) #0 {
41  %r0 = load float, float addrspace(1)* %in1
42  %r1 = load float, float addrspace(1)* %in2
43  %r2 = load float, float addrspace(1)* %in3
44  %r3 = tail call float @llvm.fmuladd.f32(float %r0, float %r1, float %r2)
45  store float %r3, float addrspace(1)* %out
46  ret void
47}
48
49; GCN-LABEL: {{^}}fmul_fadd_f32:
50; GCN-FLUSH: v_mac_f32
51
52; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32
53
54; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32
55; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32
56
57; GCN-DENORM-STRICT: v_mul_f32_e32
58; GCN-DENORM-STRICT: v_add_f32_e32
59define amdgpu_kernel void @fmul_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
60                           float addrspace(1)* %in2, float addrspace(1)* %in3) #0 {
61  %r0 = load volatile float, float addrspace(1)* %in1
62  %r1 = load volatile float, float addrspace(1)* %in2
63  %r2 = load volatile float, float addrspace(1)* %in3
64  %mul = fmul float %r0, %r1
65  %add = fadd float %mul, %r2
66  store float %add, float addrspace(1)* %out
67  ret void
68}
69
70; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f32
71; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
72; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
73
74; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
75; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]]
76; SI-FLUSH: buffer_store_dword [[R2]]
77; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
78
79; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
80
81; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
82; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
83
84; SI-DENORM buffer_store_dword [[RESULT]]
85; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
86define amdgpu_kernel void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
87  %tid = call i32 @llvm.amdgcn.workitem.id.x()
88  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
89  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
90  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
91
92  %r1 = load volatile float, float addrspace(1)* %gep.0
93  %r2 = load volatile float, float addrspace(1)* %gep.1
94
95  %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2)
96  store float %r3, float addrspace(1)* %gep.out
97  ret void
98}
99
100; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f32
101; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
102; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
103
104; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
105; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]]
106
107; SI-FLUSH: buffer_store_dword [[R2]]
108; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
109
110; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
111
112; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
113; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
114
115; SI-DENORM: buffer_store_dword [[RESULT]]
116; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
117define amdgpu_kernel void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
118  %tid = call i32 @llvm.amdgcn.workitem.id.x()
119  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
120  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
121  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
122
123  %r1 = load volatile float, float addrspace(1)* %gep.0
124  %r2 = load volatile float, float addrspace(1)* %gep.1
125
126  %r3 = tail call float @llvm.fmuladd.f32(float %r1, float 2.0, float %r2)
127  store float %r3, float addrspace(1)* %gep.out
128  ret void
129}
130
131; GCN-LABEL: {{^}}fadd_a_a_b_f32:
132; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
133; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
134
135; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
136
137; SI-FLUSH: buffer_store_dword [[R2]]
138; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
139
140; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
141
142; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
143; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
144
145; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
146; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
147
148; SI-DENORM: buffer_store_dword [[RESULT]]
149; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
150define amdgpu_kernel void @fadd_a_a_b_f32(float addrspace(1)* %out,
151                            float addrspace(1)* %in1,
152                            float addrspace(1)* %in2) #0 {
153  %tid = call i32 @llvm.amdgcn.workitem.id.x()
154  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
155  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
156  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
157
158  %r0 = load volatile float, float addrspace(1)* %gep.0
159  %r1 = load volatile float, float addrspace(1)* %gep.1
160
161  %add.0 = fadd float %r0, %r0
162  %add.1 = fadd float %add.0, %r1
163  store float %add.1, float addrspace(1)* %gep.out
164  ret void
165}
166
167; GCN-LABEL: {{^}}fadd_b_a_a_f32:
168; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
169; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
170
171; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
172
173; SI-FLUSH: buffer_store_dword [[R2]]
174; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
175
176; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
177
178; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
179; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
180
181; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
182; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
183
184; SI-DENORM: buffer_store_dword [[RESULT]]
185; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
186define amdgpu_kernel void @fadd_b_a_a_f32(float addrspace(1)* %out,
187                            float addrspace(1)* %in1,
188                            float addrspace(1)* %in2) #0 {
189  %tid = call i32 @llvm.amdgcn.workitem.id.x()
190  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
191  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
192  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
193
194  %r0 = load volatile float, float addrspace(1)* %gep.0
195  %r1 = load volatile float, float addrspace(1)* %gep.1
196
197  %add.0 = fadd float %r0, %r0
198  %add.1 = fadd float %r1, %add.0
199  store float %add.1, float addrspace(1)* %gep.out
200  ret void
201}
202
203; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32
204; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
205; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
206; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
207; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], -2.0, [[R1]]
208
209; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
210
211; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
212; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
213
214; SI-DENORM: buffer_store_dword [[RESULT]]
215; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
216define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
217  %tid = call i32 @llvm.amdgcn.workitem.id.x()
218  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
219  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
220  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
221
222  %r1 = load volatile float, float addrspace(1)* %gep.0
223  %r2 = load volatile float, float addrspace(1)* %gep.1
224
225  %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1, float %r2)
226  store float %r3, float addrspace(1)* %gep.out
227  ret void
228}
229
230; XXX
231; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f32
232; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
233; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
234
235; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
236; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]]
237
238; SI-FLUSH: buffer_store_dword [[R2]]
239; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
240
241; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
242
243; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
244; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
245
246; SI-DENORM: buffer_store_dword [[RESULT]]
247; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
248define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
249  %tid = call i32 @llvm.amdgcn.workitem.id.x()
250  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
251  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
252  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
253
254  %r1 = load volatile float, float addrspace(1)* %gep.0
255  %r2 = load volatile float, float addrspace(1)* %gep.1
256
257  %r1.fneg = fsub float -0.000000e+00, %r1
258
259  %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1.fneg, float %r2)
260  store float %r3, float addrspace(1)* %gep.out
261  ret void
262}
263
264; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32:
265; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
266; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
267
268; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
269; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], -2.0, [[R1]]
270
271; SI-FLUSH: buffer_store_dword [[R2]]
272; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
273
274; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
275
276; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
277; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
278
279; SI-DENORM: buffer_store_dword [[RESULT]]
280; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
281define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
282  %tid = call i32 @llvm.amdgcn.workitem.id.x()
283  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
284  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
285  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
286
287  %r1 = load volatile float, float addrspace(1)* %gep.0
288  %r2 = load volatile float, float addrspace(1)* %gep.1
289
290  %r1.fneg = fsub float -0.000000e+00, %r1
291
292  %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1.fneg, float %r2)
293  store float %r3, float addrspace(1)* %gep.out
294  ret void
295}
296
297; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f32:
298; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
299; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
300; GCN-FLUSH-MAD: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
301; GCN-FLUSH-FMAC: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
302
303; SI-FLUSH: buffer_store_dword [[RESULT]]
304; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
305
306; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
307
308; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
309; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
310
311; SI-DENORM: buffer_store_dword [[RESULT]]
312; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
313define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
314  %tid = call i32 @llvm.amdgcn.workitem.id.x()
315  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
316  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
317  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
318
319  %r1 = load volatile float, float addrspace(1)* %gep.0
320  %r2 = load volatile float, float addrspace(1)* %gep.1
321
322  %r2.fneg = fsub float -0.000000e+00, %r2
323
324  %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2.fneg)
325  store float %r3, float addrspace(1)* %gep.out
326  ret void
327}
328
329; GCN-LABEL: {{^}}mad_sub_f32:
330; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
331; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
332; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
333; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
334
335; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
336
337; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
338; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
339
340; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
341; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
342
343; SI: buffer_store_dword [[RESULT]]
344; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
345define amdgpu_kernel void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
346  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
347  %tid.ext = sext i32 %tid to i64
348  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
349  %add1 = add i64 %tid.ext, 1
350  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
351  %add2 = add i64 %tid.ext, 2
352  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
353  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
354  %a = load volatile float, float addrspace(1)* %gep0, align 4
355  %b = load volatile float, float addrspace(1)* %gep1, align 4
356  %c = load volatile float, float addrspace(1)* %gep2, align 4
357  %mul = fmul float %a, %b
358  %sub = fsub float %mul, %c
359  store float %sub, float addrspace(1)* %outgep, align 4
360  ret void
361}
362
363; GCN-LABEL: {{^}}mad_sub_inv_f32:
364; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
365; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
366; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
367
368; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
369
370; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
371
372; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
373; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
374
375; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
376; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
377
378; SI: buffer_store_dword [[RESULT]]
379; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
380define amdgpu_kernel void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
381  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
382  %tid.ext = sext i32 %tid to i64
383  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
384  %add1 = add i64 %tid.ext, 1
385  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
386  %add2 = add i64 %tid.ext, 2
387  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
388  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
389  %a = load volatile float, float addrspace(1)* %gep0, align 4
390  %b = load volatile float, float addrspace(1)* %gep1, align 4
391  %c = load volatile float, float addrspace(1)* %gep2, align 4
392  %mul = fmul float %a, %b
393  %sub = fsub float %c, %mul
394  store float %sub, float addrspace(1)* %outgep, align 4
395  ret void
396}
397
398; GCN-LABEL: {{^}}mad_sub_fabs_f32:
399; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
400; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
401; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
402; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
403
404; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
405
406; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
407; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]],  [[TMP]], |[[REGC]]|
408
409; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
410; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]],  [[TMP]], |[[REGC]]|
411
412; SI: buffer_store_dword [[RESULT]]
413; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
414define amdgpu_kernel void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
415  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
416  %tid.ext = sext i32 %tid to i64
417  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
418  %add1 = add i64 %tid.ext, 1
419  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
420  %add2 = add i64 %tid.ext, 2
421  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
422  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
423  %a = load volatile float, float addrspace(1)* %gep0, align 4
424  %b = load volatile float, float addrspace(1)* %gep1, align 4
425  %c = load volatile float, float addrspace(1)* %gep2, align 4
426  %c.abs = call float @llvm.fabs.f32(float %c) #0
427  %mul = fmul float %a, %b
428  %sub = fsub float %mul, %c.abs
429  store float %sub, float addrspace(1)* %outgep, align 4
430  ret void
431}
432
433; GCN-LABEL: {{^}}mad_sub_fabs_inv_f32:
434; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
435; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
436; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
437; GCN-FLUSH-MAD: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
438; GCN-FLUSH-FMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
439
440; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
441
442; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
443; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
444
445; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
446; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
447
448; SI: buffer_store_dword [[RESULT]]
449; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
450define amdgpu_kernel void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
451  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
452  %tid.ext = sext i32 %tid to i64
453  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
454  %add1 = add i64 %tid.ext, 1
455  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
456  %add2 = add i64 %tid.ext, 2
457  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
458  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
459  %a = load volatile float, float addrspace(1)* %gep0, align 4
460  %b = load volatile float, float addrspace(1)* %gep1, align 4
461  %c = load volatile float, float addrspace(1)* %gep2, align 4
462  %c.abs = call float @llvm.fabs.f32(float %c) #0
463  %mul = fmul float %a, %b
464  %sub = fsub float %c.abs, %mul
465  store float %sub, float addrspace(1)* %outgep, align 4
466  ret void
467}
468
469; GCN-LABEL: {{^}}neg_neg_mad_f32:
470; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
471; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
472; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
473
474; GCN-FLUSH: v_mac_f32_e32 [[REGC]], [[REGA]], [[REGB]]
475; SI-FLUSH: buffer_store_dword [[REGC]]
476; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
477
478; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
479
480; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]],  [[REGA]], [[REGB]]
481; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
482
483; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
484; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
485
486; SI-DENORM: buffer_store_dword [[RESULT]]
487; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
488define amdgpu_kernel void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
489  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
490  %tid.ext = sext i32 %tid to i64
491  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
492  %add1 = add i64 %tid.ext, 1
493  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
494  %add2 = add i64 %tid.ext, 2
495  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
496  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
497  %a = load volatile float, float addrspace(1)* %gep0, align 4
498  %b = load volatile float, float addrspace(1)* %gep1, align 4
499  %c = load volatile float, float addrspace(1)* %gep2, align 4
500  %nega = fsub float -0.000000e+00, %a
501  %negb = fsub float -0.000000e+00, %b
502  %mul = fmul float %nega, %negb
503  %sub = fadd float %mul, %c
504  store float %sub, float addrspace(1)* %outgep, align 4
505  ret void
506}
507
508; GCN-LABEL: {{^}}mad_fabs_sub_f32:
509; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
510; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
511; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
512; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
513
514; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
515
516; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
517; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
518
519; GCN-DENORM-STRICT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
520; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
521
522; SI: buffer_store_dword [[RESULT]]
523; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
524define amdgpu_kernel void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
525  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
526  %tid.ext = sext i32 %tid to i64
527  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
528  %add1 = add i64 %tid.ext, 1
529  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
530  %add2 = add i64 %tid.ext, 2
531  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
532  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
533  %a = load volatile float, float addrspace(1)* %gep0, align 4
534  %b = load volatile float, float addrspace(1)* %gep1, align 4
535  %c = load volatile float, float addrspace(1)* %gep2, align 4
536  %b.abs = call float @llvm.fabs.f32(float %b) #0
537  %mul = fmul float %a, %b.abs
538  %sub = fsub float %mul, %c
539  store float %sub, float addrspace(1)* %outgep, align 4
540  ret void
541}
542
543; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f32:
544; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
545; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
546; GCN-FLUSH: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
547; SI-FLUSH: buffer_store_dword [[R2]]
548; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
549
550; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
551
552; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
553; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
554
555; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
556; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
557
558; SI-DENORM: buffer_store_dword [[RESULT]]
559; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
560define amdgpu_kernel void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
561  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
562  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
563  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
564  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
565
566  %r1 = load volatile float, float addrspace(1)* %gep.0
567  %r2 = load volatile float, float addrspace(1)* %gep.1
568
569  %add = fadd float %r1, %r1
570  %r3 = fsub float %r2, %add
571
572  store float %r3, float addrspace(1)* %gep.out
573  ret void
574}
575
576; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f32:
577; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
578; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
579; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
580
581; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
582
583; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
584; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
585
586; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
587; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
588
589; SI: buffer_store_dword [[RESULT]]
590; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
591define amdgpu_kernel void @fsub_fadd_a_a_c_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
592  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
593  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
594  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
595  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
596
597  %r1 = load volatile float, float addrspace(1)* %gep.0
598  %r2 = load volatile float, float addrspace(1)* %gep.1
599
600  %add = fadd float %r1, %r1
601  %r3 = fsub float %add, %r2
602
603  store float %r3, float addrspace(1)* %gep.out
604  ret void
605}
606
607attributes #0 = { nounwind }
608attributes #1 = { nounwind readnone }
609