1; Make sure we still form mad even when unsafe math or fp-contract is allowed instead of fma.
2
3; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD  -check-prefix=SI-STD-SAFE -check-prefix=FUNC %s
4; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-SAFE -check-prefix=FUNC %s
5; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-UNSAFE -check-prefix=FUNC %s
6
7; Make sure we don't form mad with denormals
8; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-FASTFMAF -check-prefix=FUNC %s
9; RUN: llc -march=amdgcn -mcpu=verde -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s
10
11declare i32 @llvm.amdgcn.workitem.id.x() #0
12declare float @llvm.fabs.f32(float) #0
13declare float @llvm.fma.f32(float, float, float) #0
14declare float @llvm.fmuladd.f32(float, float, float) #0
15
16; (fadd (fmul x, y), z) -> (fma x, y, z)
17; FUNC-LABEL: {{^}}combine_to_mad_f32_0:
18; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
19; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
20; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
21
22; SI-STD: v_mac_f32_e32 [[C]], [[A]], [[B]]
23
24; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
25
26; SI-DENORM-SLOWFMAF-NOT: v_fma
27; SI-DENORM-SLOWFMAF-NOT: v_mad
28
29; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
30; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]],  [[TMP]], [[C]]
31
32; SI-DENORM: buffer_store_dword [[RESULT]]
33; SI-STD: buffer_store_dword [[C]]
34define amdgpu_kernel void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
35  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
36  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
37  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
38  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
39  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
40
41  %a = load volatile float, float addrspace(1)* %gep.0
42  %b = load volatile float, float addrspace(1)* %gep.1
43  %c = load volatile float, float addrspace(1)* %gep.2
44
45  %mul = fmul float %a, %b
46  %fma = fadd float %mul, %c
47  store float %fma, float addrspace(1)* %gep.out
48  ret void
49}
50
51; (fadd (fmul x, y), z) -> (fma x, y, z)
52; FUNC-LABEL: {{^}}combine_to_mad_f32_0_2use:
53; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
54; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
55; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
56; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
57
58; SI-STD-DAG: v_mac_f32_e32 [[C]], [[A]], [[B]]
59; SI-STD-DAG: v_mac_f32_e32 [[D]], [[A]], [[B]]
60
61; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]]
62; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]]
63
64; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
65; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
66; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
67
68; SI-DENORM-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
69; SI-DENORM-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
70; SI-STD-DAG: buffer_store_dword [[C]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
71; SI-STD-DAG: buffer_store_dword [[D]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
72; SI: s_endpgm
73define amdgpu_kernel void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
74  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
75  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
76  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
77  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
78  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
79  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
80  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
81
82  %a = load volatile float, float addrspace(1)* %gep.0
83  %b = load volatile float, float addrspace(1)* %gep.1
84  %c = load volatile float, float addrspace(1)* %gep.2
85  %d = load volatile float, float addrspace(1)* %gep.3
86
87  %mul = fmul float %a, %b
88  %fma0 = fadd float %mul, %c
89  %fma1 = fadd float %mul, %d
90
91  store volatile float %fma0, float addrspace(1)* %gep.out.0
92  store volatile float %fma1, float addrspace(1)* %gep.out.1
93  ret void
94}
95
96; (fadd x, (fmul y, z)) -> (fma y, z, x)
97; FUNC-LABEL: {{^}}combine_to_mad_f32_1:
98; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
99; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
100; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
101
102; SI-STD: v_mac_f32_e32 [[C]], [[A]], [[B]]
103; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
104
105; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
106; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
107
108; SI-DENORM: buffer_store_dword [[RESULT]]
109; SI-STD: buffer_store_dword [[C]]
110define amdgpu_kernel void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
111  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
112  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
113  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
114  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
115  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
116
117  %a = load volatile float, float addrspace(1)* %gep.0
118  %b = load volatile float, float addrspace(1)* %gep.1
119  %c = load volatile float, float addrspace(1)* %gep.2
120
121  %mul = fmul float %a, %b
122  %fma = fadd float %c, %mul
123  store float %fma, float addrspace(1)* %gep.out
124  ret void
125}
126
127; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
128; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32:
129; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
130; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
131; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
132
133; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
134; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
135
136; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
137; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
138
139; SI: buffer_store_dword [[RESULT]]
140define amdgpu_kernel void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
141  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
142  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
143  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
144  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
145  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
146
147  %a = load volatile float, float addrspace(1)* %gep.0
148  %b = load volatile float, float addrspace(1)* %gep.1
149  %c = load volatile float, float addrspace(1)* %gep.2
150
151  %mul = fmul float %a, %b
152  %fma = fsub float %mul, %c
153  store float %fma, float addrspace(1)* %gep.out
154  ret void
155}
156
157; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
158; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32_2use:
159; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
160; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
161; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
162; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
163
164; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]]
165; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
166
167; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]]
168; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
169
170; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
171; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
172; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
173
174; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
175; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
176; SI: s_endpgm
177define amdgpu_kernel void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
178  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
179  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
180  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
181  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
182  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
183  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
184  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
185
186  %a = load volatile float, float addrspace(1)* %gep.0
187  %b = load volatile float, float addrspace(1)* %gep.1
188  %c = load volatile float, float addrspace(1)* %gep.2
189  %d = load volatile float, float addrspace(1)* %gep.3
190
191  %mul = fmul float %a, %b
192  %fma0 = fsub float %mul, %c
193  %fma1 = fsub float %mul, %d
194  store volatile float %fma0, float addrspace(1)* %gep.out.0
195  store volatile float %fma1, float addrspace(1)* %gep.out.1
196  ret void
197}
198
199; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
200; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32:
201; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
202; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
203; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
204
205; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
206; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
207
208; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
209; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
210
211; SI: buffer_store_dword [[RESULT]]
212define amdgpu_kernel void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
213  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
214  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
215  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
216  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
217  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
218
219  %a = load volatile float, float addrspace(1)* %gep.0
220  %b = load volatile float, float addrspace(1)* %gep.1
221  %c = load volatile float, float addrspace(1)* %gep.2
222
223  %mul = fmul float %a, %b
224  %fma = fsub float %c, %mul
225  store float %fma, float addrspace(1)* %gep.out
226  ret void
227}
228
229; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
230; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32_2use:
231; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
232; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
233; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
234; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
235
236; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]]
237; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]]
238
239; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]]
240; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]]
241
242; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
243; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
244; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]],  [[D]], [[TMP]]
245
246; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
247; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
248; SI: s_endpgm
249define amdgpu_kernel void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
250  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
251  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
252  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
253  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
254  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
255  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
256  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
257
258  %a = load volatile float, float addrspace(1)* %gep.0
259  %b = load volatile float, float addrspace(1)* %gep.1
260  %c = load volatile float, float addrspace(1)* %gep.2
261  %d = load volatile float, float addrspace(1)* %gep.3
262
263  %mul = fmul float %a, %b
264  %fma0 = fsub float %c, %mul
265  %fma1 = fsub float %d, %mul
266  store volatile float %fma0, float addrspace(1)* %gep.out.0
267  store volatile float %fma1, float addrspace(1)* %gep.out.1
268  ret void
269}
270
271; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
272; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32:
273; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
274; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
275; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
276
277; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
278
279; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]]
280
281; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]]
282; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
283
284; SI: buffer_store_dword [[RESULT]]
285define amdgpu_kernel void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
286  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
287  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
288  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
289  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
290  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
291
292  %a = load volatile float, float addrspace(1)* %gep.0
293  %b = load volatile float, float addrspace(1)* %gep.1
294  %c = load volatile float, float addrspace(1)* %gep.2
295
296  %mul = fmul float %a, %b
297  %mul.neg = fsub float -0.0, %mul
298  %fma = fsub float %mul.neg, %c
299
300  store float %fma, float addrspace(1)* %gep.out
301  ret void
302}
303
304; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
305; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_neg:
306; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
307; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
308; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
309; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
310
311; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], -[[B]], -[[C]]
312; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], -[[B]], -[[D]]
313
314; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
315; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]]
316
317; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]]
318; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
319; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]],  [[TMP]], [[D]]
320
321; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
322; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
323; SI: s_endpgm
324define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
325  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
326  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
327  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
328  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
329  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
330  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
331  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
332
333  %a = load volatile float, float addrspace(1)* %gep.0
334  %b = load volatile float, float addrspace(1)* %gep.1
335  %c = load volatile float, float addrspace(1)* %gep.2
336  %d = load volatile float, float addrspace(1)* %gep.3
337
338  %mul = fmul float %a, %b
339  %mul.neg = fsub float -0.0, %mul
340  %fma0 = fsub float %mul.neg, %c
341  %fma1 = fsub float %mul.neg, %d
342
343  store volatile float %fma0, float addrspace(1)* %gep.out.0
344  store volatile float %fma1, float addrspace(1)* %gep.out.1
345  ret void
346}
347
348; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
349; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_mul:
350; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
351; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
352; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
353; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
354
355; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
356; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
357
358; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
359; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
360
361; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
362; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]]
363; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
364
365; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
366; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
367; SI: s_endpgm
368define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
369  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
370  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
371  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
372  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
373  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
374  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
375  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
376
377  %a = load volatile float, float addrspace(1)* %gep.0
378  %b = load volatile float, float addrspace(1)* %gep.1
379  %c = load volatile float, float addrspace(1)* %gep.2
380  %d = load volatile float, float addrspace(1)* %gep.3
381
382  %mul = fmul float %a, %b
383  %mul.neg = fsub float -0.0, %mul
384  %fma0 = fsub float %mul.neg, %c
385  %fma1 = fsub float %mul, %d
386
387  store volatile float %fma0, float addrspace(1)* %gep.out.0
388  store volatile float %fma1, float addrspace(1)* %gep.out.1
389  ret void
390}
391
392; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
393
394; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_0_f32:
395; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
396; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
397; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
398; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
399; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
400
401; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
402; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
403; SI-STD: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[C]]
404
405; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
406; SI-DENORM: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
407; SI-DENORM: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[C]]
408
409; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
410define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
411  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
412  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
413  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
414  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
415  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
416  %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
417  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
418
419  %x = load volatile float, float addrspace(1)* %gep.0
420  %y = load volatile float, float addrspace(1)* %gep.1
421  %z = load volatile float, float addrspace(1)* %gep.2
422  %u = load volatile float, float addrspace(1)* %gep.3
423  %v = load volatile float, float addrspace(1)* %gep.4
424
425  %tmp0 = fmul float %u, %v
426  %tmp1 = call float @llvm.fma.f32(float %x, float %y, float %tmp0) #0
427  %tmp2 = fsub float %tmp1, %z
428
429  store float %tmp2, float addrspace(1)* %gep.out
430  ret void
431}
432
433; fold (fsub x, (fma y, z, (fmul u, v)))
434;   -> (fma (fneg y), z, (fma (fneg u), v, x))
435
436; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_1_f32:
437; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
438; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
439; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
440; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
441; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
442
443; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
444; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
445; SI-STD: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]]
446
447; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
448; SI-DENORM: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
449; SI-DENORM: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]]
450
451; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
452; SI: s_endpgm
453define amdgpu_kernel void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
454  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
455  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
456  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
457  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
458  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
459  %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
460  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
461
462  %x = load volatile float, float addrspace(1)* %gep.0
463  %y = load volatile float, float addrspace(1)* %gep.1
464  %z = load volatile float, float addrspace(1)* %gep.2
465  %u = load volatile float, float addrspace(1)* %gep.3
466  %v = load volatile float, float addrspace(1)* %gep.4
467
468  %tmp0 = fmul float %u, %v
469  %tmp1 = call float @llvm.fma.f32(float %y, float %z, float %tmp0) #0
470  %tmp2 = fsub float %x, %tmp1
471
472  store float %tmp2, float addrspace(1)* %gep.out
473  ret void
474}
475
476; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
477
478; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_2_f32:
479; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
480; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
481; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
482; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
483; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
484
485; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
486; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[A]], [[B]]
487; SI-STD-SAFE: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP0]], [[C]]
488
489; SI-STD-UNSAFE: v_mad_f32 [[RESULT:v[0-9]+]], [[D]], [[E]], -[[C]]
490; SI-STD-UNSAFE: v_mac_f32_e32 [[RESULT]], [[A]], [[B]]
491
492; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
493; SI-DENORM-FASTFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
494; SI-DENORM-FASTFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]],  [[TMP1]], [[C]]
495
496; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
497; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[A]], [[B]]
498; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP1]], [[TMP0]]
499; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[C]]
500
501; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
502; SI: s_endpgm
503define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
504  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
505  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
506  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
507  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
508  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
509  %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
510  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
511
512  %x = load volatile float, float addrspace(1)* %gep.0
513  %y = load volatile float, float addrspace(1)* %gep.1
514  %z = load volatile float, float addrspace(1)* %gep.2
515  %u = load volatile float, float addrspace(1)* %gep.3
516  %v = load volatile float, float addrspace(1)* %gep.4
517
518  %tmp0 = fmul float %u, %v
519  %tmp1 = call float @llvm.fmuladd.f32(float %x, float %y, float %tmp0) #0
520  %tmp2 = fsub float %tmp1, %z
521
522  store float %tmp2, float addrspace(1)* %gep.out
523  ret void
524}
525
526; fold (fsub x, (fmuladd y, z, (fmul u, v)))
527;   -> (fmuladd (fneg y), z, (fmuladd (fneg u), v, x))
528
529; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_3_f32:
530; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
531; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
532; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
533; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
534; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
535
536; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
537; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[B]], [[C]]
538; SI-STD-SAFE: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP0]]
539
540; SI-STD-UNSAFE: v_mad_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]]
541; SI-STD-UNSAFE: v_mad_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]]
542
543; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
544; SI-DENORM-FASTFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
545; SI-DENORM-FASTFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]]
546
547; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
548; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[C]]
549; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP1]], [[TMP0]]
550; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP2]]
551
552; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
553; SI: s_endpgm
554define amdgpu_kernel void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
555  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
556  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
557  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
558  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
559  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
560  %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
561  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
562
563  %x = load volatile float, float addrspace(1)* %gep.0
564  %y = load volatile float, float addrspace(1)* %gep.1
565  %z = load volatile float, float addrspace(1)* %gep.2
566  %u = load volatile float, float addrspace(1)* %gep.3
567  %v = load volatile float, float addrspace(1)* %gep.4
568
569  %tmp0 = fmul float %u, %v
570  %tmp1 = call float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0
571  %tmp2 = fsub float %x, %tmp1
572
573  store float %tmp2, float addrspace(1)* %gep.out
574  ret void
575}
576
577attributes #0 = { nounwind readnone }
578attributes #1 = { nounwind }
579