1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefix=SI-FMA -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s
4
5; FIXME: Remove enable-unsafe-fp-math in RUN line and add flags to IR instrs
6
7; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be
8; beneficial even without fp32 denormals, but they do require no-infs-fp-math
9; for correctness.
10
11declare i32 @llvm.amdgcn.workitem.id.x() #0
12declare double @llvm.fabs.f64(double) #0
13declare double @llvm.fma.f64(double, double, double) #0
14declare float @llvm.fma.f32(float, float, float) #0
15declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #0
16
17; (fadd (fmul x, y), z) -> (fma x, y, z)
18; FUNC-LABEL: {{^}}combine_to_fma_f64_0:
19; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
20; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
21; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
22; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
23; SI: buffer_store_dwordx2 [[RESULT]]
24define amdgpu_kernel void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
25  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
26  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
27  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
28  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
29  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
30
31  %a = load volatile double, double addrspace(1)* %gep.0
32  %b = load volatile double, double addrspace(1)* %gep.1
33  %c = load volatile double, double addrspace(1)* %gep.2
34
35  %mul = fmul double %a, %b
36  %fma = fadd double %mul, %c
37  store double %fma, double addrspace(1)* %gep.out
38  ret void
39}
40
41; (fadd (fmul x, y), z) -> (fma x, y, z)
42; FUNC-LABEL: {{^}}combine_to_fma_f64_0_2use:
43; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
44; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
45; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
46; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}}
47; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
48; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[D]]
49; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
50; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
51; SI: s_endpgm
52define amdgpu_kernel void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
53  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
54  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
55  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
56  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
57  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
58  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
59  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
60
61  %a = load volatile double, double addrspace(1)* %gep.0
62  %b = load volatile double, double addrspace(1)* %gep.1
63  %c = load volatile double, double addrspace(1)* %gep.2
64  %d = load volatile double, double addrspace(1)* %gep.3
65
66  %mul = fmul double %a, %b
67  %fma0 = fadd double %mul, %c
68  %fma1 = fadd double %mul, %d
69  store volatile double %fma0, double addrspace(1)* %gep.out.0
70  store volatile double %fma1, double addrspace(1)* %gep.out.1
71  ret void
72}
73
74; (fadd x, (fmul y, z)) -> (fma y, z, x)
75; FUNC-LABEL: {{^}}combine_to_fma_f64_1:
76; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
77; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
78; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
79; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
80; SI: buffer_store_dwordx2 [[RESULT]]
81define amdgpu_kernel void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
82  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
83  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
84  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
85  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
86  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
87
88  %a = load volatile double, double addrspace(1)* %gep.0
89  %b = load volatile double, double addrspace(1)* %gep.1
90  %c = load volatile double, double addrspace(1)* %gep.2
91
92  %mul = fmul double %a, %b
93  %fma = fadd double %c, %mul
94  store double %fma, double addrspace(1)* %gep.out
95  ret void
96}
97
98; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
99; FUNC-LABEL: {{^}}combine_to_fma_fsub_0_f64:
100; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
101; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
102; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
103; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
104; SI: buffer_store_dwordx2 [[RESULT]]
105define amdgpu_kernel void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
106  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
107  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
108  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
109  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
110  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
111
112  %a = load volatile double, double addrspace(1)* %gep.0
113  %b = load volatile double, double addrspace(1)* %gep.1
114  %c = load volatile double, double addrspace(1)* %gep.2
115
116  %mul = fmul double %a, %b
117  %fma = fsub double %mul, %c
118  store double %fma, double addrspace(1)* %gep.out
119  ret void
120}
121
122; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
123; FUNC-LABEL: {{^}}combine_to_fma_fsub_f64_0_2use:
124; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
125; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
126; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
127; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}}
128; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
129; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
130; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
131; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
132; SI: s_endpgm
133define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
134  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
135  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
136  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
137  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
138  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
139  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
140  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
141
142  %a = load volatile double, double addrspace(1)* %gep.0
143  %b = load volatile double, double addrspace(1)* %gep.1
144  %c = load volatile double, double addrspace(1)* %gep.2
145  %d = load volatile double, double addrspace(1)* %gep.3
146
147  %mul = fmul double %a, %b
148  %fma0 = fsub double %mul, %c
149  %fma1 = fsub double %mul, %d
150  store volatile double %fma0, double addrspace(1)* %gep.out.0
151  store volatile double %fma1, double addrspace(1)* %gep.out.1
152  ret void
153}
154
155; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
156; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64:
157; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
158; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
159; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
160; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
161; SI: buffer_store_dwordx2 [[RESULT]]
162define amdgpu_kernel void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
163  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
164  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
165  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
166  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
167  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
168
169  %a = load volatile double, double addrspace(1)* %gep.0
170  %b = load volatile double, double addrspace(1)* %gep.1
171  %c = load volatile double, double addrspace(1)* %gep.2
172
173  %mul = fmul double %a, %b
174  %fma = fsub double %c, %mul
175  store double %fma, double addrspace(1)* %gep.out
176  ret void
177}
178
179; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
180; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64_2use:
181; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
182; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
183; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
184; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}}
185; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
186; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[D]]
187; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
188; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
189; SI: s_endpgm
190define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
191  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
192  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
193  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
194  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
195  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
196  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
197  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
198
199  %a = load volatile double, double addrspace(1)* %gep.0
200  %b = load volatile double, double addrspace(1)* %gep.1
201  %c = load volatile double, double addrspace(1)* %gep.2
202  %d = load volatile double, double addrspace(1)* %gep.3
203
204  %mul = fmul double %a, %b
205  %fma0 = fsub double %c, %mul
206  %fma1 = fsub double %d, %mul
207  store volatile double %fma0, double addrspace(1)* %gep.out.0
208  store volatile double %fma1, double addrspace(1)* %gep.out.1
209  ret void
210}
211
212; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
213; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64:
214; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
215; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
216; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
217; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
218; SI: buffer_store_dwordx2 [[RESULT]]
219define amdgpu_kernel void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
220  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
221  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
222  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
223  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
224  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
225
226  %a = load volatile double, double addrspace(1)* %gep.0
227  %b = load volatile double, double addrspace(1)* %gep.1
228  %c = load volatile double, double addrspace(1)* %gep.2
229
230  %mul = fmul double %a, %b
231  %mul.neg = fsub double -0.0, %mul
232  %fma = fsub double %mul.neg, %c
233
234  store double %fma, double addrspace(1)* %gep.out
235  ret void
236}
237
238; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
239; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_neg:
240; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
241; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
242; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
243; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}}
244; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
245; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[D]]
246; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
247; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
248; SI: s_endpgm
249define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
250  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
251  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
252  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
253  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
254  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
255  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
256  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
257
258  %a = load volatile double, double addrspace(1)* %gep.0
259  %b = load volatile double, double addrspace(1)* %gep.1
260  %c = load volatile double, double addrspace(1)* %gep.2
261  %d = load volatile double, double addrspace(1)* %gep.3
262
263  %mul = fmul double %a, %b
264  %mul.neg = fsub double -0.0, %mul
265  %fma0 = fsub double %mul.neg, %c
266  %fma1 = fsub double %mul.neg, %d
267
268  store volatile double %fma0, double addrspace(1)* %gep.out.0
269  store volatile double %fma1, double addrspace(1)* %gep.out.1
270  ret void
271}
272
273; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
274; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_mul:
275; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
276; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
277; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
278; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}}
279; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
280; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
281; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
282; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
283; SI: s_endpgm
284define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
285  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
286  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
287  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
288  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
289  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
290  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
291  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
292
293  %a = load volatile double, double addrspace(1)* %gep.0
294  %b = load volatile double, double addrspace(1)* %gep.1
295  %c = load volatile double, double addrspace(1)* %gep.2
296  %d = load volatile double, double addrspace(1)* %gep.3
297
298  %mul = fmul double %a, %b
299  %mul.neg = fsub double -0.0, %mul
300  %fma0 = fsub double %mul.neg, %c
301  %fma1 = fsub double %mul, %d
302
303  store volatile double %fma0, double addrspace(1)* %gep.out.0
304  store volatile double %fma1, double addrspace(1)* %gep.out.1
305  ret void
306}
307
308; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
309
310; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_0_f64:
311; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
312; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
313; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
314; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}}
315; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32 glc{{$}}
316
317; SI-SAFE: v_mul_f64 [[TMP0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]]
318; SI-SAFE: v_fma_f64 [[TMP1:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[TMP0]]
319; SI-SAFE: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[TMP1]], -[[Z]]
320
321; SI-UNSAFE: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]], -[[Z]]
322; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]]
323
324; SI: buffer_store_dwordx2 [[RESULT]]
325define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
326  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
327  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
328  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
329  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
330  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
331  %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4
332  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
333
334  %x = load volatile double, double addrspace(1)* %gep.0
335  %y = load volatile double, double addrspace(1)* %gep.1
336  %z = load volatile double, double addrspace(1)* %gep.2
337  %u = load volatile double, double addrspace(1)* %gep.3
338  %v = load volatile double, double addrspace(1)* %gep.4
339
340  %tmp0 = fmul double %u, %v
341  %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0
342  %tmp2 = fsub double %tmp1, %z
343
344  store double %tmp2, double addrspace(1)* %gep.out
345  ret void
346}
347
348; fold (fsub x, (fma y, z, (fmul u, v)))
349;   -> (fma (fneg y), z, (fma (fneg u), v, x))
350
351; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_1_f64:
352; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
353; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
354; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
355; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}}
356; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32 glc{{$}}
357
358; SI-SAFE: v_mul_f64 [[TMP0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]]
359; SI-SAFE: v_fma_f64 [[TMP1:v\[[0-9]+:[0-9]+\]]], [[Y]], [[Z]], [[TMP0]]
360; SI-SAFE: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], -[[TMP1]]
361
362; SI-UNSAFE: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[U]], [[V]], [[X]]
363; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]]
364
365; SI: buffer_store_dwordx2 [[RESULT]]
366define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
367  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
368  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
369  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
370  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
371  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
372  %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4
373  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
374
375  %x = load volatile double, double addrspace(1)* %gep.0
376  %y = load volatile double, double addrspace(1)* %gep.1
377  %z = load volatile double, double addrspace(1)* %gep.2
378  %u = load volatile double, double addrspace(1)* %gep.3
379  %v = load volatile double, double addrspace(1)* %gep.4
380
381  ; nsz flag is needed since this combine may change sign of zero
382  %tmp0 = fmul nsz double %u, %v
383  %tmp1 = call nsz double @llvm.fma.f64(double %y, double %z, double %tmp0) #0
384  %tmp2 = fsub nsz double %x, %tmp1
385
386  store double %tmp2, double addrspace(1)* %gep.out
387  ret void
388}
389
390;
391; Patterns (+ fneg variants): mul(add(1.0,x),y), mul(sub(1.0,x),y), mul(sub(x,1.0),y)
392;
393
394; FUNC-LABEL: {{^}}test_f32_mul_add_x_one_y:
395; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
396; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
397;
398; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
399define amdgpu_kernel void @test_f32_mul_add_x_one_y(float addrspace(1)* %out,
400                                        float addrspace(1)* %in1,
401                                        float addrspace(1)* %in2) {
402  %x = load volatile float, float addrspace(1)* %in1
403  %y = load volatile float, float addrspace(1)* %in2
404  %a = fadd float %x, 1.0
405  %m = fmul float %a, %y
406  store float %m, float addrspace(1)* %out
407  ret void
408}
409
410; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_one:
411; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
412; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
413;
414; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
415define amdgpu_kernel void @test_f32_mul_y_add_x_one(float addrspace(1)* %out,
416                                        float addrspace(1)* %in1,
417                                        float addrspace(1)* %in2) {
418  %x = load volatile float, float addrspace(1)* %in1
419  %y = load volatile float, float addrspace(1)* %in2
420  %a = fadd float %x, 1.0
421  %m = fmul float %y, %a
422  store float %m, float addrspace(1)* %out
423  ret void
424}
425
426; FUNC-LABEL: {{^}}test_f32_mul_add_x_negone_y:
427; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
428; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
429;
430; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
431define amdgpu_kernel void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out,
432                                           float addrspace(1)* %in1,
433                                           float addrspace(1)* %in2) {
434  %x = load float, float addrspace(1)* %in1
435  %y = load float, float addrspace(1)* %in2
436  %a = fadd float %x, -1.0
437  %m = fmul float %a, %y
438  store float %m, float addrspace(1)* %out
439  ret void
440}
441
442; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_negone:
443; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
444; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
445;
446; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
447define amdgpu_kernel void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out,
448                                           float addrspace(1)* %in1,
449                                           float addrspace(1)* %in2) {
450  %x = load float, float addrspace(1)* %in1
451  %y = load float, float addrspace(1)* %in2
452  %a = fadd float %x, -1.0
453  %m = fmul float %y, %a
454  store float %m, float addrspace(1)* %out
455  ret void
456}
457
458; FUNC-LABEL: {{^}}test_f32_mul_sub_one_x_y:
459; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
460; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
461;
462; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
463define amdgpu_kernel void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out,
464                                        float addrspace(1)* %in1,
465                                        float addrspace(1)* %in2) {
466  %x = load float, float addrspace(1)* %in1
467  %y = load float, float addrspace(1)* %in2
468  %s = fsub float 1.0, %x
469  %m = fmul float %s, %y
470  store float %m, float addrspace(1)* %out
471  ret void
472}
473
474; FUNC-LABEL: {{^}}test_f32_mul_y_sub_one_x:
475; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
476; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
477;
478; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
479define amdgpu_kernel void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out,
480                                        float addrspace(1)* %in1,
481                                        float addrspace(1)* %in2) {
482  %x = load float, float addrspace(1)* %in1
483  %y = load float, float addrspace(1)* %in2
484  %s = fsub float 1.0, %x
485  %m = fmul float %y, %s
486  store float %m, float addrspace(1)* %out
487  ret void
488}
489
490; FUNC-LABEL: {{^}}test_f32_mul_sub_negone_x_y:
491; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
492; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
493;
494; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
495define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out,
496                                           float addrspace(1)* %in1,
497                                           float addrspace(1)* %in2) {
498  %x = load float, float addrspace(1)* %in1
499  %y = load float, float addrspace(1)* %in2
500  %s = fsub float -1.0, %x
501  %m = fmul float %s, %y
502  store float %m, float addrspace(1)* %out
503  ret void
504}
505
506; FUNC-LABEL: {{^}}test_f32_mul_y_sub_negone_x:
507; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
508; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
509;
510; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
511define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out,
512                                         float addrspace(1)* %in1,
513                                         float addrspace(1)* %in2) {
514  %x = load float, float addrspace(1)* %in1
515  %y = load float, float addrspace(1)* %in2
516  %s = fsub float -1.0, %x
517  %m = fmul float %y, %s
518  store float %m, float addrspace(1)* %out
519  ret void
520}
521
522; FUNC-LABEL: {{^}}test_f32_mul_sub_x_one_y:
523; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
524; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
525;
526; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
527define amdgpu_kernel void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out,
528                                        float addrspace(1)* %in1,
529                                        float addrspace(1)* %in2) {
530  %x = load float, float addrspace(1)* %in1
531  %y = load float, float addrspace(1)* %in2
532  %s = fsub float %x, 1.0
533  %m = fmul float %s, %y
534  store float %m, float addrspace(1)* %out
535  ret void
536}
537
538; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_one:
539; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
540; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
541;
542; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
543define amdgpu_kernel void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out,
544                                      float addrspace(1)* %in1,
545                                      float addrspace(1)* %in2) {
546  %x = load float, float addrspace(1)* %in1
547  %y = load float, float addrspace(1)* %in2
548  %s = fsub float %x, 1.0
549  %m = fmul float %y, %s
550  store float %m, float addrspace(1)* %out
551  ret void
552}
553
554; FUNC-LABEL: {{^}}test_f32_mul_sub_x_negone_y:
555; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
556; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
557;
558; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
559define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out,
560                                         float addrspace(1)* %in1,
561                                         float addrspace(1)* %in2) {
562  %x = load float, float addrspace(1)* %in1
563  %y = load float, float addrspace(1)* %in2
564  %s = fsub float %x, -1.0
565  %m = fmul float %s, %y
566  store float %m, float addrspace(1)* %out
567  ret void
568}
569
570; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_negone:
571; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
572; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
573;
574; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
575define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out,
576                                         float addrspace(1)* %in1,
577                                         float addrspace(1)* %in2) {
578  %x = load float, float addrspace(1)* %in1
579  %y = load float, float addrspace(1)* %in2
580  %s = fsub float %x, -1.0
581  %m = fmul float %y, %s
582  store float %m, float addrspace(1)* %out
583  ret void
584}
585
586;
587; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y))
588;
589
590; FUNC-LABEL: {{^}}test_f32_interp:
591; SI-NOFMA: v_sub_f32_e32 [[VT1:v[0-9]]], 1.0, [[VT:v[0-9]]]
592; SI-NOFMA: v_mul_f32_e32 [[VTY:v[0-9]]], [[VY:v[0-9]]], [[VT1]]
593; SI-NOFMA: v_mac_f32_e32 [[VTY]], [[VX:v[0-9]]], [[VT]]
594;
595; SI-FMA: v_fma_f32 [[VR:v[0-9]]], -[[VT:v[0-9]]], [[VY:v[0-9]]], [[VY]]
596; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VT]], [[VR]]
597define amdgpu_kernel void @test_f32_interp(float addrspace(1)* %out,
598                             float addrspace(1)* %in1,
599                             float addrspace(1)* %in2,
600                             float addrspace(1)* %in3) {
601  %x = load float, float addrspace(1)* %in1
602  %y = load float, float addrspace(1)* %in2
603  %t = load float, float addrspace(1)* %in3
604  %t1 = fsub float 1.0, %t
605  %tx = fmul float %x, %t
606  %ty = fmul float %y, %t1
607  %r = fadd float %tx, %ty
608  store float %r, float addrspace(1)* %out
609  ret void
610}
611
612; FUNC-LABEL: {{^}}test_f64_interp:
613; SI-NOFMA: v_add_f64 [[VT1:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], 1.0
614; SI-NOFMA: v_mul_f64 [[VTY:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VT1]]
615; SI-NOFMA: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VTY]]
616;
617; SI-FMA: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VY]]
618; SI-FMA: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]]
619define amdgpu_kernel void @test_f64_interp(double addrspace(1)* %out,
620                             double addrspace(1)* %in1,
621                             double addrspace(1)* %in2,
622                             double addrspace(1)* %in3) {
623  %x = load double, double addrspace(1)* %in1
624  %y = load double, double addrspace(1)* %in2
625  %t = load double, double addrspace(1)* %in3
626  %t1 = fsub double 1.0, %t
627  %tx = fmul double %x, %t
628  %ty = fmul double %y, %t1
629  %r = fadd double %tx, %ty
630  store double %r, double addrspace(1)* %out
631  ret void
632}
633
634; Make sure negative constant cancels out fneg
635; SI-LABEL: {{^}}fma_neg_2.0_neg_a_b_f32:
636; SI: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
637; SI: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
638; SI-NOT: [[A]]
639; SI-NOT: [[B]]
640; SI: v_fma_f32 v{{[0-9]+}}, [[A]], 2.0, [[B]]
641define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
642  %tid = call i32 @llvm.amdgcn.workitem.id.x()
643  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
644  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
645  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
646
647  %r1 = load volatile float, float addrspace(1)* %gep.0
648  %r2 = load volatile float, float addrspace(1)* %gep.1
649
650  %r1.fneg = fneg float %r1
651
652  %r3 = tail call float @llvm.fma.f32(float -2.0, float %r1.fneg, float %r2)
653  store float %r3, float addrspace(1)* %gep.out
654  ret void
655}
656
657; SI-LABEL: {{^}}fma_2.0_neg_a_b_f32:
658; SI: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
659; SI: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
660; SI-NOT: [[A]]
661; SI-NOT: [[B]]
662; SI: v_fma_f32 v{{[0-9]+}}, [[A]], -2.0, [[B]]
663define amdgpu_kernel void @fma_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
664  %tid = call i32 @llvm.amdgcn.workitem.id.x()
665  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
666  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
667  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
668
669  %r1 = load volatile float, float addrspace(1)* %gep.0
670  %r2 = load volatile float, float addrspace(1)* %gep.1
671
672  %r1.fneg = fneg float %r1
673
674  %r3 = tail call float @llvm.fma.f32(float 2.0, float %r1.fneg, float %r2)
675  store float %r3, float addrspace(1)* %gep.out
676  ret void
677}
678
679; SI-LABEL: {{^}}fma_neg_b_c_v4f32:
680; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
681; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
682; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
683; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
684define amdgpu_kernel void @fma_neg_b_c_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #2 {
685  %tid = call i32 @llvm.amdgcn.workitem.id.x()
686  %gep.0 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %tid
687  %gep.1 = getelementptr <4 x float>, <4 x float> addrspace(1)* %gep.0, i32 1
688  %gep.2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %gep.1, i32 2
689  %gep.out = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid
690
691  %tmp0 = load <4 x float>, <4 x float> addrspace(1)* %gep.0
692  %tmp1 = load <4 x float>, <4 x float> addrspace(1)* %gep.1
693  %tmp2 = load <4 x float>, <4 x float> addrspace(1)* %gep.2
694
695  %fneg0 = fneg fast <4 x float> %tmp0
696  %fneg1 = fneg fast <4 x float> %tmp1
697  %fma0 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %tmp2, <4 x float> %fneg0, <4 x float> %fneg1)
698
699  store <4 x float> %fma0, <4 x float> addrspace(1)* %gep.out
700  ret void
701}
702
703attributes #0 = { nounwind readnone }
704attributes #1 = { nounwind }
705attributes #2 = { nounwind "no-signed-zeros-fp-math"="true" }
706