1; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tahiti -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICTSI %s
2; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde  -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,SI %s
3; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tahiti -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s
4; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde  -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s
5; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global  -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI %s
6; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global  -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI %s
7
8; GCN-LABEL: {{^}}fmuladd_f64:
9; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
10define amdgpu_kernel void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
11                         double addrspace(1)* %in2, double addrspace(1)* %in3) #0 {
12  %r0 = load double, double addrspace(1)* %in1
13  %r1 = load double, double addrspace(1)* %in2
14  %r2 = load double, double addrspace(1)* %in3
15  %r3 = tail call double @llvm.fmuladd.f64(double %r0, double %r1, double %r2)
16  store double %r3, double addrspace(1)* %out
17  ret void
18}
19
20; GCN-LABEL: {{^}}fmul_fadd_f64:
21; GCN-CONTRACT: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
22
23; GCN-STRICT: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
24; GCN-STRICT: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
25define amdgpu_kernel void @fmul_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
26                           double addrspace(1)* %in2, double addrspace(1)* %in3) #0 {
27  %r0 = load double, double addrspace(1)* %in1
28  %r1 = load double, double addrspace(1)* %in2
29  %r2 = load double, double addrspace(1)* %in3
30  %tmp = fmul double %r0, %r1
31  %r3 = fadd double %tmp, %r2
32  store double %r3, double addrspace(1)* %out
33  ret void
34}
35
36; GCN-LABEL: {{^}}fadd_a_a_b_f64:
37; GCN: {{buffer|flat}}_load_dwordx2 [[R1:v\[[0-9]+:[0-9]+\]]],
38; GCN: {{buffer|flat}}_load_dwordx2 [[R2:v\[[0-9]+:[0-9]+\]]],
39
40; GCN-STRICT: v_add_f64 [[TMP:v\[[0-9]+:[0-9]+\]]], [[R1]], [[R1]]
41; GCN-STRICT: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[TMP]], [[R2]]
42
43; GCN-CONTRACT: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[R1]], 2.0, [[R2]]
44
45; SI: buffer_store_dwordx2 [[RESULT]]
46; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
47define amdgpu_kernel void @fadd_a_a_b_f64(double addrspace(1)* %out,
48                            double addrspace(1)* %in1,
49                            double addrspace(1)* %in2) #0 {
50  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
51  %gep.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
52  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
53  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
54
55  %r0 = load volatile double, double addrspace(1)* %gep.0
56  %r1 = load volatile double, double addrspace(1)* %gep.1
57
58  %add.0 = fadd double %r0, %r0
59  %add.1 = fadd double %add.0, %r1
60  store double %add.1, double addrspace(1)* %gep.out
61  ret void
62}
63
64; GCN-LABEL: {{^}}fadd_b_a_a_f64:
65; GCN: {{buffer|flat}}_load_dwordx2 [[R1:v\[[0-9]+:[0-9]+\]]],
66; GCN: {{buffer|flat}}_load_dwordx2 [[R2:v\[[0-9]+:[0-9]+\]]],
67
68; GCN-STRICT: v_add_f64 [[TMP:v\[[0-9]+:[0-9]+\]]], [[R1]], [[R1]]
69; GCN-STRICT: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[R2]], [[TMP]]
70
71; GCN-CONTRACT: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[R1]], 2.0, [[R2]]
72
73; SI: buffer_store_dwordx2 [[RESULT]]
74; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
75define amdgpu_kernel void @fadd_b_a_a_f64(double addrspace(1)* %out,
76                            double addrspace(1)* %in1,
77                            double addrspace(1)* %in2) #0 {
78  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
79  %gep.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
80  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
81  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
82
83  %r0 = load volatile double, double addrspace(1)* %gep.0
84  %r1 = load volatile double, double addrspace(1)* %gep.1
85
86  %add.0 = fadd double %r0, %r0
87  %add.1 = fadd double %r1, %add.0
88  store double %add.1, double addrspace(1)* %gep.out
89  ret void
90}
91
92; GCN-LABEL: {{^}}mad_sub_f64:
93; GCN-STRICT: v_mul_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}
94; GCN-STRICT: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}}
95
96; GCN-CONTRACT: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}}
97define amdgpu_kernel void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double addrspace(1)* noalias nocapture readonly %ptr) #1 {
98  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
99  %tid.ext = sext i32 %tid to i64
100  %gep0 = getelementptr double, double addrspace(1)* %ptr, i64 %tid.ext
101  %add1 = add i64 %tid.ext, 1
102  %gep1 = getelementptr double, double addrspace(1)* %ptr, i64 %add1
103  %add2 = add i64 %tid.ext, 2
104  %gep2 = getelementptr double, double addrspace(1)* %ptr, i64 %add2
105  %outgep = getelementptr double, double addrspace(1)* %out, i64 %tid.ext
106  %a = load volatile double, double addrspace(1)* %gep0, align 8
107  %b = load volatile double, double addrspace(1)* %gep1, align 8
108  %c = load volatile double, double addrspace(1)* %gep2, align 8
109  %mul = fmul double %a, %b
110  %sub = fsub double %mul, %c
111  store double %sub, double addrspace(1)* %outgep, align 8
112  ret void
113}
114
115; GCN-LABEL: {{^}}fadd_a_a_b_f64_fast_add0:
116; GCN-STRICT: v_add_f64
117; GCN-STRICT: v_add_f64
118
119; GCN-CONTRACT: v_fma_f64
120define amdgpu_kernel void @fadd_a_a_b_f64_fast_add0(double addrspace(1)* %out,
121                                      double addrspace(1)* %in1,
122                                      double addrspace(1)* %in2) #0 {
123  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
124  %gep.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
125  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
126  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
127
128  %r0 = load volatile double, double addrspace(1)* %gep.0
129  %r1 = load volatile double, double addrspace(1)* %gep.1
130
131  %add.0 = fadd fast double %r0, %r0
132  %add.1 = fadd double %add.0, %r1
133  store double %add.1, double addrspace(1)* %gep.out
134  ret void
135}
136
137; GCN-LABEL: {{^}}fadd_a_a_b_f64_fast_add1:
138; GCN-STRICT: v_add_f64
139; GCN-STRICT: v_add_f64
140
141; GCN-CONTRACT: v_fma_f64
142define amdgpu_kernel void @fadd_a_a_b_f64_fast_add1(double addrspace(1)* %out,
143                                      double addrspace(1)* %in1,
144                                      double addrspace(1)* %in2) #0 {
145  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
146  %gep.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
147  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
148  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
149
150  %r0 = load volatile double, double addrspace(1)* %gep.0
151  %r1 = load volatile double, double addrspace(1)* %gep.1
152
153  %add.0 = fadd double %r0, %r0
154  %add.1 = fadd fast double %add.0, %r1
155  store double %add.1, double addrspace(1)* %gep.out
156  ret void
157}
158
159; GCN-LABEL: {{^}}fadd_a_a_b_f64_fast:
160; GCN: v_fma_f64
161define amdgpu_kernel void @fadd_a_a_b_f64_fast(double addrspace(1)* %out,
162                                 double addrspace(1)* %in1,
163                                double addrspace(1)* %in2) #0 {
164  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
165  %gep.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
166  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
167  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
168
169  %r0 = load volatile double, double addrspace(1)* %gep.0
170  %r1 = load volatile double, double addrspace(1)* %gep.1
171
172  %add.0 = fadd fast double %r0, %r0
173  %add.1 = fadd fast double %add.0, %r1
174  store double %add.1, double addrspace(1)* %gep.out
175  ret void
176}
177
178declare i32 @llvm.amdgcn.workitem.id.x() #1
179declare double @llvm.fmuladd.f64(double, double, double) #1
180
181attributes #0 = { nounwind }
182attributes #1 = { nounwind readnone }
183