1; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX900
2; RUN: llc -march=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GCN-DL-UNSAFE,GFX906-DL-UNSAFE
3; RUN: llc -march=amdgcn -mcpu=gfx1011 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
4; RUN: llc -march=amdgcn -mcpu=gfx1012 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
5; RUN: llc -march=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX906
6; RUN: llc -march=amdgcn -mcpu=gfx906 -denormal-fp-math=preserve-sign -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX906-CONTRACT
7; RUN: llc -march=amdgcn -mcpu=gfx906 -denormal-fp-math=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX906-DENORM-CONTRACT
8; (fadd (fmul S1.x, S2.x), (fadd (fmul (S1.y, S2.y), z))) -> (fdot2 S1, S2, z)
9
10; Tests to make sure fdot2 is not generated when vector elements of dot-product expressions
11; are not converted from f16 to f32.
12; GCN-LABEL: {{^}}dotproduct_f16
13; GFX900: v_fma_f16
14; GFX900: v_fma_f16
15
16; GFX906: v_mul_f16_e32
17; GFX906: v_mul_f16_e32
18
19; GFX906-DL-UNSAFE:  v_fma_f16
20; GFX10-CONTRACT: v_fmac_f16
21
22; GFX906-CONTRACT: v_mac_f16_e32
23; GFX906-DENORM-CONTRACT: v_fma_f16
24define amdgpu_kernel void @dotproduct_f16(<2 x half> addrspace(1)* %src1,
25                                          <2 x half> addrspace(1)* %src2,
26                                          half addrspace(1)* nocapture %dst) {
27entry:
28  %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1
29  %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2
30
31  %src1.el1 = extractelement <2 x half> %src1.vec, i64 0
32  %src2.el1 = extractelement <2 x half> %src2.vec, i64 0
33
34  %src1.el2 = extractelement <2 x half> %src1.vec, i64 1
35  %src2.el2 = extractelement <2 x half> %src2.vec, i64 1
36
37  %mul2 = fmul half %src1.el2, %src2.el2
38  %mul1 = fmul half %src1.el1, %src2.el1
39  %acc = load half, half addrspace(1)* %dst, align 2
40  %acc1 = fadd half %mul2, %acc
41  %acc2 = fadd half %mul1, %acc1
42  store half %acc2, half addrspace(1)* %dst, align 2
43  ret void
44}
45
46
47; We only want to generate fdot2 if vector element of dot product is converted from f16 to f32
48; and the vectors are of type <2 x half>
49; GCN-LABEL: {{^}}dotproduct_f16_f32
50; GFX900: v_mad_mix_f32
51; GFX900: v_mad_mix_f32
52
53; GFX906: v_mad_f32
54; GFX906: v_mac_f32_e32
55
56; GFX906-DL-UNSAFE: v_dot2_f32_f16
57; GFX10-DL-UNSAFE: v_dot2c_f32_f16_e32
58
59; GFX906-CONTRACT: v_dot2_f32_f16
60
61; GFX906-DENORM-CONTRACT: v_dot2_f32_f16
62define amdgpu_kernel void @dotproduct_f16_f32(<2 x half> addrspace(1)* %src1,
63                                              <2 x half> addrspace(1)* %src2,
64                                              float addrspace(1)* nocapture %dst) {
65entry:
66  %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1
67  %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2
68
69  %src1.el1 = extractelement <2 x half> %src1.vec, i64 0
70  %csrc1.el1 = fpext half %src1.el1 to float
71  %src2.el1 = extractelement <2 x half> %src2.vec, i64 0
72  %csrc2.el1 = fpext half %src2.el1 to float
73
74  %src1.el2 = extractelement <2 x half> %src1.vec, i64 1
75  %csrc1.el2 = fpext half %src1.el2 to float
76  %src2.el2 = extractelement <2 x half> %src2.vec, i64 1
77  %csrc2.el2 = fpext half %src2.el2 to float
78
79  %mul2 = fmul float %csrc1.el2, %csrc2.el2
80  %mul1 = fmul float %csrc1.el1, %csrc2.el1
81  %acc = load float, float addrspace(1)* %dst, align 4
82  %acc1 = fadd float %mul2, %acc
83  %acc2 = fadd float %mul1, %acc1
84  store float %acc2, float addrspace(1)* %dst, align 4
85  ret void
86}
87
88; We only want to generate fdot2 if vector element of dot product is converted from f16 to f32
89; and the vectors are of type <2 x half>
90; GCN-LABEL: {{^}}dotproduct_diffvecorder
91; GFX900: v_mad_mix_f32
92; GFX900: v_mad_mix_f32
93
94; GFX906: v_mad_f32
95; GFX906: v_mac_f32_e32
96
97; GFX906-DL-UNSAFE: v_dot2_f32_f16
98; GFX10-DL-UNSAFE: v_dot2c_f32_f16_e32
99
100; GFX906-CONTRACT: v_dot2_f32_f16
101; GFX906-DENORM-CONTRACT: v_dot2_f32_f16
102define amdgpu_kernel void @dotproduct_diffvecorder(<2 x half> addrspace(1)* %src1,
103                                                   <2 x half> addrspace(1)* %src2,
104                                                   float addrspace(1)* nocapture %dst) {
105entry:
106  %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1
107  %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2
108
109  %src1.el1 = extractelement <2 x half> %src1.vec, i64 0
110  %csrc1.el1 = fpext half %src1.el1 to float
111  %src2.el1 = extractelement <2 x half> %src2.vec, i64 0
112  %csrc2.el1 = fpext half %src2.el1 to float
113
114  %src1.el2 = extractelement <2 x half> %src1.vec, i64 1
115  %csrc1.el2 = fpext half %src1.el2 to float
116  %src2.el2 = extractelement <2 x half> %src2.vec, i64 1
117  %csrc2.el2 = fpext half %src2.el2 to float
118
119  %mul2 = fmul float %csrc2.el2, %csrc1.el2
120  %mul1 = fmul float %csrc1.el1, %csrc2.el1
121  %acc = load float, float addrspace(1)* %dst, align 4
122  %acc1 = fadd float %mul2, %acc
123  %acc2 = fadd float %mul1, %acc1
124  store float %acc2, float addrspace(1)* %dst, align 4
125  ret void
126}
127
128; Tests to make sure dot product is not generated when the vectors are not of <2 x half>.
129; GCN-LABEL: {{^}}dotproduct_v4f16
130; GFX900: v_mad_mix_f32
131
132; GFX906: v_mad_f32
133; GFX906: v_mac_f32_e32
134
135; GCN-DL-UNSAFE: v_fma_mix_f32
136
137; GFX906-CONTRACT: v_fma_mix_f32
138; GFX906-DENORM-CONTRACT: v_fma_mix_f32
139define amdgpu_kernel void @dotproduct_v4f16(<4 x half> addrspace(1)* %src1,
140                                            <4 x half> addrspace(1)* %src2,
141                                            float addrspace(1)* nocapture %dst) {
142entry:
143  %src1.vec = load <4 x half>, <4 x half> addrspace(1)* %src1
144  %src2.vec = load <4 x half>, <4 x half> addrspace(1)* %src2
145
146  %src1.el1 = extractelement <4 x half> %src1.vec, i64 0
147  %csrc1.el1 = fpext half %src1.el1 to float
148  %src2.el1 = extractelement <4 x half> %src2.vec, i64 0
149  %csrc2.el1 = fpext half %src2.el1 to float
150
151  %src1.el2 = extractelement <4 x half> %src1.vec, i64 1
152  %csrc1.el2 = fpext half %src1.el2 to float
153  %src2.el2 = extractelement <4 x half> %src2.vec, i64 1
154  %csrc2.el2 = fpext half %src2.el2 to float
155
156  %mul2 = fmul float %csrc1.el2, %csrc2.el2
157  %mul1 = fmul float %csrc1.el1, %csrc2.el1
158  %acc = load float, float addrspace(1)* %dst, align 4
159  %acc1 = fadd float %mul2, %acc
160  %acc2 = fadd float %mul1, %acc1
161  store float %acc2, float addrspace(1)* %dst, align 4
162  ret void
163}
164
165; GCN-LABEL: {{^}}NotAdotproduct
166; GFX900: v_mad_mix_f32
167; GFX900: v_mad_mix_f32
168
169; GFX906: v_mad_f32
170; GFX906: v_mac_f32_e32
171
172; GCN-DL-UNSAFE: v_fma_mix_f32
173
174; GFX906-CONTRACT: v_fma_mix_f32
175; GFX906-DENORM-CONTRACT: v_fma_mix_f32
176define amdgpu_kernel void @NotAdotproduct(<2 x half> addrspace(1)* %src1,
177                                          <2 x half> addrspace(1)* %src2,
178                                          float addrspace(1)* nocapture %dst) {
179entry:
180  %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1
181  %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2
182
183  %src1.el1 = extractelement <2 x half> %src1.vec, i64 0
184  %csrc1.el1 = fpext half %src1.el1 to float
185  %src2.el1 = extractelement <2 x half> %src2.vec, i64 0
186  %csrc2.el1 = fpext half %src2.el1 to float
187
188  %src1.el2 = extractelement <2 x half> %src1.vec, i64 1
189  %csrc1.el2 = fpext half %src1.el2 to float
190  %src2.el2 = extractelement <2 x half> %src2.vec, i64 1
191  %csrc2.el2 = fpext half %src2.el2 to float
192
193  %mul2 = fmul float %csrc1.el2, %csrc1.el1
194  %mul1 = fmul float %csrc2.el1, %csrc2.el2
195  %acc = load float, float addrspace(1)* %dst, align 4
196  %acc1 = fadd float %mul2, %acc
197  %acc2 = fadd float %mul1, %acc1
198  store float %acc2, float addrspace(1)* %dst, align 4
199  ret void
200}
201
202; GCN-LABEL: {{^}}Diff_Idx_NotAdotproduct
203; GFX900: v_mad_mix_f32
204; GFX900: v_mad_mix_f32
205
206; GFX906: v_mad_f32
207; GFX906: v_mac_f32_e32
208
209; GCN-DL-UNSAFE: v_fma_mix_f32
210
211; GFX906-CONTRACT: v_fma_mix_f32
212; GFX906-DENORM-CONTRACT: v_fma_mix_f32
213define amdgpu_kernel void @Diff_Idx_NotAdotproduct(<2 x half> addrspace(1)* %src1,
214                                                   <2 x half> addrspace(1)* %src2,
215                                                   float addrspace(1)* nocapture %dst) {
216entry:
217  %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1
218  %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2
219
220  %src1.el1 = extractelement <2 x half> %src1.vec, i64 0
221  %csrc1.el1 = fpext half %src1.el1 to float
222  %src2.el1 = extractelement <2 x half> %src2.vec, i64 0
223  %csrc2.el1 = fpext half %src2.el1 to float
224
225  %src1.el2 = extractelement <2 x half> %src1.vec, i64 1
226  %csrc1.el2 = fpext half %src1.el2 to float
227  %src2.el2 = extractelement <2 x half> %src2.vec, i64 1
228  %csrc2.el2 = fpext half %src2.el2 to float
229
230  %mul2 = fmul float %csrc1.el2, %csrc2.el1
231  %mul1 = fmul float %csrc1.el1, %csrc2.el2
232  %acc = load float, float addrspace(1)* %dst, align 4
233  %acc1 = fadd float %mul2, %acc
234  %acc2 = fadd float %mul1, %acc1
235  store float %acc2, float addrspace(1)* %dst, align 4
236  ret void
237}
238