1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP
3; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16,+fp64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NOFP
4
5define arm_aapcs_vfpcc float @fadd_v2f32(<2 x float> %x, float %y) {
6; CHECK-LABEL: fadd_v2f32:
7; CHECK:       @ %bb.0: @ %entry
8; CHECK-NEXT:    vadd.f32 s0, s0, s1
9; CHECK-NEXT:    vadd.f32 s0, s4, s0
10; CHECK-NEXT:    bx lr
11entry:
12  %z = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float %y, <2 x float> %x)
13  ret float %z
14}
15
16define arm_aapcs_vfpcc float @fadd_v4f32(<4 x float> %x, float %y) {
17; CHECK-FP-LABEL: fadd_v4f32:
18; CHECK-FP:       @ %bb.0: @ %entry
19; CHECK-FP-NEXT:    vadd.f32 s6, s2, s3
20; CHECK-FP-NEXT:    vadd.f32 s0, s0, s1
21; CHECK-FP-NEXT:    vadd.f32 s0, s0, s6
22; CHECK-FP-NEXT:    vadd.f32 s0, s4, s0
23; CHECK-FP-NEXT:    bx lr
24;
25; CHECK-NOFP-LABEL: fadd_v4f32:
26; CHECK-NOFP:       @ %bb.0: @ %entry
27; CHECK-NOFP-NEXT:    vadd.f32 s6, s0, s1
28; CHECK-NOFP-NEXT:    vadd.f32 s6, s6, s2
29; CHECK-NOFP-NEXT:    vadd.f32 s0, s6, s3
30; CHECK-NOFP-NEXT:    vadd.f32 s0, s4, s0
31; CHECK-NOFP-NEXT:    bx lr
32entry:
33  %z = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %y, <4 x float> %x)
34  ret float %z
35}
36
37define arm_aapcs_vfpcc float @fadd_v8f32(<8 x float> %x, float %y) {
38; CHECK-FP-LABEL: fadd_v8f32:
39; CHECK-FP:       @ %bb.0: @ %entry
40; CHECK-FP-NEXT:    vadd.f32 q0, q0, q1
41; CHECK-FP-NEXT:    vadd.f32 s4, s2, s3
42; CHECK-FP-NEXT:    vadd.f32 s0, s0, s1
43; CHECK-FP-NEXT:    vadd.f32 s0, s0, s4
44; CHECK-FP-NEXT:    vadd.f32 s0, s8, s0
45; CHECK-FP-NEXT:    bx lr
46;
47; CHECK-NOFP-LABEL: fadd_v8f32:
48; CHECK-NOFP:       @ %bb.0: @ %entry
49; CHECK-NOFP-NEXT:    vadd.f32 s12, s0, s4
50; CHECK-NOFP-NEXT:    vadd.f32 s10, s1, s5
51; CHECK-NOFP-NEXT:    vadd.f32 s14, s2, s6
52; CHECK-NOFP-NEXT:    vadd.f32 s0, s3, s7
53; CHECK-NOFP-NEXT:    vadd.f32 s10, s12, s10
54; CHECK-NOFP-NEXT:    vadd.f32 s2, s10, s14
55; CHECK-NOFP-NEXT:    vadd.f32 s0, s2, s0
56; CHECK-NOFP-NEXT:    vadd.f32 s0, s8, s0
57; CHECK-NOFP-NEXT:    bx lr
58entry:
59  %z = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float %y, <8 x float> %x)
60  ret float %z
61}
62
63define arm_aapcs_vfpcc half @fadd_v2f16(<2 x half> %x, half %y) {
64; CHECK-LABEL: fadd_v2f16:
65; CHECK:       @ %bb.0: @ %entry
66; CHECK-NEXT:    vmovx.f16 s6, s0
67; CHECK-NEXT:    vadd.f16 s0, s0, s6
68; CHECK-NEXT:    vadd.f16 s0, s4, s0
69; CHECK-NEXT:    bx lr
70entry:
71  %z = call fast half @llvm.vector.reduce.fadd.f16.v2f16(half %y, <2 x half> %x)
72  ret half %z
73}
74
75define arm_aapcs_vfpcc half @fadd_v4f16(<4 x half> %x, half %y) {
76; CHECK-FP-LABEL: fadd_v4f16:
77; CHECK-FP:       @ %bb.0: @ %entry
78; CHECK-FP-NEXT:    vmovx.f16 s6, s1
79; CHECK-FP-NEXT:    vmovx.f16 s8, s0
80; CHECK-FP-NEXT:    vadd.f16 s6, s1, s6
81; CHECK-FP-NEXT:    vadd.f16 s0, s0, s8
82; CHECK-FP-NEXT:    vadd.f16 s0, s0, s6
83; CHECK-FP-NEXT:    vadd.f16 s0, s4, s0
84; CHECK-FP-NEXT:    bx lr
85;
86; CHECK-NOFP-LABEL: fadd_v4f16:
87; CHECK-NOFP:       @ %bb.0: @ %entry
88; CHECK-NOFP-NEXT:    vmovx.f16 s6, s0
89; CHECK-NOFP-NEXT:    vadd.f16 s6, s0, s6
90; CHECK-NOFP-NEXT:    vmovx.f16 s0, s1
91; CHECK-NOFP-NEXT:    vadd.f16 s6, s6, s1
92; CHECK-NOFP-NEXT:    vadd.f16 s0, s6, s0
93; CHECK-NOFP-NEXT:    vadd.f16 s0, s4, s0
94; CHECK-NOFP-NEXT:    bx lr
95entry:
96  %z = call fast half @llvm.vector.reduce.fadd.f16.v4f16(half %y, <4 x half> %x)
97  ret half %z
98}
99
100define arm_aapcs_vfpcc half @fadd_v8f16(<8 x half> %x, half %y) {
101; CHECK-FP-LABEL: fadd_v8f16:
102; CHECK-FP:       @ %bb.0: @ %entry
103; CHECK-FP-NEXT:    vrev32.16 q2, q0
104; CHECK-FP-NEXT:    vadd.f16 q0, q0, q2
105; CHECK-FP-NEXT:    vadd.f16 s6, s2, s3
106; CHECK-FP-NEXT:    vadd.f16 s0, s0, s1
107; CHECK-FP-NEXT:    vadd.f16 s0, s0, s6
108; CHECK-FP-NEXT:    vadd.f16 s0, s4, s0
109; CHECK-FP-NEXT:    bx lr
110;
111; CHECK-NOFP-LABEL: fadd_v8f16:
112; CHECK-NOFP:       @ %bb.0: @ %entry
113; CHECK-NOFP-NEXT:    vmovx.f16 s6, s0
114; CHECK-NOFP-NEXT:    vmovx.f16 s8, s1
115; CHECK-NOFP-NEXT:    vadd.f16 s6, s0, s6
116; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
117; CHECK-NOFP-NEXT:    vadd.f16 s6, s6, s1
118; CHECK-NOFP-NEXT:    vadd.f16 s6, s6, s8
119; CHECK-NOFP-NEXT:    vmovx.f16 s8, s2
120; CHECK-NOFP-NEXT:    vadd.f16 s6, s6, s2
121; CHECK-NOFP-NEXT:    vadd.f16 s6, s6, s8
122; CHECK-NOFP-NEXT:    vadd.f16 s6, s6, s3
123; CHECK-NOFP-NEXT:    vadd.f16 s0, s6, s0
124; CHECK-NOFP-NEXT:    vadd.f16 s0, s4, s0
125; CHECK-NOFP-NEXT:    bx lr
126entry:
127  %z = call fast half @llvm.vector.reduce.fadd.f16.v8f16(half %y, <8 x half> %x)
128  ret half %z
129}
130
131define arm_aapcs_vfpcc half @fadd_v16f16(<16 x half> %x, half %y) {
132; CHECK-FP-LABEL: fadd_v16f16:
133; CHECK-FP:       @ %bb.0: @ %entry
134; CHECK-FP-NEXT:    vadd.f16 q0, q0, q1
135; CHECK-FP-NEXT:    vrev32.16 q1, q0
136; CHECK-FP-NEXT:    vadd.f16 q0, q0, q1
137; CHECK-FP-NEXT:    vadd.f16 s4, s2, s3
138; CHECK-FP-NEXT:    vadd.f16 s0, s0, s1
139; CHECK-FP-NEXT:    vadd.f16 s0, s0, s4
140; CHECK-FP-NEXT:    vadd.f16 s0, s8, s0
141; CHECK-FP-NEXT:    bx lr
142;
143; CHECK-NOFP-LABEL: fadd_v16f16:
144; CHECK-NOFP:       @ %bb.0: @ %entry
145; CHECK-NOFP-NEXT:    vmovx.f16 s10, s4
146; CHECK-NOFP-NEXT:    vmovx.f16 s12, s0
147; CHECK-NOFP-NEXT:    vadd.f16 s10, s12, s10
148; CHECK-NOFP-NEXT:    vadd.f16 s12, s0, s4
149; CHECK-NOFP-NEXT:    vadd.f16 s10, s12, s10
150; CHECK-NOFP-NEXT:    vadd.f16 s12, s1, s5
151; CHECK-NOFP-NEXT:    vadd.f16 s10, s10, s12
152; CHECK-NOFP-NEXT:    vmovx.f16 s12, s5
153; CHECK-NOFP-NEXT:    vmovx.f16 s14, s1
154; CHECK-NOFP-NEXT:    vmovx.f16 s4, s7
155; CHECK-NOFP-NEXT:    vadd.f16 s12, s14, s12
156; CHECK-NOFP-NEXT:    vmovx.f16 s14, s2
157; CHECK-NOFP-NEXT:    vadd.f16 s10, s10, s12
158; CHECK-NOFP-NEXT:    vadd.f16 s12, s2, s6
159; CHECK-NOFP-NEXT:    vadd.f16 s10, s10, s12
160; CHECK-NOFP-NEXT:    vmovx.f16 s12, s6
161; CHECK-NOFP-NEXT:    vadd.f16 s12, s14, s12
162; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
163; CHECK-NOFP-NEXT:    vadd.f16 s10, s10, s12
164; CHECK-NOFP-NEXT:    vadd.f16 s12, s3, s7
165; CHECK-NOFP-NEXT:    vadd.f16 s10, s10, s12
166; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s4
167; CHECK-NOFP-NEXT:    vadd.f16 s0, s10, s0
168; CHECK-NOFP-NEXT:    vadd.f16 s0, s8, s0
169; CHECK-NOFP-NEXT:    bx lr
170entry:
171  %z = call fast half @llvm.vector.reduce.fadd.f16.v16f16(half %y, <16 x half> %x)
172  ret half %z
173}
174
175define arm_aapcs_vfpcc double @fadd_v1f64(<1 x double> %x, double %y) {
176; CHECK-LABEL: fadd_v1f64:
177; CHECK:       @ %bb.0: @ %entry
178; CHECK-NEXT:    vadd.f64 d0, d1, d0
179; CHECK-NEXT:    bx lr
180entry:
181  %z = call fast double @llvm.vector.reduce.fadd.f64.v1f64(double %y, <1 x double> %x)
182  ret double %z
183}
184
185define arm_aapcs_vfpcc double @fadd_v2f64(<2 x double> %x, double %y) {
186; CHECK-LABEL: fadd_v2f64:
187; CHECK:       @ %bb.0: @ %entry
188; CHECK-NEXT:    vadd.f64 d0, d0, d1
189; CHECK-NEXT:    vadd.f64 d0, d2, d0
190; CHECK-NEXT:    bx lr
191entry:
192  %z = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double %y, <2 x double> %x)
193  ret double %z
194}
195
196define arm_aapcs_vfpcc double @fadd_v4f64(<4 x double> %x, double %y) {
197; CHECK-LABEL: fadd_v4f64:
198; CHECK:       @ %bb.0: @ %entry
199; CHECK-NEXT:    vadd.f64 d5, d1, d3
200; CHECK-NEXT:    vadd.f64 d0, d0, d2
201; CHECK-NEXT:    vadd.f64 d0, d0, d5
202; CHECK-NEXT:    vadd.f64 d0, d4, d0
203; CHECK-NEXT:    bx lr
204entry:
205  %z = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double %y, <4 x double> %x)
206  ret double %z
207}
208
209define arm_aapcs_vfpcc float @fadd_v2f32_nofast(<2 x float> %x, float %y) {
210; CHECK-LABEL: fadd_v2f32_nofast:
211; CHECK:       @ %bb.0: @ %entry
212; CHECK-NEXT:    vadd.f32 s4, s4, s0
213; CHECK-NEXT:    vadd.f32 s0, s4, s1
214; CHECK-NEXT:    bx lr
215entry:
216  %z = call float @llvm.vector.reduce.fadd.f32.v2f32(float %y, <2 x float> %x)
217  ret float %z
218}
219
220define arm_aapcs_vfpcc float @fadd_v4f32_nofast(<4 x float> %x, float %y) {
221; CHECK-LABEL: fadd_v4f32_nofast:
222; CHECK:       @ %bb.0: @ %entry
223; CHECK-NEXT:    vadd.f32 s4, s4, s0
224; CHECK-NEXT:    vadd.f32 s4, s4, s1
225; CHECK-NEXT:    vadd.f32 s4, s4, s2
226; CHECK-NEXT:    vadd.f32 s0, s4, s3
227; CHECK-NEXT:    bx lr
228entry:
229  %z = call float @llvm.vector.reduce.fadd.f32.v4f32(float %y, <4 x float> %x)
230  ret float %z
231}
232
233define arm_aapcs_vfpcc float @fadd_v8f32_nofast(<8 x float> %x, float %y) {
234; CHECK-LABEL: fadd_v8f32_nofast:
235; CHECK:       @ %bb.0: @ %entry
236; CHECK-NEXT:    vadd.f32 s8, s8, s0
237; CHECK-NEXT:    vadd.f32 s8, s8, s1
238; CHECK-NEXT:    vadd.f32 s8, s8, s2
239; CHECK-NEXT:    vadd.f32 s0, s8, s3
240; CHECK-NEXT:    vadd.f32 s0, s0, s4
241; CHECK-NEXT:    vadd.f32 s0, s0, s5
242; CHECK-NEXT:    vadd.f32 s0, s0, s6
243; CHECK-NEXT:    vadd.f32 s0, s0, s7
244; CHECK-NEXT:    bx lr
245entry:
246  %z = call float @llvm.vector.reduce.fadd.f32.v8f32(float %y, <8 x float> %x)
247  ret float %z
248}
249
250define arm_aapcs_vfpcc half @fadd_v4f16_nofast(<4 x half> %x, half %y) {
251; CHECK-LABEL: fadd_v4f16_nofast:
252; CHECK:       @ %bb.0: @ %entry
253; CHECK-NEXT:    vadd.f16 s4, s4, s0
254; CHECK-NEXT:    vmovx.f16 s6, s0
255; CHECK-NEXT:    vadd.f16 s4, s4, s6
256; CHECK-NEXT:    vmovx.f16 s0, s1
257; CHECK-NEXT:    vadd.f16 s4, s4, s1
258; CHECK-NEXT:    vadd.f16 s0, s4, s0
259; CHECK-NEXT:    bx lr
260entry:
261  %z = call half @llvm.vector.reduce.fadd.f16.v4f16(half %y, <4 x half> %x)
262  ret half %z
263}
264
265define arm_aapcs_vfpcc half @fadd_v8f16_nofast(<8 x half> %x, half %y) {
266; CHECK-LABEL: fadd_v8f16_nofast:
267; CHECK:       @ %bb.0: @ %entry
268; CHECK-NEXT:    vadd.f16 s4, s4, s0
269; CHECK-NEXT:    vmovx.f16 s6, s0
270; CHECK-NEXT:    vadd.f16 s4, s4, s6
271; CHECK-NEXT:    vmovx.f16 s6, s1
272; CHECK-NEXT:    vadd.f16 s4, s4, s1
273; CHECK-NEXT:    vmovx.f16 s0, s3
274; CHECK-NEXT:    vadd.f16 s4, s4, s6
275; CHECK-NEXT:    vmovx.f16 s6, s2
276; CHECK-NEXT:    vadd.f16 s4, s4, s2
277; CHECK-NEXT:    vadd.f16 s4, s4, s6
278; CHECK-NEXT:    vadd.f16 s4, s4, s3
279; CHECK-NEXT:    vadd.f16 s0, s4, s0
280; CHECK-NEXT:    bx lr
281entry:
282  %z = call half @llvm.vector.reduce.fadd.f16.v8f16(half %y, <8 x half> %x)
283  ret half %z
284}
285
286define arm_aapcs_vfpcc half @fadd_v16f16_nofast(<16 x half> %x, half %y) {
287; CHECK-LABEL: fadd_v16f16_nofast:
288; CHECK:       @ %bb.0: @ %entry
289; CHECK-NEXT:    vadd.f16 s8, s8, s0
290; CHECK-NEXT:    vmovx.f16 s10, s0
291; CHECK-NEXT:    vadd.f16 s8, s8, s10
292; CHECK-NEXT:    vmovx.f16 s10, s1
293; CHECK-NEXT:    vadd.f16 s8, s8, s1
294; CHECK-NEXT:    vmovx.f16 s0, s3
295; CHECK-NEXT:    vadd.f16 s8, s8, s10
296; CHECK-NEXT:    vmovx.f16 s10, s2
297; CHECK-NEXT:    vadd.f16 s8, s8, s2
298; CHECK-NEXT:    vmovx.f16 s2, s4
299; CHECK-NEXT:    vadd.f16 s8, s8, s10
300; CHECK-NEXT:    vadd.f16 s8, s8, s3
301; CHECK-NEXT:    vadd.f16 s0, s8, s0
302; CHECK-NEXT:    vadd.f16 s0, s0, s4
303; CHECK-NEXT:    vadd.f16 s0, s0, s2
304; CHECK-NEXT:    vmovx.f16 s2, s5
305; CHECK-NEXT:    vadd.f16 s0, s0, s5
306; CHECK-NEXT:    vadd.f16 s0, s0, s2
307; CHECK-NEXT:    vmovx.f16 s2, s6
308; CHECK-NEXT:    vadd.f16 s0, s0, s6
309; CHECK-NEXT:    vadd.f16 s0, s0, s2
310; CHECK-NEXT:    vmovx.f16 s2, s7
311; CHECK-NEXT:    vadd.f16 s0, s0, s7
312; CHECK-NEXT:    vadd.f16 s0, s0, s2
313; CHECK-NEXT:    bx lr
314entry:
315  %z = call half @llvm.vector.reduce.fadd.f16.v16f16(half %y, <16 x half> %x)
316  ret half %z
317}
318
319define arm_aapcs_vfpcc double @fadd_v1f64_nofast(<1 x double> %x, double %y) {
320; CHECK-LABEL: fadd_v1f64_nofast:
321; CHECK:       @ %bb.0: @ %entry
322; CHECK-NEXT:    vadd.f64 d0, d1, d0
323; CHECK-NEXT:    bx lr
324entry:
325  %z = call double @llvm.vector.reduce.fadd.f64.v1f64(double %y, <1 x double> %x)
326  ret double %z
327}
328
329define arm_aapcs_vfpcc double @fadd_v2f64_nofast(<2 x double> %x, double %y) {
330; CHECK-LABEL: fadd_v2f64_nofast:
331; CHECK:       @ %bb.0: @ %entry
332; CHECK-NEXT:    vadd.f64 d2, d2, d0
333; CHECK-NEXT:    vadd.f64 d0, d2, d1
334; CHECK-NEXT:    bx lr
335entry:
336  %z = call double @llvm.vector.reduce.fadd.f64.v2f64(double %y, <2 x double> %x)
337  ret double %z
338}
339
340define arm_aapcs_vfpcc double @fadd_v4f64_nofast(<4 x double> %x, double %y) {
341; CHECK-LABEL: fadd_v4f64_nofast:
342; CHECK:       @ %bb.0: @ %entry
343; CHECK-NEXT:    vadd.f64 d4, d4, d0
344; CHECK-NEXT:    vadd.f64 d0, d4, d1
345; CHECK-NEXT:    vadd.f64 d0, d0, d2
346; CHECK-NEXT:    vadd.f64 d0, d0, d3
347; CHECK-NEXT:    bx lr
348entry:
349  %z = call double @llvm.vector.reduce.fadd.f64.v4f64(double %y, <4 x double> %x)
350  ret double %z
351}
352
353declare double @llvm.vector.reduce.fadd.f64.v1f64(double, <1 x double>)
354declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>)
355declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
356declare float @llvm.vector.reduce.fadd.f32.v2f32(float, <2 x float>)
357declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
358declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
359declare half @llvm.vector.reduce.fadd.f16.v16f16(half, <16 x half>)
360declare half @llvm.vector.reduce.fadd.f16.v2f16(half, <2 x half>)
361declare half @llvm.vector.reduce.fadd.f16.v4f16(half, <4 x half>)
362declare half @llvm.vector.reduce.fadd.f16.v8f16(half, <8 x half>)
363