1; RUN: llc -aarch64-sve-vector-bits-min=128  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
2; RUN: llc -aarch64-sve-vector-bits-min=256  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_EQ_256
3; RUN: llc -aarch64-sve-vector-bits-min=384  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
4; RUN: llc -aarch64-sve-vector-bits-min=512  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
5; RUN: llc -aarch64-sve-vector-bits-min=640  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
6; RUN: llc -aarch64-sve-vector-bits-min=768  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
7; RUN: llc -aarch64-sve-vector-bits-min=896  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
8; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
9; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
10; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
11; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
12; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
13; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
14; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
15; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
16; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
17
18target triple = "aarch64-unknown-linux-gnu"
19
20; Don't use SVE when its registers are no bigger than NEON.
21; NO_SVE-NOT: ptrue
22
23;
24; FADDA
25;
26
27; No single instruction NEON support. Use SVE.
28define half @fadda_v4f16(half %start, <4 x half> %a) #0 {
29; CHECK-LABEL: fadda_v4f16:
30; CHECK: ptrue [[PG:p[0-9]+]].h, vl4
31; CHECK-NEXT: fadda h0, [[PG]], h0, z1.h
32; CHECK-NEXT: ret
33  %res = call half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a)
34  ret half %res
35}
36
37; No single instruction NEON support. Use SVE.
38define half @fadda_v8f16(half %start, <8 x half> %a) #0 {
39; CHECK-LABEL: fadda_v8f16:
40; CHECK: ptrue [[PG:p[0-9]+]].h, vl8
41; CHECK-NEXT: fadda h0, [[PG]], h0, z1.h
42; CHECK-NEXT: ret
43  %res = call half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a)
44  ret half %res
45}
46
47define half @fadda_v16f16(half %start, <16 x half>* %a) #0 {
48; CHECK-LABEL: fadda_v16f16:
49; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
50; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
51; CHECK-NEXT: fadda h0, [[PG]], h0, [[OP]].h
52; CHECK-NEXT: ret
53  %op = load <16 x half>, <16 x half>* %a
54  %res = call half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op)
55  ret half %res
56}
57
58define half @fadda_v32f16(half %start, <32 x half>* %a) #0 {
59; CHECK-LABEL: fadda_v32f16:
60; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
61; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
62; VBITS_GE_512-NEXT: fadda h0, [[PG]], h0, [[OP]].h
63; VBITS_GE_512-NEXT: ret
64
65; Ensure sensible type legalisation.
66; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
67; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
68; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
69; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
70; VBITS_EQ_256-NEXT: fadda h0, [[PG]], h0, [[LO]].h
71; VBITS_EQ_256-NEXT: fadda h0, [[PG]], h0, [[HI]].h
72; VBITS_EQ_256-NEXT: ret
73  %op = load <32 x half>, <32 x half>* %a
74  %res = call half @llvm.vector.reduce.fadd.v32f16(half %start, <32 x half> %op)
75  ret half %res
76}
77
78define half @fadda_v64f16(half %start, <64 x half>* %a) #0 {
79; CHECK-LABEL: fadda_v64f16:
80; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
81; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
82; VBITS_GE_1024-NEXT: fadda h0, [[PG]], h0, [[OP]].h
83; VBITS_GE_1024-NEXT: ret
84  %op = load <64 x half>, <64 x half>* %a
85  %res = call half @llvm.vector.reduce.fadd.v64f16(half %start, <64 x half> %op)
86  ret half %res
87}
88
89define half @fadda_v128f16(half %start, <128 x half>* %a) #0 {
90; CHECK-LABEL: fadda_v128f16:
91; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
92; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
93; VBITS_GE_2048-NEXT: fadda h0, [[PG]], h0, [[OP]].h
94; VBITS_GE_2048-NEXT: ret
95  %op = load <128 x half>, <128 x half>* %a
96  %res = call half @llvm.vector.reduce.fadd.v128f16(half %start, <128 x half> %op)
97  ret half %res
98}
99
100; No single instruction NEON support. Use SVE.
101define float @fadda_v2f32(float %start, <2 x float> %a) #0 {
102; CHECK-LABEL: fadda_v2f32:
103; CHECK: ptrue [[PG:p[0-9]+]].s, vl2
104; CHECK-NEXT: fadda s0, [[PG]], s0, z1.s
105; CHECK-NEXT: ret
106  %res = call float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a)
107  ret float %res
108}
109
110; No single instruction NEON support. Use SVE.
111define float @fadda_v4f32(float %start, <4 x float> %a) #0 {
112; CHECK-LABEL: fadda_v4f32:
113; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
114; CHECK-NEXT: fadda s0, [[PG]], s0, z1.s
115; CHECK-NEXT: ret
116  %res = call float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a)
117  ret float %res
118}
119
120define float @fadda_v8f32(float %start, <8 x float>* %a) #0 {
121; CHECK-LABEL: fadda_v8f32:
122; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
123; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
124; CHECK-NEXT: fadda s0, [[PG]], s0, [[OP]].s
125; CHECK-NEXT: ret
126  %op = load <8 x float>, <8 x float>* %a
127  %res = call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op)
128  ret float %res
129}
130
131define float @fadda_v16f32(float %start, <16 x float>* %a) #0 {
132; CHECK-LABEL: fadda_v16f32:
133; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
134; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
135; VBITS_GE_512-NEXT: fadda s0, [[PG]], s0, [[OP]].s
136; VBITS_GE_512-NEXT: ret
137
138; Ensure sensible type legalisation.
139; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
140; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
141; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
142; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
143; VBITS_EQ_256-NEXT: fadda s0, [[PG]], s0, [[LO]].s
144; VBITS_EQ_256-NEXT: fadda s0, [[PG]], s0, [[HI]].s
145; VBITS_EQ_256-NEXT: ret
146  %op = load <16 x float>, <16 x float>* %a
147  %res = call float @llvm.vector.reduce.fadd.v16f32(float %start, <16 x float> %op)
148  ret float %res
149}
150
151define float @fadda_v32f32(float %start, <32 x float>* %a) #0 {
152; CHECK-LABEL: fadda_v32f32:
153; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
154; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
155; VBITS_GE_1024-NEXT: fadda s0, [[PG]], s0, [[OP]].s
156; VBITS_GE_1024-NEXT: ret
157  %op = load <32 x float>, <32 x float>* %a
158  %res = call float @llvm.vector.reduce.fadd.v32f32(float %start, <32 x float> %op)
159  ret float %res
160}
161
162define float @fadda_v64f32(float %start, <64 x float>* %a) #0 {
163; CHECK-LABEL: fadda_v64f32:
164; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
165; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
166; VBITS_GE_2048-NEXT: fadda s0, [[PG]], s0, [[OP]].s
167; VBITS_GE_2048-NEXT: ret
168  %op = load <64 x float>, <64 x float>* %a
169  %res = call float @llvm.vector.reduce.fadd.v64f32(float %start, <64 x float> %op)
170  ret float %res
171}
172
173; No single instruction NEON support. Use SVE.
174define double @fadda_v1f64(double %start, <1 x double> %a) #0 {
175; CHECK-LABEL: fadda_v1f64:
176; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
177; CHECK-NEXT: fadda d0, [[PG]], d0, z1.d
178; CHECK-NEXT: ret
179  %res = call double @llvm.vector.reduce.fadd.v1f64(double %start, <1 x double> %a)
180  ret double %res
181}
182
183; No single instruction NEON support. Use SVE.
184define double @fadda_v2f64(double %start, <2 x double> %a) #0 {
185; CHECK-LABEL: fadda_v2f64:
186; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
187; CHECK-NEXT: fadda d0, [[PG]], d0, z1.d
188; CHECK-NEXT: ret
189  %res = call double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a)
190  ret double %res
191}
192
193define double @fadda_v4f64(double %start, <4 x double>* %a) #0 {
194; CHECK-LABEL: fadda_v4f64:
195; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
196; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
197; CHECK-NEXT: fadda d0, [[PG]], d0, [[OP]].d
198; CHECK-NEXT: ret
199  %op = load <4 x double>, <4 x double>* %a
200  %res = call double @llvm.vector.reduce.fadd.v4f64(double %start, <4 x double> %op)
201  ret double %res
202}
203
204define double @fadda_v8f64(double %start, <8 x double>* %a) #0 {
205; CHECK-LABEL: fadda_v8f64:
206; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
207; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
208; VBITS_GE_512-NEXT: fadda d0, [[PG]], d0, [[OP]].d
209; VBITS_GE_512-NEXT: ret
210
211; Ensure sensible type legalisation.
212; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
213; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
214; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
215; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
216; VBITS_EQ_256-NEXT: fadda d0, [[PG]], d0, [[LO]].d
217; VBITS_EQ_256-NEXT: fadda d0, [[PG]], d0, [[HI]].d
218; VBITS_EQ_256-NEXT: ret
219  %op = load <8 x double>, <8 x double>* %a
220  %res = call double @llvm.vector.reduce.fadd.v8f64(double %start, <8 x double> %op)
221  ret double %res
222}
223
224define double @fadda_v16f64(double %start, <16 x double>* %a) #0 {
225; CHECK-LABEL: fadda_v16f64:
226; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
227; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
228; VBITS_GE_1024-NEXT: fadda d0, [[PG]], d0, [[OP]].d
229; VBITS_GE_1024-NEXT: ret
230  %op = load <16 x double>, <16 x double>* %a
231  %res = call double @llvm.vector.reduce.fadd.v16f64(double %start, <16 x double> %op)
232  ret double %res
233}
234
235define double @fadda_v32f64(double %start, <32 x double>* %a) #0 {
236; CHECK-LABEL: fadda_v32f64:
237; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
238; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
239; VBITS_GE_2048-NEXT: fadda d0, [[PG]], d0, [[OP]].d
240; VBITS_GE_2048-NEXT: ret
241  %op = load <32 x double>, <32 x double>* %a
242  %res = call double @llvm.vector.reduce.fadd.v32f64(double %start, <32 x double> %op)
243  ret double %res
244}
245
246;
247; FADDV
248;
249
250; No single instruction NEON support for 4 element vectors.
251define half @faddv_v4f16(half %start, <4 x half> %a) #0 {
252; CHECK-LABEL: faddv_v4f16:
253; CHECK: ptrue [[PG:p[0-9]+]].h, vl4
254; CHECK-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], z1.h
255; CHECK-NEXT: fadd h0, h0, [[RDX]]
256; CHECK-NEXT: ret
257  %res = call fast half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a)
258  ret half %res
259}
260
261; No single instruction NEON support for 8 element vectors.
262define half @faddv_v8f16(half %start, <8 x half> %a) #0 {
263; CHECK-LABEL: faddv_v8f16:
264; CHECK: ptrue [[PG:p[0-9]+]].h, vl8
265; CHECK-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], z1.h
266; CHECK-NEXT: fadd h0, h0, [[RDX]]
267; CHECK-NEXT: ret
268  %res = call fast half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a)
269  ret half %res
270}
271
272define half @faddv_v16f16(half %start, <16 x half>* %a) #0 {
273; CHECK-LABEL: faddv_v16f16:
274; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
275; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
276; CHECK-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], [[OP]].h
277; CHECK-NEXT: fadd h0, h0, [[RDX]]
278; CHECK-NEXT: ret
279  %op = load <16 x half>, <16 x half>* %a
280  %res = call fast half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op)
281  ret half %res
282}
283
284define half @faddv_v32f16(half %start, <32 x half>* %a) #0 {
285; CHECK-LABEL: faddv_v32f16:
286; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
287; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
288; VBITS_GE_512-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], [[OP]].h
289; VBITS_GE_512-NEXT: fadd h0, h0, [[RDX]]
290; VBITS_GE_512-NEXT: ret
291
292; Ensure sensible type legalisation.
293; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
294; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
295; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
296; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
297; VBITS_EQ_256-DAG: fadd [[ADD:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h
298; VBITS_EQ_256-DAG: faddv h1, [[PG]], [[ADD]].h
299; VBITS_EQ_256-DAG: fadd h0, h0, [[RDX]]
300; VBITS_EQ_256-NEXT: ret
301  %op = load <32 x half>, <32 x half>* %a
302  %res = call fast half @llvm.vector.reduce.fadd.v32f16(half %start, <32 x half> %op)
303  ret half %res
304}
305
306define half @faddv_v64f16(half %start, <64 x half>* %a) #0 {
307; CHECK-LABEL: faddv_v64f16:
308; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
309; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
310; VBITS_GE_1024-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], [[OP]].h
311; VBITS_GE_1024-NEXT: fadd h0, h0, [[RDX]]
312; VBITS_GE_1024-NEXT: ret
313  %op = load <64 x half>, <64 x half>* %a
314  %res = call fast half @llvm.vector.reduce.fadd.v64f16(half %start, <64 x half> %op)
315  ret half %res
316}
317
318define half @faddv_v128f16(half %start, <128 x half>* %a) #0 {
319; CHECK-LABEL: faddv_v128f16:
320; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
321; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
322; VBITS_GE_2048-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], [[OP]].h
323; VBITS_GE_2048-NEXT: fadd h0, h0, [[RDX]]
324; VBITS_GE_2048-NEXT: ret
325  %op = load <128 x half>, <128 x half>* %a
326  %res = call fast half @llvm.vector.reduce.fadd.v128f16(half %start, <128 x half> %op)
327  ret half %res
328}
329
330; Don't use SVE for 2 element vectors.
331define float @faddv_v2f32(float %start, <2 x float> %a) #0 {
332; CHECK-LABEL: faddv_v2f32:
333; CHECK: faddp s1, v1.2s
334; CHECK-NEXT: fadd s0, s0, s1
335; CHECK-NEXT: ret
336  %res = call fast float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a)
337  ret float %res
338}
339
340; No single instruction NEON support for 4 element vectors.
341define float @faddv_v4f32(float %start, <4 x float> %a) #0 {
342; CHECK-LABEL: faddv_v4f32:
343; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
344; CHECK-NEXT: faddv [[RDX:s[0-9]+]], [[PG]], z1.s
345; CHECK-NEXT: fadd s0, s0, [[RDX]]
346; CHECK-NEXT: ret
347  %res = call fast float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a)
348  ret float %res
349}
350
351define float @faddv_v8f32(float %start, <8 x float>* %a) #0 {
352; CHECK-LABEL: faddv_v8f32:
353; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
354; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
355; CHECK-NEXT: faddv [[RDX:s[0-9]+]], [[PG]], [[OP]].s
356; CHECK-NEXT: fadd s0, s0, [[RDX]]
357; CHECK-NEXT: ret
358  %op = load <8 x float>, <8 x float>* %a
359  %res = call fast float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op)
360  ret float %res
361}
362
363define float @faddv_v16f32(float %start, <16 x float>* %a) #0 {
364; CHECK-LABEL: faddv_v16f32:
365; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
366; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
367; VBITS_GE_512-NEXT: faddv [[RDX:s[0-9]+]], [[PG]], [[OP]].s
368; VBITS_GE_512-NEXT: fadd s0, s0, [[RDX]]
369; VBITS_GE_512-NEXT: ret
370
371; Ensure sensible type legalisation.
372; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
373; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
374; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
375; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
376; VBITS_EQ_256-DAG: fadd [[ADD:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s
377; VBITS_EQ_256-DAG: faddv [[RDX:s[0-9]+]], [[PG]], [[ADD]].s
378; VBITS_EQ_256-DAG: fadd s0, s0, [[RDX]]
379; VBITS_EQ_256-NEXT: ret
380  %op = load <16 x float>, <16 x float>* %a
381  %res = call fast float @llvm.vector.reduce.fadd.v16f32(float %start, <16 x float> %op)
382  ret float %res
383}
384
385define float @faddv_v32f32(float %start, <32 x float>* %a) #0 {
386; CHECK-LABEL: faddv_v32f32:
387; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
388; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
389; VBITS_GE_1024-NEXT: faddv [[RDX:s[0-9]+]], [[PG]], [[OP]].s
390; VBITS_GE_1024-NEXT: fadd s0, s0, [[RDX]]
391; VBITS_GE_1024-NEXT: ret
392  %op = load <32 x float>, <32 x float>* %a
393  %res = call fast float @llvm.vector.reduce.fadd.v32f32(float %start, <32 x float> %op)
394  ret float %res
395}
396
397define float @faddv_v64f32(float %start, <64 x float>* %a) #0 {
398; CHECK-LABEL: faddv_v64f32:
399; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
400; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
401; VBITS_GE_2048-NEXT: faddv [[RDX:s[0-9]+]], [[PG]], [[OP]].s
402; VBITS_GE_2048-NEXT: fadd s0, s0, [[RDX]]
403; VBITS_GE_2048-NEXT: ret
404  %op = load <64 x float>, <64 x float>* %a
405  %res = call fast float @llvm.vector.reduce.fadd.v64f32(float %start, <64 x float> %op)
406  ret float %res
407}
408
409; Don't use SVE for 1 element vectors.
410define double @faddv_v1f64(double %start, <1 x double> %a) #0 {
411; CHECK-LABEL: faddv_v1f64:
412; CHECK: fadd d0, d0, d1
413; CHECK-NEXT: ret
414  %res = call fast double @llvm.vector.reduce.fadd.v1f64(double %start, <1 x double> %a)
415  ret double %res
416}
417
418; Don't use SVE for 2 element vectors.
419define double @faddv_v2f64(double %start, <2 x double> %a) #0 {
420; CHECK-LABEL: faddv_v2f64:
421; CHECK: faddp d1, v1.2d
422; CHECK-NEXT: fadd d0, d0, d1
423; CHECK-NEXT: ret
424  %res = call fast double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a)
425  ret double %res
426}
427
428define double @faddv_v4f64(double %start, <4 x double>* %a) #0 {
429; CHECK-LABEL: faddv_v4f64:
430; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
431; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
432; CHECK-NEXT: faddv [[RDX:d[0-9]+]], [[PG]], [[OP]].d
433; CHECK-NEXT: fadd d0, d0, [[RDX]]
434; CHECK-NEXT: ret
435  %op = load <4 x double>, <4 x double>* %a
436  %res = call fast double @llvm.vector.reduce.fadd.v4f64(double %start, <4 x double> %op)
437  ret double %res
438}
439
440define double @faddv_v8f64(double %start, <8 x double>* %a) #0 {
441; CHECK-LABEL: faddv_v8f64:
442; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
443; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
444; VBITS_GE_512-NEXT: faddv [[RDX:d[0-9]+]], [[PG]], [[OP]].d
445; VBITS_GE_512-NEXT: fadd d0, d0, [[RDX]]
446; VBITS_GE_512-NEXT: ret
447
448; Ensure sensible type legalisation.
449; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
450; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
451; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
452; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
453; VBITS_EQ_256-DAG: fadd [[ADD:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d
454; VBITS_EQ_256-DAG: faddv [[RDX:d[0-9]+]], [[PG]], [[ADD]].d
455; VBITS_EQ_256-DAG: fadd d0, d0, [[RDX]]
456; VBITS_EQ_256-NEXT: ret
457  %op = load <8 x double>, <8 x double>* %a
458  %res = call fast double @llvm.vector.reduce.fadd.v8f64(double %start, <8 x double> %op)
459  ret double %res
460}
461
462define double @faddv_v16f64(double %start, <16 x double>* %a) #0 {
463; CHECK-LABEL: faddv_v16f64:
464; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
465; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
466; VBITS_GE_1024-NEXT: faddv [[RDX:d[0-9]+]], [[PG]], [[OP]].d
467; VBITS_GE_1024-NEXT: fadd d0, d0, [[RDX]]
468; VBITS_GE_1024-NEXT: ret
469  %op = load <16 x double>, <16 x double>* %a
470  %res = call fast double @llvm.vector.reduce.fadd.v16f64(double %start, <16 x double> %op)
471  ret double %res
472}
473
474define double @faddv_v32f64(double %start, <32 x double>* %a) #0 {
475; CHECK-LABEL: faddv_v32f64:
476; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
477; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
478; VBITS_GE_2048-NEXT: faddv [[RDX:d[0-9]+]], [[PG]], [[OP]].d
479; VBITS_GE_2048-NEXT: fadd d0, d0, [[RDX]]
480; VBITS_GE_2048-NEXT: ret
481  %op = load <32 x double>, <32 x double>* %a
482  %res = call fast double @llvm.vector.reduce.fadd.v32f64(double %start, <32 x double> %op)
483  ret double %res
484}
485
486;
487; FMAXV
488;
489
490; No NEON 16-bit vector FMAXNMV support. Use SVE.
491define half @fmaxv_v4f16(<4 x half> %a) #0 {
492; CHECK-LABEL: fmaxv_v4f16:
493; CHECK: fmaxnmv h0, v0.4h
494; CHECK-NEXT: ret
495  %res = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> %a)
496  ret half %res
497}
498
499; No NEON 16-bit vector FMAXNMV support. Use SVE.
500define half @fmaxv_v8f16(<8 x half> %a) #0 {
501; CHECK-LABEL: fmaxv_v8f16:
502; CHECK: fmaxnmv h0, v0.8h
503; CHECK-NEXT: ret
504  %res = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> %a)
505  ret half %res
506}
507
508define half @fmaxv_v16f16(<16 x half>* %a) #0 {
509; CHECK-LABEL: fmaxv_v16f16:
510; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
511; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
512; CHECK-NEXT: fmaxnmv h0, [[PG]], [[OP]].h
513; CHECK-NEXT: ret
514  %op = load <16 x half>, <16 x half>* %a
515  %res = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> %op)
516  ret half %res
517}
518
519define half @fmaxv_v32f16(<32 x half>* %a) #0 {
520; CHECK-LABEL: fmaxv_v32f16:
521; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
522; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
523; VBITS_GE_512-NEXT: fmaxnmv h0, [[PG]], [[OP]].h
524; VBITS_GE_512-NEXT: ret
525
526; Ensure sensible type legalisation.
527; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
528; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
529; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
530; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
531; VBITS_EQ_256-DAG: fmaxnm [[MAX:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h
532; VBITS_EQ_256-DAG: fmaxnmv h0, [[PG]], [[MAX]].h
533; VBITS_EQ_256-NEXT: ret
534  %op = load <32 x half>, <32 x half>* %a
535  %res = call half @llvm.vector.reduce.fmax.v32f16(<32 x half> %op)
536  ret half %res
537}
538
539define half @fmaxv_v64f16(<64 x half>* %a) #0 {
540; CHECK-LABEL: fmaxv_v64f16:
541; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
542; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
543; VBITS_GE_1024-NEXT: fmaxnmv h0, [[PG]], [[OP]].h
544; VBITS_GE_1024-NEXT: ret
545  %op = load <64 x half>, <64 x half>* %a
546  %res = call half @llvm.vector.reduce.fmax.v64f16(<64 x half> %op)
547  ret half %res
548}
549
550define half @fmaxv_v128f16(<128 x half>* %a) #0 {
551; CHECK-LABEL: fmaxv_v128f16:
552; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
553; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
554; VBITS_GE_2048-NEXT: fmaxnmv h0, [[PG]], [[OP]].h
555; VBITS_GE_2048-NEXT: ret
556  %op = load <128 x half>, <128 x half>* %a
557  %res = call half @llvm.vector.reduce.fmax.v128f16(<128 x half> %op)
558  ret half %res
559}
560
561; Don't use SVE for 64-bit f32 vectors.
562define float @fmaxv_v2f32(<2 x float> %a) #0 {
563; CHECK-LABEL: fmaxv_v2f32:
564; CHECK: fmaxnmp s0, v0.2s
565; CHECK: ret
566  %res = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> %a)
567  ret float %res
568}
569
570; Don't use SVE for 128-bit f32 vectors.
571define float @fmaxv_v4f32(<4 x float> %a) #0 {
572; CHECK-LABEL: fmaxv_v4f32:
573; CHECK: fmaxnmv s0, v0.4s
574; CHECK: ret
575  %res = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
576  ret float %res
577}
578
579define float @fmaxv_v8f32(<8 x float>* %a) #0 {
580; CHECK-LABEL: fmaxv_v8f32:
581; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
582; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
583; CHECK-NEXT: fmaxnmv s0, [[PG]], [[OP]].s
584; CHECK-NEXT: ret
585  %op = load <8 x float>, <8 x float>* %a
586  %res = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %op)
587  ret float %res
588}
589
590define float @fmaxv_v16f32(<16 x float>* %a) #0 {
591; CHECK-LABEL: fmaxv_v16f32:
592; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
593; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
594; VBITS_GE_512-NEXT: fmaxnmv s0, [[PG]], [[OP]].s
595; VBITS_GE_512-NEXT: ret
596
597; Ensure sensible type legalisation.
598; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
599; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
600; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
601; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
602; VBITS_EQ_256-DAG: fmaxnm [[MAX:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s
603; VBITS_EQ_256-DAG: fmaxnmv s0, [[PG]], [[MAX]].s
604; VBITS_EQ_256-NEXT: ret
605  %op = load <16 x float>, <16 x float>* %a
606  %res = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %op)
607  ret float %res
608}
609
610define float @fmaxv_v32f32(<32 x float>* %a) #0 {
611; CHECK-LABEL: fmaxv_v32f32:
612; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
613; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
614; VBITS_GE_1024-NEXT: fmaxnmv s0, [[PG]], [[OP]].s
615; VBITS_GE_1024-NEXT: ret
616  %op = load <32 x float>, <32 x float>* %a
617  %res = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> %op)
618  ret float %res
619}
620
621define float @fmaxv_v64f32(<64 x float>* %a) #0 {
622; CHECK-LABEL: fmaxv_v64f32:
623; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
624; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
625; VBITS_GE_2048-NEXT: fmaxnmv s0, [[PG]], [[OP]].s
626; VBITS_GE_2048-NEXT: ret
627  %op = load <64 x float>, <64 x float>* %a
628  %res = call float @llvm.vector.reduce.fmax.v64f32(<64 x float> %op)
629  ret float %res
630}
631
632; Nothing to do for single element vectors.
633define double @fmaxv_v1f64(<1 x double> %a) #0 {
634; CHECK-LABEL: fmaxv_v1f64:
635; CHECK-NOT: fmax
636; CHECK: ret
637  %res = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> %a)
638  ret double %res
639}
640
641; Don't use SVE for 128-bit f64 vectors.
642define double @fmaxv_v2f64(<2 x double> %a) #0 {
643; CHECK-LABEL: fmaxv_v2f64:
644; CHECK: fmaxnmp d0, v0.2d
645; CHECK-NEXT: ret
646  %res = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a)
647  ret double %res
648}
649
650define double @fmaxv_v4f64(<4 x double>* %a) #0 {
651; CHECK-LABEL: fmaxv_v4f64:
652; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
653; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
654; CHECK-NEXT: fmaxnmv d0, [[PG]], [[OP]].d
655; CHECK-NEXT: ret
656  %op = load <4 x double>, <4 x double>* %a
657  %res = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %op)
658  ret double %res
659}
660
661define double @fmaxv_v8f64(<8 x double>* %a) #0 {
662; CHECK-LABEL: fmaxv_v8f64:
663; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
664; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
665; VBITS_GE_512-NEXT: fmaxnmv d0, [[PG]], [[OP]].d
666; VBITS_GE_512-NEXT: ret
667
668; Ensure sensible type legalisation.
669; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
670; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
671; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
672; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
673; VBITS_EQ_256-DAG: fmaxnm [[MAX:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d
674; VBITS_EQ_256-DAG: fmaxnmv d0, [[PG]], [[MAX]].d
675; VBITS_EQ_256-NEXT: ret
676  %op = load <8 x double>, <8 x double>* %a
677  %res = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> %op)
678  ret double %res
679}
680
681define double @fmaxv_v16f64(<16 x double>* %a) #0 {
682; CHECK-LABEL: fmaxv_v16f64:
683; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
684; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
685; VBITS_GE_1024-NEXT: fmaxnmv d0, [[PG]], [[OP]].d
686; VBITS_GE_1024-NEXT: ret
687  %op = load <16 x double>, <16 x double>* %a
688  %res = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> %op)
689  ret double %res
690}
691
692define double @fmaxv_v32f64(<32 x double>* %a) #0 {
693; CHECK-LABEL: fmaxv_v32f64:
694; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
695; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
696; VBITS_GE_2048-NEXT: fmaxnmv d0, [[PG]], [[OP]].d
697; VBITS_GE_2048-NEXT: ret
698  %op = load <32 x double>, <32 x double>* %a
699  %res = call double @llvm.vector.reduce.fmax.v32f64(<32 x double> %op)
700  ret double %res
701}
702
703;
704; FMINV
705;
706
707; No NEON 16-bit vector FMINNMV support. Use SVE.
708define half @fminv_v4f16(<4 x half> %a) #0 {
709; CHECK-LABEL: fminv_v4f16:
710; CHECK: fminnmv h0, v0.4h
711; CHECK-NEXT: ret
712  %res = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> %a)
713  ret half %res
714}
715
716; No NEON 16-bit vector FMINNMV support. Use SVE.
717define half @fminv_v8f16(<8 x half> %a) #0 {
718; CHECK-LABEL: fminv_v8f16:
719; CHECK: fminnmv h0, v0.8h
720; CHECK-NEXT: ret
721  %res = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> %a)
722  ret half %res
723}
724
725define half @fminv_v16f16(<16 x half>* %a) #0 {
726; CHECK-LABEL: fminv_v16f16:
727; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
728; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
729; CHECK-NEXT: fminnmv h0, [[PG]], [[OP]].h
730; CHECK-NEXT: ret
731  %op = load <16 x half>, <16 x half>* %a
732  %res = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> %op)
733  ret half %res
734}
735
736define half @fminv_v32f16(<32 x half>* %a) #0 {
737; CHECK-LABEL: fminv_v32f16:
738; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
739; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
740; VBITS_GE_512-NEXT: fminnmv h0, [[PG]], [[OP]].h
741; VBITS_GE_512-NEXT: ret
742
743; Ensure sensible type legalisation.
744; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
745; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
746; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
747; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
748; VBITS_EQ_256-DAG: fminnm [[MIN:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h
749; VBITS_EQ_256-DAG: fminnmv h0, [[PG]], [[MIN]].h
750; VBITS_EQ_256-NEXT: ret
751  %op = load <32 x half>, <32 x half>* %a
752  %res = call half @llvm.vector.reduce.fmin.v32f16(<32 x half> %op)
753  ret half %res
754}
755
756define half @fminv_v64f16(<64 x half>* %a) #0 {
757; CHECK-LABEL: fminv_v64f16:
758; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
759; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
760; VBITS_GE_1024-NEXT: fminnmv h0, [[PG]], [[OP]].h
761; VBITS_GE_1024-NEXT: ret
762  %op = load <64 x half>, <64 x half>* %a
763  %res = call half @llvm.vector.reduce.fmin.v64f16(<64 x half> %op)
764  ret half %res
765}
766
767define half @fminv_v128f16(<128 x half>* %a) #0 {
768; CHECK-LABEL: fminv_v128f16:
769; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
770; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
771; VBITS_GE_2048-NEXT: fminnmv h0, [[PG]], [[OP]].h
772; VBITS_GE_2048-NEXT: ret
773  %op = load <128 x half>, <128 x half>* %a
774  %res = call half @llvm.vector.reduce.fmin.v128f16(<128 x half> %op)
775  ret half %res
776}
777
778; Don't use SVE for 64-bit f32 vectors.
779define float @fminv_v2f32(<2 x float> %a) #0 {
780; CHECK-LABEL: fminv_v2f32:
781; CHECK: fminnmp s0, v0.2s
782; CHECK: ret
783  %res = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a)
784  ret float %res
785}
786
787; Don't use SVE for 128-bit f32 vectors.
788define float @fminv_v4f32(<4 x float> %a) #0 {
789; CHECK-LABEL: fminv_v4f32:
790; CHECK: fminnmv s0, v0.4s
791; CHECK: ret
792  %res = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a)
793  ret float %res
794}
795
796define float @fminv_v8f32(<8 x float>* %a) #0 {
797; CHECK-LABEL: fminv_v8f32:
798; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
799; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
800; CHECK-NEXT: fminnmv s0, [[PG]], [[OP]].s
801; CHECK-NEXT: ret
802  %op = load <8 x float>, <8 x float>* %a
803  %res = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %op)
804  ret float %res
805}
806
807define float @fminv_v16f32(<16 x float>* %a) #0 {
808; CHECK-LABEL: fminv_v16f32:
809; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
810; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
811; VBITS_GE_512-NEXT: fminnmv s0, [[PG]], [[OP]].s
812; VBITS_GE_512-NEXT: ret
813
814; Ensure sensible type legalisation.
815; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
816; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
817; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
818; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
819; VBITS_EQ_256-DAG: fminnm [[MIN:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s
820; VBITS_EQ_256-DAG: fminnmv s0, [[PG]], [[MIN]].s
821; VBITS_EQ_256-NEXT: ret
822  %op = load <16 x float>, <16 x float>* %a
823  %res = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> %op)
824  ret float %res
825}
826
827define float @fminv_v32f32(<32 x float>* %a) #0 {
828; CHECK-LABEL: fminv_v32f32:
829; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
830; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
831; VBITS_GE_1024-NEXT: fminnmv s0, [[PG]], [[OP]].s
832; VBITS_GE_1024-NEXT: ret
833  %op = load <32 x float>, <32 x float>* %a
834  %res = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> %op)
835  ret float %res
836}
837
838define float @fminv_v64f32(<64 x float>* %a) #0 {
839; CHECK-LABEL: fminv_v64f32:
840; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
841; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
842; VBITS_GE_2048-NEXT: fminnmv s0, [[PG]], [[OP]].s
843; VBITS_GE_2048-NEXT: ret
844  %op = load <64 x float>, <64 x float>* %a
845  %res = call float @llvm.vector.reduce.fmin.v64f32(<64 x float> %op)
846  ret float %res
847}
848
849; Nothing to do for single element vectors.
850define double @fminv_v1f64(<1 x double> %a) #0 {
851; CHECK-LABEL: fminv_v1f64:
852; CHECK-NOT: fmin
853; CHECK: ret
854  %res = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> %a)
855  ret double %res
856}
857
858; Don't use SVE for 128-bit f64 vectors.
859define double @fminv_v2f64(<2 x double> %a) #0 {
860; CHECK-LABEL: fminv_v2f64:
861; CHECK: fminnmp d0, v0.2d
862; CHECK-NEXT: ret
863  %res = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a)
864  ret double %res
865}
866
867define double @fminv_v4f64(<4 x double>* %a) #0 {
868; CHECK-LABEL: fminv_v4f64:
869; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
870; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
871; CHECK-NEXT: fminnmv d0, [[PG]], [[OP]].d
872; CHECK-NEXT: ret
873  %op = load <4 x double>, <4 x double>* %a
874  %res = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %op)
875  ret double %res
876}
877
878define double @fminv_v8f64(<8 x double>* %a) #0 {
879; CHECK-LABEL: fminv_v8f64:
880; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
881; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
882; VBITS_GE_512-NEXT: fminnmv d0, [[PG]], [[OP]].d
883; VBITS_GE_512-NEXT: ret
884
885; Ensure sensible type legalisation.
886; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
887; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
888; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
889; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
890; VBITS_EQ_256-DAG: fminnm [[MIN:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d
891; VBITS_EQ_256-DAG: fminnmv d0, [[PG]], [[MIN]].d
892; VBITS_EQ_256-NEXT: ret
893  %op = load <8 x double>, <8 x double>* %a
894  %res = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> %op)
895  ret double %res
896}
897
898define double @fminv_v16f64(<16 x double>* %a) #0 {
899; CHECK-LABEL: fminv_v16f64:
900; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
901; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
902; VBITS_GE_1024-NEXT: fminnmv d0, [[PG]], [[OP]].d
903; VBITS_GE_1024-NEXT: ret
904  %op = load <16 x double>, <16 x double>* %a
905  %res = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> %op)
906  ret double %res
907}
908
909define double @fminv_v32f64(<32 x double>* %a) #0 {
910; CHECK-LABEL: fminv_v32f64:
911; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
912; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
913; VBITS_GE_2048-NEXT: fminnmv d0, [[PG]], [[OP]].d
914; VBITS_GE_2048-NEXT: ret
915  %op = load <32 x double>, <32 x double>* %a
916  %res = call double @llvm.vector.reduce.fmin.v32f64(<32 x double> %op)
917  ret double %res
918}
919
920attributes #0 = { "target-features"="+sve" }
921
922declare half @llvm.vector.reduce.fadd.v4f16(half, <4 x half>)
923declare half @llvm.vector.reduce.fadd.v8f16(half, <8 x half>)
924declare half @llvm.vector.reduce.fadd.v16f16(half, <16 x half>)
925declare half @llvm.vector.reduce.fadd.v32f16(half, <32 x half>)
926declare half @llvm.vector.reduce.fadd.v64f16(half, <64 x half>)
927declare half @llvm.vector.reduce.fadd.v128f16(half, <128 x half>)
928
929declare float @llvm.vector.reduce.fadd.v2f32(float, <2 x float>)
930declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>)
931declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>)
932declare float @llvm.vector.reduce.fadd.v16f32(float, <16 x float>)
933declare float @llvm.vector.reduce.fadd.v32f32(float, <32 x float>)
934declare float @llvm.vector.reduce.fadd.v64f32(float, <64 x float>)
935
936declare double @llvm.vector.reduce.fadd.v1f64(double, <1 x double>)
937declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>)
938declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>)
939declare double @llvm.vector.reduce.fadd.v8f64(double, <8 x double>)
940declare double @llvm.vector.reduce.fadd.v16f64(double, <16 x double>)
941declare double @llvm.vector.reduce.fadd.v32f64(double, <32 x double>)
942
943declare half @llvm.vector.reduce.fmax.v4f16(<4 x half>)
944declare half @llvm.vector.reduce.fmax.v8f16(<8 x half>)
945declare half @llvm.vector.reduce.fmax.v16f16(<16 x half>)
946declare half @llvm.vector.reduce.fmax.v32f16(<32 x half>)
947declare half @llvm.vector.reduce.fmax.v64f16(<64 x half>)
948declare half @llvm.vector.reduce.fmax.v128f16(<128 x half>)
949
950declare float @llvm.vector.reduce.fmax.v2f32(<2 x float>)
951declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
952declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>)
953declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>)
954declare float @llvm.vector.reduce.fmax.v32f32(<32 x float>)
955declare float @llvm.vector.reduce.fmax.v64f32(<64 x float>)
956
957declare double @llvm.vector.reduce.fmax.v1f64(<1 x double>)
958declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>)
959declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>)
960declare double @llvm.vector.reduce.fmax.v8f64(<8 x double>)
961declare double @llvm.vector.reduce.fmax.v16f64(<16 x double>)
962declare double @llvm.vector.reduce.fmax.v32f64(<32 x double>)
963
964declare half @llvm.vector.reduce.fmin.v4f16(<4 x half>)
965declare half @llvm.vector.reduce.fmin.v8f16(<8 x half>)
966declare half @llvm.vector.reduce.fmin.v16f16(<16 x half>)
967declare half @llvm.vector.reduce.fmin.v32f16(<32 x half>)
968declare half @llvm.vector.reduce.fmin.v64f16(<64 x half>)
969declare half @llvm.vector.reduce.fmin.v128f16(<128 x half>)
970
971declare float @llvm.vector.reduce.fmin.v2f32(<2 x float>)
972declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
973declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
974declare float @llvm.vector.reduce.fmin.v16f32(<16 x float>)
975declare float @llvm.vector.reduce.fmin.v32f32(<32 x float>)
976declare float @llvm.vector.reduce.fmin.v64f32(<64 x float>)
977
978declare double @llvm.vector.reduce.fmin.v1f64(<1 x double>)
979declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>)
980declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>)
981declare double @llvm.vector.reduce.fmin.v8f64(<8 x double>)
982declare double @llvm.vector.reduce.fmin.v16f64(<16 x double>)
983declare double @llvm.vector.reduce.fmin.v32f64(<32 x double>)
984