1; RUN: llc -aarch64-sve-vector-bits-min=128  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
2; RUN: llc -aarch64-sve-vector-bits-min=256  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_EQ_256
3; RUN: llc -aarch64-sve-vector-bits-min=384  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
4; RUN: llc -aarch64-sve-vector-bits-min=512  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
5; RUN: llc -aarch64-sve-vector-bits-min=640  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
6; RUN: llc -aarch64-sve-vector-bits-min=768  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
7; RUN: llc -aarch64-sve-vector-bits-min=896  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
8; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
9; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
10; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
11; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
12; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
13; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
14; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
15; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
16; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
17
18target triple = "aarch64-unknown-linux-gnu"
19
20; Don't use SVE when its registers are no bigger than NEON.
21; NO_SVE-NOT: ptrue
22
23;
24; UADDV
25;
26
27; Don't use SVE for 64-bit vectors.
28define i8 @uaddv_v8i8(<8 x i8> %a) #0 {
29; CHECK-LABEL: uaddv_v8i8:
30; CHECK: addv b0, v0.8b
31; CHECK: ret
32  %res = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a)
33  ret i8 %res
34}
35
36; Don't use SVE for 128-bit vectors.
37define i8 @uaddv_v16i8(<16 x i8> %a) #0 {
38; CHECK-LABEL: uaddv_v16i8:
39; CHECK: addv b0, v0.16b
40; CHECK: ret
41  %res = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a)
42  ret i8 %res
43}
44
45define i8 @uaddv_v32i8(<32 x i8>* %a) #0 {
46; CHECK-LABEL: uaddv_v32i8:
47; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
48; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
49; CHECK-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].b
50; CHECK-NEXT: fmov x0, [[REDUCE]]
51; CHECK-NEXT: ret
52  %op = load <32 x i8>, <32 x i8>* %a
53  %res = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %op)
54  ret i8 %res
55}
56
57define i8 @uaddv_v64i8(<64 x i8>* %a) #0 {
58; CHECK-LABEL: uaddv_v64i8:
59; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
60; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
61; VBITS_GE_512-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].b
62; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
63; VBITS_GE_512-NEXT: ret
64
65; Ensure sensible type legalisation.
66; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
67; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
68; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0]
69; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
70; VBITS_EQ_256-DAG: add [[ADD:z[0-9]+]].b, [[PG]]/m, [[HI]].b, [[LO]].b
71; VBITS_EQ_256-DAG: addv [[REDUCE:d[0-9]+]], [[PG]], [[ADD]].b
72; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
73; VBITS_EQ_256-NEXT: ret
74  %op = load <64 x i8>, <64 x i8>* %a
75  %res = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %op)
76  ret i8 %res
77}
78
79define i8 @uaddv_v128i8(<128 x i8>* %a) #0 {
80; CHECK-LABEL: uaddv_v128i8:
81; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
82; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
83; VBITS_GE_1024-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].b
84; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
85; VBITS_GE_1024-NEXT: ret
86  %op = load <128 x i8>, <128 x i8>* %a
87  %res = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %op)
88  ret i8 %res
89}
90
91define i8 @uaddv_v256i8(<256 x i8>* %a) #0 {
92; CHECK-LABEL: uaddv_v256i8:
93; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
94; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
95; VBITS_GE_2048-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].b
96; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
97; VBITS_GE_2048-NEXT: ret
98  %op = load <256 x i8>, <256 x i8>* %a
99  %res = call i8 @llvm.vector.reduce.add.v256i8(<256 x i8> %op)
100  ret i8 %res
101}
102
103; Don't use SVE for 64-bit vectors.
104define i16 @uaddv_v4i16(<4 x i16> %a) #0 {
105; CHECK-LABEL: uaddv_v4i16:
106; CHECK: addv h0, v0.4h
107; CHECK: ret
108  %res = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a)
109  ret i16 %res
110}
111
112; Don't use SVE for 128-bit vectors.
113define i16 @uaddv_v8i16(<8 x i16> %a) #0 {
114; CHECK-LABEL: uaddv_v8i16:
115; CHECK: addv h0, v0.8h
116; CHECK: ret
117  %res = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a)
118  ret i16 %res
119}
120
121define i16 @uaddv_v16i16(<16 x i16>* %a) #0 {
122; CHECK-LABEL: uaddv_v16i16:
123; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
124; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
125; CHECK-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].h
126; CHECK-NEXT: fmov x0, [[REDUCE]]
127; CHECK-NEXT: ret
128  %op = load <16 x i16>, <16 x i16>* %a
129  %res = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %op)
130  ret i16 %res
131}
132
133define i16 @uaddv_v32i16(<32 x i16>* %a) #0 {
134; CHECK-LABEL: uaddv_v32i16:
135; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
136; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
137; VBITS_GE_512-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].h
138; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
139; VBITS_GE_512-NEXT: ret
140
141; Ensure sensible type legalisation.
142; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
143; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
144; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
145; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
146; VBITS_EQ_256-DAG: add [[ADD:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h
147; VBITS_EQ_256-DAG: addv [[REDUCE:d[0-9]+]], [[PG]], [[ADD]].h
148; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
149; VBITS_EQ_256-NEXT: ret
150  %op = load <32 x i16>, <32 x i16>* %a
151  %res = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %op)
152  ret i16 %res
153}
154
155define i16 @uaddv_v64i16(<64 x i16>* %a) #0 {
156; CHECK-LABEL: uaddv_v64i16:
157; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
158; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
159; VBITS_GE_1024-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].h
160; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
161; VBITS_GE_1024-NEXT: ret
162  %op = load <64 x i16>, <64 x i16>* %a
163  %res = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %op)
164  ret i16 %res
165}
166
167define i16 @uaddv_v128i16(<128 x i16>* %a) #0 {
168; CHECK-LABEL: uaddv_v128i16:
169; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
170; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
171; VBITS_GE_2048-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].h
172; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
173; VBITS_GE_2048-NEXT: ret
174  %op = load <128 x i16>, <128 x i16>* %a
175  %res = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> %op)
176  ret i16 %res
177}
178
179; Don't use SVE for 64-bit vectors.
180define i32 @uaddv_v2i32(<2 x i32> %a) #0 {
181; CHECK-LABEL: uaddv_v2i32:
182; CHECK: addp v0.2s, v0.2s
183; CHECK: ret
184  %res = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a)
185  ret i32 %res
186}
187
188; Don't use SVE for 128-bit vectors.
189define i32 @uaddv_v4i32(<4 x i32> %a) #0 {
190; CHECK-LABEL: uaddv_v4i32:
191; CHECK: addv s0, v0.4s
192; CHECK: ret
193  %res = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
194  ret i32 %res
195}
196
197define i32 @uaddv_v8i32(<8 x i32>* %a) #0 {
198; CHECK-LABEL: uaddv_v8i32:
199; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
200; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
201; CHECK-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].s
202; CHECK-NEXT: fmov x0, [[REDUCE]]
203; CHECK-NEXT: ret
204  %op = load <8 x i32>, <8 x i32>* %a
205  %res = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %op)
206  ret i32 %res
207}
208
209define i32 @uaddv_v16i32(<16 x i32>* %a) #0 {
210; CHECK-LABEL: uaddv_v16i32:
211; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
212; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
213; VBITS_GE_512-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].s
214; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
215; VBITS_GE_512-NEXT: ret
216
217; Ensure sensible type legalisation.
218; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
219; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
220; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
221; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
222; VBITS_EQ_256-DAG: add [[ADD:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s
223; VBITS_EQ_256-DAG: addv [[REDUCE:d[0-9]+]], [[PG]], [[ADD]].s
224; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
225; VBITS_EQ_256-NEXT: ret
226  %op = load <16 x i32>, <16 x i32>* %a
227  %res = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %op)
228  ret i32 %res
229}
230
231define i32 @uaddv_v32i32(<32 x i32>* %a) #0 {
232; CHECK-LABEL: uaddv_v32i32:
233; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
234; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
235; VBITS_GE_1024-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].s
236; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
237; VBITS_GE_1024-NEXT: ret
238  %op = load <32 x i32>, <32 x i32>* %a
239  %res = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %op)
240  ret i32 %res
241}
242
243define i32 @uaddv_v64i32(<64 x i32>* %a) #0 {
244; CHECK-LABEL: uaddv_v64i32:
245; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
246; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
247; VBITS_GE_2048-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].s
248; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
249; VBITS_GE_2048-NEXT: ret
250  %op = load <64 x i32>, <64 x i32>* %a
251  %res = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %op)
252  ret i32 %res
253}
254
255; Nothing to do for single element vectors.
256define i64 @uaddv_v1i64(<1 x i64> %a) #0 {
257; CHECK-LABEL: uaddv_v1i64:
258; CHECK: fmov x0, d0
259; CHECK: ret
260  %res = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a)
261  ret i64 %res
262}
263
264; Don't use SVE for 128-bit vectors.
265define i64 @uaddv_v2i64(<2 x i64> %a) #0 {
266; CHECK-LABEL: uaddv_v2i64:
267; CHECK: addp d0, v0.2d
268; CHECK: ret
269  %res = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a)
270  ret i64 %res
271}
272
273define i64 @uaddv_v4i64(<4 x i64>* %a) #0 {
274; CHECK-LABEL: uaddv_v4i64:
275; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
276; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
277; CHECK-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
278; CHECK-NEXT: fmov x0, [[REDUCE]]
279; CHECK-NEXT: ret
280  %op = load <4 x i64>, <4 x i64>* %a
281  %res = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %op)
282  ret i64 %res
283}
284
285define i64 @uaddv_v8i64(<8 x i64>* %a) #0 {
286; CHECK-LABEL: uaddv_v8i64:
287; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
288; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
289; VBITS_GE_512-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
290; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
291; VBITS_GE_512-NEXT: ret
292
293; Ensure sensible type legalisation.
294; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
295; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
296; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
297; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
298; VBITS_EQ_256-DAG: add [[ADD:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d
299; VBITS_EQ_256-DAG: addv [[REDUCE:d[0-9]+]], [[PG]], [[ADD]].d
300; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
301; VBITS_EQ_256-NEXT: ret
302  %op = load <8 x i64>, <8 x i64>* %a
303  %res = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %op)
304  ret i64 %res
305}
306
307define i64 @uaddv_v16i64(<16 x i64>* %a) #0 {
308; CHECK-LABEL: uaddv_v16i64:
309; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
310; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
311; VBITS_GE_1024-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
312; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
313; VBITS_GE_1024-NEXT: ret
314  %op = load <16 x i64>, <16 x i64>* %a
315  %res = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %op)
316  ret i64 %res
317}
318
319define i64 @uaddv_v32i64(<32 x i64>* %a) #0 {
320; CHECK-LABEL: uaddv_v32i64:
321; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
322; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
323; VBITS_GE_2048-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
324; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
325; VBITS_GE_2048-NEXT: ret
326  %op = load <32 x i64>, <32 x i64>* %a
327  %res = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> %op)
328  ret i64 %res
329}
330
331;
332; SMAXV
333;
334
335; Don't use SVE for 64-bit vectors.
336define i8 @smaxv_v8i8(<8 x i8> %a) #0 {
337; CHECK-LABEL: smaxv_v8i8:
338; CHECK: smaxv b0, v0.8b
339; CHECK: ret
340  %res = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %a)
341  ret i8 %res
342}
343
344; Don't use SVE for 128-bit vectors.
345define i8 @smaxv_v16i8(<16 x i8> %a) #0 {
346; CHECK-LABEL: smaxv_v16i8:
347; CHECK: smaxv b0, v0.16b
348; CHECK: ret
349  %res = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %a)
350  ret i8 %res
351}
352
353define i8 @smaxv_v32i8(<32 x i8>* %a) #0 {
354; CHECK-LABEL: smaxv_v32i8:
355; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
356; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
357; CHECK-NEXT: smaxv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
358; CHECK-NEXT: fmov w0, s[[REDUCE]]
359; CHECK-NEXT: ret
360  %op = load <32 x i8>, <32 x i8>* %a
361  %res = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %op)
362  ret i8 %res
363}
364
365define i8 @smaxv_v64i8(<64 x i8>* %a) #0 {
366; CHECK-LABEL: smaxv_v64i8:
367; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
368; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
369; VBITS_GE_512-NEXT: smaxv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
370; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
371; VBITS_GE_512-NEXT: ret
372
373; Ensure sensible type legalisation.
374; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
375; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
376; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0]
377; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
378; VBITS_EQ_256-DAG: smax [[MAX:z[0-9]+]].b, [[PG]]/m, [[HI]].b, [[LO]].b
379; VBITS_EQ_256-DAG: smaxv b[[REDUCE:[0-9]+]], [[PG]], [[MAX]].b
380; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
381; VBITS_EQ_256-NEXT: ret
382  %op = load <64 x i8>, <64 x i8>* %a
383  %res = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> %op)
384  ret i8 %res
385}
386
387define i8 @smaxv_v128i8(<128 x i8>* %a) #0 {
388; CHECK-LABEL: smaxv_v128i8:
389; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
390; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
391; VBITS_GE_1024-NEXT: smaxv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
392; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
393; VBITS_GE_1024-NEXT: ret
394  %op = load <128 x i8>, <128 x i8>* %a
395  %res = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> %op)
396  ret i8 %res
397}
398
399define i8 @smaxv_v256i8(<256 x i8>* %a) #0 {
400; CHECK-LABEL: smaxv_v256i8:
401; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
402; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
403; VBITS_GE_2048-NEXT: smaxv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
404; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
405; VBITS_GE_2048-NEXT: ret
406  %op = load <256 x i8>, <256 x i8>* %a
407  %res = call i8 @llvm.vector.reduce.smax.v256i8(<256 x i8> %op)
408  ret i8 %res
409}
410
411; Don't use SVE for 64-bit vectors.
412define i16 @smaxv_v4i16(<4 x i16> %a) #0 {
413; CHECK-LABEL: smaxv_v4i16:
414; CHECK: smaxv h0, v0.4h
415; CHECK: ret
416  %res = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %a)
417  ret i16 %res
418}
419
420; Don't use SVE for 128-bit vectors.
421define i16 @smaxv_v8i16(<8 x i16> %a) #0 {
422; CHECK-LABEL: smaxv_v8i16:
423; CHECK: smaxv h0, v0.8h
424; CHECK: ret
425  %res = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %a)
426  ret i16 %res
427}
428
429define i16 @smaxv_v16i16(<16 x i16>* %a) #0 {
430; CHECK-LABEL: smaxv_v16i16:
431; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
432; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
433; CHECK-NEXT: smaxv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
434; CHECK-NEXT: fmov w0, s[[REDUCE]]
435; CHECK-NEXT: ret
436  %op = load <16 x i16>, <16 x i16>* %a
437  %res = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %op)
438  ret i16 %res
439}
440
441define i16 @smaxv_v32i16(<32 x i16>* %a) #0 {
442; CHECK-LABEL: smaxv_v32i16:
443; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
444; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
445; VBITS_GE_512-NEXT: smaxv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
446; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
447; VBITS_GE_512-NEXT: ret
448
449; Ensure sensible type legalisation.
450; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
451; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
452; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
453; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
454; VBITS_EQ_256-DAG: smax [[MAX:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h
455; VBITS_EQ_256-DAG: smaxv h[[REDUCE:[0-9]+]], [[PG]], [[MAX]].h
456; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
457; VBITS_EQ_256-NEXT: ret
458  %op = load <32 x i16>, <32 x i16>* %a
459  %res = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> %op)
460  ret i16 %res
461}
462
463define i16 @smaxv_v64i16(<64 x i16>* %a) #0 {
464; CHECK-LABEL: smaxv_v64i16:
465; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
466; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
467; VBITS_GE_1024-NEXT: smaxv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
468; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
469; VBITS_GE_1024-NEXT: ret
470  %op = load <64 x i16>, <64 x i16>* %a
471  %res = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> %op)
472  ret i16 %res
473}
474
475define i16 @smaxv_v128i16(<128 x i16>* %a) #0 {
476; CHECK-LABEL: smaxv_v128i16:
477; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
478; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
479; VBITS_GE_2048-NEXT: smaxv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
480; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
481; VBITS_GE_2048-NEXT: ret
482  %op = load <128 x i16>, <128 x i16>* %a
483  %res = call i16 @llvm.vector.reduce.smax.v128i16(<128 x i16> %op)
484  ret i16 %res
485}
486
487; Don't use SVE for 64-bit vectors.
488define i32 @smaxv_v2i32(<2 x i32> %a) #0 {
489; CHECK-LABEL: smaxv_v2i32:
490; CHECK: smaxp v0.2s, v0.2s
491; CHECK: ret
492  %res = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %a)
493  ret i32 %res
494}
495
496; Don't use SVE for 128-bit vectors.
497define i32 @smaxv_v4i32(<4 x i32> %a) #0 {
498; CHECK-LABEL: smaxv_v4i32:
499; CHECK: smaxv s0, v0.4s
500; CHECK: ret
501  %res = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a)
502  ret i32 %res
503}
504
505define i32 @smaxv_v8i32(<8 x i32>* %a) #0 {
506; CHECK-LABEL: smaxv_v8i32:
507; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
508; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
509; CHECK-NEXT: smaxv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
510; CHECK-NEXT: fmov w0, [[REDUCE]]
511; CHECK-NEXT: ret
512  %op = load <8 x i32>, <8 x i32>* %a
513  %res = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %op)
514  ret i32 %res
515}
516
517define i32 @smaxv_v16i32(<16 x i32>* %a) #0 {
518; CHECK-LABEL: smaxv_v16i32:
519; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
520; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
521; VBITS_GE_512-NEXT: smaxv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
522; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]]
523; VBITS_GE_512-NEXT: ret
524
525; Ensure sensible type legalisation.
526; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
527; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
528; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
529; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
530; VBITS_EQ_256-DAG: smax [[MAX:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s
531; VBITS_EQ_256-DAG: smaxv [[REDUCE:s[0-9]+]], [[PG]], [[MAX]].s
532; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
533; VBITS_EQ_256-NEXT: ret
534  %op = load <16 x i32>, <16 x i32>* %a
535  %res = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> %op)
536  ret i32 %res
537}
538
539define i32 @smaxv_v32i32(<32 x i32>* %a) #0 {
540; CHECK-LABEL: smaxv_v32i32:
541; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
542; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
543; VBITS_GE_1024-NEXT: smaxv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
544; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]]
545; VBITS_GE_1024-NEXT: ret
546  %op = load <32 x i32>, <32 x i32>* %a
547  %res = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> %op)
548  ret i32 %res
549}
550
551define i32 @smaxv_v64i32(<64 x i32>* %a) #0 {
552; CHECK-LABEL: smaxv_v64i32:
553; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
554; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
555; VBITS_GE_2048-NEXT: smaxv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
556; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]]
557; VBITS_GE_2048-NEXT: ret
558  %op = load <64 x i32>, <64 x i32>* %a
559  %res = call i32 @llvm.vector.reduce.smax.v64i32(<64 x i32> %op)
560  ret i32 %res
561}
562
563; Nothing to do for single element vectors.
564define i64 @smaxv_v1i64(<1 x i64> %a) #0 {
565; CHECK-LABEL: smaxv_v1i64:
566; CHECK: fmov x0, d0
567; CHECK: ret
568  %res = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> %a)
569  ret i64 %res
570}
571
572; No NEON 64-bit vector SMAXV support. Use SVE.
573define i64 @smaxv_v2i64(<2 x i64> %a) #0 {
574; CHECK-LABEL: smaxv_v2i64:
575; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
576; CHECK-NEXT: smaxv [[REDUCE:d[0-9]+]], [[PG]], z0.d
577; CHECK-NEXT: fmov x0, [[REDUCE]]
578; CHECK-NEXT: ret
579  %res = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %a)
580  ret i64 %res
581}
582
583define i64 @smaxv_v4i64(<4 x i64>* %a) #0 {
584; CHECK-LABEL: smaxv_v4i64:
585; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
586; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
587; CHECK-NEXT: smaxv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
588; CHECK-NEXT: fmov x0, [[REDUCE]]
589; CHECK-NEXT: ret
590  %op = load <4 x i64>, <4 x i64>* %a
591  %res = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %op)
592  ret i64 %res
593}
594
595define i64 @smaxv_v8i64(<8 x i64>* %a) #0 {
596; CHECK-LABEL: smaxv_v8i64:
597; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
598; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
599; VBITS_GE_512-NEXT: smaxv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
600; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
601; VBITS_GE_512-NEXT: ret
602
603; Ensure sensible type legalisation.
604; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
605; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
606; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
607; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
608; VBITS_EQ_256-DAG: smax [[MAX:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d
609; VBITS_EQ_256-DAG: smaxv [[REDUCE:d[0-9]+]], [[PG]], [[MAX]].d
610; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
611; VBITS_EQ_256-NEXT: ret
612  %op = load <8 x i64>, <8 x i64>* %a
613  %res = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> %op)
614  ret i64 %res
615}
616
617define i64 @smaxv_v16i64(<16 x i64>* %a) #0 {
618; CHECK-LABEL: smaxv_v16i64:
619; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
620; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
621; VBITS_GE_1024-NEXT: smaxv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
622; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
623; VBITS_GE_1024-NEXT: ret
624  %op = load <16 x i64>, <16 x i64>* %a
625  %res = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> %op)
626  ret i64 %res
627}
628
629define i64 @smaxv_v32i64(<32 x i64>* %a) #0 {
630; CHECK-LABEL: smaxv_v32i64:
631; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
632; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
633; VBITS_GE_2048-NEXT: smaxv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
634; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
635; VBITS_GE_2048-NEXT: ret
636  %op = load <32 x i64>, <32 x i64>* %a
637  %res = call i64 @llvm.vector.reduce.smax.v32i64(<32 x i64> %op)
638  ret i64 %res
639}
640
641;
642; SMINV
643;
644
645; Don't use SVE for 64-bit vectors.
646define i8 @sminv_v8i8(<8 x i8> %a) #0 {
647; CHECK-LABEL: sminv_v8i8:
648; CHECK: sminv b0, v0.8b
649; CHECK: ret
650  %res = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %a)
651  ret i8 %res
652}
653
654; Don't use SVE for 128-bit vectors.
655define i8 @sminv_v16i8(<16 x i8> %a) #0 {
656; CHECK-LABEL: sminv_v16i8:
657; CHECK: sminv b0, v0.16b
658; CHECK: ret
659  %res = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %a)
660  ret i8 %res
661}
662
663define i8 @sminv_v32i8(<32 x i8>* %a) #0 {
664; CHECK-LABEL: sminv_v32i8:
665; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
666; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
667; CHECK-NEXT: sminv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
668; CHECK-NEXT: fmov w0, s[[REDUCE]]
669; CHECK-NEXT: ret
670  %op = load <32 x i8>, <32 x i8>* %a
671  %res = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %op)
672  ret i8 %res
673}
674
675define i8 @sminv_v64i8(<64 x i8>* %a) #0 {
676; CHECK-LABEL: sminv_v64i8:
677; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
678; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
679; VBITS_GE_512-NEXT: sminv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
680; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
681; VBITS_GE_512-NEXT: ret
682
683; Ensure sensible type legalisation.
684; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
685; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
686; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0]
687; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
688; VBITS_EQ_256-DAG: smin [[MIN:z[0-9]+]].b, [[PG]]/m, [[HI]].b, [[LO]].b
689; VBITS_EQ_256-DAG: sminv b[[REDUCE:[0-9]+]], [[PG]], [[MIN]].b
690; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
691; VBITS_EQ_256-NEXT: ret
692  %op = load <64 x i8>, <64 x i8>* %a
693  %res = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> %op)
694  ret i8 %res
695}
696
697define i8 @sminv_v128i8(<128 x i8>* %a) #0 {
698; CHECK-LABEL: sminv_v128i8:
699; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
700; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
701; VBITS_GE_1024-NEXT: sminv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
702; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
703; VBITS_GE_1024-NEXT: ret
704  %op = load <128 x i8>, <128 x i8>* %a
705  %res = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> %op)
706  ret i8 %res
707}
708
709define i8 @sminv_v256i8(<256 x i8>* %a) #0 {
710; CHECK-LABEL: sminv_v256i8:
711; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
712; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
713; VBITS_GE_2048-NEXT: sminv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
714; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
715; VBITS_GE_2048-NEXT: ret
716  %op = load <256 x i8>, <256 x i8>* %a
717  %res = call i8 @llvm.vector.reduce.smin.v256i8(<256 x i8> %op)
718  ret i8 %res
719}
720
721; Don't use SVE for 64-bit vectors.
722define i16 @sminv_v4i16(<4 x i16> %a) #0 {
723; CHECK-LABEL: sminv_v4i16:
724; CHECK: sminv h0, v0.4h
725; CHECK: ret
726  %res = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %a)
727  ret i16 %res
728}
729
730; Don't use SVE for 128-bit vectors.
731define i16 @sminv_v8i16(<8 x i16> %a) #0 {
732; CHECK-LABEL: sminv_v8i16:
733; CHECK: sminv h0, v0.8h
734; CHECK: ret
735  %res = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %a)
736  ret i16 %res
737}
738
739define i16 @sminv_v16i16(<16 x i16>* %a) #0 {
740; CHECK-LABEL: sminv_v16i16:
741; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
742; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
743; CHECK-NEXT: sminv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
744; CHECK-NEXT: fmov w0, s[[REDUCE]]
745; CHECK-NEXT: ret
746  %op = load <16 x i16>, <16 x i16>* %a
747  %res = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %op)
748  ret i16 %res
749}
750
751define i16 @sminv_v32i16(<32 x i16>* %a) #0 {
752; CHECK-LABEL: sminv_v32i16:
753; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
754; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
755; VBITS_GE_512-NEXT: sminv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
756; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
757; VBITS_GE_512-NEXT: ret
758
759; Ensure sensible type legalisation.
760; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
761; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
762; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
763; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
764; VBITS_EQ_256-DAG: smin [[MIN:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h
765; VBITS_EQ_256-DAG: sminv h[[REDUCE:[0-9]+]], [[PG]], [[MIN]].h
766; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
767; VBITS_EQ_256-NEXT: ret
768  %op = load <32 x i16>, <32 x i16>* %a
769  %res = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> %op)
770  ret i16 %res
771}
772
773define i16 @sminv_v64i16(<64 x i16>* %a) #0 {
774; CHECK-LABEL: sminv_v64i16:
775; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
776; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
777; VBITS_GE_1024-NEXT: sminv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
778; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
779; VBITS_GE_1024-NEXT: ret
780  %op = load <64 x i16>, <64 x i16>* %a
781  %res = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> %op)
782  ret i16 %res
783}
784
785define i16 @sminv_v128i16(<128 x i16>* %a) #0 {
786; CHECK-LABEL: sminv_v128i16:
787; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
788; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
789; VBITS_GE_2048-NEXT: sminv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
790; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
791; VBITS_GE_2048-NEXT: ret
792  %op = load <128 x i16>, <128 x i16>* %a
793  %res = call i16 @llvm.vector.reduce.smin.v128i16(<128 x i16> %op)
794  ret i16 %res
795}
796
797; Don't use SVE for 64-bit vectors.
798define i32 @sminv_v2i32(<2 x i32> %a) #0 {
799; CHECK-LABEL: sminv_v2i32:
800; CHECK: minp v0.2s, v0.2s
801; CHECK: ret
802  %res = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %a)
803  ret i32 %res
804}
805
806; Don't use SVE for 128-bit vectors.
807define i32 @sminv_v4i32(<4 x i32> %a) #0 {
808; CHECK-LABEL: sminv_v4i32:
809; CHECK: sminv s0, v0.4s
810; CHECK: ret
811  %res = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a)
812  ret i32 %res
813}
814
815define i32 @sminv_v8i32(<8 x i32>* %a) #0 {
816; CHECK-LABEL: sminv_v8i32:
817; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
818; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
819; CHECK-NEXT: sminv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
820; CHECK-NEXT: fmov w0, [[REDUCE]]
821; CHECK-NEXT: ret
822  %op = load <8 x i32>, <8 x i32>* %a
823  %res = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %op)
824  ret i32 %res
825}
826
827define i32 @sminv_v16i32(<16 x i32>* %a) #0 {
828; CHECK-LABEL: sminv_v16i32:
829; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
830; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
831; VBITS_GE_512-NEXT: sminv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
832; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]]
833; VBITS_GE_512-NEXT: ret
834
835; Ensure sensible type legalisation.
836; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
837; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
838; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
839; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
840; VBITS_EQ_256-DAG: smin [[MIN:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s
841; VBITS_EQ_256-DAG: sminv [[REDUCE:s[0-9]+]], [[PG]], [[MIN]].s
842; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
843; VBITS_EQ_256-NEXT: ret
844  %op = load <16 x i32>, <16 x i32>* %a
845  %res = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> %op)
846  ret i32 %res
847}
848
849define i32 @sminv_v32i32(<32 x i32>* %a) #0 {
850; CHECK-LABEL: sminv_v32i32:
851; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
852; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
853; VBITS_GE_1024-NEXT: sminv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
854; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]]
855; VBITS_GE_1024-NEXT: ret
856  %op = load <32 x i32>, <32 x i32>* %a
857  %res = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> %op)
858  ret i32 %res
859}
860
861define i32 @sminv_v64i32(<64 x i32>* %a) #0 {
862; CHECK-LABEL: sminv_v64i32:
863; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
864; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
865; VBITS_GE_2048-NEXT: sminv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
866; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]]
867; VBITS_GE_2048-NEXT: ret
868  %op = load <64 x i32>, <64 x i32>* %a
869  %res = call i32 @llvm.vector.reduce.smin.v64i32(<64 x i32> %op)
870  ret i32 %res
871}
872
873; Nothing to do for single element vectors.
874define i64 @sminv_v1i64(<1 x i64> %a) #0 {
875; CHECK-LABEL: sminv_v1i64:
876; CHECK: fmov x0, d0
877; CHECK: ret
878  %res = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> %a)
879  ret i64 %res
880}
881
882; No NEON 64-bit vector SMINV support. Use SVE.
883define i64 @sminv_v2i64(<2 x i64> %a) #0 {
884; CHECK-LABEL: sminv_v2i64:
885; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
886; CHECK-NEXT: sminv [[REDUCE:d[0-9]+]], [[PG]], z0.d
887; CHECK-NEXT: fmov x0, [[REDUCE]]
888; CHECK-NEXT: ret
889  %res = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %a)
890  ret i64 %res
891}
892
893define i64 @sminv_v4i64(<4 x i64>* %a) #0 {
894; CHECK-LABEL: sminv_v4i64:
895; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
896; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
897; CHECK-NEXT: sminv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
898; CHECK-NEXT: fmov x0, [[REDUCE]]
899; CHECK-NEXT: ret
900  %op = load <4 x i64>, <4 x i64>* %a
901  %res = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %op)
902  ret i64 %res
903}
904
905define i64 @sminv_v8i64(<8 x i64>* %a) #0 {
906; CHECK-LABEL: sminv_v8i64:
907; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
908; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
909; VBITS_GE_512-NEXT: sminv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
910; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
911; VBITS_GE_512-NEXT: ret
912
913; Ensure sensible type legalisation.
914; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
915; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
916; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
917; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
918; VBITS_EQ_256-DAG: smin [[MIN:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d
919; VBITS_EQ_256-DAG: sminv [[REDUCE:d[0-9]+]], [[PG]], [[MIN]].d
920; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
921; VBITS_EQ_256-NEXT: ret
922  %op = load <8 x i64>, <8 x i64>* %a
923  %res = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> %op)
924  ret i64 %res
925}
926
927define i64 @sminv_v16i64(<16 x i64>* %a) #0 {
928; CHECK-LABEL: sminv_v16i64:
929; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
930; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
931; VBITS_GE_1024-NEXT: sminv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
932; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
933; VBITS_GE_1024-NEXT: ret
934  %op = load <16 x i64>, <16 x i64>* %a
935  %res = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> %op)
936  ret i64 %res
937}
938
939define i64 @sminv_v32i64(<32 x i64>* %a) #0 {
940; CHECK-LABEL: sminv_v32i64:
941; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
942; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
943; VBITS_GE_2048-NEXT: sminv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
944; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
945; VBITS_GE_2048-NEXT: ret
946  %op = load <32 x i64>, <32 x i64>* %a
947  %res = call i64 @llvm.vector.reduce.smin.v32i64(<32 x i64> %op)
948  ret i64 %res
949}
950
951;
952; UMAXV
953;
954
955; Don't use SVE for 64-bit vectors.
956define i8 @umaxv_v8i8(<8 x i8> %a) #0 {
957; CHECK-LABEL: umaxv_v8i8:
958; CHECK: umaxv b0, v0.8b
959; CHECK: ret
960  %res = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %a)
961  ret i8 %res
962}
963
964; Don't use SVE for 128-bit vectors.
965define i8 @umaxv_v16i8(<16 x i8> %a) #0 {
966; CHECK-LABEL: umaxv_v16i8:
967; CHECK: umaxv b0, v0.16b
968; CHECK: ret
969  %res = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %a)
970  ret i8 %res
971}
972
973define i8 @umaxv_v32i8(<32 x i8>* %a) #0 {
974; CHECK-LABEL: umaxv_v32i8:
975; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
976; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
977; CHECK-NEXT: umaxv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
978; CHECK-NEXT: fmov w0, s[[REDUCE]]
979; CHECK-NEXT: ret
980  %op = load <32 x i8>, <32 x i8>* %a
981  %res = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %op)
982  ret i8 %res
983}
984
985define i8 @umaxv_v64i8(<64 x i8>* %a) #0 {
986; CHECK-LABEL: umaxv_v64i8:
987; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
988; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
989; VBITS_GE_512-NEXT: umaxv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
990; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
991; VBITS_GE_512-NEXT: ret
992
993; Ensure sensible type legalisation.
994; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
995; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
996; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0]
997; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
998; VBITS_EQ_256-DAG: umax [[MAX:z[0-9]+]].b, [[PG]]/m, [[HI]].b, [[LO]].b
999; VBITS_EQ_256-DAG: umaxv b[[REDUCE:[0-9]+]], [[PG]], [[MAX]].b
1000; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
1001; VBITS_EQ_256-NEXT: ret
1002  %op = load <64 x i8>, <64 x i8>* %a
1003  %res = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> %op)
1004  ret i8 %res
1005}
1006
1007define i8 @umaxv_v128i8(<128 x i8>* %a) #0 {
1008; CHECK-LABEL: umaxv_v128i8:
1009; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
1010; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
1011; VBITS_GE_1024-NEXT: umaxv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
1012; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
1013; VBITS_GE_1024-NEXT: ret
1014  %op = load <128 x i8>, <128 x i8>* %a
1015  %res = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> %op)
1016  ret i8 %res
1017}
1018
1019define i8 @umaxv_v256i8(<256 x i8>* %a) #0 {
1020; CHECK-LABEL: umaxv_v256i8:
1021; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
1022; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
1023; VBITS_GE_2048-NEXT: umaxv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
1024; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
1025; VBITS_GE_2048-NEXT: ret
1026  %op = load <256 x i8>, <256 x i8>* %a
1027  %res = call i8 @llvm.vector.reduce.umax.v256i8(<256 x i8> %op)
1028  ret i8 %res
1029}
1030
1031; Don't use SVE for 64-bit vectors.
1032define i16 @umaxv_v4i16(<4 x i16> %a) #0 {
1033; CHECK-LABEL: umaxv_v4i16:
1034; CHECK: umaxv h0, v0.4h
1035; CHECK: ret
1036  %res = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %a)
1037  ret i16 %res
1038}
1039
1040; Don't use SVE for 128-bit vectors.
1041define i16 @umaxv_v8i16(<8 x i16> %a) #0 {
1042; CHECK-LABEL: umaxv_v8i16:
1043; CHECK: umaxv h0, v0.8h
1044; CHECK: ret
1045  %res = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %a)
1046  ret i16 %res
1047}
1048
1049define i16 @umaxv_v16i16(<16 x i16>* %a) #0 {
1050; CHECK-LABEL: umaxv_v16i16:
1051; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
1052; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
1053; CHECK-NEXT: umaxv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
1054; CHECK-NEXT: fmov w0, s[[REDUCE]]
1055; CHECK-NEXT: ret
1056  %op = load <16 x i16>, <16 x i16>* %a
1057  %res = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %op)
1058  ret i16 %res
1059}
1060
1061define i16 @umaxv_v32i16(<32 x i16>* %a) #0 {
1062; CHECK-LABEL: umaxv_v32i16:
1063; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
1064; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
1065; VBITS_GE_512-NEXT: umaxv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
1066; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
1067; VBITS_GE_512-NEXT: ret
1068
1069; Ensure sensible type legalisation.
1070; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
1071; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
1072; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
1073; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
1074; VBITS_EQ_256-DAG: umax [[MAX:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h
1075; VBITS_EQ_256-DAG: umaxv h[[REDUCE:[0-9]+]], [[PG]], [[MAX]].h
1076; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
1077; VBITS_EQ_256-NEXT: ret
1078  %op = load <32 x i16>, <32 x i16>* %a
1079  %res = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> %op)
1080  ret i16 %res
1081}
1082
1083define i16 @umaxv_v64i16(<64 x i16>* %a) #0 {
1084; CHECK-LABEL: umaxv_v64i16:
1085; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
1086; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
1087; VBITS_GE_1024-NEXT: umaxv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
1088; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
1089; VBITS_GE_1024-NEXT: ret
1090  %op = load <64 x i16>, <64 x i16>* %a
1091  %res = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> %op)
1092  ret i16 %res
1093}
1094
1095define i16 @umaxv_v128i16(<128 x i16>* %a) #0 {
1096; CHECK-LABEL: umaxv_v128i16:
1097; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
1098; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
1099; VBITS_GE_2048-NEXT: umaxv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
1100; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
1101; VBITS_GE_2048-NEXT: ret
1102  %op = load <128 x i16>, <128 x i16>* %a
1103  %res = call i16 @llvm.vector.reduce.umax.v128i16(<128 x i16> %op)
1104  ret i16 %res
1105}
1106
1107; Don't use SVE for 64-bit vectors.
1108define i32 @umaxv_v2i32(<2 x i32> %a) #0 {
1109; CHECK-LABEL: umaxv_v2i32:
1110; CHECK: umaxp v0.2s, v0.2s
1111; CHECK: ret
1112  %res = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %a)
1113  ret i32 %res
1114}
1115
1116; Don't use SVE for 128-bit vectors.
1117define i32 @umaxv_v4i32(<4 x i32> %a) #0 {
1118; CHECK-LABEL: umaxv_v4i32:
1119; CHECK: umaxv s0, v0.4s
1120; CHECK: ret
1121  %res = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a)
1122  ret i32 %res
1123}
1124
1125define i32 @umaxv_v8i32(<8 x i32>* %a) #0 {
1126; CHECK-LABEL: umaxv_v8i32:
1127; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
1128; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
1129; CHECK-NEXT: umaxv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
1130; CHECK-NEXT: fmov w0, [[REDUCE]]
1131; CHECK-NEXT: ret
1132  %op = load <8 x i32>, <8 x i32>* %a
1133  %res = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %op)
1134  ret i32 %res
1135}
1136
1137define i32 @umaxv_v16i32(<16 x i32>* %a) #0 {
1138; CHECK-LABEL: umaxv_v16i32:
1139; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
1140; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
1141; VBITS_GE_512-NEXT: umaxv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
1142; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]]
1143; VBITS_GE_512-NEXT: ret
1144
1145; Ensure sensible type legalisation.
1146; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
1147; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
1148; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
1149; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
1150; VBITS_EQ_256-DAG: umax [[MAX:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s
1151; VBITS_EQ_256-DAG: umaxv [[REDUCE:s[0-9]+]], [[PG]], [[MAX]].s
1152; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
1153; VBITS_EQ_256-NEXT: ret
1154  %op = load <16 x i32>, <16 x i32>* %a
1155  %res = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> %op)
1156  ret i32 %res
1157}
1158
1159define i32 @umaxv_v32i32(<32 x i32>* %a) #0 {
1160; CHECK-LABEL: umaxv_v32i32:
1161; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
1162; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
1163; VBITS_GE_1024-NEXT: umaxv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
1164; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]]
1165; VBITS_GE_1024-NEXT: ret
1166  %op = load <32 x i32>, <32 x i32>* %a
1167  %res = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> %op)
1168  ret i32 %res
1169}
1170
1171define i32 @umaxv_v64i32(<64 x i32>* %a) #0 {
1172; CHECK-LABEL: umaxv_v64i32:
1173; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
1174; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
1175; VBITS_GE_2048-NEXT: umaxv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
1176; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]]
1177; VBITS_GE_2048-NEXT: ret
1178  %op = load <64 x i32>, <64 x i32>* %a
1179  %res = call i32 @llvm.vector.reduce.umax.v64i32(<64 x i32> %op)
1180  ret i32 %res
1181}
1182
1183; Nothing to do for single element vectors.
1184define i64 @umaxv_v1i64(<1 x i64> %a) #0 {
1185; CHECK-LABEL: umaxv_v1i64:
1186; CHECK: fmov x0, d0
1187; CHECK: ret
1188  %res = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> %a)
1189  ret i64 %res
1190}
1191
1192; No NEON 64-bit vector UMAXV support. Use SVE.
1193define i64 @umaxv_v2i64(<2 x i64> %a) #0 {
1194; CHECK-LABEL: umaxv_v2i64:
1195; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
1196; CHECK-NEXT: umaxv [[REDUCE:d[0-9]+]], [[PG]], z0.d
1197; CHECK-NEXT: fmov x0, [[REDUCE]]
1198; CHECK-NEXT: ret
1199  %res = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %a)
1200  ret i64 %res
1201}
1202
1203define i64 @umaxv_v4i64(<4 x i64>* %a) #0 {
1204; CHECK-LABEL: umaxv_v4i64:
1205; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
1206; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
1207; CHECK-NEXT: umaxv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
1208; CHECK-NEXT: fmov x0, [[REDUCE]]
1209; CHECK-NEXT: ret
1210  %op = load <4 x i64>, <4 x i64>* %a
1211  %res = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %op)
1212  ret i64 %res
1213}
1214
1215define i64 @umaxv_v8i64(<8 x i64>* %a) #0 {
1216; CHECK-LABEL: umaxv_v8i64:
1217; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
1218; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
1219; VBITS_GE_512-NEXT: umaxv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
1220; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
1221; VBITS_GE_512-NEXT: ret
1222
1223; Ensure sensible type legalisation.
1224; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
1225; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
1226; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
1227; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
1228; VBITS_EQ_256-DAG: umax [[MAX:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d
1229; VBITS_EQ_256-DAG: umaxv [[REDUCE:d[0-9]+]], [[PG]], [[MAX]].d
1230; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
1231; VBITS_EQ_256-NEXT: ret
1232  %op = load <8 x i64>, <8 x i64>* %a
1233  %res = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> %op)
1234  ret i64 %res
1235}
1236
1237define i64 @umaxv_v16i64(<16 x i64>* %a) #0 {
1238; CHECK-LABEL: umaxv_v16i64:
1239; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
1240; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
1241; VBITS_GE_1024-NEXT: umaxv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
1242; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
1243; VBITS_GE_1024-NEXT: ret
1244  %op = load <16 x i64>, <16 x i64>* %a
1245  %res = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> %op)
1246  ret i64 %res
1247}
1248
1249define i64 @umaxv_v32i64(<32 x i64>* %a) #0 {
1250; CHECK-LABEL: umaxv_v32i64:
1251; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
1252; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
1253; VBITS_GE_2048-NEXT: umaxv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
1254; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
1255; VBITS_GE_2048-NEXT: ret
1256  %op = load <32 x i64>, <32 x i64>* %a
1257  %res = call i64 @llvm.vector.reduce.umax.v32i64(<32 x i64> %op)
1258  ret i64 %res
1259}
1260
1261;
1262; UMINV
1263;
1264
1265; Don't use SVE for 64-bit vectors.
1266define i8 @uminv_v8i8(<8 x i8> %a) #0 {
1267; CHECK-LABEL: uminv_v8i8:
1268; CHECK: uminv b0, v0.8b
1269; CHECK: ret
1270  %res = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %a)
1271  ret i8 %res
1272}
1273
1274; Don't use SVE for 128-bit vectors.
1275define i8 @uminv_v16i8(<16 x i8> %a) #0 {
1276; CHECK-LABEL: uminv_v16i8:
1277; CHECK: uminv b0, v0.16b
1278; CHECK: ret
1279  %res = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %a)
1280  ret i8 %res
1281}
1282
1283define i8 @uminv_v32i8(<32 x i8>* %a) #0 {
1284; CHECK-LABEL: uminv_v32i8:
1285; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
1286; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
1287; CHECK-NEXT: uminv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
1288; CHECK-NEXT: fmov w0, s[[REDUCE]]
1289; CHECK-NEXT: ret
1290  %op = load <32 x i8>, <32 x i8>* %a
1291  %res = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %op)
1292  ret i8 %res
1293}
1294
1295define i8 @uminv_v64i8(<64 x i8>* %a) #0 {
1296; CHECK-LABEL: uminv_v64i8:
1297; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
1298; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
1299; VBITS_GE_512-NEXT: uminv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
1300; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
1301; VBITS_GE_512-NEXT: ret
1302
1303; Ensure sensible type legalisation.
1304; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
1305; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
1306; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0]
1307; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
1308; VBITS_EQ_256-DAG: umin [[MIN:z[0-9]+]].b, [[PG]]/m, [[HI]].b, [[LO]].b
1309; VBITS_EQ_256-DAG: uminv b[[REDUCE:[0-9]+]], [[PG]], [[MIN]].b
1310; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
1311; VBITS_EQ_256-NEXT: ret
1312  %op = load <64 x i8>, <64 x i8>* %a
1313  %res = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> %op)
1314  ret i8 %res
1315}
1316
1317define i8 @uminv_v128i8(<128 x i8>* %a) #0 {
1318; CHECK-LABEL: uminv_v128i8:
1319; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
1320; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
1321; VBITS_GE_1024-NEXT: uminv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
1322; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
1323; VBITS_GE_1024-NEXT: ret
1324  %op = load <128 x i8>, <128 x i8>* %a
1325  %res = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> %op)
1326  ret i8 %res
1327}
1328
1329define i8 @uminv_v256i8(<256 x i8>* %a) #0 {
1330; CHECK-LABEL: uminv_v256i8:
1331; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
1332; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
1333; VBITS_GE_2048-NEXT: uminv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
1334; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
1335; VBITS_GE_2048-NEXT: ret
1336  %op = load <256 x i8>, <256 x i8>* %a
1337  %res = call i8 @llvm.vector.reduce.umin.v256i8(<256 x i8> %op)
1338  ret i8 %res
1339}
1340
1341; Don't use SVE for 64-bit vectors.
1342define i16 @uminv_v4i16(<4 x i16> %a) #0 {
1343; CHECK-LABEL: uminv_v4i16:
1344; CHECK: uminv h0, v0.4h
1345; CHECK: ret
1346  %res = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %a)
1347  ret i16 %res
1348}
1349
1350; Don't use SVE for 128-bit vectors.
1351define i16 @uminv_v8i16(<8 x i16> %a) #0 {
1352; CHECK-LABEL: uminv_v8i16:
1353; CHECK: uminv h0, v0.8h
1354; CHECK: ret
1355  %res = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %a)
1356  ret i16 %res
1357}
1358
1359define i16 @uminv_v16i16(<16 x i16>* %a) #0 {
1360; CHECK-LABEL: uminv_v16i16:
1361; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
1362; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
1363; CHECK-NEXT: uminv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
1364; CHECK-NEXT: fmov w0, s[[REDUCE]]
1365; CHECK-NEXT: ret
1366  %op = load <16 x i16>, <16 x i16>* %a
1367  %res = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %op)
1368  ret i16 %res
1369}
1370
1371define i16 @uminv_v32i16(<32 x i16>* %a) #0 {
1372; CHECK-LABEL: uminv_v32i16:
1373; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
1374; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
1375; VBITS_GE_512-NEXT: uminv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
1376; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
1377; VBITS_GE_512-NEXT: ret
1378
1379; Ensure sensible type legalisation.
1380; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
1381; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
1382; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
1383; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
1384; VBITS_EQ_256-DAG: umin [[MIN:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h
1385; VBITS_EQ_256-DAG: uminv h[[REDUCE:[0-9]+]], [[PG]], [[MIN]].h
1386; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
1387; VBITS_EQ_256-NEXT: ret
1388  %op = load <32 x i16>, <32 x i16>* %a
1389  %res = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> %op)
1390  ret i16 %res
1391}
1392
1393define i16 @uminv_v64i16(<64 x i16>* %a) #0 {
1394; CHECK-LABEL: uminv_v64i16:
1395; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
1396; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
1397; VBITS_GE_1024-NEXT: uminv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
1398; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
1399; VBITS_GE_1024-NEXT: ret
1400  %op = load <64 x i16>, <64 x i16>* %a
1401  %res = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> %op)
1402  ret i16 %res
1403}
1404
1405define i16 @uminv_v128i16(<128 x i16>* %a) #0 {
1406; CHECK-LABEL: uminv_v128i16:
1407; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
1408; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
1409; VBITS_GE_2048-NEXT: uminv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
1410; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
1411; VBITS_GE_2048-NEXT: ret
1412  %op = load <128 x i16>, <128 x i16>* %a
1413  %res = call i16 @llvm.vector.reduce.umin.v128i16(<128 x i16> %op)
1414  ret i16 %res
1415}
1416
1417; Don't use SVE for 64-bit vectors.
1418define i32 @uminv_v2i32(<2 x i32> %a) #0 {
1419; CHECK-LABEL: uminv_v2i32:
1420; CHECK: minp v0.2s, v0.2s
1421; CHECK: ret
1422  %res = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %a)
1423  ret i32 %res
1424}
1425
1426; Don't use SVE for 128-bit vectors.
1427define i32 @uminv_v4i32(<4 x i32> %a) #0 {
1428; CHECK-LABEL: uminv_v4i32:
1429; CHECK: uminv s0, v0.4s
1430; CHECK: ret
1431  %res = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a)
1432  ret i32 %res
1433}
1434
1435define i32 @uminv_v8i32(<8 x i32>* %a) #0 {
1436; CHECK-LABEL: uminv_v8i32:
1437; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
1438; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
1439; CHECK-NEXT: uminv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
1440; CHECK-NEXT: fmov w0, [[REDUCE]]
1441; CHECK-NEXT: ret
1442  %op = load <8 x i32>, <8 x i32>* %a
1443  %res = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %op)
1444  ret i32 %res
1445}
1446
1447define i32 @uminv_v16i32(<16 x i32>* %a) #0 {
1448; CHECK-LABEL: uminv_v16i32:
1449; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
1450; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
1451; VBITS_GE_512-NEXT: uminv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
1452; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]]
1453; VBITS_GE_512-NEXT: ret
1454
1455; Ensure sensible type legalisation.
1456; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
1457; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
1458; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
1459; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
1460; VBITS_EQ_256-DAG: umin [[MIN:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s
1461; VBITS_EQ_256-DAG: uminv [[REDUCE:s[0-9]+]], [[PG]], [[MIN]].s
1462; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
1463; VBITS_EQ_256-NEXT: ret
1464  %op = load <16 x i32>, <16 x i32>* %a
1465  %res = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> %op)
1466  ret i32 %res
1467}
1468
1469define i32 @uminv_v32i32(<32 x i32>* %a) #0 {
1470; CHECK-LABEL: uminv_v32i32:
1471; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
1472; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
1473; VBITS_GE_1024-NEXT: uminv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
1474; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]]
1475; VBITS_GE_1024-NEXT: ret
1476  %op = load <32 x i32>, <32 x i32>* %a
1477  %res = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> %op)
1478  ret i32 %res
1479}
1480
1481define i32 @uminv_v64i32(<64 x i32>* %a) #0 {
1482; CHECK-LABEL: uminv_v64i32:
1483; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
1484; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
1485; VBITS_GE_2048-NEXT: uminv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
1486; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]]
1487; VBITS_GE_2048-NEXT: ret
1488  %op = load <64 x i32>, <64 x i32>* %a
1489  %res = call i32 @llvm.vector.reduce.umin.v64i32(<64 x i32> %op)
1490  ret i32 %res
1491}
1492
1493; Nothing to do for single element vectors.
1494define i64 @uminv_v1i64(<1 x i64> %a) #0 {
1495; CHECK-LABEL: uminv_v1i64:
1496; CHECK: fmov x0, d0
1497; CHECK: ret
1498  %res = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> %a)
1499  ret i64 %res
1500}
1501
1502; No NEON 64-bit vector UMINV support. Use SVE.
1503define i64 @uminv_v2i64(<2 x i64> %a) #0 {
1504; CHECK-LABEL: uminv_v2i64:
1505; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
1506; CHECK-NEXT: uminv [[REDUCE:d[0-9]+]], [[PG]], z0.d
1507; CHECK-NEXT: fmov x0, [[REDUCE]]
1508; CHECK-NEXT: ret
1509  %res = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %a)
1510  ret i64 %res
1511}
1512
1513define i64 @uminv_v4i64(<4 x i64>* %a) #0 {
1514; CHECK-LABEL: uminv_v4i64:
1515; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
1516; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
1517; CHECK-NEXT: uminv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
1518; CHECK-NEXT: fmov x0, [[REDUCE]]
1519; CHECK-NEXT: ret
1520  %op = load <4 x i64>, <4 x i64>* %a
1521  %res = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %op)
1522  ret i64 %res
1523}
1524
1525define i64 @uminv_v8i64(<8 x i64>* %a) #0 {
1526; CHECK-LABEL: uminv_v8i64:
1527; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
1528; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
1529; VBITS_GE_512-NEXT: uminv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
1530; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
1531; VBITS_GE_512-NEXT: ret
1532
1533; Ensure sensible type legalisation.
1534; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
1535; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
1536; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
1537; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
1538; VBITS_EQ_256-DAG: umin [[MIN:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d
1539; VBITS_EQ_256-DAG: uminv [[REDUCE:d[0-9]+]], [[PG]], [[MIN]].d
1540; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
1541; VBITS_EQ_256-NEXT: ret
1542  %op = load <8 x i64>, <8 x i64>* %a
1543  %res = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> %op)
1544  ret i64 %res
1545}
1546
1547define i64 @uminv_v16i64(<16 x i64>* %a) #0 {
1548; CHECK-LABEL: uminv_v16i64:
1549; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
1550; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
1551; VBITS_GE_1024-NEXT: uminv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
1552; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
1553; VBITS_GE_1024-NEXT: ret
1554  %op = load <16 x i64>, <16 x i64>* %a
1555  %res = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> %op)
1556  ret i64 %res
1557}
1558
1559define i64 @uminv_v32i64(<32 x i64>* %a) #0 {
1560; CHECK-LABEL: uminv_v32i64:
1561; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
1562; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
1563; VBITS_GE_2048-NEXT: uminv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
1564; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
1565; VBITS_GE_2048-NEXT: ret
1566  %op = load <32 x i64>, <32 x i64>* %a
1567  %res = call i64 @llvm.vector.reduce.umin.v32i64(<32 x i64> %op)
1568  ret i64 %res
1569}
1570
1571attributes #0 = { "target-features"="+sve" }
1572
1573declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
1574declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
1575declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>)
1576declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>)
1577declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>)
1578declare i8 @llvm.vector.reduce.add.v256i8(<256 x i8>)
1579
1580declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
1581declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
1582declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
1583declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>)
1584declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>)
1585declare i16 @llvm.vector.reduce.add.v128i16(<128 x i16>)
1586
1587declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
1588declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
1589declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
1590declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
1591declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
1592declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>)
1593
1594declare i64 @llvm.vector.reduce.add.v1i64(<1 x i64>)
1595declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
1596declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
1597declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
1598declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
1599declare i64 @llvm.vector.reduce.add.v32i64(<32 x i64>)
1600
1601declare i8 @llvm.vector.reduce.smax.v8i8(<8 x i8>)
1602declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>)
1603declare i8 @llvm.vector.reduce.smax.v32i8(<32 x i8>)
1604declare i8 @llvm.vector.reduce.smax.v64i8(<64 x i8>)
1605declare i8 @llvm.vector.reduce.smax.v128i8(<128 x i8>)
1606declare i8 @llvm.vector.reduce.smax.v256i8(<256 x i8>)
1607
1608declare i16 @llvm.vector.reduce.smax.v4i16(<4 x i16>)
1609declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>)
1610declare i16 @llvm.vector.reduce.smax.v16i16(<16 x i16>)
1611declare i16 @llvm.vector.reduce.smax.v32i16(<32 x i16>)
1612declare i16 @llvm.vector.reduce.smax.v64i16(<64 x i16>)
1613declare i16 @llvm.vector.reduce.smax.v128i16(<128 x i16>)
1614
1615declare i32 @llvm.vector.reduce.smax.v2i32(<2 x i32>)
1616declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
1617declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32>)
1618declare i32 @llvm.vector.reduce.smax.v16i32(<16 x i32>)
1619declare i32 @llvm.vector.reduce.smax.v32i32(<32 x i32>)
1620declare i32 @llvm.vector.reduce.smax.v64i32(<64 x i32>)
1621
1622declare i64 @llvm.vector.reduce.smax.v1i64(<1 x i64>)
1623declare i64 @llvm.vector.reduce.smax.v2i64(<2 x i64>)
1624declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>)
1625declare i64 @llvm.vector.reduce.smax.v8i64(<8 x i64>)
1626declare i64 @llvm.vector.reduce.smax.v16i64(<16 x i64>)
1627declare i64 @llvm.vector.reduce.smax.v32i64(<32 x i64>)
1628
1629declare i8 @llvm.vector.reduce.smin.v8i8(<8 x i8>)
1630declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>)
1631declare i8 @llvm.vector.reduce.smin.v32i8(<32 x i8>)
1632declare i8 @llvm.vector.reduce.smin.v64i8(<64 x i8>)
1633declare i8 @llvm.vector.reduce.smin.v128i8(<128 x i8>)
1634declare i8 @llvm.vector.reduce.smin.v256i8(<256 x i8>)
1635
1636declare i16 @llvm.vector.reduce.smin.v4i16(<4 x i16>)
1637declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>)
1638declare i16 @llvm.vector.reduce.smin.v16i16(<16 x i16>)
1639declare i16 @llvm.vector.reduce.smin.v32i16(<32 x i16>)
1640declare i16 @llvm.vector.reduce.smin.v64i16(<64 x i16>)
1641declare i16 @llvm.vector.reduce.smin.v128i16(<128 x i16>)
1642
1643declare i32 @llvm.vector.reduce.smin.v2i32(<2 x i32>)
1644declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
1645declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32>)
1646declare i32 @llvm.vector.reduce.smin.v16i32(<16 x i32>)
1647declare i32 @llvm.vector.reduce.smin.v32i32(<32 x i32>)
1648declare i32 @llvm.vector.reduce.smin.v64i32(<64 x i32>)
1649
1650declare i64 @llvm.vector.reduce.smin.v1i64(<1 x i64>)
1651declare i64 @llvm.vector.reduce.smin.v2i64(<2 x i64>)
1652declare i64 @llvm.vector.reduce.smin.v4i64(<4 x i64>)
1653declare i64 @llvm.vector.reduce.smin.v8i64(<8 x i64>)
1654declare i64 @llvm.vector.reduce.smin.v16i64(<16 x i64>)
1655declare i64 @llvm.vector.reduce.smin.v32i64(<32 x i64>)
1656
1657declare i8 @llvm.vector.reduce.umax.v8i8(<8 x i8>)
1658declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>)
1659declare i8 @llvm.vector.reduce.umax.v32i8(<32 x i8>)
1660declare i8 @llvm.vector.reduce.umax.v64i8(<64 x i8>)
1661declare i8 @llvm.vector.reduce.umax.v128i8(<128 x i8>)
1662declare i8 @llvm.vector.reduce.umax.v256i8(<256 x i8>)
1663
1664declare i16 @llvm.vector.reduce.umax.v4i16(<4 x i16>)
1665declare i16 @llvm.vector.reduce.umax.v8i16(<8 x i16>)
1666declare i16 @llvm.vector.reduce.umax.v16i16(<16 x i16>)
1667declare i16 @llvm.vector.reduce.umax.v32i16(<32 x i16>)
1668declare i16 @llvm.vector.reduce.umax.v64i16(<64 x i16>)
1669declare i16 @llvm.vector.reduce.umax.v128i16(<128 x i16>)
1670
1671declare i32 @llvm.vector.reduce.umax.v2i32(<2 x i32>)
1672declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
1673declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32>)
1674declare i32 @llvm.vector.reduce.umax.v16i32(<16 x i32>)
1675declare i32 @llvm.vector.reduce.umax.v32i32(<32 x i32>)
1676declare i32 @llvm.vector.reduce.umax.v64i32(<64 x i32>)
1677
1678declare i64 @llvm.vector.reduce.umax.v1i64(<1 x i64>)
1679declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64>)
1680declare i64 @llvm.vector.reduce.umax.v4i64(<4 x i64>)
1681declare i64 @llvm.vector.reduce.umax.v8i64(<8 x i64>)
1682declare i64 @llvm.vector.reduce.umax.v16i64(<16 x i64>)
1683declare i64 @llvm.vector.reduce.umax.v32i64(<32 x i64>)
1684
1685declare i8 @llvm.vector.reduce.umin.v8i8(<8 x i8>)
1686declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>)
1687declare i8 @llvm.vector.reduce.umin.v32i8(<32 x i8>)
1688declare i8 @llvm.vector.reduce.umin.v64i8(<64 x i8>)
1689declare i8 @llvm.vector.reduce.umin.v128i8(<128 x i8>)
1690declare i8 @llvm.vector.reduce.umin.v256i8(<256 x i8>)
1691
1692declare i16 @llvm.vector.reduce.umin.v4i16(<4 x i16>)
1693declare i16 @llvm.vector.reduce.umin.v8i16(<8 x i16>)
1694declare i16 @llvm.vector.reduce.umin.v16i16(<16 x i16>)
1695declare i16 @llvm.vector.reduce.umin.v32i16(<32 x i16>)
1696declare i16 @llvm.vector.reduce.umin.v64i16(<64 x i16>)
1697declare i16 @llvm.vector.reduce.umin.v128i16(<128 x i16>)
1698
1699declare i32 @llvm.vector.reduce.umin.v2i32(<2 x i32>)
1700declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
1701declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32>)
1702declare i32 @llvm.vector.reduce.umin.v16i32(<16 x i32>)
1703declare i32 @llvm.vector.reduce.umin.v32i32(<32 x i32>)
1704declare i32 @llvm.vector.reduce.umin.v64i32(<64 x i32>)
1705
1706declare i64 @llvm.vector.reduce.umin.v1i64(<1 x i64>)
1707declare i64 @llvm.vector.reduce.umin.v2i64(<2 x i64>)
1708declare i64 @llvm.vector.reduce.umin.v4i64(<4 x i64>)
1709declare i64 @llvm.vector.reduce.umin.v8i64(<8 x i64>)
1710declare i64 @llvm.vector.reduce.umin.v16i64(<16 x i64>)
1711declare i64 @llvm.vector.reduce.umin.v32i64(<32 x i64>)
1712