1; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE 2; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_EQ_256 3; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK 4; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 5; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 6; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 7; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 8; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 9; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 10; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 11; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 12; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 13; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 14; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 15; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 16; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048 17 18target triple = "aarch64-unknown-linux-gnu" 19 20; Don't use SVE when its registers are no bigger than NEON. 21; NO_SVE-NOT: ptrue 22 23; 24; UADDV 25; 26 27; Don't use SVE for 64-bit vectors. 28define i8 @uaddv_v8i8(<8 x i8> %a) #0 { 29; CHECK-LABEL: uaddv_v8i8: 30; CHECK: addv b0, v0.8b 31; CHECK: ret 32 %res = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a) 33 ret i8 %res 34} 35 36; Don't use SVE for 128-bit vectors. 37define i8 @uaddv_v16i8(<16 x i8> %a) #0 { 38; CHECK-LABEL: uaddv_v16i8: 39; CHECK: addv b0, v0.16b 40; CHECK: ret 41 %res = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a) 42 ret i8 %res 43} 44 45define i8 @uaddv_v32i8(<32 x i8>* %a) #0 { 46; CHECK-LABEL: uaddv_v32i8: 47; CHECK: ptrue [[PG:p[0-9]+]].b, vl32 48; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] 49; CHECK-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].b 50; CHECK-NEXT: fmov x0, [[REDUCE]] 51; CHECK-NEXT: ret 52 %op = load <32 x i8>, <32 x i8>* %a 53 %res = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %op) 54 ret i8 %res 55} 56 57define i8 @uaddv_v64i8(<64 x i8>* %a) #0 { 58; CHECK-LABEL: uaddv_v64i8: 59; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64 60; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] 61; VBITS_GE_512-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].b 62; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]] 63; VBITS_GE_512-NEXT: ret 64 65; Ensure sensible type legalisation. 66; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32 67; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32 68; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0] 69; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]] 70; VBITS_EQ_256-DAG: add [[ADD:z[0-9]+]].b, [[PG]]/m, [[HI]].b, [[LO]].b 71; VBITS_EQ_256-DAG: addv [[REDUCE:d[0-9]+]], [[PG]], [[ADD]].b 72; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]] 73; VBITS_EQ_256-NEXT: ret 74 %op = load <64 x i8>, <64 x i8>* %a 75 %res = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %op) 76 ret i8 %res 77} 78 79define i8 @uaddv_v128i8(<128 x i8>* %a) #0 { 80; CHECK-LABEL: uaddv_v128i8: 81; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128 82; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] 83; VBITS_GE_1024-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].b 84; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]] 85; VBITS_GE_1024-NEXT: ret 86 %op = load <128 x i8>, <128 x i8>* %a 87 %res = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %op) 88 ret i8 %res 89} 90 91define i8 @uaddv_v256i8(<256 x i8>* %a) #0 { 92; CHECK-LABEL: uaddv_v256i8: 93; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256 94; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] 95; VBITS_GE_2048-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].b 96; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]] 97; VBITS_GE_2048-NEXT: ret 98 %op = load <256 x i8>, <256 x i8>* %a 99 %res = call i8 @llvm.vector.reduce.add.v256i8(<256 x i8> %op) 100 ret i8 %res 101} 102 103; Don't use SVE for 64-bit vectors. 104define i16 @uaddv_v4i16(<4 x i16> %a) #0 { 105; CHECK-LABEL: uaddv_v4i16: 106; CHECK: addv h0, v0.4h 107; CHECK: ret 108 %res = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a) 109 ret i16 %res 110} 111 112; Don't use SVE for 128-bit vectors. 113define i16 @uaddv_v8i16(<8 x i16> %a) #0 { 114; CHECK-LABEL: uaddv_v8i16: 115; CHECK: addv h0, v0.8h 116; CHECK: ret 117 %res = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a) 118 ret i16 %res 119} 120 121define i16 @uaddv_v16i16(<16 x i16>* %a) #0 { 122; CHECK-LABEL: uaddv_v16i16: 123; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 124; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 125; CHECK-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].h 126; CHECK-NEXT: fmov x0, [[REDUCE]] 127; CHECK-NEXT: ret 128 %op = load <16 x i16>, <16 x i16>* %a 129 %res = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %op) 130 ret i16 %res 131} 132 133define i16 @uaddv_v32i16(<32 x i16>* %a) #0 { 134; CHECK-LABEL: uaddv_v32i16: 135; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 136; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 137; VBITS_GE_512-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].h 138; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]] 139; VBITS_GE_512-NEXT: ret 140 141; Ensure sensible type legalisation. 142; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 143; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16 144; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0] 145; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1] 146; VBITS_EQ_256-DAG: add [[ADD:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h 147; VBITS_EQ_256-DAG: addv [[REDUCE:d[0-9]+]], [[PG]], [[ADD]].h 148; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]] 149; VBITS_EQ_256-NEXT: ret 150 %op = load <32 x i16>, <32 x i16>* %a 151 %res = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %op) 152 ret i16 %res 153} 154 155define i16 @uaddv_v64i16(<64 x i16>* %a) #0 { 156; CHECK-LABEL: uaddv_v64i16: 157; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 158; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 159; VBITS_GE_1024-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].h 160; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]] 161; VBITS_GE_1024-NEXT: ret 162 %op = load <64 x i16>, <64 x i16>* %a 163 %res = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %op) 164 ret i16 %res 165} 166 167define i16 @uaddv_v128i16(<128 x i16>* %a) #0 { 168; CHECK-LABEL: uaddv_v128i16: 169; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128 170; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 171; VBITS_GE_2048-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].h 172; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]] 173; VBITS_GE_2048-NEXT: ret 174 %op = load <128 x i16>, <128 x i16>* %a 175 %res = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> %op) 176 ret i16 %res 177} 178 179; Don't use SVE for 64-bit vectors. 180define i32 @uaddv_v2i32(<2 x i32> %a) #0 { 181; CHECK-LABEL: uaddv_v2i32: 182; CHECK: addp v0.2s, v0.2s 183; CHECK: ret 184 %res = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a) 185 ret i32 %res 186} 187 188; Don't use SVE for 128-bit vectors. 189define i32 @uaddv_v4i32(<4 x i32> %a) #0 { 190; CHECK-LABEL: uaddv_v4i32: 191; CHECK: addv s0, v0.4s 192; CHECK: ret 193 %res = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a) 194 ret i32 %res 195} 196 197define i32 @uaddv_v8i32(<8 x i32>* %a) #0 { 198; CHECK-LABEL: uaddv_v8i32: 199; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 200; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 201; CHECK-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].s 202; CHECK-NEXT: fmov x0, [[REDUCE]] 203; CHECK-NEXT: ret 204 %op = load <8 x i32>, <8 x i32>* %a 205 %res = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %op) 206 ret i32 %res 207} 208 209define i32 @uaddv_v16i32(<16 x i32>* %a) #0 { 210; CHECK-LABEL: uaddv_v16i32: 211; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 212; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 213; VBITS_GE_512-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].s 214; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]] 215; VBITS_GE_512-NEXT: ret 216 217; Ensure sensible type legalisation. 218; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 219; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8 220; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0] 221; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2] 222; VBITS_EQ_256-DAG: add [[ADD:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s 223; VBITS_EQ_256-DAG: addv [[REDUCE:d[0-9]+]], [[PG]], [[ADD]].s 224; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]] 225; VBITS_EQ_256-NEXT: ret 226 %op = load <16 x i32>, <16 x i32>* %a 227 %res = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %op) 228 ret i32 %res 229} 230 231define i32 @uaddv_v32i32(<32 x i32>* %a) #0 { 232; CHECK-LABEL: uaddv_v32i32: 233; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 234; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 235; VBITS_GE_1024-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].s 236; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]] 237; VBITS_GE_1024-NEXT: ret 238 %op = load <32 x i32>, <32 x i32>* %a 239 %res = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %op) 240 ret i32 %res 241} 242 243define i32 @uaddv_v64i32(<64 x i32>* %a) #0 { 244; CHECK-LABEL: uaddv_v64i32: 245; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 246; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 247; VBITS_GE_2048-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].s 248; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]] 249; VBITS_GE_2048-NEXT: ret 250 %op = load <64 x i32>, <64 x i32>* %a 251 %res = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %op) 252 ret i32 %res 253} 254 255; Nothing to do for single element vectors. 256define i64 @uaddv_v1i64(<1 x i64> %a) #0 { 257; CHECK-LABEL: uaddv_v1i64: 258; CHECK: fmov x0, d0 259; CHECK: ret 260 %res = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a) 261 ret i64 %res 262} 263 264; Don't use SVE for 128-bit vectors. 265define i64 @uaddv_v2i64(<2 x i64> %a) #0 { 266; CHECK-LABEL: uaddv_v2i64: 267; CHECK: addp d0, v0.2d 268; CHECK: ret 269 %res = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a) 270 ret i64 %res 271} 272 273define i64 @uaddv_v4i64(<4 x i64>* %a) #0 { 274; CHECK-LABEL: uaddv_v4i64: 275; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 276; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 277; CHECK-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d 278; CHECK-NEXT: fmov x0, [[REDUCE]] 279; CHECK-NEXT: ret 280 %op = load <4 x i64>, <4 x i64>* %a 281 %res = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %op) 282 ret i64 %res 283} 284 285define i64 @uaddv_v8i64(<8 x i64>* %a) #0 { 286; CHECK-LABEL: uaddv_v8i64: 287; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 288; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 289; VBITS_GE_512-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d 290; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]] 291; VBITS_GE_512-NEXT: ret 292 293; Ensure sensible type legalisation. 294; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 295; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4 296; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] 297; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3] 298; VBITS_EQ_256-DAG: add [[ADD:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d 299; VBITS_EQ_256-DAG: addv [[REDUCE:d[0-9]+]], [[PG]], [[ADD]].d 300; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]] 301; VBITS_EQ_256-NEXT: ret 302 %op = load <8 x i64>, <8 x i64>* %a 303 %res = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %op) 304 ret i64 %res 305} 306 307define i64 @uaddv_v16i64(<16 x i64>* %a) #0 { 308; CHECK-LABEL: uaddv_v16i64: 309; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 310; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 311; VBITS_GE_1024-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d 312; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]] 313; VBITS_GE_1024-NEXT: ret 314 %op = load <16 x i64>, <16 x i64>* %a 315 %res = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %op) 316 ret i64 %res 317} 318 319define i64 @uaddv_v32i64(<32 x i64>* %a) #0 { 320; CHECK-LABEL: uaddv_v32i64: 321; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 322; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 323; VBITS_GE_2048-NEXT: uaddv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d 324; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]] 325; VBITS_GE_2048-NEXT: ret 326 %op = load <32 x i64>, <32 x i64>* %a 327 %res = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> %op) 328 ret i64 %res 329} 330 331; 332; SMAXV 333; 334 335; Don't use SVE for 64-bit vectors. 336define i8 @smaxv_v8i8(<8 x i8> %a) #0 { 337; CHECK-LABEL: smaxv_v8i8: 338; CHECK: smaxv b0, v0.8b 339; CHECK: ret 340 %res = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %a) 341 ret i8 %res 342} 343 344; Don't use SVE for 128-bit vectors. 345define i8 @smaxv_v16i8(<16 x i8> %a) #0 { 346; CHECK-LABEL: smaxv_v16i8: 347; CHECK: smaxv b0, v0.16b 348; CHECK: ret 349 %res = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %a) 350 ret i8 %res 351} 352 353define i8 @smaxv_v32i8(<32 x i8>* %a) #0 { 354; CHECK-LABEL: smaxv_v32i8: 355; CHECK: ptrue [[PG:p[0-9]+]].b, vl32 356; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] 357; CHECK-NEXT: smaxv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b 358; CHECK-NEXT: fmov w0, s[[REDUCE]] 359; CHECK-NEXT: ret 360 %op = load <32 x i8>, <32 x i8>* %a 361 %res = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %op) 362 ret i8 %res 363} 364 365define i8 @smaxv_v64i8(<64 x i8>* %a) #0 { 366; CHECK-LABEL: smaxv_v64i8: 367; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64 368; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] 369; VBITS_GE_512-NEXT: smaxv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b 370; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]] 371; VBITS_GE_512-NEXT: ret 372 373; Ensure sensible type legalisation. 374; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32 375; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32 376; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0] 377; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]] 378; VBITS_EQ_256-DAG: smax [[MAX:z[0-9]+]].b, [[PG]]/m, [[HI]].b, [[LO]].b 379; VBITS_EQ_256-DAG: smaxv b[[REDUCE:[0-9]+]], [[PG]], [[MAX]].b 380; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]] 381; VBITS_EQ_256-NEXT: ret 382 %op = load <64 x i8>, <64 x i8>* %a 383 %res = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> %op) 384 ret i8 %res 385} 386 387define i8 @smaxv_v128i8(<128 x i8>* %a) #0 { 388; CHECK-LABEL: smaxv_v128i8: 389; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128 390; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] 391; VBITS_GE_1024-NEXT: smaxv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b 392; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]] 393; VBITS_GE_1024-NEXT: ret 394 %op = load <128 x i8>, <128 x i8>* %a 395 %res = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> %op) 396 ret i8 %res 397} 398 399define i8 @smaxv_v256i8(<256 x i8>* %a) #0 { 400; CHECK-LABEL: smaxv_v256i8: 401; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256 402; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] 403; VBITS_GE_2048-NEXT: smaxv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b 404; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]] 405; VBITS_GE_2048-NEXT: ret 406 %op = load <256 x i8>, <256 x i8>* %a 407 %res = call i8 @llvm.vector.reduce.smax.v256i8(<256 x i8> %op) 408 ret i8 %res 409} 410 411; Don't use SVE for 64-bit vectors. 412define i16 @smaxv_v4i16(<4 x i16> %a) #0 { 413; CHECK-LABEL: smaxv_v4i16: 414; CHECK: smaxv h0, v0.4h 415; CHECK: ret 416 %res = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %a) 417 ret i16 %res 418} 419 420; Don't use SVE for 128-bit vectors. 421define i16 @smaxv_v8i16(<8 x i16> %a) #0 { 422; CHECK-LABEL: smaxv_v8i16: 423; CHECK: smaxv h0, v0.8h 424; CHECK: ret 425 %res = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %a) 426 ret i16 %res 427} 428 429define i16 @smaxv_v16i16(<16 x i16>* %a) #0 { 430; CHECK-LABEL: smaxv_v16i16: 431; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 432; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 433; CHECK-NEXT: smaxv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h 434; CHECK-NEXT: fmov w0, s[[REDUCE]] 435; CHECK-NEXT: ret 436 %op = load <16 x i16>, <16 x i16>* %a 437 %res = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %op) 438 ret i16 %res 439} 440 441define i16 @smaxv_v32i16(<32 x i16>* %a) #0 { 442; CHECK-LABEL: smaxv_v32i16: 443; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 444; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 445; VBITS_GE_512-NEXT: smaxv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h 446; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]] 447; VBITS_GE_512-NEXT: ret 448 449; Ensure sensible type legalisation. 450; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 451; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16 452; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0] 453; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1] 454; VBITS_EQ_256-DAG: smax [[MAX:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h 455; VBITS_EQ_256-DAG: smaxv h[[REDUCE:[0-9]+]], [[PG]], [[MAX]].h 456; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]] 457; VBITS_EQ_256-NEXT: ret 458 %op = load <32 x i16>, <32 x i16>* %a 459 %res = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> %op) 460 ret i16 %res 461} 462 463define i16 @smaxv_v64i16(<64 x i16>* %a) #0 { 464; CHECK-LABEL: smaxv_v64i16: 465; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 466; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 467; VBITS_GE_1024-NEXT: smaxv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h 468; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]] 469; VBITS_GE_1024-NEXT: ret 470 %op = load <64 x i16>, <64 x i16>* %a 471 %res = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> %op) 472 ret i16 %res 473} 474 475define i16 @smaxv_v128i16(<128 x i16>* %a) #0 { 476; CHECK-LABEL: smaxv_v128i16: 477; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128 478; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 479; VBITS_GE_2048-NEXT: smaxv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h 480; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]] 481; VBITS_GE_2048-NEXT: ret 482 %op = load <128 x i16>, <128 x i16>* %a 483 %res = call i16 @llvm.vector.reduce.smax.v128i16(<128 x i16> %op) 484 ret i16 %res 485} 486 487; Don't use SVE for 64-bit vectors. 488define i32 @smaxv_v2i32(<2 x i32> %a) #0 { 489; CHECK-LABEL: smaxv_v2i32: 490; CHECK: smaxp v0.2s, v0.2s 491; CHECK: ret 492 %res = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %a) 493 ret i32 %res 494} 495 496; Don't use SVE for 128-bit vectors. 497define i32 @smaxv_v4i32(<4 x i32> %a) #0 { 498; CHECK-LABEL: smaxv_v4i32: 499; CHECK: smaxv s0, v0.4s 500; CHECK: ret 501 %res = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a) 502 ret i32 %res 503} 504 505define i32 @smaxv_v8i32(<8 x i32>* %a) #0 { 506; CHECK-LABEL: smaxv_v8i32: 507; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 508; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 509; CHECK-NEXT: smaxv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s 510; CHECK-NEXT: fmov w0, [[REDUCE]] 511; CHECK-NEXT: ret 512 %op = load <8 x i32>, <8 x i32>* %a 513 %res = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %op) 514 ret i32 %res 515} 516 517define i32 @smaxv_v16i32(<16 x i32>* %a) #0 { 518; CHECK-LABEL: smaxv_v16i32: 519; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 520; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 521; VBITS_GE_512-NEXT: smaxv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s 522; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]] 523; VBITS_GE_512-NEXT: ret 524 525; Ensure sensible type legalisation. 526; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 527; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8 528; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0] 529; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2] 530; VBITS_EQ_256-DAG: smax [[MAX:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s 531; VBITS_EQ_256-DAG: smaxv [[REDUCE:s[0-9]+]], [[PG]], [[MAX]].s 532; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]] 533; VBITS_EQ_256-NEXT: ret 534 %op = load <16 x i32>, <16 x i32>* %a 535 %res = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> %op) 536 ret i32 %res 537} 538 539define i32 @smaxv_v32i32(<32 x i32>* %a) #0 { 540; CHECK-LABEL: smaxv_v32i32: 541; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 542; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 543; VBITS_GE_1024-NEXT: smaxv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s 544; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]] 545; VBITS_GE_1024-NEXT: ret 546 %op = load <32 x i32>, <32 x i32>* %a 547 %res = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> %op) 548 ret i32 %res 549} 550 551define i32 @smaxv_v64i32(<64 x i32>* %a) #0 { 552; CHECK-LABEL: smaxv_v64i32: 553; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 554; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 555; VBITS_GE_2048-NEXT: smaxv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s 556; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]] 557; VBITS_GE_2048-NEXT: ret 558 %op = load <64 x i32>, <64 x i32>* %a 559 %res = call i32 @llvm.vector.reduce.smax.v64i32(<64 x i32> %op) 560 ret i32 %res 561} 562 563; Nothing to do for single element vectors. 564define i64 @smaxv_v1i64(<1 x i64> %a) #0 { 565; CHECK-LABEL: smaxv_v1i64: 566; CHECK: fmov x0, d0 567; CHECK: ret 568 %res = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> %a) 569 ret i64 %res 570} 571 572; No NEON 64-bit vector SMAXV support. Use SVE. 573define i64 @smaxv_v2i64(<2 x i64> %a) #0 { 574; CHECK-LABEL: smaxv_v2i64: 575; CHECK: ptrue [[PG:p[0-9]+]].d, vl2 576; CHECK-NEXT: smaxv [[REDUCE:d[0-9]+]], [[PG]], z0.d 577; CHECK-NEXT: fmov x0, [[REDUCE]] 578; CHECK-NEXT: ret 579 %res = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %a) 580 ret i64 %res 581} 582 583define i64 @smaxv_v4i64(<4 x i64>* %a) #0 { 584; CHECK-LABEL: smaxv_v4i64: 585; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 586; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 587; CHECK-NEXT: smaxv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d 588; CHECK-NEXT: fmov x0, [[REDUCE]] 589; CHECK-NEXT: ret 590 %op = load <4 x i64>, <4 x i64>* %a 591 %res = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %op) 592 ret i64 %res 593} 594 595define i64 @smaxv_v8i64(<8 x i64>* %a) #0 { 596; CHECK-LABEL: smaxv_v8i64: 597; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 598; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 599; VBITS_GE_512-NEXT: smaxv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d 600; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]] 601; VBITS_GE_512-NEXT: ret 602 603; Ensure sensible type legalisation. 604; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 605; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4 606; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] 607; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3] 608; VBITS_EQ_256-DAG: smax [[MAX:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d 609; VBITS_EQ_256-DAG: smaxv [[REDUCE:d[0-9]+]], [[PG]], [[MAX]].d 610; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]] 611; VBITS_EQ_256-NEXT: ret 612 %op = load <8 x i64>, <8 x i64>* %a 613 %res = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> %op) 614 ret i64 %res 615} 616 617define i64 @smaxv_v16i64(<16 x i64>* %a) #0 { 618; CHECK-LABEL: smaxv_v16i64: 619; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 620; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 621; VBITS_GE_1024-NEXT: smaxv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d 622; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]] 623; VBITS_GE_1024-NEXT: ret 624 %op = load <16 x i64>, <16 x i64>* %a 625 %res = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> %op) 626 ret i64 %res 627} 628 629define i64 @smaxv_v32i64(<32 x i64>* %a) #0 { 630; CHECK-LABEL: smaxv_v32i64: 631; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 632; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 633; VBITS_GE_2048-NEXT: smaxv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d 634; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]] 635; VBITS_GE_2048-NEXT: ret 636 %op = load <32 x i64>, <32 x i64>* %a 637 %res = call i64 @llvm.vector.reduce.smax.v32i64(<32 x i64> %op) 638 ret i64 %res 639} 640 641; 642; SMINV 643; 644 645; Don't use SVE for 64-bit vectors. 646define i8 @sminv_v8i8(<8 x i8> %a) #0 { 647; CHECK-LABEL: sminv_v8i8: 648; CHECK: sminv b0, v0.8b 649; CHECK: ret 650 %res = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %a) 651 ret i8 %res 652} 653 654; Don't use SVE for 128-bit vectors. 655define i8 @sminv_v16i8(<16 x i8> %a) #0 { 656; CHECK-LABEL: sminv_v16i8: 657; CHECK: sminv b0, v0.16b 658; CHECK: ret 659 %res = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %a) 660 ret i8 %res 661} 662 663define i8 @sminv_v32i8(<32 x i8>* %a) #0 { 664; CHECK-LABEL: sminv_v32i8: 665; CHECK: ptrue [[PG:p[0-9]+]].b, vl32 666; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] 667; CHECK-NEXT: sminv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b 668; CHECK-NEXT: fmov w0, s[[REDUCE]] 669; CHECK-NEXT: ret 670 %op = load <32 x i8>, <32 x i8>* %a 671 %res = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %op) 672 ret i8 %res 673} 674 675define i8 @sminv_v64i8(<64 x i8>* %a) #0 { 676; CHECK-LABEL: sminv_v64i8: 677; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64 678; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] 679; VBITS_GE_512-NEXT: sminv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b 680; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]] 681; VBITS_GE_512-NEXT: ret 682 683; Ensure sensible type legalisation. 684; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32 685; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32 686; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0] 687; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]] 688; VBITS_EQ_256-DAG: smin [[MIN:z[0-9]+]].b, [[PG]]/m, [[HI]].b, [[LO]].b 689; VBITS_EQ_256-DAG: sminv b[[REDUCE:[0-9]+]], [[PG]], [[MIN]].b 690; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]] 691; VBITS_EQ_256-NEXT: ret 692 %op = load <64 x i8>, <64 x i8>* %a 693 %res = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> %op) 694 ret i8 %res 695} 696 697define i8 @sminv_v128i8(<128 x i8>* %a) #0 { 698; CHECK-LABEL: sminv_v128i8: 699; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128 700; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] 701; VBITS_GE_1024-NEXT: sminv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b 702; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]] 703; VBITS_GE_1024-NEXT: ret 704 %op = load <128 x i8>, <128 x i8>* %a 705 %res = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> %op) 706 ret i8 %res 707} 708 709define i8 @sminv_v256i8(<256 x i8>* %a) #0 { 710; CHECK-LABEL: sminv_v256i8: 711; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256 712; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] 713; VBITS_GE_2048-NEXT: sminv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b 714; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]] 715; VBITS_GE_2048-NEXT: ret 716 %op = load <256 x i8>, <256 x i8>* %a 717 %res = call i8 @llvm.vector.reduce.smin.v256i8(<256 x i8> %op) 718 ret i8 %res 719} 720 721; Don't use SVE for 64-bit vectors. 722define i16 @sminv_v4i16(<4 x i16> %a) #0 { 723; CHECK-LABEL: sminv_v4i16: 724; CHECK: sminv h0, v0.4h 725; CHECK: ret 726 %res = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %a) 727 ret i16 %res 728} 729 730; Don't use SVE for 128-bit vectors. 731define i16 @sminv_v8i16(<8 x i16> %a) #0 { 732; CHECK-LABEL: sminv_v8i16: 733; CHECK: sminv h0, v0.8h 734; CHECK: ret 735 %res = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %a) 736 ret i16 %res 737} 738 739define i16 @sminv_v16i16(<16 x i16>* %a) #0 { 740; CHECK-LABEL: sminv_v16i16: 741; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 742; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 743; CHECK-NEXT: sminv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h 744; CHECK-NEXT: fmov w0, s[[REDUCE]] 745; CHECK-NEXT: ret 746 %op = load <16 x i16>, <16 x i16>* %a 747 %res = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %op) 748 ret i16 %res 749} 750 751define i16 @sminv_v32i16(<32 x i16>* %a) #0 { 752; CHECK-LABEL: sminv_v32i16: 753; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 754; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 755; VBITS_GE_512-NEXT: sminv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h 756; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]] 757; VBITS_GE_512-NEXT: ret 758 759; Ensure sensible type legalisation. 760; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 761; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16 762; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0] 763; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1] 764; VBITS_EQ_256-DAG: smin [[MIN:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h 765; VBITS_EQ_256-DAG: sminv h[[REDUCE:[0-9]+]], [[PG]], [[MIN]].h 766; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]] 767; VBITS_EQ_256-NEXT: ret 768 %op = load <32 x i16>, <32 x i16>* %a 769 %res = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> %op) 770 ret i16 %res 771} 772 773define i16 @sminv_v64i16(<64 x i16>* %a) #0 { 774; CHECK-LABEL: sminv_v64i16: 775; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 776; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 777; VBITS_GE_1024-NEXT: sminv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h 778; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]] 779; VBITS_GE_1024-NEXT: ret 780 %op = load <64 x i16>, <64 x i16>* %a 781 %res = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> %op) 782 ret i16 %res 783} 784 785define i16 @sminv_v128i16(<128 x i16>* %a) #0 { 786; CHECK-LABEL: sminv_v128i16: 787; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128 788; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 789; VBITS_GE_2048-NEXT: sminv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h 790; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]] 791; VBITS_GE_2048-NEXT: ret 792 %op = load <128 x i16>, <128 x i16>* %a 793 %res = call i16 @llvm.vector.reduce.smin.v128i16(<128 x i16> %op) 794 ret i16 %res 795} 796 797; Don't use SVE for 64-bit vectors. 798define i32 @sminv_v2i32(<2 x i32> %a) #0 { 799; CHECK-LABEL: sminv_v2i32: 800; CHECK: minp v0.2s, v0.2s 801; CHECK: ret 802 %res = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %a) 803 ret i32 %res 804} 805 806; Don't use SVE for 128-bit vectors. 807define i32 @sminv_v4i32(<4 x i32> %a) #0 { 808; CHECK-LABEL: sminv_v4i32: 809; CHECK: sminv s0, v0.4s 810; CHECK: ret 811 %res = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a) 812 ret i32 %res 813} 814 815define i32 @sminv_v8i32(<8 x i32>* %a) #0 { 816; CHECK-LABEL: sminv_v8i32: 817; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 818; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 819; CHECK-NEXT: sminv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s 820; CHECK-NEXT: fmov w0, [[REDUCE]] 821; CHECK-NEXT: ret 822 %op = load <8 x i32>, <8 x i32>* %a 823 %res = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %op) 824 ret i32 %res 825} 826 827define i32 @sminv_v16i32(<16 x i32>* %a) #0 { 828; CHECK-LABEL: sminv_v16i32: 829; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 830; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 831; VBITS_GE_512-NEXT: sminv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s 832; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]] 833; VBITS_GE_512-NEXT: ret 834 835; Ensure sensible type legalisation. 836; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 837; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8 838; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0] 839; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2] 840; VBITS_EQ_256-DAG: smin [[MIN:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s 841; VBITS_EQ_256-DAG: sminv [[REDUCE:s[0-9]+]], [[PG]], [[MIN]].s 842; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]] 843; VBITS_EQ_256-NEXT: ret 844 %op = load <16 x i32>, <16 x i32>* %a 845 %res = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> %op) 846 ret i32 %res 847} 848 849define i32 @sminv_v32i32(<32 x i32>* %a) #0 { 850; CHECK-LABEL: sminv_v32i32: 851; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 852; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 853; VBITS_GE_1024-NEXT: sminv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s 854; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]] 855; VBITS_GE_1024-NEXT: ret 856 %op = load <32 x i32>, <32 x i32>* %a 857 %res = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> %op) 858 ret i32 %res 859} 860 861define i32 @sminv_v64i32(<64 x i32>* %a) #0 { 862; CHECK-LABEL: sminv_v64i32: 863; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 864; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 865; VBITS_GE_2048-NEXT: sminv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s 866; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]] 867; VBITS_GE_2048-NEXT: ret 868 %op = load <64 x i32>, <64 x i32>* %a 869 %res = call i32 @llvm.vector.reduce.smin.v64i32(<64 x i32> %op) 870 ret i32 %res 871} 872 873; Nothing to do for single element vectors. 874define i64 @sminv_v1i64(<1 x i64> %a) #0 { 875; CHECK-LABEL: sminv_v1i64: 876; CHECK: fmov x0, d0 877; CHECK: ret 878 %res = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> %a) 879 ret i64 %res 880} 881 882; No NEON 64-bit vector SMINV support. Use SVE. 883define i64 @sminv_v2i64(<2 x i64> %a) #0 { 884; CHECK-LABEL: sminv_v2i64: 885; CHECK: ptrue [[PG:p[0-9]+]].d, vl2 886; CHECK-NEXT: sminv [[REDUCE:d[0-9]+]], [[PG]], z0.d 887; CHECK-NEXT: fmov x0, [[REDUCE]] 888; CHECK-NEXT: ret 889 %res = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %a) 890 ret i64 %res 891} 892 893define i64 @sminv_v4i64(<4 x i64>* %a) #0 { 894; CHECK-LABEL: sminv_v4i64: 895; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 896; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 897; CHECK-NEXT: sminv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d 898; CHECK-NEXT: fmov x0, [[REDUCE]] 899; CHECK-NEXT: ret 900 %op = load <4 x i64>, <4 x i64>* %a 901 %res = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %op) 902 ret i64 %res 903} 904 905define i64 @sminv_v8i64(<8 x i64>* %a) #0 { 906; CHECK-LABEL: sminv_v8i64: 907; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 908; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 909; VBITS_GE_512-NEXT: sminv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d 910; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]] 911; VBITS_GE_512-NEXT: ret 912 913; Ensure sensible type legalisation. 914; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 915; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4 916; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] 917; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3] 918; VBITS_EQ_256-DAG: smin [[MIN:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d 919; VBITS_EQ_256-DAG: sminv [[REDUCE:d[0-9]+]], [[PG]], [[MIN]].d 920; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]] 921; VBITS_EQ_256-NEXT: ret 922 %op = load <8 x i64>, <8 x i64>* %a 923 %res = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> %op) 924 ret i64 %res 925} 926 927define i64 @sminv_v16i64(<16 x i64>* %a) #0 { 928; CHECK-LABEL: sminv_v16i64: 929; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 930; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 931; VBITS_GE_1024-NEXT: sminv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d 932; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]] 933; VBITS_GE_1024-NEXT: ret 934 %op = load <16 x i64>, <16 x i64>* %a 935 %res = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> %op) 936 ret i64 %res 937} 938 939define i64 @sminv_v32i64(<32 x i64>* %a) #0 { 940; CHECK-LABEL: sminv_v32i64: 941; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 942; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 943; VBITS_GE_2048-NEXT: sminv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d 944; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]] 945; VBITS_GE_2048-NEXT: ret 946 %op = load <32 x i64>, <32 x i64>* %a 947 %res = call i64 @llvm.vector.reduce.smin.v32i64(<32 x i64> %op) 948 ret i64 %res 949} 950 951; 952; UMAXV 953; 954 955; Don't use SVE for 64-bit vectors. 956define i8 @umaxv_v8i8(<8 x i8> %a) #0 { 957; CHECK-LABEL: umaxv_v8i8: 958; CHECK: umaxv b0, v0.8b 959; CHECK: ret 960 %res = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %a) 961 ret i8 %res 962} 963 964; Don't use SVE for 128-bit vectors. 965define i8 @umaxv_v16i8(<16 x i8> %a) #0 { 966; CHECK-LABEL: umaxv_v16i8: 967; CHECK: umaxv b0, v0.16b 968; CHECK: ret 969 %res = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %a) 970 ret i8 %res 971} 972 973define i8 @umaxv_v32i8(<32 x i8>* %a) #0 { 974; CHECK-LABEL: umaxv_v32i8: 975; CHECK: ptrue [[PG:p[0-9]+]].b, vl32 976; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] 977; CHECK-NEXT: umaxv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b 978; CHECK-NEXT: fmov w0, s[[REDUCE]] 979; CHECK-NEXT: ret 980 %op = load <32 x i8>, <32 x i8>* %a 981 %res = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %op) 982 ret i8 %res 983} 984 985define i8 @umaxv_v64i8(<64 x i8>* %a) #0 { 986; CHECK-LABEL: umaxv_v64i8: 987; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64 988; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] 989; VBITS_GE_512-NEXT: umaxv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b 990; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]] 991; VBITS_GE_512-NEXT: ret 992 993; Ensure sensible type legalisation. 994; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32 995; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32 996; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0] 997; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]] 998; VBITS_EQ_256-DAG: umax [[MAX:z[0-9]+]].b, [[PG]]/m, [[HI]].b, [[LO]].b 999; VBITS_EQ_256-DAG: umaxv b[[REDUCE:[0-9]+]], [[PG]], [[MAX]].b 1000; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]] 1001; VBITS_EQ_256-NEXT: ret 1002 %op = load <64 x i8>, <64 x i8>* %a 1003 %res = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> %op) 1004 ret i8 %res 1005} 1006 1007define i8 @umaxv_v128i8(<128 x i8>* %a) #0 { 1008; CHECK-LABEL: umaxv_v128i8: 1009; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128 1010; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] 1011; VBITS_GE_1024-NEXT: umaxv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b 1012; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]] 1013; VBITS_GE_1024-NEXT: ret 1014 %op = load <128 x i8>, <128 x i8>* %a 1015 %res = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> %op) 1016 ret i8 %res 1017} 1018 1019define i8 @umaxv_v256i8(<256 x i8>* %a) #0 { 1020; CHECK-LABEL: umaxv_v256i8: 1021; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256 1022; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] 1023; VBITS_GE_2048-NEXT: umaxv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b 1024; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]] 1025; VBITS_GE_2048-NEXT: ret 1026 %op = load <256 x i8>, <256 x i8>* %a 1027 %res = call i8 @llvm.vector.reduce.umax.v256i8(<256 x i8> %op) 1028 ret i8 %res 1029} 1030 1031; Don't use SVE for 64-bit vectors. 1032define i16 @umaxv_v4i16(<4 x i16> %a) #0 { 1033; CHECK-LABEL: umaxv_v4i16: 1034; CHECK: umaxv h0, v0.4h 1035; CHECK: ret 1036 %res = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %a) 1037 ret i16 %res 1038} 1039 1040; Don't use SVE for 128-bit vectors. 1041define i16 @umaxv_v8i16(<8 x i16> %a) #0 { 1042; CHECK-LABEL: umaxv_v8i16: 1043; CHECK: umaxv h0, v0.8h 1044; CHECK: ret 1045 %res = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %a) 1046 ret i16 %res 1047} 1048 1049define i16 @umaxv_v16i16(<16 x i16>* %a) #0 { 1050; CHECK-LABEL: umaxv_v16i16: 1051; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 1052; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 1053; CHECK-NEXT: umaxv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h 1054; CHECK-NEXT: fmov w0, s[[REDUCE]] 1055; CHECK-NEXT: ret 1056 %op = load <16 x i16>, <16 x i16>* %a 1057 %res = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %op) 1058 ret i16 %res 1059} 1060 1061define i16 @umaxv_v32i16(<32 x i16>* %a) #0 { 1062; CHECK-LABEL: umaxv_v32i16: 1063; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 1064; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 1065; VBITS_GE_512-NEXT: umaxv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h 1066; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]] 1067; VBITS_GE_512-NEXT: ret 1068 1069; Ensure sensible type legalisation. 1070; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 1071; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16 1072; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0] 1073; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1] 1074; VBITS_EQ_256-DAG: umax [[MAX:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h 1075; VBITS_EQ_256-DAG: umaxv h[[REDUCE:[0-9]+]], [[PG]], [[MAX]].h 1076; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]] 1077; VBITS_EQ_256-NEXT: ret 1078 %op = load <32 x i16>, <32 x i16>* %a 1079 %res = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> %op) 1080 ret i16 %res 1081} 1082 1083define i16 @umaxv_v64i16(<64 x i16>* %a) #0 { 1084; CHECK-LABEL: umaxv_v64i16: 1085; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 1086; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 1087; VBITS_GE_1024-NEXT: umaxv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h 1088; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]] 1089; VBITS_GE_1024-NEXT: ret 1090 %op = load <64 x i16>, <64 x i16>* %a 1091 %res = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> %op) 1092 ret i16 %res 1093} 1094 1095define i16 @umaxv_v128i16(<128 x i16>* %a) #0 { 1096; CHECK-LABEL: umaxv_v128i16: 1097; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128 1098; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 1099; VBITS_GE_2048-NEXT: umaxv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h 1100; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]] 1101; VBITS_GE_2048-NEXT: ret 1102 %op = load <128 x i16>, <128 x i16>* %a 1103 %res = call i16 @llvm.vector.reduce.umax.v128i16(<128 x i16> %op) 1104 ret i16 %res 1105} 1106 1107; Don't use SVE for 64-bit vectors. 1108define i32 @umaxv_v2i32(<2 x i32> %a) #0 { 1109; CHECK-LABEL: umaxv_v2i32: 1110; CHECK: umaxp v0.2s, v0.2s 1111; CHECK: ret 1112 %res = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %a) 1113 ret i32 %res 1114} 1115 1116; Don't use SVE for 128-bit vectors. 1117define i32 @umaxv_v4i32(<4 x i32> %a) #0 { 1118; CHECK-LABEL: umaxv_v4i32: 1119; CHECK: umaxv s0, v0.4s 1120; CHECK: ret 1121 %res = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a) 1122 ret i32 %res 1123} 1124 1125define i32 @umaxv_v8i32(<8 x i32>* %a) #0 { 1126; CHECK-LABEL: umaxv_v8i32: 1127; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 1128; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 1129; CHECK-NEXT: umaxv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s 1130; CHECK-NEXT: fmov w0, [[REDUCE]] 1131; CHECK-NEXT: ret 1132 %op = load <8 x i32>, <8 x i32>* %a 1133 %res = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %op) 1134 ret i32 %res 1135} 1136 1137define i32 @umaxv_v16i32(<16 x i32>* %a) #0 { 1138; CHECK-LABEL: umaxv_v16i32: 1139; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 1140; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 1141; VBITS_GE_512-NEXT: umaxv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s 1142; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]] 1143; VBITS_GE_512-NEXT: ret 1144 1145; Ensure sensible type legalisation. 1146; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 1147; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8 1148; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0] 1149; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2] 1150; VBITS_EQ_256-DAG: umax [[MAX:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s 1151; VBITS_EQ_256-DAG: umaxv [[REDUCE:s[0-9]+]], [[PG]], [[MAX]].s 1152; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]] 1153; VBITS_EQ_256-NEXT: ret 1154 %op = load <16 x i32>, <16 x i32>* %a 1155 %res = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> %op) 1156 ret i32 %res 1157} 1158 1159define i32 @umaxv_v32i32(<32 x i32>* %a) #0 { 1160; CHECK-LABEL: umaxv_v32i32: 1161; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 1162; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 1163; VBITS_GE_1024-NEXT: umaxv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s 1164; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]] 1165; VBITS_GE_1024-NEXT: ret 1166 %op = load <32 x i32>, <32 x i32>* %a 1167 %res = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> %op) 1168 ret i32 %res 1169} 1170 1171define i32 @umaxv_v64i32(<64 x i32>* %a) #0 { 1172; CHECK-LABEL: umaxv_v64i32: 1173; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 1174; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 1175; VBITS_GE_2048-NEXT: umaxv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s 1176; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]] 1177; VBITS_GE_2048-NEXT: ret 1178 %op = load <64 x i32>, <64 x i32>* %a 1179 %res = call i32 @llvm.vector.reduce.umax.v64i32(<64 x i32> %op) 1180 ret i32 %res 1181} 1182 1183; Nothing to do for single element vectors. 1184define i64 @umaxv_v1i64(<1 x i64> %a) #0 { 1185; CHECK-LABEL: umaxv_v1i64: 1186; CHECK: fmov x0, d0 1187; CHECK: ret 1188 %res = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> %a) 1189 ret i64 %res 1190} 1191 1192; No NEON 64-bit vector UMAXV support. Use SVE. 1193define i64 @umaxv_v2i64(<2 x i64> %a) #0 { 1194; CHECK-LABEL: umaxv_v2i64: 1195; CHECK: ptrue [[PG:p[0-9]+]].d, vl2 1196; CHECK-NEXT: umaxv [[REDUCE:d[0-9]+]], [[PG]], z0.d 1197; CHECK-NEXT: fmov x0, [[REDUCE]] 1198; CHECK-NEXT: ret 1199 %res = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %a) 1200 ret i64 %res 1201} 1202 1203define i64 @umaxv_v4i64(<4 x i64>* %a) #0 { 1204; CHECK-LABEL: umaxv_v4i64: 1205; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 1206; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 1207; CHECK-NEXT: umaxv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d 1208; CHECK-NEXT: fmov x0, [[REDUCE]] 1209; CHECK-NEXT: ret 1210 %op = load <4 x i64>, <4 x i64>* %a 1211 %res = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %op) 1212 ret i64 %res 1213} 1214 1215define i64 @umaxv_v8i64(<8 x i64>* %a) #0 { 1216; CHECK-LABEL: umaxv_v8i64: 1217; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 1218; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 1219; VBITS_GE_512-NEXT: umaxv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d 1220; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]] 1221; VBITS_GE_512-NEXT: ret 1222 1223; Ensure sensible type legalisation. 1224; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 1225; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4 1226; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] 1227; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3] 1228; VBITS_EQ_256-DAG: umax [[MAX:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d 1229; VBITS_EQ_256-DAG: umaxv [[REDUCE:d[0-9]+]], [[PG]], [[MAX]].d 1230; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]] 1231; VBITS_EQ_256-NEXT: ret 1232 %op = load <8 x i64>, <8 x i64>* %a 1233 %res = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> %op) 1234 ret i64 %res 1235} 1236 1237define i64 @umaxv_v16i64(<16 x i64>* %a) #0 { 1238; CHECK-LABEL: umaxv_v16i64: 1239; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 1240; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 1241; VBITS_GE_1024-NEXT: umaxv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d 1242; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]] 1243; VBITS_GE_1024-NEXT: ret 1244 %op = load <16 x i64>, <16 x i64>* %a 1245 %res = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> %op) 1246 ret i64 %res 1247} 1248 1249define i64 @umaxv_v32i64(<32 x i64>* %a) #0 { 1250; CHECK-LABEL: umaxv_v32i64: 1251; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 1252; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 1253; VBITS_GE_2048-NEXT: umaxv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d 1254; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]] 1255; VBITS_GE_2048-NEXT: ret 1256 %op = load <32 x i64>, <32 x i64>* %a 1257 %res = call i64 @llvm.vector.reduce.umax.v32i64(<32 x i64> %op) 1258 ret i64 %res 1259} 1260 1261; 1262; UMINV 1263; 1264 1265; Don't use SVE for 64-bit vectors. 1266define i8 @uminv_v8i8(<8 x i8> %a) #0 { 1267; CHECK-LABEL: uminv_v8i8: 1268; CHECK: uminv b0, v0.8b 1269; CHECK: ret 1270 %res = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %a) 1271 ret i8 %res 1272} 1273 1274; Don't use SVE for 128-bit vectors. 1275define i8 @uminv_v16i8(<16 x i8> %a) #0 { 1276; CHECK-LABEL: uminv_v16i8: 1277; CHECK: uminv b0, v0.16b 1278; CHECK: ret 1279 %res = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %a) 1280 ret i8 %res 1281} 1282 1283define i8 @uminv_v32i8(<32 x i8>* %a) #0 { 1284; CHECK-LABEL: uminv_v32i8: 1285; CHECK: ptrue [[PG:p[0-9]+]].b, vl32 1286; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] 1287; CHECK-NEXT: uminv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b 1288; CHECK-NEXT: fmov w0, s[[REDUCE]] 1289; CHECK-NEXT: ret 1290 %op = load <32 x i8>, <32 x i8>* %a 1291 %res = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %op) 1292 ret i8 %res 1293} 1294 1295define i8 @uminv_v64i8(<64 x i8>* %a) #0 { 1296; CHECK-LABEL: uminv_v64i8: 1297; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64 1298; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] 1299; VBITS_GE_512-NEXT: uminv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b 1300; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]] 1301; VBITS_GE_512-NEXT: ret 1302 1303; Ensure sensible type legalisation. 1304; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32 1305; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32 1306; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0] 1307; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]] 1308; VBITS_EQ_256-DAG: umin [[MIN:z[0-9]+]].b, [[PG]]/m, [[HI]].b, [[LO]].b 1309; VBITS_EQ_256-DAG: uminv b[[REDUCE:[0-9]+]], [[PG]], [[MIN]].b 1310; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]] 1311; VBITS_EQ_256-NEXT: ret 1312 %op = load <64 x i8>, <64 x i8>* %a 1313 %res = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> %op) 1314 ret i8 %res 1315} 1316 1317define i8 @uminv_v128i8(<128 x i8>* %a) #0 { 1318; CHECK-LABEL: uminv_v128i8: 1319; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128 1320; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] 1321; VBITS_GE_1024-NEXT: uminv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b 1322; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]] 1323; VBITS_GE_1024-NEXT: ret 1324 %op = load <128 x i8>, <128 x i8>* %a 1325 %res = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> %op) 1326 ret i8 %res 1327} 1328 1329define i8 @uminv_v256i8(<256 x i8>* %a) #0 { 1330; CHECK-LABEL: uminv_v256i8: 1331; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256 1332; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] 1333; VBITS_GE_2048-NEXT: uminv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b 1334; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]] 1335; VBITS_GE_2048-NEXT: ret 1336 %op = load <256 x i8>, <256 x i8>* %a 1337 %res = call i8 @llvm.vector.reduce.umin.v256i8(<256 x i8> %op) 1338 ret i8 %res 1339} 1340 1341; Don't use SVE for 64-bit vectors. 1342define i16 @uminv_v4i16(<4 x i16> %a) #0 { 1343; CHECK-LABEL: uminv_v4i16: 1344; CHECK: uminv h0, v0.4h 1345; CHECK: ret 1346 %res = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %a) 1347 ret i16 %res 1348} 1349 1350; Don't use SVE for 128-bit vectors. 1351define i16 @uminv_v8i16(<8 x i16> %a) #0 { 1352; CHECK-LABEL: uminv_v8i16: 1353; CHECK: uminv h0, v0.8h 1354; CHECK: ret 1355 %res = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %a) 1356 ret i16 %res 1357} 1358 1359define i16 @uminv_v16i16(<16 x i16>* %a) #0 { 1360; CHECK-LABEL: uminv_v16i16: 1361; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 1362; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 1363; CHECK-NEXT: uminv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h 1364; CHECK-NEXT: fmov w0, s[[REDUCE]] 1365; CHECK-NEXT: ret 1366 %op = load <16 x i16>, <16 x i16>* %a 1367 %res = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %op) 1368 ret i16 %res 1369} 1370 1371define i16 @uminv_v32i16(<32 x i16>* %a) #0 { 1372; CHECK-LABEL: uminv_v32i16: 1373; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 1374; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 1375; VBITS_GE_512-NEXT: uminv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h 1376; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]] 1377; VBITS_GE_512-NEXT: ret 1378 1379; Ensure sensible type legalisation. 1380; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 1381; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16 1382; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0] 1383; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1] 1384; VBITS_EQ_256-DAG: umin [[MIN:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h 1385; VBITS_EQ_256-DAG: uminv h[[REDUCE:[0-9]+]], [[PG]], [[MIN]].h 1386; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]] 1387; VBITS_EQ_256-NEXT: ret 1388 %op = load <32 x i16>, <32 x i16>* %a 1389 %res = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> %op) 1390 ret i16 %res 1391} 1392 1393define i16 @uminv_v64i16(<64 x i16>* %a) #0 { 1394; CHECK-LABEL: uminv_v64i16: 1395; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 1396; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 1397; VBITS_GE_1024-NEXT: uminv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h 1398; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]] 1399; VBITS_GE_1024-NEXT: ret 1400 %op = load <64 x i16>, <64 x i16>* %a 1401 %res = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> %op) 1402 ret i16 %res 1403} 1404 1405define i16 @uminv_v128i16(<128 x i16>* %a) #0 { 1406; CHECK-LABEL: uminv_v128i16: 1407; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128 1408; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 1409; VBITS_GE_2048-NEXT: uminv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h 1410; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]] 1411; VBITS_GE_2048-NEXT: ret 1412 %op = load <128 x i16>, <128 x i16>* %a 1413 %res = call i16 @llvm.vector.reduce.umin.v128i16(<128 x i16> %op) 1414 ret i16 %res 1415} 1416 1417; Don't use SVE for 64-bit vectors. 1418define i32 @uminv_v2i32(<2 x i32> %a) #0 { 1419; CHECK-LABEL: uminv_v2i32: 1420; CHECK: minp v0.2s, v0.2s 1421; CHECK: ret 1422 %res = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %a) 1423 ret i32 %res 1424} 1425 1426; Don't use SVE for 128-bit vectors. 1427define i32 @uminv_v4i32(<4 x i32> %a) #0 { 1428; CHECK-LABEL: uminv_v4i32: 1429; CHECK: uminv s0, v0.4s 1430; CHECK: ret 1431 %res = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a) 1432 ret i32 %res 1433} 1434 1435define i32 @uminv_v8i32(<8 x i32>* %a) #0 { 1436; CHECK-LABEL: uminv_v8i32: 1437; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 1438; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 1439; CHECK-NEXT: uminv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s 1440; CHECK-NEXT: fmov w0, [[REDUCE]] 1441; CHECK-NEXT: ret 1442 %op = load <8 x i32>, <8 x i32>* %a 1443 %res = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %op) 1444 ret i32 %res 1445} 1446 1447define i32 @uminv_v16i32(<16 x i32>* %a) #0 { 1448; CHECK-LABEL: uminv_v16i32: 1449; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 1450; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 1451; VBITS_GE_512-NEXT: uminv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s 1452; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]] 1453; VBITS_GE_512-NEXT: ret 1454 1455; Ensure sensible type legalisation. 1456; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 1457; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8 1458; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0] 1459; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2] 1460; VBITS_EQ_256-DAG: umin [[MIN:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s 1461; VBITS_EQ_256-DAG: uminv [[REDUCE:s[0-9]+]], [[PG]], [[MIN]].s 1462; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]] 1463; VBITS_EQ_256-NEXT: ret 1464 %op = load <16 x i32>, <16 x i32>* %a 1465 %res = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> %op) 1466 ret i32 %res 1467} 1468 1469define i32 @uminv_v32i32(<32 x i32>* %a) #0 { 1470; CHECK-LABEL: uminv_v32i32: 1471; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 1472; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 1473; VBITS_GE_1024-NEXT: uminv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s 1474; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]] 1475; VBITS_GE_1024-NEXT: ret 1476 %op = load <32 x i32>, <32 x i32>* %a 1477 %res = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> %op) 1478 ret i32 %res 1479} 1480 1481define i32 @uminv_v64i32(<64 x i32>* %a) #0 { 1482; CHECK-LABEL: uminv_v64i32: 1483; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 1484; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 1485; VBITS_GE_2048-NEXT: uminv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s 1486; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]] 1487; VBITS_GE_2048-NEXT: ret 1488 %op = load <64 x i32>, <64 x i32>* %a 1489 %res = call i32 @llvm.vector.reduce.umin.v64i32(<64 x i32> %op) 1490 ret i32 %res 1491} 1492 1493; Nothing to do for single element vectors. 1494define i64 @uminv_v1i64(<1 x i64> %a) #0 { 1495; CHECK-LABEL: uminv_v1i64: 1496; CHECK: fmov x0, d0 1497; CHECK: ret 1498 %res = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> %a) 1499 ret i64 %res 1500} 1501 1502; No NEON 64-bit vector UMINV support. Use SVE. 1503define i64 @uminv_v2i64(<2 x i64> %a) #0 { 1504; CHECK-LABEL: uminv_v2i64: 1505; CHECK: ptrue [[PG:p[0-9]+]].d, vl2 1506; CHECK-NEXT: uminv [[REDUCE:d[0-9]+]], [[PG]], z0.d 1507; CHECK-NEXT: fmov x0, [[REDUCE]] 1508; CHECK-NEXT: ret 1509 %res = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %a) 1510 ret i64 %res 1511} 1512 1513define i64 @uminv_v4i64(<4 x i64>* %a) #0 { 1514; CHECK-LABEL: uminv_v4i64: 1515; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 1516; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 1517; CHECK-NEXT: uminv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d 1518; CHECK-NEXT: fmov x0, [[REDUCE]] 1519; CHECK-NEXT: ret 1520 %op = load <4 x i64>, <4 x i64>* %a 1521 %res = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %op) 1522 ret i64 %res 1523} 1524 1525define i64 @uminv_v8i64(<8 x i64>* %a) #0 { 1526; CHECK-LABEL: uminv_v8i64: 1527; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 1528; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 1529; VBITS_GE_512-NEXT: uminv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d 1530; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]] 1531; VBITS_GE_512-NEXT: ret 1532 1533; Ensure sensible type legalisation. 1534; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 1535; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4 1536; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] 1537; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3] 1538; VBITS_EQ_256-DAG: umin [[MIN:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d 1539; VBITS_EQ_256-DAG: uminv [[REDUCE:d[0-9]+]], [[PG]], [[MIN]].d 1540; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]] 1541; VBITS_EQ_256-NEXT: ret 1542 %op = load <8 x i64>, <8 x i64>* %a 1543 %res = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> %op) 1544 ret i64 %res 1545} 1546 1547define i64 @uminv_v16i64(<16 x i64>* %a) #0 { 1548; CHECK-LABEL: uminv_v16i64: 1549; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 1550; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 1551; VBITS_GE_1024-NEXT: uminv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d 1552; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]] 1553; VBITS_GE_1024-NEXT: ret 1554 %op = load <16 x i64>, <16 x i64>* %a 1555 %res = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> %op) 1556 ret i64 %res 1557} 1558 1559define i64 @uminv_v32i64(<32 x i64>* %a) #0 { 1560; CHECK-LABEL: uminv_v32i64: 1561; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 1562; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 1563; VBITS_GE_2048-NEXT: uminv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d 1564; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]] 1565; VBITS_GE_2048-NEXT: ret 1566 %op = load <32 x i64>, <32 x i64>* %a 1567 %res = call i64 @llvm.vector.reduce.umin.v32i64(<32 x i64> %op) 1568 ret i64 %res 1569} 1570 1571attributes #0 = { "target-features"="+sve" } 1572 1573declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>) 1574declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) 1575declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>) 1576declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>) 1577declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>) 1578declare i8 @llvm.vector.reduce.add.v256i8(<256 x i8>) 1579 1580declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>) 1581declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) 1582declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) 1583declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>) 1584declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>) 1585declare i16 @llvm.vector.reduce.add.v128i16(<128 x i16>) 1586 1587declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>) 1588declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) 1589declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) 1590declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) 1591declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>) 1592declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>) 1593 1594declare i64 @llvm.vector.reduce.add.v1i64(<1 x i64>) 1595declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) 1596declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) 1597declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) 1598declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>) 1599declare i64 @llvm.vector.reduce.add.v32i64(<32 x i64>) 1600 1601declare i8 @llvm.vector.reduce.smax.v8i8(<8 x i8>) 1602declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>) 1603declare i8 @llvm.vector.reduce.smax.v32i8(<32 x i8>) 1604declare i8 @llvm.vector.reduce.smax.v64i8(<64 x i8>) 1605declare i8 @llvm.vector.reduce.smax.v128i8(<128 x i8>) 1606declare i8 @llvm.vector.reduce.smax.v256i8(<256 x i8>) 1607 1608declare i16 @llvm.vector.reduce.smax.v4i16(<4 x i16>) 1609declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>) 1610declare i16 @llvm.vector.reduce.smax.v16i16(<16 x i16>) 1611declare i16 @llvm.vector.reduce.smax.v32i16(<32 x i16>) 1612declare i16 @llvm.vector.reduce.smax.v64i16(<64 x i16>) 1613declare i16 @llvm.vector.reduce.smax.v128i16(<128 x i16>) 1614 1615declare i32 @llvm.vector.reduce.smax.v2i32(<2 x i32>) 1616declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>) 1617declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32>) 1618declare i32 @llvm.vector.reduce.smax.v16i32(<16 x i32>) 1619declare i32 @llvm.vector.reduce.smax.v32i32(<32 x i32>) 1620declare i32 @llvm.vector.reduce.smax.v64i32(<64 x i32>) 1621 1622declare i64 @llvm.vector.reduce.smax.v1i64(<1 x i64>) 1623declare i64 @llvm.vector.reduce.smax.v2i64(<2 x i64>) 1624declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>) 1625declare i64 @llvm.vector.reduce.smax.v8i64(<8 x i64>) 1626declare i64 @llvm.vector.reduce.smax.v16i64(<16 x i64>) 1627declare i64 @llvm.vector.reduce.smax.v32i64(<32 x i64>) 1628 1629declare i8 @llvm.vector.reduce.smin.v8i8(<8 x i8>) 1630declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>) 1631declare i8 @llvm.vector.reduce.smin.v32i8(<32 x i8>) 1632declare i8 @llvm.vector.reduce.smin.v64i8(<64 x i8>) 1633declare i8 @llvm.vector.reduce.smin.v128i8(<128 x i8>) 1634declare i8 @llvm.vector.reduce.smin.v256i8(<256 x i8>) 1635 1636declare i16 @llvm.vector.reduce.smin.v4i16(<4 x i16>) 1637declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>) 1638declare i16 @llvm.vector.reduce.smin.v16i16(<16 x i16>) 1639declare i16 @llvm.vector.reduce.smin.v32i16(<32 x i16>) 1640declare i16 @llvm.vector.reduce.smin.v64i16(<64 x i16>) 1641declare i16 @llvm.vector.reduce.smin.v128i16(<128 x i16>) 1642 1643declare i32 @llvm.vector.reduce.smin.v2i32(<2 x i32>) 1644declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>) 1645declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32>) 1646declare i32 @llvm.vector.reduce.smin.v16i32(<16 x i32>) 1647declare i32 @llvm.vector.reduce.smin.v32i32(<32 x i32>) 1648declare i32 @llvm.vector.reduce.smin.v64i32(<64 x i32>) 1649 1650declare i64 @llvm.vector.reduce.smin.v1i64(<1 x i64>) 1651declare i64 @llvm.vector.reduce.smin.v2i64(<2 x i64>) 1652declare i64 @llvm.vector.reduce.smin.v4i64(<4 x i64>) 1653declare i64 @llvm.vector.reduce.smin.v8i64(<8 x i64>) 1654declare i64 @llvm.vector.reduce.smin.v16i64(<16 x i64>) 1655declare i64 @llvm.vector.reduce.smin.v32i64(<32 x i64>) 1656 1657declare i8 @llvm.vector.reduce.umax.v8i8(<8 x i8>) 1658declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>) 1659declare i8 @llvm.vector.reduce.umax.v32i8(<32 x i8>) 1660declare i8 @llvm.vector.reduce.umax.v64i8(<64 x i8>) 1661declare i8 @llvm.vector.reduce.umax.v128i8(<128 x i8>) 1662declare i8 @llvm.vector.reduce.umax.v256i8(<256 x i8>) 1663 1664declare i16 @llvm.vector.reduce.umax.v4i16(<4 x i16>) 1665declare i16 @llvm.vector.reduce.umax.v8i16(<8 x i16>) 1666declare i16 @llvm.vector.reduce.umax.v16i16(<16 x i16>) 1667declare i16 @llvm.vector.reduce.umax.v32i16(<32 x i16>) 1668declare i16 @llvm.vector.reduce.umax.v64i16(<64 x i16>) 1669declare i16 @llvm.vector.reduce.umax.v128i16(<128 x i16>) 1670 1671declare i32 @llvm.vector.reduce.umax.v2i32(<2 x i32>) 1672declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>) 1673declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32>) 1674declare i32 @llvm.vector.reduce.umax.v16i32(<16 x i32>) 1675declare i32 @llvm.vector.reduce.umax.v32i32(<32 x i32>) 1676declare i32 @llvm.vector.reduce.umax.v64i32(<64 x i32>) 1677 1678declare i64 @llvm.vector.reduce.umax.v1i64(<1 x i64>) 1679declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64>) 1680declare i64 @llvm.vector.reduce.umax.v4i64(<4 x i64>) 1681declare i64 @llvm.vector.reduce.umax.v8i64(<8 x i64>) 1682declare i64 @llvm.vector.reduce.umax.v16i64(<16 x i64>) 1683declare i64 @llvm.vector.reduce.umax.v32i64(<32 x i64>) 1684 1685declare i8 @llvm.vector.reduce.umin.v8i8(<8 x i8>) 1686declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>) 1687declare i8 @llvm.vector.reduce.umin.v32i8(<32 x i8>) 1688declare i8 @llvm.vector.reduce.umin.v64i8(<64 x i8>) 1689declare i8 @llvm.vector.reduce.umin.v128i8(<128 x i8>) 1690declare i8 @llvm.vector.reduce.umin.v256i8(<256 x i8>) 1691 1692declare i16 @llvm.vector.reduce.umin.v4i16(<4 x i16>) 1693declare i16 @llvm.vector.reduce.umin.v8i16(<8 x i16>) 1694declare i16 @llvm.vector.reduce.umin.v16i16(<16 x i16>) 1695declare i16 @llvm.vector.reduce.umin.v32i16(<32 x i16>) 1696declare i16 @llvm.vector.reduce.umin.v64i16(<64 x i16>) 1697declare i16 @llvm.vector.reduce.umin.v128i16(<128 x i16>) 1698 1699declare i32 @llvm.vector.reduce.umin.v2i32(<2 x i32>) 1700declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>) 1701declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32>) 1702declare i32 @llvm.vector.reduce.umin.v16i32(<16 x i32>) 1703declare i32 @llvm.vector.reduce.umin.v32i32(<32 x i32>) 1704declare i32 @llvm.vector.reduce.umin.v64i32(<64 x i32>) 1705 1706declare i64 @llvm.vector.reduce.umin.v1i64(<1 x i64>) 1707declare i64 @llvm.vector.reduce.umin.v2i64(<2 x i64>) 1708declare i64 @llvm.vector.reduce.umin.v4i64(<4 x i64>) 1709declare i64 @llvm.vector.reduce.umin.v8i64(<8 x i64>) 1710declare i64 @llvm.vector.reduce.umin.v16i64(<16 x i64>) 1711declare i64 @llvm.vector.reduce.umin.v32i64(<32 x i64>) 1712