1; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE 2; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_EQ_256 3; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK 4; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 5; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 6; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 7; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 8; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 9; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 10; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 11; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 12; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 13; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 14; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 15; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 16; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048 17 18target triple = "aarch64-unknown-linux-gnu" 19 20; Don't use SVE when its registers are no bigger than NEON. 21; NO_SVE-NOT: ptrue 22 23; 24; FADDA 25; 26 27; No single instruction NEON support. Use SVE. 28define half @fadda_v4f16(half %start, <4 x half> %a) #0 { 29; CHECK-LABEL: fadda_v4f16: 30; CHECK: ptrue [[PG:p[0-9]+]].h, vl4 31; CHECK-NEXT: fadda h0, [[PG]], h0, z1.h 32; CHECK-NEXT: ret 33 %res = call half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a) 34 ret half %res 35} 36 37; No single instruction NEON support. Use SVE. 38define half @fadda_v8f16(half %start, <8 x half> %a) #0 { 39; CHECK-LABEL: fadda_v8f16: 40; CHECK: ptrue [[PG:p[0-9]+]].h, vl8 41; CHECK-NEXT: fadda h0, [[PG]], h0, z1.h 42; CHECK-NEXT: ret 43 %res = call half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a) 44 ret half %res 45} 46 47define half @fadda_v16f16(half %start, <16 x half>* %a) #0 { 48; CHECK-LABEL: fadda_v16f16: 49; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 50; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 51; CHECK-NEXT: fadda h0, [[PG]], h0, [[OP]].h 52; CHECK-NEXT: ret 53 %op = load <16 x half>, <16 x half>* %a 54 %res = call half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op) 55 ret half %res 56} 57 58define half @fadda_v32f16(half %start, <32 x half>* %a) #0 { 59; CHECK-LABEL: fadda_v32f16: 60; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 61; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 62; VBITS_GE_512-NEXT: fadda h0, [[PG]], h0, [[OP]].h 63; VBITS_GE_512-NEXT: ret 64 65; Ensure sensible type legalisation. 66; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 67; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16 68; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0] 69; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1] 70; VBITS_EQ_256-NEXT: fadda h0, [[PG]], h0, [[LO]].h 71; VBITS_EQ_256-NEXT: fadda h0, [[PG]], h0, [[HI]].h 72; VBITS_EQ_256-NEXT: ret 73 %op = load <32 x half>, <32 x half>* %a 74 %res = call half @llvm.vector.reduce.fadd.v32f16(half %start, <32 x half> %op) 75 ret half %res 76} 77 78define half @fadda_v64f16(half %start, <64 x half>* %a) #0 { 79; CHECK-LABEL: fadda_v64f16: 80; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 81; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 82; VBITS_GE_1024-NEXT: fadda h0, [[PG]], h0, [[OP]].h 83; VBITS_GE_1024-NEXT: ret 84 %op = load <64 x half>, <64 x half>* %a 85 %res = call half @llvm.vector.reduce.fadd.v64f16(half %start, <64 x half> %op) 86 ret half %res 87} 88 89define half @fadda_v128f16(half %start, <128 x half>* %a) #0 { 90; CHECK-LABEL: fadda_v128f16: 91; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128 92; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 93; VBITS_GE_2048-NEXT: fadda h0, [[PG]], h0, [[OP]].h 94; VBITS_GE_2048-NEXT: ret 95 %op = load <128 x half>, <128 x half>* %a 96 %res = call half @llvm.vector.reduce.fadd.v128f16(half %start, <128 x half> %op) 97 ret half %res 98} 99 100; No single instruction NEON support. Use SVE. 101define float @fadda_v2f32(float %start, <2 x float> %a) #0 { 102; CHECK-LABEL: fadda_v2f32: 103; CHECK: ptrue [[PG:p[0-9]+]].s, vl2 104; CHECK-NEXT: fadda s0, [[PG]], s0, z1.s 105; CHECK-NEXT: ret 106 %res = call float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a) 107 ret float %res 108} 109 110; No single instruction NEON support. Use SVE. 111define float @fadda_v4f32(float %start, <4 x float> %a) #0 { 112; CHECK-LABEL: fadda_v4f32: 113; CHECK: ptrue [[PG:p[0-9]+]].s, vl4 114; CHECK-NEXT: fadda s0, [[PG]], s0, z1.s 115; CHECK-NEXT: ret 116 %res = call float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a) 117 ret float %res 118} 119 120define float @fadda_v8f32(float %start, <8 x float>* %a) #0 { 121; CHECK-LABEL: fadda_v8f32: 122; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 123; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 124; CHECK-NEXT: fadda s0, [[PG]], s0, [[OP]].s 125; CHECK-NEXT: ret 126 %op = load <8 x float>, <8 x float>* %a 127 %res = call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op) 128 ret float %res 129} 130 131define float @fadda_v16f32(float %start, <16 x float>* %a) #0 { 132; CHECK-LABEL: fadda_v16f32: 133; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 134; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 135; VBITS_GE_512-NEXT: fadda s0, [[PG]], s0, [[OP]].s 136; VBITS_GE_512-NEXT: ret 137 138; Ensure sensible type legalisation. 139; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 140; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8 141; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0] 142; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2] 143; VBITS_EQ_256-NEXT: fadda s0, [[PG]], s0, [[LO]].s 144; VBITS_EQ_256-NEXT: fadda s0, [[PG]], s0, [[HI]].s 145; VBITS_EQ_256-NEXT: ret 146 %op = load <16 x float>, <16 x float>* %a 147 %res = call float @llvm.vector.reduce.fadd.v16f32(float %start, <16 x float> %op) 148 ret float %res 149} 150 151define float @fadda_v32f32(float %start, <32 x float>* %a) #0 { 152; CHECK-LABEL: fadda_v32f32: 153; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 154; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 155; VBITS_GE_1024-NEXT: fadda s0, [[PG]], s0, [[OP]].s 156; VBITS_GE_1024-NEXT: ret 157 %op = load <32 x float>, <32 x float>* %a 158 %res = call float @llvm.vector.reduce.fadd.v32f32(float %start, <32 x float> %op) 159 ret float %res 160} 161 162define float @fadda_v64f32(float %start, <64 x float>* %a) #0 { 163; CHECK-LABEL: fadda_v64f32: 164; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 165; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 166; VBITS_GE_2048-NEXT: fadda s0, [[PG]], s0, [[OP]].s 167; VBITS_GE_2048-NEXT: ret 168 %op = load <64 x float>, <64 x float>* %a 169 %res = call float @llvm.vector.reduce.fadd.v64f32(float %start, <64 x float> %op) 170 ret float %res 171} 172 173; No single instruction NEON support. Use SVE. 174define double @fadda_v1f64(double %start, <1 x double> %a) #0 { 175; CHECK-LABEL: fadda_v1f64: 176; CHECK: ptrue [[PG:p[0-9]+]].d, vl1 177; CHECK-NEXT: fadda d0, [[PG]], d0, z1.d 178; CHECK-NEXT: ret 179 %res = call double @llvm.vector.reduce.fadd.v1f64(double %start, <1 x double> %a) 180 ret double %res 181} 182 183; No single instruction NEON support. Use SVE. 184define double @fadda_v2f64(double %start, <2 x double> %a) #0 { 185; CHECK-LABEL: fadda_v2f64: 186; CHECK: ptrue [[PG:p[0-9]+]].d, vl2 187; CHECK-NEXT: fadda d0, [[PG]], d0, z1.d 188; CHECK-NEXT: ret 189 %res = call double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a) 190 ret double %res 191} 192 193define double @fadda_v4f64(double %start, <4 x double>* %a) #0 { 194; CHECK-LABEL: fadda_v4f64: 195; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 196; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 197; CHECK-NEXT: fadda d0, [[PG]], d0, [[OP]].d 198; CHECK-NEXT: ret 199 %op = load <4 x double>, <4 x double>* %a 200 %res = call double @llvm.vector.reduce.fadd.v4f64(double %start, <4 x double> %op) 201 ret double %res 202} 203 204define double @fadda_v8f64(double %start, <8 x double>* %a) #0 { 205; CHECK-LABEL: fadda_v8f64: 206; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 207; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 208; VBITS_GE_512-NEXT: fadda d0, [[PG]], d0, [[OP]].d 209; VBITS_GE_512-NEXT: ret 210 211; Ensure sensible type legalisation. 212; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 213; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4 214; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] 215; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3] 216; VBITS_EQ_256-NEXT: fadda d0, [[PG]], d0, [[LO]].d 217; VBITS_EQ_256-NEXT: fadda d0, [[PG]], d0, [[HI]].d 218; VBITS_EQ_256-NEXT: ret 219 %op = load <8 x double>, <8 x double>* %a 220 %res = call double @llvm.vector.reduce.fadd.v8f64(double %start, <8 x double> %op) 221 ret double %res 222} 223 224define double @fadda_v16f64(double %start, <16 x double>* %a) #0 { 225; CHECK-LABEL: fadda_v16f64: 226; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 227; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 228; VBITS_GE_1024-NEXT: fadda d0, [[PG]], d0, [[OP]].d 229; VBITS_GE_1024-NEXT: ret 230 %op = load <16 x double>, <16 x double>* %a 231 %res = call double @llvm.vector.reduce.fadd.v16f64(double %start, <16 x double> %op) 232 ret double %res 233} 234 235define double @fadda_v32f64(double %start, <32 x double>* %a) #0 { 236; CHECK-LABEL: fadda_v32f64: 237; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 238; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 239; VBITS_GE_2048-NEXT: fadda d0, [[PG]], d0, [[OP]].d 240; VBITS_GE_2048-NEXT: ret 241 %op = load <32 x double>, <32 x double>* %a 242 %res = call double @llvm.vector.reduce.fadd.v32f64(double %start, <32 x double> %op) 243 ret double %res 244} 245 246; 247; FADDV 248; 249 250; No single instruction NEON support for 4 element vectors. 251define half @faddv_v4f16(half %start, <4 x half> %a) #0 { 252; CHECK-LABEL: faddv_v4f16: 253; CHECK: ptrue [[PG:p[0-9]+]].h, vl4 254; CHECK-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], z1.h 255; CHECK-NEXT: fadd h0, h0, [[RDX]] 256; CHECK-NEXT: ret 257 %res = call fast half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a) 258 ret half %res 259} 260 261; No single instruction NEON support for 8 element vectors. 262define half @faddv_v8f16(half %start, <8 x half> %a) #0 { 263; CHECK-LABEL: faddv_v8f16: 264; CHECK: ptrue [[PG:p[0-9]+]].h, vl8 265; CHECK-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], z1.h 266; CHECK-NEXT: fadd h0, h0, [[RDX]] 267; CHECK-NEXT: ret 268 %res = call fast half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a) 269 ret half %res 270} 271 272define half @faddv_v16f16(half %start, <16 x half>* %a) #0 { 273; CHECK-LABEL: faddv_v16f16: 274; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 275; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 276; CHECK-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], [[OP]].h 277; CHECK-NEXT: fadd h0, h0, [[RDX]] 278; CHECK-NEXT: ret 279 %op = load <16 x half>, <16 x half>* %a 280 %res = call fast half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op) 281 ret half %res 282} 283 284define half @faddv_v32f16(half %start, <32 x half>* %a) #0 { 285; CHECK-LABEL: faddv_v32f16: 286; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 287; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 288; VBITS_GE_512-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], [[OP]].h 289; VBITS_GE_512-NEXT: fadd h0, h0, [[RDX]] 290; VBITS_GE_512-NEXT: ret 291 292; Ensure sensible type legalisation. 293; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 294; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16 295; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0] 296; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1] 297; VBITS_EQ_256-DAG: fadd [[ADD:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h 298; VBITS_EQ_256-DAG: faddv h1, [[PG]], [[ADD]].h 299; VBITS_EQ_256-DAG: fadd h0, h0, [[RDX]] 300; VBITS_EQ_256-NEXT: ret 301 %op = load <32 x half>, <32 x half>* %a 302 %res = call fast half @llvm.vector.reduce.fadd.v32f16(half %start, <32 x half> %op) 303 ret half %res 304} 305 306define half @faddv_v64f16(half %start, <64 x half>* %a) #0 { 307; CHECK-LABEL: faddv_v64f16: 308; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 309; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 310; VBITS_GE_1024-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], [[OP]].h 311; VBITS_GE_1024-NEXT: fadd h0, h0, [[RDX]] 312; VBITS_GE_1024-NEXT: ret 313 %op = load <64 x half>, <64 x half>* %a 314 %res = call fast half @llvm.vector.reduce.fadd.v64f16(half %start, <64 x half> %op) 315 ret half %res 316} 317 318define half @faddv_v128f16(half %start, <128 x half>* %a) #0 { 319; CHECK-LABEL: faddv_v128f16: 320; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128 321; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 322; VBITS_GE_2048-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], [[OP]].h 323; VBITS_GE_2048-NEXT: fadd h0, h0, [[RDX]] 324; VBITS_GE_2048-NEXT: ret 325 %op = load <128 x half>, <128 x half>* %a 326 %res = call fast half @llvm.vector.reduce.fadd.v128f16(half %start, <128 x half> %op) 327 ret half %res 328} 329 330; Don't use SVE for 2 element vectors. 331define float @faddv_v2f32(float %start, <2 x float> %a) #0 { 332; CHECK-LABEL: faddv_v2f32: 333; CHECK: faddp s1, v1.2s 334; CHECK-NEXT: fadd s0, s0, s1 335; CHECK-NEXT: ret 336 %res = call fast float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a) 337 ret float %res 338} 339 340; No single instruction NEON support for 4 element vectors. 341define float @faddv_v4f32(float %start, <4 x float> %a) #0 { 342; CHECK-LABEL: faddv_v4f32: 343; CHECK: ptrue [[PG:p[0-9]+]].s, vl4 344; CHECK-NEXT: faddv [[RDX:s[0-9]+]], [[PG]], z1.s 345; CHECK-NEXT: fadd s0, s0, [[RDX]] 346; CHECK-NEXT: ret 347 %res = call fast float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a) 348 ret float %res 349} 350 351define float @faddv_v8f32(float %start, <8 x float>* %a) #0 { 352; CHECK-LABEL: faddv_v8f32: 353; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 354; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 355; CHECK-NEXT: faddv [[RDX:s[0-9]+]], [[PG]], [[OP]].s 356; CHECK-NEXT: fadd s0, s0, [[RDX]] 357; CHECK-NEXT: ret 358 %op = load <8 x float>, <8 x float>* %a 359 %res = call fast float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op) 360 ret float %res 361} 362 363define float @faddv_v16f32(float %start, <16 x float>* %a) #0 { 364; CHECK-LABEL: faddv_v16f32: 365; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 366; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 367; VBITS_GE_512-NEXT: faddv [[RDX:s[0-9]+]], [[PG]], [[OP]].s 368; VBITS_GE_512-NEXT: fadd s0, s0, [[RDX]] 369; VBITS_GE_512-NEXT: ret 370 371; Ensure sensible type legalisation. 372; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 373; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8 374; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0] 375; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2] 376; VBITS_EQ_256-DAG: fadd [[ADD:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s 377; VBITS_EQ_256-DAG: faddv [[RDX:s[0-9]+]], [[PG]], [[ADD]].s 378; VBITS_EQ_256-DAG: fadd s0, s0, [[RDX]] 379; VBITS_EQ_256-NEXT: ret 380 %op = load <16 x float>, <16 x float>* %a 381 %res = call fast float @llvm.vector.reduce.fadd.v16f32(float %start, <16 x float> %op) 382 ret float %res 383} 384 385define float @faddv_v32f32(float %start, <32 x float>* %a) #0 { 386; CHECK-LABEL: faddv_v32f32: 387; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 388; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 389; VBITS_GE_1024-NEXT: faddv [[RDX:s[0-9]+]], [[PG]], [[OP]].s 390; VBITS_GE_1024-NEXT: fadd s0, s0, [[RDX]] 391; VBITS_GE_1024-NEXT: ret 392 %op = load <32 x float>, <32 x float>* %a 393 %res = call fast float @llvm.vector.reduce.fadd.v32f32(float %start, <32 x float> %op) 394 ret float %res 395} 396 397define float @faddv_v64f32(float %start, <64 x float>* %a) #0 { 398; CHECK-LABEL: faddv_v64f32: 399; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 400; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 401; VBITS_GE_2048-NEXT: faddv [[RDX:s[0-9]+]], [[PG]], [[OP]].s 402; VBITS_GE_2048-NEXT: fadd s0, s0, [[RDX]] 403; VBITS_GE_2048-NEXT: ret 404 %op = load <64 x float>, <64 x float>* %a 405 %res = call fast float @llvm.vector.reduce.fadd.v64f32(float %start, <64 x float> %op) 406 ret float %res 407} 408 409; Don't use SVE for 1 element vectors. 410define double @faddv_v1f64(double %start, <1 x double> %a) #0 { 411; CHECK-LABEL: faddv_v1f64: 412; CHECK: fadd d0, d0, d1 413; CHECK-NEXT: ret 414 %res = call fast double @llvm.vector.reduce.fadd.v1f64(double %start, <1 x double> %a) 415 ret double %res 416} 417 418; Don't use SVE for 2 element vectors. 419define double @faddv_v2f64(double %start, <2 x double> %a) #0 { 420; CHECK-LABEL: faddv_v2f64: 421; CHECK: faddp d1, v1.2d 422; CHECK-NEXT: fadd d0, d0, d1 423; CHECK-NEXT: ret 424 %res = call fast double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a) 425 ret double %res 426} 427 428define double @faddv_v4f64(double %start, <4 x double>* %a) #0 { 429; CHECK-LABEL: faddv_v4f64: 430; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 431; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 432; CHECK-NEXT: faddv [[RDX:d[0-9]+]], [[PG]], [[OP]].d 433; CHECK-NEXT: fadd d0, d0, [[RDX]] 434; CHECK-NEXT: ret 435 %op = load <4 x double>, <4 x double>* %a 436 %res = call fast double @llvm.vector.reduce.fadd.v4f64(double %start, <4 x double> %op) 437 ret double %res 438} 439 440define double @faddv_v8f64(double %start, <8 x double>* %a) #0 { 441; CHECK-LABEL: faddv_v8f64: 442; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 443; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 444; VBITS_GE_512-NEXT: faddv [[RDX:d[0-9]+]], [[PG]], [[OP]].d 445; VBITS_GE_512-NEXT: fadd d0, d0, [[RDX]] 446; VBITS_GE_512-NEXT: ret 447 448; Ensure sensible type legalisation. 449; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 450; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4 451; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] 452; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3] 453; VBITS_EQ_256-DAG: fadd [[ADD:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d 454; VBITS_EQ_256-DAG: faddv [[RDX:d[0-9]+]], [[PG]], [[ADD]].d 455; VBITS_EQ_256-DAG: fadd d0, d0, [[RDX]] 456; VBITS_EQ_256-NEXT: ret 457 %op = load <8 x double>, <8 x double>* %a 458 %res = call fast double @llvm.vector.reduce.fadd.v8f64(double %start, <8 x double> %op) 459 ret double %res 460} 461 462define double @faddv_v16f64(double %start, <16 x double>* %a) #0 { 463; CHECK-LABEL: faddv_v16f64: 464; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 465; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 466; VBITS_GE_1024-NEXT: faddv [[RDX:d[0-9]+]], [[PG]], [[OP]].d 467; VBITS_GE_1024-NEXT: fadd d0, d0, [[RDX]] 468; VBITS_GE_1024-NEXT: ret 469 %op = load <16 x double>, <16 x double>* %a 470 %res = call fast double @llvm.vector.reduce.fadd.v16f64(double %start, <16 x double> %op) 471 ret double %res 472} 473 474define double @faddv_v32f64(double %start, <32 x double>* %a) #0 { 475; CHECK-LABEL: faddv_v32f64: 476; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 477; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 478; VBITS_GE_2048-NEXT: faddv [[RDX:d[0-9]+]], [[PG]], [[OP]].d 479; VBITS_GE_2048-NEXT: fadd d0, d0, [[RDX]] 480; VBITS_GE_2048-NEXT: ret 481 %op = load <32 x double>, <32 x double>* %a 482 %res = call fast double @llvm.vector.reduce.fadd.v32f64(double %start, <32 x double> %op) 483 ret double %res 484} 485 486; 487; FMAXV 488; 489 490; No NEON 16-bit vector FMAXNMV support. Use SVE. 491define half @fmaxv_v4f16(<4 x half> %a) #0 { 492; CHECK-LABEL: fmaxv_v4f16: 493; CHECK: fmaxnmv h0, v0.4h 494; CHECK-NEXT: ret 495 %res = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> %a) 496 ret half %res 497} 498 499; No NEON 16-bit vector FMAXNMV support. Use SVE. 500define half @fmaxv_v8f16(<8 x half> %a) #0 { 501; CHECK-LABEL: fmaxv_v8f16: 502; CHECK: fmaxnmv h0, v0.8h 503; CHECK-NEXT: ret 504 %res = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> %a) 505 ret half %res 506} 507 508define half @fmaxv_v16f16(<16 x half>* %a) #0 { 509; CHECK-LABEL: fmaxv_v16f16: 510; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 511; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 512; CHECK-NEXT: fmaxnmv h0, [[PG]], [[OP]].h 513; CHECK-NEXT: ret 514 %op = load <16 x half>, <16 x half>* %a 515 %res = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> %op) 516 ret half %res 517} 518 519define half @fmaxv_v32f16(<32 x half>* %a) #0 { 520; CHECK-LABEL: fmaxv_v32f16: 521; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 522; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 523; VBITS_GE_512-NEXT: fmaxnmv h0, [[PG]], [[OP]].h 524; VBITS_GE_512-NEXT: ret 525 526; Ensure sensible type legalisation. 527; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 528; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16 529; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0] 530; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1] 531; VBITS_EQ_256-DAG: fmaxnm [[MAX:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h 532; VBITS_EQ_256-DAG: fmaxnmv h0, [[PG]], [[MAX]].h 533; VBITS_EQ_256-NEXT: ret 534 %op = load <32 x half>, <32 x half>* %a 535 %res = call half @llvm.vector.reduce.fmax.v32f16(<32 x half> %op) 536 ret half %res 537} 538 539define half @fmaxv_v64f16(<64 x half>* %a) #0 { 540; CHECK-LABEL: fmaxv_v64f16: 541; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 542; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 543; VBITS_GE_1024-NEXT: fmaxnmv h0, [[PG]], [[OP]].h 544; VBITS_GE_1024-NEXT: ret 545 %op = load <64 x half>, <64 x half>* %a 546 %res = call half @llvm.vector.reduce.fmax.v64f16(<64 x half> %op) 547 ret half %res 548} 549 550define half @fmaxv_v128f16(<128 x half>* %a) #0 { 551; CHECK-LABEL: fmaxv_v128f16: 552; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128 553; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 554; VBITS_GE_2048-NEXT: fmaxnmv h0, [[PG]], [[OP]].h 555; VBITS_GE_2048-NEXT: ret 556 %op = load <128 x half>, <128 x half>* %a 557 %res = call half @llvm.vector.reduce.fmax.v128f16(<128 x half> %op) 558 ret half %res 559} 560 561; Don't use SVE for 64-bit f32 vectors. 562define float @fmaxv_v2f32(<2 x float> %a) #0 { 563; CHECK-LABEL: fmaxv_v2f32: 564; CHECK: fmaxnmp s0, v0.2s 565; CHECK: ret 566 %res = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> %a) 567 ret float %res 568} 569 570; Don't use SVE for 128-bit f32 vectors. 571define float @fmaxv_v4f32(<4 x float> %a) #0 { 572; CHECK-LABEL: fmaxv_v4f32: 573; CHECK: fmaxnmv s0, v0.4s 574; CHECK: ret 575 %res = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a) 576 ret float %res 577} 578 579define float @fmaxv_v8f32(<8 x float>* %a) #0 { 580; CHECK-LABEL: fmaxv_v8f32: 581; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 582; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 583; CHECK-NEXT: fmaxnmv s0, [[PG]], [[OP]].s 584; CHECK-NEXT: ret 585 %op = load <8 x float>, <8 x float>* %a 586 %res = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %op) 587 ret float %res 588} 589 590define float @fmaxv_v16f32(<16 x float>* %a) #0 { 591; CHECK-LABEL: fmaxv_v16f32: 592; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 593; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 594; VBITS_GE_512-NEXT: fmaxnmv s0, [[PG]], [[OP]].s 595; VBITS_GE_512-NEXT: ret 596 597; Ensure sensible type legalisation. 598; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 599; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8 600; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0] 601; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2] 602; VBITS_EQ_256-DAG: fmaxnm [[MAX:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s 603; VBITS_EQ_256-DAG: fmaxnmv s0, [[PG]], [[MAX]].s 604; VBITS_EQ_256-NEXT: ret 605 %op = load <16 x float>, <16 x float>* %a 606 %res = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %op) 607 ret float %res 608} 609 610define float @fmaxv_v32f32(<32 x float>* %a) #0 { 611; CHECK-LABEL: fmaxv_v32f32: 612; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 613; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 614; VBITS_GE_1024-NEXT: fmaxnmv s0, [[PG]], [[OP]].s 615; VBITS_GE_1024-NEXT: ret 616 %op = load <32 x float>, <32 x float>* %a 617 %res = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> %op) 618 ret float %res 619} 620 621define float @fmaxv_v64f32(<64 x float>* %a) #0 { 622; CHECK-LABEL: fmaxv_v64f32: 623; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 624; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 625; VBITS_GE_2048-NEXT: fmaxnmv s0, [[PG]], [[OP]].s 626; VBITS_GE_2048-NEXT: ret 627 %op = load <64 x float>, <64 x float>* %a 628 %res = call float @llvm.vector.reduce.fmax.v64f32(<64 x float> %op) 629 ret float %res 630} 631 632; Nothing to do for single element vectors. 633define double @fmaxv_v1f64(<1 x double> %a) #0 { 634; CHECK-LABEL: fmaxv_v1f64: 635; CHECK-NOT: fmax 636; CHECK: ret 637 %res = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> %a) 638 ret double %res 639} 640 641; Don't use SVE for 128-bit f64 vectors. 642define double @fmaxv_v2f64(<2 x double> %a) #0 { 643; CHECK-LABEL: fmaxv_v2f64: 644; CHECK: fmaxnmp d0, v0.2d 645; CHECK-NEXT: ret 646 %res = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a) 647 ret double %res 648} 649 650define double @fmaxv_v4f64(<4 x double>* %a) #0 { 651; CHECK-LABEL: fmaxv_v4f64: 652; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 653; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 654; CHECK-NEXT: fmaxnmv d0, [[PG]], [[OP]].d 655; CHECK-NEXT: ret 656 %op = load <4 x double>, <4 x double>* %a 657 %res = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %op) 658 ret double %res 659} 660 661define double @fmaxv_v8f64(<8 x double>* %a) #0 { 662; CHECK-LABEL: fmaxv_v8f64: 663; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 664; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 665; VBITS_GE_512-NEXT: fmaxnmv d0, [[PG]], [[OP]].d 666; VBITS_GE_512-NEXT: ret 667 668; Ensure sensible type legalisation. 669; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 670; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4 671; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] 672; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3] 673; VBITS_EQ_256-DAG: fmaxnm [[MAX:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d 674; VBITS_EQ_256-DAG: fmaxnmv d0, [[PG]], [[MAX]].d 675; VBITS_EQ_256-NEXT: ret 676 %op = load <8 x double>, <8 x double>* %a 677 %res = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> %op) 678 ret double %res 679} 680 681define double @fmaxv_v16f64(<16 x double>* %a) #0 { 682; CHECK-LABEL: fmaxv_v16f64: 683; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 684; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 685; VBITS_GE_1024-NEXT: fmaxnmv d0, [[PG]], [[OP]].d 686; VBITS_GE_1024-NEXT: ret 687 %op = load <16 x double>, <16 x double>* %a 688 %res = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> %op) 689 ret double %res 690} 691 692define double @fmaxv_v32f64(<32 x double>* %a) #0 { 693; CHECK-LABEL: fmaxv_v32f64: 694; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 695; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 696; VBITS_GE_2048-NEXT: fmaxnmv d0, [[PG]], [[OP]].d 697; VBITS_GE_2048-NEXT: ret 698 %op = load <32 x double>, <32 x double>* %a 699 %res = call double @llvm.vector.reduce.fmax.v32f64(<32 x double> %op) 700 ret double %res 701} 702 703; 704; FMINV 705; 706 707; No NEON 16-bit vector FMINNMV support. Use SVE. 708define half @fminv_v4f16(<4 x half> %a) #0 { 709; CHECK-LABEL: fminv_v4f16: 710; CHECK: fminnmv h0, v0.4h 711; CHECK-NEXT: ret 712 %res = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> %a) 713 ret half %res 714} 715 716; No NEON 16-bit vector FMINNMV support. Use SVE. 717define half @fminv_v8f16(<8 x half> %a) #0 { 718; CHECK-LABEL: fminv_v8f16: 719; CHECK: fminnmv h0, v0.8h 720; CHECK-NEXT: ret 721 %res = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> %a) 722 ret half %res 723} 724 725define half @fminv_v16f16(<16 x half>* %a) #0 { 726; CHECK-LABEL: fminv_v16f16: 727; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 728; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 729; CHECK-NEXT: fminnmv h0, [[PG]], [[OP]].h 730; CHECK-NEXT: ret 731 %op = load <16 x half>, <16 x half>* %a 732 %res = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> %op) 733 ret half %res 734} 735 736define half @fminv_v32f16(<32 x half>* %a) #0 { 737; CHECK-LABEL: fminv_v32f16: 738; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 739; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 740; VBITS_GE_512-NEXT: fminnmv h0, [[PG]], [[OP]].h 741; VBITS_GE_512-NEXT: ret 742 743; Ensure sensible type legalisation. 744; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 745; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16 746; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0] 747; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1] 748; VBITS_EQ_256-DAG: fminnm [[MIN:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h 749; VBITS_EQ_256-DAG: fminnmv h0, [[PG]], [[MIN]].h 750; VBITS_EQ_256-NEXT: ret 751 %op = load <32 x half>, <32 x half>* %a 752 %res = call half @llvm.vector.reduce.fmin.v32f16(<32 x half> %op) 753 ret half %res 754} 755 756define half @fminv_v64f16(<64 x half>* %a) #0 { 757; CHECK-LABEL: fminv_v64f16: 758; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 759; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 760; VBITS_GE_1024-NEXT: fminnmv h0, [[PG]], [[OP]].h 761; VBITS_GE_1024-NEXT: ret 762 %op = load <64 x half>, <64 x half>* %a 763 %res = call half @llvm.vector.reduce.fmin.v64f16(<64 x half> %op) 764 ret half %res 765} 766 767define half @fminv_v128f16(<128 x half>* %a) #0 { 768; CHECK-LABEL: fminv_v128f16: 769; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128 770; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 771; VBITS_GE_2048-NEXT: fminnmv h0, [[PG]], [[OP]].h 772; VBITS_GE_2048-NEXT: ret 773 %op = load <128 x half>, <128 x half>* %a 774 %res = call half @llvm.vector.reduce.fmin.v128f16(<128 x half> %op) 775 ret half %res 776} 777 778; Don't use SVE for 64-bit f32 vectors. 779define float @fminv_v2f32(<2 x float> %a) #0 { 780; CHECK-LABEL: fminv_v2f32: 781; CHECK: fminnmp s0, v0.2s 782; CHECK: ret 783 %res = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a) 784 ret float %res 785} 786 787; Don't use SVE for 128-bit f32 vectors. 788define float @fminv_v4f32(<4 x float> %a) #0 { 789; CHECK-LABEL: fminv_v4f32: 790; CHECK: fminnmv s0, v0.4s 791; CHECK: ret 792 %res = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a) 793 ret float %res 794} 795 796define float @fminv_v8f32(<8 x float>* %a) #0 { 797; CHECK-LABEL: fminv_v8f32: 798; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 799; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 800; CHECK-NEXT: fminnmv s0, [[PG]], [[OP]].s 801; CHECK-NEXT: ret 802 %op = load <8 x float>, <8 x float>* %a 803 %res = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %op) 804 ret float %res 805} 806 807define float @fminv_v16f32(<16 x float>* %a) #0 { 808; CHECK-LABEL: fminv_v16f32: 809; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 810; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 811; VBITS_GE_512-NEXT: fminnmv s0, [[PG]], [[OP]].s 812; VBITS_GE_512-NEXT: ret 813 814; Ensure sensible type legalisation. 815; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 816; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8 817; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0] 818; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2] 819; VBITS_EQ_256-DAG: fminnm [[MIN:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s 820; VBITS_EQ_256-DAG: fminnmv s0, [[PG]], [[MIN]].s 821; VBITS_EQ_256-NEXT: ret 822 %op = load <16 x float>, <16 x float>* %a 823 %res = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> %op) 824 ret float %res 825} 826 827define float @fminv_v32f32(<32 x float>* %a) #0 { 828; CHECK-LABEL: fminv_v32f32: 829; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 830; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 831; VBITS_GE_1024-NEXT: fminnmv s0, [[PG]], [[OP]].s 832; VBITS_GE_1024-NEXT: ret 833 %op = load <32 x float>, <32 x float>* %a 834 %res = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> %op) 835 ret float %res 836} 837 838define float @fminv_v64f32(<64 x float>* %a) #0 { 839; CHECK-LABEL: fminv_v64f32: 840; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 841; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 842; VBITS_GE_2048-NEXT: fminnmv s0, [[PG]], [[OP]].s 843; VBITS_GE_2048-NEXT: ret 844 %op = load <64 x float>, <64 x float>* %a 845 %res = call float @llvm.vector.reduce.fmin.v64f32(<64 x float> %op) 846 ret float %res 847} 848 849; Nothing to do for single element vectors. 850define double @fminv_v1f64(<1 x double> %a) #0 { 851; CHECK-LABEL: fminv_v1f64: 852; CHECK-NOT: fmin 853; CHECK: ret 854 %res = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> %a) 855 ret double %res 856} 857 858; Don't use SVE for 128-bit f64 vectors. 859define double @fminv_v2f64(<2 x double> %a) #0 { 860; CHECK-LABEL: fminv_v2f64: 861; CHECK: fminnmp d0, v0.2d 862; CHECK-NEXT: ret 863 %res = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a) 864 ret double %res 865} 866 867define double @fminv_v4f64(<4 x double>* %a) #0 { 868; CHECK-LABEL: fminv_v4f64: 869; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 870; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 871; CHECK-NEXT: fminnmv d0, [[PG]], [[OP]].d 872; CHECK-NEXT: ret 873 %op = load <4 x double>, <4 x double>* %a 874 %res = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %op) 875 ret double %res 876} 877 878define double @fminv_v8f64(<8 x double>* %a) #0 { 879; CHECK-LABEL: fminv_v8f64: 880; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 881; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 882; VBITS_GE_512-NEXT: fminnmv d0, [[PG]], [[OP]].d 883; VBITS_GE_512-NEXT: ret 884 885; Ensure sensible type legalisation. 886; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 887; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4 888; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] 889; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3] 890; VBITS_EQ_256-DAG: fminnm [[MIN:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d 891; VBITS_EQ_256-DAG: fminnmv d0, [[PG]], [[MIN]].d 892; VBITS_EQ_256-NEXT: ret 893 %op = load <8 x double>, <8 x double>* %a 894 %res = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> %op) 895 ret double %res 896} 897 898define double @fminv_v16f64(<16 x double>* %a) #0 { 899; CHECK-LABEL: fminv_v16f64: 900; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 901; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 902; VBITS_GE_1024-NEXT: fminnmv d0, [[PG]], [[OP]].d 903; VBITS_GE_1024-NEXT: ret 904 %op = load <16 x double>, <16 x double>* %a 905 %res = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> %op) 906 ret double %res 907} 908 909define double @fminv_v32f64(<32 x double>* %a) #0 { 910; CHECK-LABEL: fminv_v32f64: 911; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 912; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 913; VBITS_GE_2048-NEXT: fminnmv d0, [[PG]], [[OP]].d 914; VBITS_GE_2048-NEXT: ret 915 %op = load <32 x double>, <32 x double>* %a 916 %res = call double @llvm.vector.reduce.fmin.v32f64(<32 x double> %op) 917 ret double %res 918} 919 920attributes #0 = { "target-features"="+sve" } 921 922declare half @llvm.vector.reduce.fadd.v4f16(half, <4 x half>) 923declare half @llvm.vector.reduce.fadd.v8f16(half, <8 x half>) 924declare half @llvm.vector.reduce.fadd.v16f16(half, <16 x half>) 925declare half @llvm.vector.reduce.fadd.v32f16(half, <32 x half>) 926declare half @llvm.vector.reduce.fadd.v64f16(half, <64 x half>) 927declare half @llvm.vector.reduce.fadd.v128f16(half, <128 x half>) 928 929declare float @llvm.vector.reduce.fadd.v2f32(float, <2 x float>) 930declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>) 931declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>) 932declare float @llvm.vector.reduce.fadd.v16f32(float, <16 x float>) 933declare float @llvm.vector.reduce.fadd.v32f32(float, <32 x float>) 934declare float @llvm.vector.reduce.fadd.v64f32(float, <64 x float>) 935 936declare double @llvm.vector.reduce.fadd.v1f64(double, <1 x double>) 937declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>) 938declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>) 939declare double @llvm.vector.reduce.fadd.v8f64(double, <8 x double>) 940declare double @llvm.vector.reduce.fadd.v16f64(double, <16 x double>) 941declare double @llvm.vector.reduce.fadd.v32f64(double, <32 x double>) 942 943declare half @llvm.vector.reduce.fmax.v4f16(<4 x half>) 944declare half @llvm.vector.reduce.fmax.v8f16(<8 x half>) 945declare half @llvm.vector.reduce.fmax.v16f16(<16 x half>) 946declare half @llvm.vector.reduce.fmax.v32f16(<32 x half>) 947declare half @llvm.vector.reduce.fmax.v64f16(<64 x half>) 948declare half @llvm.vector.reduce.fmax.v128f16(<128 x half>) 949 950declare float @llvm.vector.reduce.fmax.v2f32(<2 x float>) 951declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>) 952declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>) 953declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>) 954declare float @llvm.vector.reduce.fmax.v32f32(<32 x float>) 955declare float @llvm.vector.reduce.fmax.v64f32(<64 x float>) 956 957declare double @llvm.vector.reduce.fmax.v1f64(<1 x double>) 958declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>) 959declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>) 960declare double @llvm.vector.reduce.fmax.v8f64(<8 x double>) 961declare double @llvm.vector.reduce.fmax.v16f64(<16 x double>) 962declare double @llvm.vector.reduce.fmax.v32f64(<32 x double>) 963 964declare half @llvm.vector.reduce.fmin.v4f16(<4 x half>) 965declare half @llvm.vector.reduce.fmin.v8f16(<8 x half>) 966declare half @llvm.vector.reduce.fmin.v16f16(<16 x half>) 967declare half @llvm.vector.reduce.fmin.v32f16(<32 x half>) 968declare half @llvm.vector.reduce.fmin.v64f16(<64 x half>) 969declare half @llvm.vector.reduce.fmin.v128f16(<128 x half>) 970 971declare float @llvm.vector.reduce.fmin.v2f32(<2 x float>) 972declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>) 973declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>) 974declare float @llvm.vector.reduce.fmin.v16f32(<16 x float>) 975declare float @llvm.vector.reduce.fmin.v32f32(<32 x float>) 976declare float @llvm.vector.reduce.fmin.v64f32(<64 x float>) 977 978declare double @llvm.vector.reduce.fmin.v1f64(<1 x double>) 979declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>) 980declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>) 981declare double @llvm.vector.reduce.fmin.v8f64(<8 x double>) 982declare double @llvm.vector.reduce.fmin.v16f64(<16 x double>) 983declare double @llvm.vector.reduce.fmin.v32f64(<32 x double>) 984