1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP 3; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16,+fp64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NOFP 4 5define arm_aapcs_vfpcc float @fadd_v2f32(<2 x float> %x, float %y) { 6; CHECK-LABEL: fadd_v2f32: 7; CHECK: @ %bb.0: @ %entry 8; CHECK-NEXT: vadd.f32 s0, s0, s1 9; CHECK-NEXT: vadd.f32 s0, s4, s0 10; CHECK-NEXT: bx lr 11entry: 12 %z = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float %y, <2 x float> %x) 13 ret float %z 14} 15 16define arm_aapcs_vfpcc float @fadd_v4f32(<4 x float> %x, float %y) { 17; CHECK-FP-LABEL: fadd_v4f32: 18; CHECK-FP: @ %bb.0: @ %entry 19; CHECK-FP-NEXT: vadd.f32 s6, s2, s3 20; CHECK-FP-NEXT: vadd.f32 s0, s0, s1 21; CHECK-FP-NEXT: vadd.f32 s0, s0, s6 22; CHECK-FP-NEXT: vadd.f32 s0, s4, s0 23; CHECK-FP-NEXT: bx lr 24; 25; CHECK-NOFP-LABEL: fadd_v4f32: 26; CHECK-NOFP: @ %bb.0: @ %entry 27; CHECK-NOFP-NEXT: vadd.f32 s6, s0, s1 28; CHECK-NOFP-NEXT: vadd.f32 s6, s6, s2 29; CHECK-NOFP-NEXT: vadd.f32 s0, s6, s3 30; CHECK-NOFP-NEXT: vadd.f32 s0, s4, s0 31; CHECK-NOFP-NEXT: bx lr 32entry: 33 %z = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %y, <4 x float> %x) 34 ret float %z 35} 36 37define arm_aapcs_vfpcc float @fadd_v8f32(<8 x float> %x, float %y) { 38; CHECK-FP-LABEL: fadd_v8f32: 39; CHECK-FP: @ %bb.0: @ %entry 40; CHECK-FP-NEXT: vadd.f32 q0, q0, q1 41; CHECK-FP-NEXT: vadd.f32 s4, s2, s3 42; CHECK-FP-NEXT: vadd.f32 s0, s0, s1 43; CHECK-FP-NEXT: vadd.f32 s0, s0, s4 44; CHECK-FP-NEXT: vadd.f32 s0, s8, s0 45; CHECK-FP-NEXT: bx lr 46; 47; CHECK-NOFP-LABEL: fadd_v8f32: 48; CHECK-NOFP: @ %bb.0: @ %entry 49; CHECK-NOFP-NEXT: vadd.f32 s12, s0, s4 50; CHECK-NOFP-NEXT: vadd.f32 s10, s1, s5 51; CHECK-NOFP-NEXT: vadd.f32 s14, s2, s6 52; CHECK-NOFP-NEXT: vadd.f32 s0, s3, s7 53; CHECK-NOFP-NEXT: vadd.f32 s10, s12, s10 54; CHECK-NOFP-NEXT: vadd.f32 s2, s10, s14 55; CHECK-NOFP-NEXT: vadd.f32 s0, s2, s0 56; CHECK-NOFP-NEXT: vadd.f32 s0, s8, s0 57; CHECK-NOFP-NEXT: bx lr 58entry: 59 %z = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float %y, <8 x float> %x) 60 ret float %z 61} 62 63define arm_aapcs_vfpcc half @fadd_v2f16(<2 x half> %x, half %y) { 64; CHECK-LABEL: fadd_v2f16: 65; CHECK: @ %bb.0: @ %entry 66; CHECK-NEXT: vmovx.f16 s6, s0 67; CHECK-NEXT: vadd.f16 s0, s0, s6 68; CHECK-NEXT: vadd.f16 s0, s4, s0 69; CHECK-NEXT: bx lr 70entry: 71 %z = call fast half @llvm.vector.reduce.fadd.f16.v2f16(half %y, <2 x half> %x) 72 ret half %z 73} 74 75define arm_aapcs_vfpcc half @fadd_v4f16(<4 x half> %x, half %y) { 76; CHECK-FP-LABEL: fadd_v4f16: 77; CHECK-FP: @ %bb.0: @ %entry 78; CHECK-FP-NEXT: vmovx.f16 s6, s1 79; CHECK-FP-NEXT: vmovx.f16 s8, s0 80; CHECK-FP-NEXT: vadd.f16 s6, s1, s6 81; CHECK-FP-NEXT: vadd.f16 s0, s0, s8 82; CHECK-FP-NEXT: vadd.f16 s0, s0, s6 83; CHECK-FP-NEXT: vadd.f16 s0, s4, s0 84; CHECK-FP-NEXT: bx lr 85; 86; CHECK-NOFP-LABEL: fadd_v4f16: 87; CHECK-NOFP: @ %bb.0: @ %entry 88; CHECK-NOFP-NEXT: vmovx.f16 s6, s0 89; CHECK-NOFP-NEXT: vadd.f16 s6, s0, s6 90; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 91; CHECK-NOFP-NEXT: vadd.f16 s6, s6, s1 92; CHECK-NOFP-NEXT: vadd.f16 s0, s6, s0 93; CHECK-NOFP-NEXT: vadd.f16 s0, s4, s0 94; CHECK-NOFP-NEXT: bx lr 95entry: 96 %z = call fast half @llvm.vector.reduce.fadd.f16.v4f16(half %y, <4 x half> %x) 97 ret half %z 98} 99 100define arm_aapcs_vfpcc half @fadd_v8f16(<8 x half> %x, half %y) { 101; CHECK-FP-LABEL: fadd_v8f16: 102; CHECK-FP: @ %bb.0: @ %entry 103; CHECK-FP-NEXT: vrev32.16 q2, q0 104; CHECK-FP-NEXT: vadd.f16 q0, q0, q2 105; CHECK-FP-NEXT: vadd.f16 s6, s2, s3 106; CHECK-FP-NEXT: vadd.f16 s0, s0, s1 107; CHECK-FP-NEXT: vadd.f16 s0, s0, s6 108; CHECK-FP-NEXT: vadd.f16 s0, s4, s0 109; CHECK-FP-NEXT: bx lr 110; 111; CHECK-NOFP-LABEL: fadd_v8f16: 112; CHECK-NOFP: @ %bb.0: @ %entry 113; CHECK-NOFP-NEXT: vmovx.f16 s6, s0 114; CHECK-NOFP-NEXT: vmovx.f16 s8, s1 115; CHECK-NOFP-NEXT: vadd.f16 s6, s0, s6 116; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 117; CHECK-NOFP-NEXT: vadd.f16 s6, s6, s1 118; CHECK-NOFP-NEXT: vadd.f16 s6, s6, s8 119; CHECK-NOFP-NEXT: vmovx.f16 s8, s2 120; CHECK-NOFP-NEXT: vadd.f16 s6, s6, s2 121; CHECK-NOFP-NEXT: vadd.f16 s6, s6, s8 122; CHECK-NOFP-NEXT: vadd.f16 s6, s6, s3 123; CHECK-NOFP-NEXT: vadd.f16 s0, s6, s0 124; CHECK-NOFP-NEXT: vadd.f16 s0, s4, s0 125; CHECK-NOFP-NEXT: bx lr 126entry: 127 %z = call fast half @llvm.vector.reduce.fadd.f16.v8f16(half %y, <8 x half> %x) 128 ret half %z 129} 130 131define arm_aapcs_vfpcc half @fadd_v16f16(<16 x half> %x, half %y) { 132; CHECK-FP-LABEL: fadd_v16f16: 133; CHECK-FP: @ %bb.0: @ %entry 134; CHECK-FP-NEXT: vadd.f16 q0, q0, q1 135; CHECK-FP-NEXT: vrev32.16 q1, q0 136; CHECK-FP-NEXT: vadd.f16 q0, q0, q1 137; CHECK-FP-NEXT: vadd.f16 s4, s2, s3 138; CHECK-FP-NEXT: vadd.f16 s0, s0, s1 139; CHECK-FP-NEXT: vadd.f16 s0, s0, s4 140; CHECK-FP-NEXT: vadd.f16 s0, s8, s0 141; CHECK-FP-NEXT: bx lr 142; 143; CHECK-NOFP-LABEL: fadd_v16f16: 144; CHECK-NOFP: @ %bb.0: @ %entry 145; CHECK-NOFP-NEXT: vmovx.f16 s10, s4 146; CHECK-NOFP-NEXT: vmovx.f16 s12, s0 147; CHECK-NOFP-NEXT: vadd.f16 s10, s12, s10 148; CHECK-NOFP-NEXT: vadd.f16 s12, s0, s4 149; CHECK-NOFP-NEXT: vadd.f16 s10, s12, s10 150; CHECK-NOFP-NEXT: vadd.f16 s12, s1, s5 151; CHECK-NOFP-NEXT: vadd.f16 s10, s10, s12 152; CHECK-NOFP-NEXT: vmovx.f16 s12, s5 153; CHECK-NOFP-NEXT: vmovx.f16 s14, s1 154; CHECK-NOFP-NEXT: vmovx.f16 s4, s7 155; CHECK-NOFP-NEXT: vadd.f16 s12, s14, s12 156; CHECK-NOFP-NEXT: vmovx.f16 s14, s2 157; CHECK-NOFP-NEXT: vadd.f16 s10, s10, s12 158; CHECK-NOFP-NEXT: vadd.f16 s12, s2, s6 159; CHECK-NOFP-NEXT: vadd.f16 s10, s10, s12 160; CHECK-NOFP-NEXT: vmovx.f16 s12, s6 161; CHECK-NOFP-NEXT: vadd.f16 s12, s14, s12 162; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 163; CHECK-NOFP-NEXT: vadd.f16 s10, s10, s12 164; CHECK-NOFP-NEXT: vadd.f16 s12, s3, s7 165; CHECK-NOFP-NEXT: vadd.f16 s10, s10, s12 166; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s4 167; CHECK-NOFP-NEXT: vadd.f16 s0, s10, s0 168; CHECK-NOFP-NEXT: vadd.f16 s0, s8, s0 169; CHECK-NOFP-NEXT: bx lr 170entry: 171 %z = call fast half @llvm.vector.reduce.fadd.f16.v16f16(half %y, <16 x half> %x) 172 ret half %z 173} 174 175define arm_aapcs_vfpcc double @fadd_v1f64(<1 x double> %x, double %y) { 176; CHECK-LABEL: fadd_v1f64: 177; CHECK: @ %bb.0: @ %entry 178; CHECK-NEXT: vadd.f64 d0, d1, d0 179; CHECK-NEXT: bx lr 180entry: 181 %z = call fast double @llvm.vector.reduce.fadd.f64.v1f64(double %y, <1 x double> %x) 182 ret double %z 183} 184 185define arm_aapcs_vfpcc double @fadd_v2f64(<2 x double> %x, double %y) { 186; CHECK-LABEL: fadd_v2f64: 187; CHECK: @ %bb.0: @ %entry 188; CHECK-NEXT: vadd.f64 d0, d0, d1 189; CHECK-NEXT: vadd.f64 d0, d2, d0 190; CHECK-NEXT: bx lr 191entry: 192 %z = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double %y, <2 x double> %x) 193 ret double %z 194} 195 196define arm_aapcs_vfpcc double @fadd_v4f64(<4 x double> %x, double %y) { 197; CHECK-LABEL: fadd_v4f64: 198; CHECK: @ %bb.0: @ %entry 199; CHECK-NEXT: vadd.f64 d5, d1, d3 200; CHECK-NEXT: vadd.f64 d0, d0, d2 201; CHECK-NEXT: vadd.f64 d0, d0, d5 202; CHECK-NEXT: vadd.f64 d0, d4, d0 203; CHECK-NEXT: bx lr 204entry: 205 %z = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double %y, <4 x double> %x) 206 ret double %z 207} 208 209define arm_aapcs_vfpcc float @fadd_v2f32_nofast(<2 x float> %x, float %y) { 210; CHECK-LABEL: fadd_v2f32_nofast: 211; CHECK: @ %bb.0: @ %entry 212; CHECK-NEXT: vadd.f32 s4, s4, s0 213; CHECK-NEXT: vadd.f32 s0, s4, s1 214; CHECK-NEXT: bx lr 215entry: 216 %z = call float @llvm.vector.reduce.fadd.f32.v2f32(float %y, <2 x float> %x) 217 ret float %z 218} 219 220define arm_aapcs_vfpcc float @fadd_v4f32_nofast(<4 x float> %x, float %y) { 221; CHECK-LABEL: fadd_v4f32_nofast: 222; CHECK: @ %bb.0: @ %entry 223; CHECK-NEXT: vadd.f32 s4, s4, s0 224; CHECK-NEXT: vadd.f32 s4, s4, s1 225; CHECK-NEXT: vadd.f32 s4, s4, s2 226; CHECK-NEXT: vadd.f32 s0, s4, s3 227; CHECK-NEXT: bx lr 228entry: 229 %z = call float @llvm.vector.reduce.fadd.f32.v4f32(float %y, <4 x float> %x) 230 ret float %z 231} 232 233define arm_aapcs_vfpcc float @fadd_v8f32_nofast(<8 x float> %x, float %y) { 234; CHECK-LABEL: fadd_v8f32_nofast: 235; CHECK: @ %bb.0: @ %entry 236; CHECK-NEXT: vadd.f32 s8, s8, s0 237; CHECK-NEXT: vadd.f32 s8, s8, s1 238; CHECK-NEXT: vadd.f32 s8, s8, s2 239; CHECK-NEXT: vadd.f32 s0, s8, s3 240; CHECK-NEXT: vadd.f32 s0, s0, s4 241; CHECK-NEXT: vadd.f32 s0, s0, s5 242; CHECK-NEXT: vadd.f32 s0, s0, s6 243; CHECK-NEXT: vadd.f32 s0, s0, s7 244; CHECK-NEXT: bx lr 245entry: 246 %z = call float @llvm.vector.reduce.fadd.f32.v8f32(float %y, <8 x float> %x) 247 ret float %z 248} 249 250define arm_aapcs_vfpcc half @fadd_v4f16_nofast(<4 x half> %x, half %y) { 251; CHECK-LABEL: fadd_v4f16_nofast: 252; CHECK: @ %bb.0: @ %entry 253; CHECK-NEXT: vadd.f16 s4, s4, s0 254; CHECK-NEXT: vmovx.f16 s6, s0 255; CHECK-NEXT: vadd.f16 s4, s4, s6 256; CHECK-NEXT: vmovx.f16 s0, s1 257; CHECK-NEXT: vadd.f16 s4, s4, s1 258; CHECK-NEXT: vadd.f16 s0, s4, s0 259; CHECK-NEXT: bx lr 260entry: 261 %z = call half @llvm.vector.reduce.fadd.f16.v4f16(half %y, <4 x half> %x) 262 ret half %z 263} 264 265define arm_aapcs_vfpcc half @fadd_v8f16_nofast(<8 x half> %x, half %y) { 266; CHECK-LABEL: fadd_v8f16_nofast: 267; CHECK: @ %bb.0: @ %entry 268; CHECK-NEXT: vadd.f16 s4, s4, s0 269; CHECK-NEXT: vmovx.f16 s6, s0 270; CHECK-NEXT: vadd.f16 s4, s4, s6 271; CHECK-NEXT: vmovx.f16 s6, s1 272; CHECK-NEXT: vadd.f16 s4, s4, s1 273; CHECK-NEXT: vmovx.f16 s0, s3 274; CHECK-NEXT: vadd.f16 s4, s4, s6 275; CHECK-NEXT: vmovx.f16 s6, s2 276; CHECK-NEXT: vadd.f16 s4, s4, s2 277; CHECK-NEXT: vadd.f16 s4, s4, s6 278; CHECK-NEXT: vadd.f16 s4, s4, s3 279; CHECK-NEXT: vadd.f16 s0, s4, s0 280; CHECK-NEXT: bx lr 281entry: 282 %z = call half @llvm.vector.reduce.fadd.f16.v8f16(half %y, <8 x half> %x) 283 ret half %z 284} 285 286define arm_aapcs_vfpcc half @fadd_v16f16_nofast(<16 x half> %x, half %y) { 287; CHECK-LABEL: fadd_v16f16_nofast: 288; CHECK: @ %bb.0: @ %entry 289; CHECK-NEXT: vadd.f16 s8, s8, s0 290; CHECK-NEXT: vmovx.f16 s10, s0 291; CHECK-NEXT: vadd.f16 s8, s8, s10 292; CHECK-NEXT: vmovx.f16 s10, s1 293; CHECK-NEXT: vadd.f16 s8, s8, s1 294; CHECK-NEXT: vmovx.f16 s0, s3 295; CHECK-NEXT: vadd.f16 s8, s8, s10 296; CHECK-NEXT: vmovx.f16 s10, s2 297; CHECK-NEXT: vadd.f16 s8, s8, s2 298; CHECK-NEXT: vmovx.f16 s2, s4 299; CHECK-NEXT: vadd.f16 s8, s8, s10 300; CHECK-NEXT: vadd.f16 s8, s8, s3 301; CHECK-NEXT: vadd.f16 s0, s8, s0 302; CHECK-NEXT: vadd.f16 s0, s0, s4 303; CHECK-NEXT: vadd.f16 s0, s0, s2 304; CHECK-NEXT: vmovx.f16 s2, s5 305; CHECK-NEXT: vadd.f16 s0, s0, s5 306; CHECK-NEXT: vadd.f16 s0, s0, s2 307; CHECK-NEXT: vmovx.f16 s2, s6 308; CHECK-NEXT: vadd.f16 s0, s0, s6 309; CHECK-NEXT: vadd.f16 s0, s0, s2 310; CHECK-NEXT: vmovx.f16 s2, s7 311; CHECK-NEXT: vadd.f16 s0, s0, s7 312; CHECK-NEXT: vadd.f16 s0, s0, s2 313; CHECK-NEXT: bx lr 314entry: 315 %z = call half @llvm.vector.reduce.fadd.f16.v16f16(half %y, <16 x half> %x) 316 ret half %z 317} 318 319define arm_aapcs_vfpcc double @fadd_v1f64_nofast(<1 x double> %x, double %y) { 320; CHECK-LABEL: fadd_v1f64_nofast: 321; CHECK: @ %bb.0: @ %entry 322; CHECK-NEXT: vadd.f64 d0, d1, d0 323; CHECK-NEXT: bx lr 324entry: 325 %z = call double @llvm.vector.reduce.fadd.f64.v1f64(double %y, <1 x double> %x) 326 ret double %z 327} 328 329define arm_aapcs_vfpcc double @fadd_v2f64_nofast(<2 x double> %x, double %y) { 330; CHECK-LABEL: fadd_v2f64_nofast: 331; CHECK: @ %bb.0: @ %entry 332; CHECK-NEXT: vadd.f64 d2, d2, d0 333; CHECK-NEXT: vadd.f64 d0, d2, d1 334; CHECK-NEXT: bx lr 335entry: 336 %z = call double @llvm.vector.reduce.fadd.f64.v2f64(double %y, <2 x double> %x) 337 ret double %z 338} 339 340define arm_aapcs_vfpcc double @fadd_v4f64_nofast(<4 x double> %x, double %y) { 341; CHECK-LABEL: fadd_v4f64_nofast: 342; CHECK: @ %bb.0: @ %entry 343; CHECK-NEXT: vadd.f64 d4, d4, d0 344; CHECK-NEXT: vadd.f64 d0, d4, d1 345; CHECK-NEXT: vadd.f64 d0, d0, d2 346; CHECK-NEXT: vadd.f64 d0, d0, d3 347; CHECK-NEXT: bx lr 348entry: 349 %z = call double @llvm.vector.reduce.fadd.f64.v4f64(double %y, <4 x double> %x) 350 ret double %z 351} 352 353declare double @llvm.vector.reduce.fadd.f64.v1f64(double, <1 x double>) 354declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>) 355declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>) 356declare float @llvm.vector.reduce.fadd.f32.v2f32(float, <2 x float>) 357declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>) 358declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>) 359declare half @llvm.vector.reduce.fadd.f16.v16f16(half, <16 x half>) 360declare half @llvm.vector.reduce.fadd.f16.v2f16(half, <2 x half>) 361declare half @llvm.vector.reduce.fadd.f16.v4f16(half, <4 x half>) 362declare half @llvm.vector.reduce.fadd.f16.v8f16(half, <8 x half>) 363