1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP 3; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16,+fp64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NOFP 4 5define arm_aapcs_vfpcc float @fmul_v2f32(<2 x float> %x, float %y) { 6; CHECK-LABEL: fmul_v2f32: 7; CHECK: @ %bb.0: @ %entry 8; CHECK-NEXT: vmul.f32 s0, s0, s1 9; CHECK-NEXT: vmul.f32 s0, s4, s0 10; CHECK-NEXT: bx lr 11entry: 12 %z = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float %y, <2 x float> %x) 13 ret float %z 14} 15 16define arm_aapcs_vfpcc float @fmul_v4f32(<4 x float> %x, float %y) { 17; CHECK-FP-LABEL: fmul_v4f32: 18; CHECK-FP: @ %bb.0: @ %entry 19; CHECK-FP-NEXT: vmul.f32 s6, s2, s3 20; CHECK-FP-NEXT: vmul.f32 s0, s0, s1 21; CHECK-FP-NEXT: vmul.f32 s0, s0, s6 22; CHECK-FP-NEXT: vmul.f32 s0, s4, s0 23; CHECK-FP-NEXT: bx lr 24; 25; CHECK-NOFP-LABEL: fmul_v4f32: 26; CHECK-NOFP: @ %bb.0: @ %entry 27; CHECK-NOFP-NEXT: vmul.f32 s6, s0, s1 28; CHECK-NOFP-NEXT: vmul.f32 s6, s6, s2 29; CHECK-NOFP-NEXT: vmul.f32 s0, s6, s3 30; CHECK-NOFP-NEXT: vmul.f32 s0, s4, s0 31; CHECK-NOFP-NEXT: bx lr 32entry: 33 %z = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float %y, <4 x float> %x) 34 ret float %z 35} 36 37define arm_aapcs_vfpcc float @fmul_v8f32(<8 x float> %x, float %y) { 38; CHECK-FP-LABEL: fmul_v8f32: 39; CHECK-FP: @ %bb.0: @ %entry 40; CHECK-FP-NEXT: vmul.f32 q0, q0, q1 41; CHECK-FP-NEXT: vmul.f32 s4, s2, s3 42; CHECK-FP-NEXT: vmul.f32 s0, s0, s1 43; CHECK-FP-NEXT: vmul.f32 s0, s0, s4 44; CHECK-FP-NEXT: vmul.f32 s0, s8, s0 45; CHECK-FP-NEXT: bx lr 46; 47; CHECK-NOFP-LABEL: fmul_v8f32: 48; CHECK-NOFP: @ %bb.0: @ %entry 49; CHECK-NOFP-NEXT: vmul.f32 s12, s0, s4 50; CHECK-NOFP-NEXT: vmul.f32 s10, s1, s5 51; CHECK-NOFP-NEXT: vmul.f32 s14, s2, s6 52; CHECK-NOFP-NEXT: vmul.f32 s0, s3, s7 53; CHECK-NOFP-NEXT: vmul.f32 s10, s12, s10 54; CHECK-NOFP-NEXT: vmul.f32 s2, s10, s14 55; CHECK-NOFP-NEXT: vmul.f32 s0, s2, s0 56; CHECK-NOFP-NEXT: vmul.f32 s0, s8, s0 57; CHECK-NOFP-NEXT: bx lr 58entry: 59 %z = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float %y, <8 x float> %x) 60 ret float %z 61} 62 63define arm_aapcs_vfpcc half @fmul_v2f16(<2 x half> %x, half %y) { 64; CHECK-LABEL: fmul_v2f16: 65; CHECK: @ %bb.0: @ %entry 66; CHECK-NEXT: vmovx.f16 s6, s0 67; CHECK-NEXT: vmul.f16 s0, s0, s6 68; CHECK-NEXT: vmul.f16 s0, s4, s0 69; CHECK-NEXT: bx lr 70entry: 71 %z = call fast half @llvm.vector.reduce.fmul.f16.v2f16(half %y, <2 x half> %x) 72 ret half %z 73} 74 75define arm_aapcs_vfpcc half @fmul_v4f16(<4 x half> %x, half %y) { 76; CHECK-FP-LABEL: fmul_v4f16: 77; CHECK-FP: @ %bb.0: @ %entry 78; CHECK-FP-NEXT: vmovx.f16 s6, s1 79; CHECK-FP-NEXT: vmovx.f16 s8, s0 80; CHECK-FP-NEXT: vmul.f16 s6, s1, s6 81; CHECK-FP-NEXT: vmul.f16 s0, s0, s8 82; CHECK-FP-NEXT: vmul.f16 s0, s0, s6 83; CHECK-FP-NEXT: vmul.f16 s0, s4, s0 84; CHECK-FP-NEXT: bx lr 85; 86; CHECK-NOFP-LABEL: fmul_v4f16: 87; CHECK-NOFP: @ %bb.0: @ %entry 88; CHECK-NOFP-NEXT: vmovx.f16 s6, s0 89; CHECK-NOFP-NEXT: vmul.f16 s6, s0, s6 90; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 91; CHECK-NOFP-NEXT: vmul.f16 s6, s6, s1 92; CHECK-NOFP-NEXT: vmul.f16 s0, s6, s0 93; CHECK-NOFP-NEXT: vmul.f16 s0, s4, s0 94; CHECK-NOFP-NEXT: bx lr 95entry: 96 %z = call fast half @llvm.vector.reduce.fmul.f16.v4f16(half %y, <4 x half> %x) 97 ret half %z 98} 99 100define arm_aapcs_vfpcc half @fmul_v8f16(<8 x half> %x, half %y) { 101; CHECK-FP-LABEL: fmul_v8f16: 102; CHECK-FP: @ %bb.0: @ %entry 103; CHECK-FP-NEXT: vrev32.16 q2, q0 104; CHECK-FP-NEXT: vmul.f16 q0, q0, q2 105; CHECK-FP-NEXT: vmul.f16 s6, s2, s3 106; CHECK-FP-NEXT: vmul.f16 s0, s0, s1 107; CHECK-FP-NEXT: vmul.f16 s0, s0, s6 108; CHECK-FP-NEXT: vmul.f16 s0, s4, s0 109; CHECK-FP-NEXT: bx lr 110; 111; CHECK-NOFP-LABEL: fmul_v8f16: 112; CHECK-NOFP: @ %bb.0: @ %entry 113; CHECK-NOFP-NEXT: vmovx.f16 s6, s0 114; CHECK-NOFP-NEXT: vmovx.f16 s8, s1 115; CHECK-NOFP-NEXT: vmul.f16 s6, s0, s6 116; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 117; CHECK-NOFP-NEXT: vmul.f16 s6, s6, s1 118; CHECK-NOFP-NEXT: vmul.f16 s6, s6, s8 119; CHECK-NOFP-NEXT: vmovx.f16 s8, s2 120; CHECK-NOFP-NEXT: vmul.f16 s6, s6, s2 121; CHECK-NOFP-NEXT: vmul.f16 s6, s6, s8 122; CHECK-NOFP-NEXT: vmul.f16 s6, s6, s3 123; CHECK-NOFP-NEXT: vmul.f16 s0, s6, s0 124; CHECK-NOFP-NEXT: vmul.f16 s0, s4, s0 125; CHECK-NOFP-NEXT: bx lr 126entry: 127 %z = call fast half @llvm.vector.reduce.fmul.f16.v8f16(half %y, <8 x half> %x) 128 ret half %z 129} 130 131define arm_aapcs_vfpcc half @fmul_v16f16(<16 x half> %x, half %y) { 132; CHECK-FP-LABEL: fmul_v16f16: 133; CHECK-FP: @ %bb.0: @ %entry 134; CHECK-FP-NEXT: vmul.f16 q0, q0, q1 135; CHECK-FP-NEXT: vrev32.16 q1, q0 136; CHECK-FP-NEXT: vmul.f16 q0, q0, q1 137; CHECK-FP-NEXT: vmul.f16 s4, s2, s3 138; CHECK-FP-NEXT: vmul.f16 s0, s0, s1 139; CHECK-FP-NEXT: vmul.f16 s0, s0, s4 140; CHECK-FP-NEXT: vmul.f16 s0, s8, s0 141; CHECK-FP-NEXT: bx lr 142; 143; CHECK-NOFP-LABEL: fmul_v16f16: 144; CHECK-NOFP: @ %bb.0: @ %entry 145; CHECK-NOFP-NEXT: vmovx.f16 s10, s4 146; CHECK-NOFP-NEXT: vmovx.f16 s12, s0 147; CHECK-NOFP-NEXT: vmul.f16 s10, s12, s10 148; CHECK-NOFP-NEXT: vmul.f16 s12, s0, s4 149; CHECK-NOFP-NEXT: vmul.f16 s10, s12, s10 150; CHECK-NOFP-NEXT: vmul.f16 s12, s1, s5 151; CHECK-NOFP-NEXT: vmul.f16 s10, s10, s12 152; CHECK-NOFP-NEXT: vmovx.f16 s12, s5 153; CHECK-NOFP-NEXT: vmovx.f16 s14, s1 154; CHECK-NOFP-NEXT: vmovx.f16 s4, s7 155; CHECK-NOFP-NEXT: vmul.f16 s12, s14, s12 156; CHECK-NOFP-NEXT: vmovx.f16 s14, s2 157; CHECK-NOFP-NEXT: vmul.f16 s10, s10, s12 158; CHECK-NOFP-NEXT: vmul.f16 s12, s2, s6 159; CHECK-NOFP-NEXT: vmul.f16 s10, s10, s12 160; CHECK-NOFP-NEXT: vmovx.f16 s12, s6 161; CHECK-NOFP-NEXT: vmul.f16 s12, s14, s12 162; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 163; CHECK-NOFP-NEXT: vmul.f16 s10, s10, s12 164; CHECK-NOFP-NEXT: vmul.f16 s12, s3, s7 165; CHECK-NOFP-NEXT: vmul.f16 s10, s10, s12 166; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s4 167; CHECK-NOFP-NEXT: vmul.f16 s0, s10, s0 168; CHECK-NOFP-NEXT: vmul.f16 s0, s8, s0 169; CHECK-NOFP-NEXT: bx lr 170entry: 171 %z = call fast half @llvm.vector.reduce.fmul.f16.v16f16(half %y, <16 x half> %x) 172 ret half %z 173} 174 175define arm_aapcs_vfpcc double @fmul_v1f64(<1 x double> %x, double %y) { 176; CHECK-LABEL: fmul_v1f64: 177; CHECK: @ %bb.0: @ %entry 178; CHECK-NEXT: vmul.f64 d0, d1, d0 179; CHECK-NEXT: bx lr 180entry: 181 %z = call fast double @llvm.vector.reduce.fmul.f64.v1f64(double %y, <1 x double> %x) 182 ret double %z 183} 184 185define arm_aapcs_vfpcc double @fmul_v2f64(<2 x double> %x, double %y) { 186; CHECK-LABEL: fmul_v2f64: 187; CHECK: @ %bb.0: @ %entry 188; CHECK-NEXT: vmul.f64 d0, d0, d1 189; CHECK-NEXT: vmul.f64 d0, d2, d0 190; CHECK-NEXT: bx lr 191entry: 192 %z = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double %y, <2 x double> %x) 193 ret double %z 194} 195 196define arm_aapcs_vfpcc double @fmul_v4f64(<4 x double> %x, double %y) { 197; CHECK-LABEL: fmul_v4f64: 198; CHECK: @ %bb.0: @ %entry 199; CHECK-NEXT: vmul.f64 d5, d1, d3 200; CHECK-NEXT: vmul.f64 d0, d0, d2 201; CHECK-NEXT: vmul.f64 d0, d0, d5 202; CHECK-NEXT: vmul.f64 d0, d4, d0 203; CHECK-NEXT: bx lr 204entry: 205 %z = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double %y, <4 x double> %x) 206 ret double %z 207} 208 209define arm_aapcs_vfpcc float @fmul_v2f32_nofast(<2 x float> %x, float %y) { 210; CHECK-LABEL: fmul_v2f32_nofast: 211; CHECK: @ %bb.0: @ %entry 212; CHECK-NEXT: vmul.f32 s4, s4, s0 213; CHECK-NEXT: vmul.f32 s0, s4, s1 214; CHECK-NEXT: bx lr 215entry: 216 %z = call float @llvm.vector.reduce.fmul.f32.v2f32(float %y, <2 x float> %x) 217 ret float %z 218} 219 220define arm_aapcs_vfpcc float @fmul_v4f32_nofast(<4 x float> %x, float %y) { 221; CHECK-LABEL: fmul_v4f32_nofast: 222; CHECK: @ %bb.0: @ %entry 223; CHECK-NEXT: vmul.f32 s4, s4, s0 224; CHECK-NEXT: vmul.f32 s4, s4, s1 225; CHECK-NEXT: vmul.f32 s4, s4, s2 226; CHECK-NEXT: vmul.f32 s0, s4, s3 227; CHECK-NEXT: bx lr 228entry: 229 %z = call float @llvm.vector.reduce.fmul.f32.v4f32(float %y, <4 x float> %x) 230 ret float %z 231} 232 233define arm_aapcs_vfpcc float @fmul_v8f32_nofast(<8 x float> %x, float %y) { 234; CHECK-LABEL: fmul_v8f32_nofast: 235; CHECK: @ %bb.0: @ %entry 236; CHECK-NEXT: vmul.f32 s8, s8, s0 237; CHECK-NEXT: vmul.f32 s8, s8, s1 238; CHECK-NEXT: vmul.f32 s8, s8, s2 239; CHECK-NEXT: vmul.f32 s0, s8, s3 240; CHECK-NEXT: vmul.f32 s0, s0, s4 241; CHECK-NEXT: vmul.f32 s0, s0, s5 242; CHECK-NEXT: vmul.f32 s0, s0, s6 243; CHECK-NEXT: vmul.f32 s0, s0, s7 244; CHECK-NEXT: bx lr 245entry: 246 %z = call float @llvm.vector.reduce.fmul.f32.v8f32(float %y, <8 x float> %x) 247 ret float %z 248} 249 250define arm_aapcs_vfpcc half @fmul_v2f16_nofast(<2 x half> %x, half %y) { 251; CHECK-LABEL: fmul_v2f16_nofast: 252; CHECK: @ %bb.0: @ %entry 253; CHECK-NEXT: vmul.f16 s4, s4, s0 254; CHECK-NEXT: vmovx.f16 s0, s0 255; CHECK-NEXT: vmul.f16 s0, s4, s0 256; CHECK-NEXT: bx lr 257entry: 258 %z = call half @llvm.vector.reduce.fmul.f16.v2f16(half %y, <2 x half> %x) 259 ret half %z 260} 261 262define arm_aapcs_vfpcc half @fmul_v4f16_nofast(<4 x half> %x, half %y) { 263; CHECK-LABEL: fmul_v4f16_nofast: 264; CHECK: @ %bb.0: @ %entry 265; CHECK-NEXT: vmul.f16 s4, s4, s0 266; CHECK-NEXT: vmovx.f16 s6, s0 267; CHECK-NEXT: vmul.f16 s4, s4, s6 268; CHECK-NEXT: vmovx.f16 s0, s1 269; CHECK-NEXT: vmul.f16 s4, s4, s1 270; CHECK-NEXT: vmul.f16 s0, s4, s0 271; CHECK-NEXT: bx lr 272entry: 273 %z = call half @llvm.vector.reduce.fmul.f16.v4f16(half %y, <4 x half> %x) 274 ret half %z 275} 276 277define arm_aapcs_vfpcc half @fmul_v8f16_nofast(<8 x half> %x, half %y) { 278; CHECK-LABEL: fmul_v8f16_nofast: 279; CHECK: @ %bb.0: @ %entry 280; CHECK-NEXT: vmul.f16 s4, s4, s0 281; CHECK-NEXT: vmovx.f16 s6, s0 282; CHECK-NEXT: vmul.f16 s4, s4, s6 283; CHECK-NEXT: vmovx.f16 s6, s1 284; CHECK-NEXT: vmul.f16 s4, s4, s1 285; CHECK-NEXT: vmovx.f16 s0, s3 286; CHECK-NEXT: vmul.f16 s4, s4, s6 287; CHECK-NEXT: vmovx.f16 s6, s2 288; CHECK-NEXT: vmul.f16 s4, s4, s2 289; CHECK-NEXT: vmul.f16 s4, s4, s6 290; CHECK-NEXT: vmul.f16 s4, s4, s3 291; CHECK-NEXT: vmul.f16 s0, s4, s0 292; CHECK-NEXT: bx lr 293entry: 294 %z = call half @llvm.vector.reduce.fmul.f16.v8f16(half %y, <8 x half> %x) 295 ret half %z 296} 297 298define arm_aapcs_vfpcc half @fmul_v16f16_nofast(<16 x half> %x, half %y) { 299; CHECK-LABEL: fmul_v16f16_nofast: 300; CHECK: @ %bb.0: @ %entry 301; CHECK-NEXT: vmul.f16 s8, s8, s0 302; CHECK-NEXT: vmovx.f16 s10, s0 303; CHECK-NEXT: vmul.f16 s8, s8, s10 304; CHECK-NEXT: vmovx.f16 s10, s1 305; CHECK-NEXT: vmul.f16 s8, s8, s1 306; CHECK-NEXT: vmovx.f16 s0, s3 307; CHECK-NEXT: vmul.f16 s8, s8, s10 308; CHECK-NEXT: vmovx.f16 s10, s2 309; CHECK-NEXT: vmul.f16 s8, s8, s2 310; CHECK-NEXT: vmovx.f16 s2, s4 311; CHECK-NEXT: vmul.f16 s8, s8, s10 312; CHECK-NEXT: vmul.f16 s8, s8, s3 313; CHECK-NEXT: vmul.f16 s0, s8, s0 314; CHECK-NEXT: vmul.f16 s0, s0, s4 315; CHECK-NEXT: vmul.f16 s0, s0, s2 316; CHECK-NEXT: vmovx.f16 s2, s5 317; CHECK-NEXT: vmul.f16 s0, s0, s5 318; CHECK-NEXT: vmul.f16 s0, s0, s2 319; CHECK-NEXT: vmovx.f16 s2, s6 320; CHECK-NEXT: vmul.f16 s0, s0, s6 321; CHECK-NEXT: vmul.f16 s0, s0, s2 322; CHECK-NEXT: vmovx.f16 s2, s7 323; CHECK-NEXT: vmul.f16 s0, s0, s7 324; CHECK-NEXT: vmul.f16 s0, s0, s2 325; CHECK-NEXT: bx lr 326entry: 327 %z = call half @llvm.vector.reduce.fmul.f16.v16f16(half %y, <16 x half> %x) 328 ret half %z 329} 330 331define arm_aapcs_vfpcc double @fmul_v1f64_nofast(<1 x double> %x, double %y) { 332; CHECK-LABEL: fmul_v1f64_nofast: 333; CHECK: @ %bb.0: @ %entry 334; CHECK-NEXT: vmul.f64 d0, d1, d0 335; CHECK-NEXT: bx lr 336entry: 337 %z = call double @llvm.vector.reduce.fmul.f64.v1f64(double %y, <1 x double> %x) 338 ret double %z 339} 340 341define arm_aapcs_vfpcc double @fmul_v2f64_nofast(<2 x double> %x, double %y) { 342; CHECK-LABEL: fmul_v2f64_nofast: 343; CHECK: @ %bb.0: @ %entry 344; CHECK-NEXT: vmul.f64 d2, d2, d0 345; CHECK-NEXT: vmul.f64 d0, d2, d1 346; CHECK-NEXT: bx lr 347entry: 348 %z = call double @llvm.vector.reduce.fmul.f64.v2f64(double %y, <2 x double> %x) 349 ret double %z 350} 351 352define arm_aapcs_vfpcc double @fmul_v4f64_nofast(<4 x double> %x, double %y) { 353; CHECK-LABEL: fmul_v4f64_nofast: 354; CHECK: @ %bb.0: @ %entry 355; CHECK-NEXT: vmul.f64 d4, d4, d0 356; CHECK-NEXT: vmul.f64 d0, d4, d1 357; CHECK-NEXT: vmul.f64 d0, d0, d2 358; CHECK-NEXT: vmul.f64 d0, d0, d3 359; CHECK-NEXT: bx lr 360entry: 361 %z = call double @llvm.vector.reduce.fmul.f64.v4f64(double %y, <4 x double> %x) 362 ret double %z 363} 364 365declare double @llvm.vector.reduce.fmul.f64.v1f64(double, <1 x double>) 366declare double @llvm.vector.reduce.fmul.f64.v2f64(double, <2 x double>) 367declare double @llvm.vector.reduce.fmul.f64.v4f64(double, <4 x double>) 368declare float @llvm.vector.reduce.fmul.f32.v2f32(float, <2 x float>) 369declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>) 370declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>) 371declare half @llvm.vector.reduce.fmul.f16.v16f16(half, <16 x half>) 372declare half @llvm.vector.reduce.fmul.f16.v2f16(half, <2 x half>) 373declare half @llvm.vector.reduce.fmul.f16.v4f16(half, <4 x half>) 374declare half @llvm.vector.reduce.fmul.f16.v8f16(half, <8 x half>) 375