// REQUIRES: aarch64-registered-target
// RUN: %clang_cc1 -triple arm64-apple-ios -target-feature +neon \
// RUN:        -target-feature +v8.3a \
// RUN:        -target-feature +fullfp16 \
// RUN:        -disable-O0-optnone -emit-llvm -o - %s | opt -S -O1 | FileCheck %s
#include <arm_neon.h>

// CHECK-LABEL: @test_vcmla_f16(
// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %rhs)
// CHECK: ret <4 x half> [[RES]]
float16x4_t test_vcmla_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
  return vcmla_f16(acc, lhs, rhs);
}

// CHECK-LABEL: @test_vcmla_f32(
// CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
// CHECK: ret <2 x float> [[RES]]
float32x2_t test_vcmla_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
  return vcmla_f32(acc, lhs, rhs);
}

// CHECK-LABEL: @test_vcmlaq_f16(
// CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> %rhs)
// CHECK: ret <8 x half> [[RES]]
float16x8_t test_vcmlaq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
  return vcmlaq_f16(acc, lhs, rhs);
}

// CHECK-LABEL: @test_vcmlaq_f32(
// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> %rhs)
// CHECK: ret <4 x float> [[RES]]
float32x4_t test_vcmlaq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
  return vcmlaq_f32(acc, lhs, rhs);
}

// CHECK-LABEL: @test_vcmlaq_f64(
// CHECK: [[RES:%.*]] = call <2 x double> @llvm.aarch64.neon.vcmla.rot0.v2f64(<2 x double> %acc, <2 x double> %lhs, <2 x double> %rhs)
// CHECK: ret <2 x double> [[RES]]
float64x2_t test_vcmlaq_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) {
  return vcmlaq_f64(acc, lhs, rhs);
}

// CHECK-LABEL: @test_vcmla_rot90_f16(
// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %rhs)
// CHECK: ret <4 x half> [[RES]]
float16x4_t test_vcmla_rot90_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
  return vcmla_rot90_f16(acc, lhs, rhs);
}

// CHECK-LABEL: @test_vcmla_rot90_f32(
// CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
// CHECK: ret <2 x float> [[RES]]
float32x2_t test_vcmla_rot90_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
  return vcmla_rot90_f32(acc, lhs, rhs);
}

// CHECK-LABEL: @test_vcmlaq_rot90_f16(
// CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> %rhs)
// CHECK: ret <8 x half> [[RES]]
float16x8_t test_vcmlaq_rot90_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
  return vcmlaq_rot90_f16(acc, lhs, rhs);
}

// CHECK-LABEL: @test_vcmlaq_rot90_f32(
// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> %rhs)
// CHECK: ret <4 x float> [[RES]]
float32x4_t test_vcmlaq_rot90_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
  return vcmlaq_rot90_f32(acc, lhs, rhs);
}

// CHECK-LABEL: @test_vcmlaq_rot90_f64(
// CHECK: [[RES:%.*]] = call <2 x double> @llvm.aarch64.neon.vcmla.rot90.v2f64(<2 x double> %acc, <2 x double> %lhs, <2 x double> %rhs)
// CHECK: ret <2 x double> [[RES]]
float64x2_t test_vcmlaq_rot90_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) {
  return vcmlaq_rot90_f64(acc, lhs, rhs);
}

// CHECK-LABEL: @test_vcmla_rot180_f16(
// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %rhs)
// CHECK: ret <4 x half> [[RES]]
float16x4_t test_vcmla_rot180_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
  return vcmla_rot180_f16(acc, lhs, rhs);
}

// CHECK-LABEL: @test_vcmla_rot180_f32(
// CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
// CHECK: ret <2 x float> [[RES]]
float32x2_t test_vcmla_rot180_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
  return vcmla_rot180_f32(acc, lhs, rhs);
}

// CHECK-LABEL: @test_vcmlaq_rot180_f16(
// CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> %rhs)
// CHECK: ret <8 x half> [[RES]]
float16x8_t test_vcmlaq_rot180_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
  return vcmlaq_rot180_f16(acc, lhs, rhs);
}

// CHECK-LABEL: @test_vcmlaq_rot180_f32(
// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> %rhs)
// CHECK: ret <4 x float> [[RES]]
float32x4_t test_vcmlaq_rot180_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
  return vcmlaq_rot180_f32(acc, lhs, rhs);
}

// CHECK-LABEL: @test_vcmlaq_rot180_f64(
// CHECK: [[RES:%.*]] = call <2 x double> @llvm.aarch64.neon.vcmla.rot180.v2f64(<2 x double> %acc, <2 x double> %lhs, <2 x double> %rhs)
// CHECK: ret <2 x double> [[RES]]
float64x2_t test_vcmlaq_rot180_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) {
  return vcmlaq_rot180_f64(acc, lhs, rhs);
}

// CHECK-LABEL: @test_vcmla_rot270_f16(
// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %rhs)
// CHECK: ret <4 x half> [[RES]]
float16x4_t test_vcmla_rot270_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
  return vcmla_rot270_f16(acc, lhs, rhs);
}

// CHECK-LABEL: @test_vcmla_rot270_f32(
// CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
// CHECK: ret <2 x float> [[RES]]
float32x2_t test_vcmla_rot270_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
  return vcmla_rot270_f32(acc, lhs, rhs);
}

// CHECK-LABEL: @test_vcmlaq_rot270_f16(
// CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> %rhs)
// CHECK: ret <8 x half> [[RES]]
float16x8_t test_vcmlaq_rot270_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
  return vcmlaq_rot270_f16(acc, lhs, rhs);
}

// CHECK-LABEL: @test_vcmlaq_rot270_f32(
// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> %rhs)
// CHECK: ret <4 x float> [[RES]]
float32x4_t test_vcmlaq_rot270_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
  return vcmlaq_rot270_f32(acc, lhs, rhs);
}

// CHECK-LABEL: @test_vcmlaq_rot270_f64(
// CHECK: [[RES:%.*]] = call <2 x double> @llvm.aarch64.neon.vcmla.rot270.v2f64(<2 x double> %acc, <2 x double> %lhs, <2 x double> %rhs)
// CHECK: ret <2 x double> [[RES]]
float64x2_t test_vcmlaq_rot270_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) {
  return vcmlaq_rot270_f64(acc, lhs, rhs);
}

// CHECK-LABEL: @test_vcmla_lane_f16(
// CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
// CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
// CHECK: ret <4 x half> [[RES]]
float16x4_t test_vcmla_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
  return vcmla_lane_f16(acc, lhs, rhs, 1);
}

// ACLE says this exists, but it won't map to a single instruction if lane > 1.
// CHECK-LABEL: @test_vcmla_laneq_f16(
// CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
// CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <2 x i32> <i32 3, i32 3>
// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
// CHECK: ret <4 x half> [[RES]]
float16x4_t test_vcmla_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
  return vcmla_laneq_f16(acc, lhs, rhs, 3);
}

// CHECK-LABEL: @test_vcmlaq_lane_f16(
// CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
// CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
// CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
// CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
// CHECK: ret <8 x half> [[RES]]
float16x8_t test_vcmlaq_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) {
  return vcmlaq_lane_f16(acc, lhs, rhs, 1);
}

// CHECK-LABEL: @test_vcmlaq_laneq_f16(
// CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
// CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
// CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
// CHECK: ret <8 x half> [[RES]]
float16x8_t test_vcmlaq_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
  return vcmlaq_laneq_f16(acc, lhs, rhs, 3);
}

// CHECK-LABEL: @test_vcmla_lane_f32(
// CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
// CHECK: ret <2 x float> [[RES]]
float32x2_t test_vcmla_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
  return vcmla_lane_f32(acc, lhs, rhs, 0);
}

// ACLE says this exists, but it won't map to a single instruction if lane > 1.
// CHECK-LABEL: @test_vcmla_laneq_f32(
// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <1 x i32> <i32 1>
// CHECK: [[DUP_FLT:%.*]] = bitcast <1 x i64> [[DUP]] to <2 x float>
// CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> [[DUP_FLT]])
// CHECK: ret <2 x float> [[RES]]
float32x2_t test_vcmla_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) {
  return vcmla_laneq_f32(acc, lhs, rhs, 1);
}

// CHECK-LABEL: @test_vcmlaq_lane_f32(
// CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
// CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> undef, i64 [[CPLX]], i32 0
// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX_VEC]], <2 x i64> undef, <2 x i32> zeroinitializer
// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
// CHECK: ret <4 x float> [[RES]]
float32x4_t test_vcmlaq_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
  return vcmlaq_lane_f32(acc, lhs, rhs, 0);
}

// CHECK-LABEL: @test_vcmlaq_laneq_f32(
// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <2 x i32> <i32 1, i32 1>
// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
// CHECK: ret <4 x float> [[RES]]
float32x4_t test_vcmlaq_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
  return vcmlaq_laneq_f32(acc, lhs, rhs, 1);
}

// CHECK-LABEL: @test_vcmla_rot90_lane_f16(
// CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
// CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
// CHECK: ret <4 x half> [[RES]]
float16x4_t test_vcmla_rot90_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
  return vcmla_rot90_lane_f16(acc, lhs, rhs, 1);
}

// ACLE says this exists, but it won't map to a single instruction if lane > 1.
// CHECK-LABEL: @test_vcmla_rot90_laneq_f16(
// CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
// CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <2 x i32> <i32 3, i32 3>
// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
// CHECK: ret <4 x half> [[RES]]
float16x4_t test_vcmla_rot90_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
  return vcmla_rot90_laneq_f16(acc, lhs, rhs, 3);
}

// CHECK-LABEL: @test_vcmlaq_rot90_lane_f16(
// CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
// CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
// CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
// CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
// CHECK: ret <8 x half> [[RES]]
float16x8_t test_vcmlaq_rot90_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) {
  return vcmlaq_rot90_lane_f16(acc, lhs, rhs, 1);
}

// CHECK-LABEL: @test_vcmlaq_rot90_laneq_f16(
// CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
// CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
// CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
// CHECK: ret <8 x half> [[RES]]
float16x8_t test_vcmlaq_rot90_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
  return vcmlaq_rot90_laneq_f16(acc, lhs, rhs, 3);
}

// CHECK-LABEL: @test_vcmla_rot90_lane_f32(
// CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
// CHECK: ret <2 x float> [[RES]]
float32x2_t test_vcmla_rot90_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
  return vcmla_rot90_lane_f32(acc, lhs, rhs, 0);
}

// ACLE says this exists, but it won't map to a single instruction if lane > 1.
// CHECK-LABEL: @test_vcmla_rot90_laneq_f32(
// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <1 x i32> <i32 1>
// CHECK: [[DUP_FLT:%.*]] = bitcast <1 x i64> [[DUP]] to <2 x float>
// CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> [[DUP_FLT]])
// CHECK: ret <2 x float> [[RES]]
float32x2_t test_vcmla_rot90_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) {
  return vcmla_rot90_laneq_f32(acc, lhs, rhs, 1);
}

// CHECK-LABEL: @test_vcmlaq_rot90_lane_f32(
// CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
// CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> undef, i64 [[CPLX]], i32 0
// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX_VEC]], <2 x i64> undef, <2 x i32> zeroinitializer
// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
// CHECK: ret <4 x float> [[RES]]
float32x4_t test_vcmlaq_rot90_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
  return vcmlaq_rot90_lane_f32(acc, lhs, rhs, 0);
}

// CHECK-LABEL: @test_vcmlaq_rot90_laneq_f32(
// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <2 x i32> <i32 1, i32 1>
// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
// CHECK: ret <4 x float> [[RES]]
float32x4_t test_vcmlaq_rot90_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
  return vcmlaq_rot90_laneq_f32(acc, lhs, rhs, 1);
}

// CHECK-LABEL: @test_vcmla_rot180_lane_f16(
// CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
// CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
// CHECK: ret <4 x half> [[RES]]
float16x4_t test_vcmla_rot180_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
  return vcmla_rot180_lane_f16(acc, lhs, rhs, 1);
}

// ACLE says this exists, but it won't map to a single instruction if lane > 1.
// CHECK-LABEL: @test_vcmla_rot180_laneq_f16(
// CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
// CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <2 x i32> <i32 3, i32 3>
// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
// CHECK: ret <4 x half> [[RES]]
float16x4_t test_vcmla_rot180_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
  return vcmla_rot180_laneq_f16(acc, lhs, rhs, 3);
}

// CHECK-LABEL: @test_vcmlaq_rot180_lane_f16(
// CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
// CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
// CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
// CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
// CHECK: ret <8 x half> [[RES]]
float16x8_t test_vcmlaq_rot180_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) {
  return vcmlaq_rot180_lane_f16(acc, lhs, rhs, 1);
}

// CHECK-LABEL: @test_vcmlaq_rot180_laneq_f16(
// CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
// CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
// CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
// CHECK: ret <8 x half> [[RES]]
float16x8_t test_vcmlaq_rot180_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
  return vcmlaq_rot180_laneq_f16(acc, lhs, rhs, 3);
}

// CHECK-LABEL: @test_vcmla_rot180_lane_f32(
// CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
// CHECK: ret <2 x float> [[RES]]
float32x2_t test_vcmla_rot180_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
  return vcmla_rot180_lane_f32(acc, lhs, rhs, 0);
}

// ACLE says this exists, but it won't map to a single instruction if lane > 1.
// CHECK-LABEL: @test_vcmla_rot180_laneq_f32(
// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <1 x i32> <i32 1>
// CHECK: [[DUP_FLT:%.*]] = bitcast <1 x i64> [[DUP]] to <2 x float>
// CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> [[DUP_FLT]])
// CHECK: ret <2 x float> [[RES]]
float32x2_t test_vcmla_rot180_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) {
  return vcmla_rot180_laneq_f32(acc, lhs, rhs, 1);
}

// CHECK-LABEL: @test_vcmlaq_rot180_lane_f32(
// CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
// CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> undef, i64 [[CPLX]], i32 0
// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX_VEC]], <2 x i64> undef, <2 x i32> zeroinitializer
// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
// CHECK: ret <4 x float> [[RES]]
float32x4_t test_vcmlaq_rot180_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
  return vcmlaq_rot180_lane_f32(acc, lhs, rhs, 0);
}

// CHECK-LABEL: @test_vcmlaq_rot180_laneq_f32(
// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <2 x i32> <i32 1, i32 1>
// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
// CHECK: ret <4 x float> [[RES]]
float32x4_t test_vcmlaq_rot180_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
  return vcmlaq_rot180_laneq_f32(acc, lhs, rhs, 1);
}

// CHECK-LABEL: @test_vcmla_rot270_lane_f16(
// CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
// CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
// CHECK: ret <4 x half> [[RES]]
float16x4_t test_vcmla_rot270_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
  return vcmla_rot270_lane_f16(acc, lhs, rhs, 1);
}

// ACLE says this exists, but it won't map to a single instruction if lane > 1.
// CHECK-LABEL: @test_vcmla_rot270_laneq_f16(
// CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
// CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <2 x i32> <i32 3, i32 3>
// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
// CHECK: ret <4 x half> [[RES]]
float16x4_t test_vcmla_rot270_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
  return vcmla_rot270_laneq_f16(acc, lhs, rhs, 3);
}

// CHECK-LABEL: @test_vcmlaq_rot270_lane_f16(
// CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
// CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
// CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
// CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
// CHECK: ret <8 x half> [[RES]]
float16x8_t test_vcmlaq_rot270_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) {
  return vcmlaq_rot270_lane_f16(acc, lhs, rhs, 1);
}

// CHECK-LABEL: @test_vcmlaq_rot270_laneq_f16(
// CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
// CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
// CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
// CHECK: ret <8 x half> [[RES]]
float16x8_t test_vcmlaq_rot270_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
  return vcmlaq_rot270_laneq_f16(acc, lhs, rhs, 3);
}

// CHECK-LABEL: @test_vcmla_rot270_lane_f32(
// CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
// CHECK: ret <2 x float> [[RES]]
float32x2_t test_vcmla_rot270_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
  return vcmla_rot270_lane_f32(acc, lhs, rhs, 0);
}

// ACLE says this exists, but it won't map to a single instruction if lane > 1.
// CHECK-LABEL: @test_vcmla_rot270_laneq_f32(
// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <1 x i32> <i32 1>
// CHECK: [[DUP_FLT:%.*]] = bitcast <1 x i64> [[DUP]] to <2 x float>
// CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> [[DUP_FLT]])
// CHECK: ret <2 x float> [[RES]]
float32x2_t test_vcmla_rot270_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) {
  return vcmla_rot270_laneq_f32(acc, lhs, rhs, 1);
}

// CHECK-LABEL: @test_vcmlaq_rot270_lane_f32(
// CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
// CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> undef, i64 [[CPLX]], i32 0
// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX_VEC]], <2 x i64> undef, <2 x i32> zeroinitializer
// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
// CHECK: ret <4 x float> [[RES]]
float32x4_t test_vcmlaq_rot270_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
  return vcmlaq_rot270_lane_f32(acc, lhs, rhs, 0);
}

// CHECK-LABEL: @test_vcmlaq_rot270_laneq_f32(
// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <2 x i32> <i32 1, i32 1>
// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
// CHECK: ret <4 x float> [[RES]]
float32x4_t test_vcmlaq_rot270_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
  return vcmlaq_rot270_laneq_f32(acc, lhs, rhs, 1);
}