1 // REQUIRES: aarch64-registered-target
2 // RUN: %clang_cc1 -triple arm64-apple-ios -target-feature +neon \
3 // RUN:        -target-feature +v8.3a \
4 // RUN:        -target-feature +fullfp16 \
5 // RUN:        -disable-O0-optnone -emit-llvm -o - %s | opt -S -O1 | FileCheck %s
6 #include <arm_neon.h>
7 
8 // CHECK-LABEL: @test_vcmla_f16(
9 // CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %rhs)
10 // CHECK: ret <4 x half> [[RES]]
test_vcmla_f16(float16x4_t acc,float16x4_t lhs,float16x4_t rhs)11 float16x4_t test_vcmla_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
12   return vcmla_f16(acc, lhs, rhs);
13 }
14 
15 // CHECK-LABEL: @test_vcmla_f32(
16 // CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
17 // CHECK: ret <2 x float> [[RES]]
test_vcmla_f32(float32x2_t acc,float32x2_t lhs,float32x2_t rhs)18 float32x2_t test_vcmla_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
19   return vcmla_f32(acc, lhs, rhs);
20 }
21 
22 // CHECK-LABEL: @test_vcmlaq_f16(
23 // CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> %rhs)
24 // CHECK: ret <8 x half> [[RES]]
test_vcmlaq_f16(float16x8_t acc,float16x8_t lhs,float16x8_t rhs)25 float16x8_t test_vcmlaq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
26   return vcmlaq_f16(acc, lhs, rhs);
27 }
28 
29 // CHECK-LABEL: @test_vcmlaq_f32(
30 // CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> %rhs)
31 // CHECK: ret <4 x float> [[RES]]
test_vcmlaq_f32(float32x4_t acc,float32x4_t lhs,float32x4_t rhs)32 float32x4_t test_vcmlaq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
33   return vcmlaq_f32(acc, lhs, rhs);
34 }
35 
36 // CHECK-LABEL: @test_vcmlaq_f64(
37 // CHECK: [[RES:%.*]] = call <2 x double> @llvm.aarch64.neon.vcmla.rot0.v2f64(<2 x double> %acc, <2 x double> %lhs, <2 x double> %rhs)
38 // CHECK: ret <2 x double> [[RES]]
test_vcmlaq_f64(float64x2_t acc,float64x2_t lhs,float64x2_t rhs)39 float64x2_t test_vcmlaq_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) {
40   return vcmlaq_f64(acc, lhs, rhs);
41 }
42 
43 // CHECK-LABEL: @test_vcmla_rot90_f16(
44 // CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %rhs)
45 // CHECK: ret <4 x half> [[RES]]
test_vcmla_rot90_f16(float16x4_t acc,float16x4_t lhs,float16x4_t rhs)46 float16x4_t test_vcmla_rot90_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
47   return vcmla_rot90_f16(acc, lhs, rhs);
48 }
49 
50 // CHECK-LABEL: @test_vcmla_rot90_f32(
51 // CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
52 // CHECK: ret <2 x float> [[RES]]
test_vcmla_rot90_f32(float32x2_t acc,float32x2_t lhs,float32x2_t rhs)53 float32x2_t test_vcmla_rot90_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
54   return vcmla_rot90_f32(acc, lhs, rhs);
55 }
56 
57 // CHECK-LABEL: @test_vcmlaq_rot90_f16(
58 // CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> %rhs)
59 // CHECK: ret <8 x half> [[RES]]
test_vcmlaq_rot90_f16(float16x8_t acc,float16x8_t lhs,float16x8_t rhs)60 float16x8_t test_vcmlaq_rot90_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
61   return vcmlaq_rot90_f16(acc, lhs, rhs);
62 }
63 
64 // CHECK-LABEL: @test_vcmlaq_rot90_f32(
65 // CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> %rhs)
66 // CHECK: ret <4 x float> [[RES]]
test_vcmlaq_rot90_f32(float32x4_t acc,float32x4_t lhs,float32x4_t rhs)67 float32x4_t test_vcmlaq_rot90_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
68   return vcmlaq_rot90_f32(acc, lhs, rhs);
69 }
70 
71 // CHECK-LABEL: @test_vcmlaq_rot90_f64(
72 // CHECK: [[RES:%.*]] = call <2 x double> @llvm.aarch64.neon.vcmla.rot90.v2f64(<2 x double> %acc, <2 x double> %lhs, <2 x double> %rhs)
73 // CHECK: ret <2 x double> [[RES]]
test_vcmlaq_rot90_f64(float64x2_t acc,float64x2_t lhs,float64x2_t rhs)74 float64x2_t test_vcmlaq_rot90_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) {
75   return vcmlaq_rot90_f64(acc, lhs, rhs);
76 }
77 
78 // CHECK-LABEL: @test_vcmla_rot180_f16(
79 // CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %rhs)
80 // CHECK: ret <4 x half> [[RES]]
test_vcmla_rot180_f16(float16x4_t acc,float16x4_t lhs,float16x4_t rhs)81 float16x4_t test_vcmla_rot180_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
82   return vcmla_rot180_f16(acc, lhs, rhs);
83 }
84 
85 // CHECK-LABEL: @test_vcmla_rot180_f32(
86 // CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
87 // CHECK: ret <2 x float> [[RES]]
test_vcmla_rot180_f32(float32x2_t acc,float32x2_t lhs,float32x2_t rhs)88 float32x2_t test_vcmla_rot180_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
89   return vcmla_rot180_f32(acc, lhs, rhs);
90 }
91 
92 // CHECK-LABEL: @test_vcmlaq_rot180_f16(
93 // CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> %rhs)
94 // CHECK: ret <8 x half> [[RES]]
test_vcmlaq_rot180_f16(float16x8_t acc,float16x8_t lhs,float16x8_t rhs)95 float16x8_t test_vcmlaq_rot180_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
96   return vcmlaq_rot180_f16(acc, lhs, rhs);
97 }
98 
99 // CHECK-LABEL: @test_vcmlaq_rot180_f32(
100 // CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> %rhs)
101 // CHECK: ret <4 x float> [[RES]]
test_vcmlaq_rot180_f32(float32x4_t acc,float32x4_t lhs,float32x4_t rhs)102 float32x4_t test_vcmlaq_rot180_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
103   return vcmlaq_rot180_f32(acc, lhs, rhs);
104 }
105 
106 // CHECK-LABEL: @test_vcmlaq_rot180_f64(
107 // CHECK: [[RES:%.*]] = call <2 x double> @llvm.aarch64.neon.vcmla.rot180.v2f64(<2 x double> %acc, <2 x double> %lhs, <2 x double> %rhs)
108 // CHECK: ret <2 x double> [[RES]]
test_vcmlaq_rot180_f64(float64x2_t acc,float64x2_t lhs,float64x2_t rhs)109 float64x2_t test_vcmlaq_rot180_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) {
110   return vcmlaq_rot180_f64(acc, lhs, rhs);
111 }
112 
113 // CHECK-LABEL: @test_vcmla_rot270_f16(
114 // CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %rhs)
115 // CHECK: ret <4 x half> [[RES]]
test_vcmla_rot270_f16(float16x4_t acc,float16x4_t lhs,float16x4_t rhs)116 float16x4_t test_vcmla_rot270_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
117   return vcmla_rot270_f16(acc, lhs, rhs);
118 }
119 
120 // CHECK-LABEL: @test_vcmla_rot270_f32(
121 // CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
122 // CHECK: ret <2 x float> [[RES]]
test_vcmla_rot270_f32(float32x2_t acc,float32x2_t lhs,float32x2_t rhs)123 float32x2_t test_vcmla_rot270_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
124   return vcmla_rot270_f32(acc, lhs, rhs);
125 }
126 
127 // CHECK-LABEL: @test_vcmlaq_rot270_f16(
128 // CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> %rhs)
129 // CHECK: ret <8 x half> [[RES]]
test_vcmlaq_rot270_f16(float16x8_t acc,float16x8_t lhs,float16x8_t rhs)130 float16x8_t test_vcmlaq_rot270_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
131   return vcmlaq_rot270_f16(acc, lhs, rhs);
132 }
133 
134 // CHECK-LABEL: @test_vcmlaq_rot270_f32(
135 // CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> %rhs)
136 // CHECK: ret <4 x float> [[RES]]
test_vcmlaq_rot270_f32(float32x4_t acc,float32x4_t lhs,float32x4_t rhs)137 float32x4_t test_vcmlaq_rot270_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
138   return vcmlaq_rot270_f32(acc, lhs, rhs);
139 }
140 
141 // CHECK-LABEL: @test_vcmlaq_rot270_f64(
142 // CHECK: [[RES:%.*]] = call <2 x double> @llvm.aarch64.neon.vcmla.rot270.v2f64(<2 x double> %acc, <2 x double> %lhs, <2 x double> %rhs)
143 // CHECK: ret <2 x double> [[RES]]
test_vcmlaq_rot270_f64(float64x2_t acc,float64x2_t lhs,float64x2_t rhs)144 float64x2_t test_vcmlaq_rot270_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) {
145   return vcmlaq_rot270_f64(acc, lhs, rhs);
146 }
147 
148 // CHECK-LABEL: @test_vcmla_lane_f16(
149 // CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
150 // CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
151 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
152 // CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
153 // CHECK: ret <4 x half> [[RES]]
test_vcmla_lane_f16(float16x4_t acc,float16x4_t lhs,float16x4_t rhs)154 float16x4_t test_vcmla_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
155   return vcmla_lane_f16(acc, lhs, rhs, 1);
156 }
157 
158 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
159 // CHECK-LABEL: @test_vcmla_laneq_f16(
160 // CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
161 // CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <2 x i32> <i32 3, i32 3>
162 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
163 // CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
164 // CHECK: ret <4 x half> [[RES]]
test_vcmla_laneq_f16(float16x4_t acc,float16x4_t lhs,float16x8_t rhs)165 float16x4_t test_vcmla_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
166   return vcmla_laneq_f16(acc, lhs, rhs, 3);
167 }
168 
169 // CHECK-LABEL: @test_vcmlaq_lane_f16(
170 // CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
171 // CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
172 // CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
173 // CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
174 // CHECK: ret <8 x half> [[RES]]
test_vcmlaq_lane_f16(float16x8_t acc,float16x8_t lhs,float16x4_t rhs)175 float16x8_t test_vcmlaq_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) {
176   return vcmlaq_lane_f16(acc, lhs, rhs, 1);
177 }
178 
179 // CHECK-LABEL: @test_vcmlaq_laneq_f16(
180 // CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
181 // CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
182 // CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
183 // CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
184 // CHECK: ret <8 x half> [[RES]]
test_vcmlaq_laneq_f16(float16x8_t acc,float16x8_t lhs,float16x8_t rhs)185 float16x8_t test_vcmlaq_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
186   return vcmlaq_laneq_f16(acc, lhs, rhs, 3);
187 }
188 
189 // CHECK-LABEL: @test_vcmla_lane_f32(
190 // CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
191 // CHECK: ret <2 x float> [[RES]]
test_vcmla_lane_f32(float32x2_t acc,float32x2_t lhs,float32x2_t rhs)192 float32x2_t test_vcmla_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
193   return vcmla_lane_f32(acc, lhs, rhs, 0);
194 }
195 
196 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
197 // CHECK-LABEL: @test_vcmla_laneq_f32(
198 // CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
199 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <1 x i32> <i32 1>
200 // CHECK: [[DUP_FLT:%.*]] = bitcast <1 x i64> [[DUP]] to <2 x float>
201 // CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> [[DUP_FLT]])
202 // CHECK: ret <2 x float> [[RES]]
test_vcmla_laneq_f32(float32x2_t acc,float32x2_t lhs,float32x4_t rhs)203 float32x2_t test_vcmla_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) {
204   return vcmla_laneq_f32(acc, lhs, rhs, 1);
205 }
206 
207 // CHECK-LABEL: @test_vcmlaq_lane_f32(
208 // CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
209 // CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> undef, i64 [[CPLX]], i32 0
210 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX_VEC]], <2 x i64> poison, <2 x i32> zeroinitializer
211 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
212 // CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
213 // CHECK: ret <4 x float> [[RES]]
test_vcmlaq_lane_f32(float32x4_t acc,float32x4_t lhs,float32x2_t rhs)214 float32x4_t test_vcmlaq_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
215   return vcmlaq_lane_f32(acc, lhs, rhs, 0);
216 }
217 
218 // CHECK-LABEL: @test_vcmlaq_laneq_f32(
219 // CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
220 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <2 x i32> <i32 1, i32 1>
221 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
222 // CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
223 // CHECK: ret <4 x float> [[RES]]
test_vcmlaq_laneq_f32(float32x4_t acc,float32x4_t lhs,float32x4_t rhs)224 float32x4_t test_vcmlaq_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
225   return vcmlaq_laneq_f32(acc, lhs, rhs, 1);
226 }
227 
228 // CHECK-LABEL: @test_vcmla_rot90_lane_f16(
229 // CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
230 // CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
231 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
232 // CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
233 // CHECK: ret <4 x half> [[RES]]
test_vcmla_rot90_lane_f16(float16x4_t acc,float16x4_t lhs,float16x4_t rhs)234 float16x4_t test_vcmla_rot90_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
235   return vcmla_rot90_lane_f16(acc, lhs, rhs, 1);
236 }
237 
238 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
239 // CHECK-LABEL: @test_vcmla_rot90_laneq_f16(
240 // CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
241 // CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <2 x i32> <i32 3, i32 3>
242 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
243 // CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
244 // CHECK: ret <4 x half> [[RES]]
test_vcmla_rot90_laneq_f16(float16x4_t acc,float16x4_t lhs,float16x8_t rhs)245 float16x4_t test_vcmla_rot90_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
246   return vcmla_rot90_laneq_f16(acc, lhs, rhs, 3);
247 }
248 
249 // CHECK-LABEL: @test_vcmlaq_rot90_lane_f16(
250 // CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
251 // CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
252 // CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
253 // CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
254 // CHECK: ret <8 x half> [[RES]]
test_vcmlaq_rot90_lane_f16(float16x8_t acc,float16x8_t lhs,float16x4_t rhs)255 float16x8_t test_vcmlaq_rot90_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) {
256   return vcmlaq_rot90_lane_f16(acc, lhs, rhs, 1);
257 }
258 
259 // CHECK-LABEL: @test_vcmlaq_rot90_laneq_f16(
260 // CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
261 // CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
262 // CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
263 // CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
264 // CHECK: ret <8 x half> [[RES]]
test_vcmlaq_rot90_laneq_f16(float16x8_t acc,float16x8_t lhs,float16x8_t rhs)265 float16x8_t test_vcmlaq_rot90_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
266   return vcmlaq_rot90_laneq_f16(acc, lhs, rhs, 3);
267 }
268 
269 // CHECK-LABEL: @test_vcmla_rot90_lane_f32(
270 // CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
271 // CHECK: ret <2 x float> [[RES]]
test_vcmla_rot90_lane_f32(float32x2_t acc,float32x2_t lhs,float32x2_t rhs)272 float32x2_t test_vcmla_rot90_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
273   return vcmla_rot90_lane_f32(acc, lhs, rhs, 0);
274 }
275 
276 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
277 // CHECK-LABEL: @test_vcmla_rot90_laneq_f32(
278 // CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
279 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <1 x i32> <i32 1>
280 // CHECK: [[DUP_FLT:%.*]] = bitcast <1 x i64> [[DUP]] to <2 x float>
281 // CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> [[DUP_FLT]])
282 // CHECK: ret <2 x float> [[RES]]
test_vcmla_rot90_laneq_f32(float32x2_t acc,float32x2_t lhs,float32x4_t rhs)283 float32x2_t test_vcmla_rot90_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) {
284   return vcmla_rot90_laneq_f32(acc, lhs, rhs, 1);
285 }
286 
287 // CHECK-LABEL: @test_vcmlaq_rot90_lane_f32(
288 // CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
289 // CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> undef, i64 [[CPLX]], i32 0
290 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX_VEC]], <2 x i64> poison, <2 x i32> zeroinitializer
291 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
292 // CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
293 // CHECK: ret <4 x float> [[RES]]
test_vcmlaq_rot90_lane_f32(float32x4_t acc,float32x4_t lhs,float32x2_t rhs)294 float32x4_t test_vcmlaq_rot90_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
295   return vcmlaq_rot90_lane_f32(acc, lhs, rhs, 0);
296 }
297 
298 // CHECK-LABEL: @test_vcmlaq_rot90_laneq_f32(
299 // CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
300 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <2 x i32> <i32 1, i32 1>
301 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
302 // CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
303 // CHECK: ret <4 x float> [[RES]]
test_vcmlaq_rot90_laneq_f32(float32x4_t acc,float32x4_t lhs,float32x4_t rhs)304 float32x4_t test_vcmlaq_rot90_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
305   return vcmlaq_rot90_laneq_f32(acc, lhs, rhs, 1);
306 }
307 
308 // CHECK-LABEL: @test_vcmla_rot180_lane_f16(
309 // CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
310 // CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
311 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
312 // CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
313 // CHECK: ret <4 x half> [[RES]]
test_vcmla_rot180_lane_f16(float16x4_t acc,float16x4_t lhs,float16x4_t rhs)314 float16x4_t test_vcmla_rot180_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
315   return vcmla_rot180_lane_f16(acc, lhs, rhs, 1);
316 }
317 
318 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
319 // CHECK-LABEL: @test_vcmla_rot180_laneq_f16(
320 // CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
321 // CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <2 x i32> <i32 3, i32 3>
322 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
323 // CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
324 // CHECK: ret <4 x half> [[RES]]
test_vcmla_rot180_laneq_f16(float16x4_t acc,float16x4_t lhs,float16x8_t rhs)325 float16x4_t test_vcmla_rot180_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
326   return vcmla_rot180_laneq_f16(acc, lhs, rhs, 3);
327 }
328 
329 // CHECK-LABEL: @test_vcmlaq_rot180_lane_f16(
330 // CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
331 // CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
332 // CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
333 // CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
334 // CHECK: ret <8 x half> [[RES]]
test_vcmlaq_rot180_lane_f16(float16x8_t acc,float16x8_t lhs,float16x4_t rhs)335 float16x8_t test_vcmlaq_rot180_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) {
336   return vcmlaq_rot180_lane_f16(acc, lhs, rhs, 1);
337 }
338 
339 // CHECK-LABEL: @test_vcmlaq_rot180_laneq_f16(
340 // CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
341 // CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
342 // CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
343 // CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
344 // CHECK: ret <8 x half> [[RES]]
test_vcmlaq_rot180_laneq_f16(float16x8_t acc,float16x8_t lhs,float16x8_t rhs)345 float16x8_t test_vcmlaq_rot180_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
346   return vcmlaq_rot180_laneq_f16(acc, lhs, rhs, 3);
347 }
348 
349 // CHECK-LABEL: @test_vcmla_rot180_lane_f32(
350 // CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
351 // CHECK: ret <2 x float> [[RES]]
test_vcmla_rot180_lane_f32(float32x2_t acc,float32x2_t lhs,float32x2_t rhs)352 float32x2_t test_vcmla_rot180_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
353   return vcmla_rot180_lane_f32(acc, lhs, rhs, 0);
354 }
355 
356 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
357 // CHECK-LABEL: @test_vcmla_rot180_laneq_f32(
358 // CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
359 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <1 x i32> <i32 1>
360 // CHECK: [[DUP_FLT:%.*]] = bitcast <1 x i64> [[DUP]] to <2 x float>
361 // CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> [[DUP_FLT]])
362 // CHECK: ret <2 x float> [[RES]]
test_vcmla_rot180_laneq_f32(float32x2_t acc,float32x2_t lhs,float32x4_t rhs)363 float32x2_t test_vcmla_rot180_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) {
364   return vcmla_rot180_laneq_f32(acc, lhs, rhs, 1);
365 }
366 
367 // CHECK-LABEL: @test_vcmlaq_rot180_lane_f32(
368 // CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
369 // CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> undef, i64 [[CPLX]], i32 0
370 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX_VEC]], <2 x i64> poison, <2 x i32> zeroinitializer
371 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
372 // CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
373 // CHECK: ret <4 x float> [[RES]]
test_vcmlaq_rot180_lane_f32(float32x4_t acc,float32x4_t lhs,float32x2_t rhs)374 float32x4_t test_vcmlaq_rot180_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
375   return vcmlaq_rot180_lane_f32(acc, lhs, rhs, 0);
376 }
377 
378 // CHECK-LABEL: @test_vcmlaq_rot180_laneq_f32(
379 // CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
380 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <2 x i32> <i32 1, i32 1>
381 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
382 // CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
383 // CHECK: ret <4 x float> [[RES]]
test_vcmlaq_rot180_laneq_f32(float32x4_t acc,float32x4_t lhs,float32x4_t rhs)384 float32x4_t test_vcmlaq_rot180_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
385   return vcmlaq_rot180_laneq_f32(acc, lhs, rhs, 1);
386 }
387 
388 // CHECK-LABEL: @test_vcmla_rot270_lane_f16(
389 // CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
390 // CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
391 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
392 // CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
393 // CHECK: ret <4 x half> [[RES]]
test_vcmla_rot270_lane_f16(float16x4_t acc,float16x4_t lhs,float16x4_t rhs)394 float16x4_t test_vcmla_rot270_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
395   return vcmla_rot270_lane_f16(acc, lhs, rhs, 1);
396 }
397 
398 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
399 // CHECK-LABEL: @test_vcmla_rot270_laneq_f16(
400 // CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
401 // CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <2 x i32> <i32 3, i32 3>
402 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
403 // CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
404 // CHECK: ret <4 x half> [[RES]]
test_vcmla_rot270_laneq_f16(float16x4_t acc,float16x4_t lhs,float16x8_t rhs)405 float16x4_t test_vcmla_rot270_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
406   return vcmla_rot270_laneq_f16(acc, lhs, rhs, 3);
407 }
408 
409 // CHECK-LABEL: @test_vcmlaq_rot270_lane_f16(
410 // CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
411 // CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
412 // CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
413 // CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
414 // CHECK: ret <8 x half> [[RES]]
test_vcmlaq_rot270_lane_f16(float16x8_t acc,float16x8_t lhs,float16x4_t rhs)415 float16x8_t test_vcmlaq_rot270_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) {
416   return vcmlaq_rot270_lane_f16(acc, lhs, rhs, 1);
417 }
418 
419 // CHECK-LABEL: @test_vcmlaq_rot270_laneq_f16(
420 // CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
421 // CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
422 // CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
423 // CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
424 // CHECK: ret <8 x half> [[RES]]
test_vcmlaq_rot270_laneq_f16(float16x8_t acc,float16x8_t lhs,float16x8_t rhs)425 float16x8_t test_vcmlaq_rot270_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
426   return vcmlaq_rot270_laneq_f16(acc, lhs, rhs, 3);
427 }
428 
429 // CHECK-LABEL: @test_vcmla_rot270_lane_f32(
430 // CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
431 // CHECK: ret <2 x float> [[RES]]
test_vcmla_rot270_lane_f32(float32x2_t acc,float32x2_t lhs,float32x2_t rhs)432 float32x2_t test_vcmla_rot270_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
433   return vcmla_rot270_lane_f32(acc, lhs, rhs, 0);
434 }
435 
436 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
437 // CHECK-LABEL: @test_vcmla_rot270_laneq_f32(
438 // CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
439 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <1 x i32> <i32 1>
440 // CHECK: [[DUP_FLT:%.*]] = bitcast <1 x i64> [[DUP]] to <2 x float>
441 // CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> [[DUP_FLT]])
442 // CHECK: ret <2 x float> [[RES]]
test_vcmla_rot270_laneq_f32(float32x2_t acc,float32x2_t lhs,float32x4_t rhs)443 float32x2_t test_vcmla_rot270_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) {
444   return vcmla_rot270_laneq_f32(acc, lhs, rhs, 1);
445 }
446 
447 // CHECK-LABEL: @test_vcmlaq_rot270_lane_f32(
448 // CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
449 // CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> undef, i64 [[CPLX]], i32 0
450 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX_VEC]], <2 x i64> poison, <2 x i32> zeroinitializer
451 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
452 // CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
453 // CHECK: ret <4 x float> [[RES]]
test_vcmlaq_rot270_lane_f32(float32x4_t acc,float32x4_t lhs,float32x2_t rhs)454 float32x4_t test_vcmlaq_rot270_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
455   return vcmlaq_rot270_lane_f32(acc, lhs, rhs, 0);
456 }
457 
458 // CHECK-LABEL: @test_vcmlaq_rot270_laneq_f32(
459 // CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
460 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <2 x i32> <i32 1, i32 1>
461 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
462 // CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
463 // CHECK: ret <4 x float> [[RES]]
test_vcmlaq_rot270_laneq_f32(float32x4_t acc,float32x4_t lhs,float32x4_t rhs)464 float32x4_t test_vcmlaq_rot270_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
465   return vcmlaq_rot270_laneq_f32(acc, lhs, rhs, 1);
466 }
467