1 // REQUIRES: aarch64-registered-target
2 // RUN: %clang_cc1 -triple arm64-apple-ios -target-feature +neon \
3 // RUN: -target-feature +v8.3a \
4 // RUN: -target-feature +fullfp16 \
5 // RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -O1 | FileCheck %s
6 #include <arm_neon.h>
7
8 // CHECK-LABEL: @test_vcmla_f16(
9 // CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %rhs)
10 // CHECK: ret <4 x half> [[RES]]
test_vcmla_f16(float16x4_t acc,float16x4_t lhs,float16x4_t rhs)11 float16x4_t test_vcmla_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
12 return vcmla_f16(acc, lhs, rhs);
13 }
14
15 // CHECK-LABEL: @test_vcmla_f32(
16 // CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
17 // CHECK: ret <2 x float> [[RES]]
test_vcmla_f32(float32x2_t acc,float32x2_t lhs,float32x2_t rhs)18 float32x2_t test_vcmla_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
19 return vcmla_f32(acc, lhs, rhs);
20 }
21
22 // CHECK-LABEL: @test_vcmlaq_f16(
23 // CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> %rhs)
24 // CHECK: ret <8 x half> [[RES]]
test_vcmlaq_f16(float16x8_t acc,float16x8_t lhs,float16x8_t rhs)25 float16x8_t test_vcmlaq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
26 return vcmlaq_f16(acc, lhs, rhs);
27 }
28
29 // CHECK-LABEL: @test_vcmlaq_f32(
30 // CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> %rhs)
31 // CHECK: ret <4 x float> [[RES]]
test_vcmlaq_f32(float32x4_t acc,float32x4_t lhs,float32x4_t rhs)32 float32x4_t test_vcmlaq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
33 return vcmlaq_f32(acc, lhs, rhs);
34 }
35
36 // CHECK-LABEL: @test_vcmlaq_f64(
37 // CHECK: [[RES:%.*]] = call <2 x double> @llvm.aarch64.neon.vcmla.rot0.v2f64(<2 x double> %acc, <2 x double> %lhs, <2 x double> %rhs)
38 // CHECK: ret <2 x double> [[RES]]
test_vcmlaq_f64(float64x2_t acc,float64x2_t lhs,float64x2_t rhs)39 float64x2_t test_vcmlaq_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) {
40 return vcmlaq_f64(acc, lhs, rhs);
41 }
42
43 // CHECK-LABEL: @test_vcmla_rot90_f16(
44 // CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %rhs)
45 // CHECK: ret <4 x half> [[RES]]
test_vcmla_rot90_f16(float16x4_t acc,float16x4_t lhs,float16x4_t rhs)46 float16x4_t test_vcmla_rot90_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
47 return vcmla_rot90_f16(acc, lhs, rhs);
48 }
49
50 // CHECK-LABEL: @test_vcmla_rot90_f32(
51 // CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
52 // CHECK: ret <2 x float> [[RES]]
test_vcmla_rot90_f32(float32x2_t acc,float32x2_t lhs,float32x2_t rhs)53 float32x2_t test_vcmla_rot90_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
54 return vcmla_rot90_f32(acc, lhs, rhs);
55 }
56
57 // CHECK-LABEL: @test_vcmlaq_rot90_f16(
58 // CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> %rhs)
59 // CHECK: ret <8 x half> [[RES]]
test_vcmlaq_rot90_f16(float16x8_t acc,float16x8_t lhs,float16x8_t rhs)60 float16x8_t test_vcmlaq_rot90_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
61 return vcmlaq_rot90_f16(acc, lhs, rhs);
62 }
63
64 // CHECK-LABEL: @test_vcmlaq_rot90_f32(
65 // CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> %rhs)
66 // CHECK: ret <4 x float> [[RES]]
test_vcmlaq_rot90_f32(float32x4_t acc,float32x4_t lhs,float32x4_t rhs)67 float32x4_t test_vcmlaq_rot90_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
68 return vcmlaq_rot90_f32(acc, lhs, rhs);
69 }
70
71 // CHECK-LABEL: @test_vcmlaq_rot90_f64(
72 // CHECK: [[RES:%.*]] = call <2 x double> @llvm.aarch64.neon.vcmla.rot90.v2f64(<2 x double> %acc, <2 x double> %lhs, <2 x double> %rhs)
73 // CHECK: ret <2 x double> [[RES]]
test_vcmlaq_rot90_f64(float64x2_t acc,float64x2_t lhs,float64x2_t rhs)74 float64x2_t test_vcmlaq_rot90_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) {
75 return vcmlaq_rot90_f64(acc, lhs, rhs);
76 }
77
78 // CHECK-LABEL: @test_vcmla_rot180_f16(
79 // CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %rhs)
80 // CHECK: ret <4 x half> [[RES]]
test_vcmla_rot180_f16(float16x4_t acc,float16x4_t lhs,float16x4_t rhs)81 float16x4_t test_vcmla_rot180_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
82 return vcmla_rot180_f16(acc, lhs, rhs);
83 }
84
85 // CHECK-LABEL: @test_vcmla_rot180_f32(
86 // CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
87 // CHECK: ret <2 x float> [[RES]]
test_vcmla_rot180_f32(float32x2_t acc,float32x2_t lhs,float32x2_t rhs)88 float32x2_t test_vcmla_rot180_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
89 return vcmla_rot180_f32(acc, lhs, rhs);
90 }
91
92 // CHECK-LABEL: @test_vcmlaq_rot180_f16(
93 // CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> %rhs)
94 // CHECK: ret <8 x half> [[RES]]
test_vcmlaq_rot180_f16(float16x8_t acc,float16x8_t lhs,float16x8_t rhs)95 float16x8_t test_vcmlaq_rot180_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
96 return vcmlaq_rot180_f16(acc, lhs, rhs);
97 }
98
99 // CHECK-LABEL: @test_vcmlaq_rot180_f32(
100 // CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> %rhs)
101 // CHECK: ret <4 x float> [[RES]]
test_vcmlaq_rot180_f32(float32x4_t acc,float32x4_t lhs,float32x4_t rhs)102 float32x4_t test_vcmlaq_rot180_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
103 return vcmlaq_rot180_f32(acc, lhs, rhs);
104 }
105
106 // CHECK-LABEL: @test_vcmlaq_rot180_f64(
107 // CHECK: [[RES:%.*]] = call <2 x double> @llvm.aarch64.neon.vcmla.rot180.v2f64(<2 x double> %acc, <2 x double> %lhs, <2 x double> %rhs)
108 // CHECK: ret <2 x double> [[RES]]
test_vcmlaq_rot180_f64(float64x2_t acc,float64x2_t lhs,float64x2_t rhs)109 float64x2_t test_vcmlaq_rot180_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) {
110 return vcmlaq_rot180_f64(acc, lhs, rhs);
111 }
112
113 // CHECK-LABEL: @test_vcmla_rot270_f16(
114 // CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %rhs)
115 // CHECK: ret <4 x half> [[RES]]
test_vcmla_rot270_f16(float16x4_t acc,float16x4_t lhs,float16x4_t rhs)116 float16x4_t test_vcmla_rot270_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
117 return vcmla_rot270_f16(acc, lhs, rhs);
118 }
119
120 // CHECK-LABEL: @test_vcmla_rot270_f32(
121 // CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
122 // CHECK: ret <2 x float> [[RES]]
test_vcmla_rot270_f32(float32x2_t acc,float32x2_t lhs,float32x2_t rhs)123 float32x2_t test_vcmla_rot270_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
124 return vcmla_rot270_f32(acc, lhs, rhs);
125 }
126
127 // CHECK-LABEL: @test_vcmlaq_rot270_f16(
128 // CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> %rhs)
129 // CHECK: ret <8 x half> [[RES]]
test_vcmlaq_rot270_f16(float16x8_t acc,float16x8_t lhs,float16x8_t rhs)130 float16x8_t test_vcmlaq_rot270_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
131 return vcmlaq_rot270_f16(acc, lhs, rhs);
132 }
133
134 // CHECK-LABEL: @test_vcmlaq_rot270_f32(
135 // CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> %rhs)
136 // CHECK: ret <4 x float> [[RES]]
test_vcmlaq_rot270_f32(float32x4_t acc,float32x4_t lhs,float32x4_t rhs)137 float32x4_t test_vcmlaq_rot270_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
138 return vcmlaq_rot270_f32(acc, lhs, rhs);
139 }
140
141 // CHECK-LABEL: @test_vcmlaq_rot270_f64(
142 // CHECK: [[RES:%.*]] = call <2 x double> @llvm.aarch64.neon.vcmla.rot270.v2f64(<2 x double> %acc, <2 x double> %lhs, <2 x double> %rhs)
143 // CHECK: ret <2 x double> [[RES]]
test_vcmlaq_rot270_f64(float64x2_t acc,float64x2_t lhs,float64x2_t rhs)144 float64x2_t test_vcmlaq_rot270_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) {
145 return vcmlaq_rot270_f64(acc, lhs, rhs);
146 }
147
148 // CHECK-LABEL: @test_vcmla_lane_f16(
149 // CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
150 // CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
151 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
152 // CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
153 // CHECK: ret <4 x half> [[RES]]
test_vcmla_lane_f16(float16x4_t acc,float16x4_t lhs,float16x4_t rhs)154 float16x4_t test_vcmla_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
155 return vcmla_lane_f16(acc, lhs, rhs, 1);
156 }
157
158 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
159 // CHECK-LABEL: @test_vcmla_laneq_f16(
160 // CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
161 // CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <2 x i32> <i32 3, i32 3>
162 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
163 // CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
164 // CHECK: ret <4 x half> [[RES]]
test_vcmla_laneq_f16(float16x4_t acc,float16x4_t lhs,float16x8_t rhs)165 float16x4_t test_vcmla_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
166 return vcmla_laneq_f16(acc, lhs, rhs, 3);
167 }
168
169 // CHECK-LABEL: @test_vcmlaq_lane_f16(
170 // CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
171 // CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
172 // CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
173 // CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
174 // CHECK: ret <8 x half> [[RES]]
test_vcmlaq_lane_f16(float16x8_t acc,float16x8_t lhs,float16x4_t rhs)175 float16x8_t test_vcmlaq_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) {
176 return vcmlaq_lane_f16(acc, lhs, rhs, 1);
177 }
178
179 // CHECK-LABEL: @test_vcmlaq_laneq_f16(
180 // CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
181 // CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
182 // CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
183 // CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
184 // CHECK: ret <8 x half> [[RES]]
test_vcmlaq_laneq_f16(float16x8_t acc,float16x8_t lhs,float16x8_t rhs)185 float16x8_t test_vcmlaq_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
186 return vcmlaq_laneq_f16(acc, lhs, rhs, 3);
187 }
188
189 // CHECK-LABEL: @test_vcmla_lane_f32(
190 // CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
191 // CHECK: ret <2 x float> [[RES]]
test_vcmla_lane_f32(float32x2_t acc,float32x2_t lhs,float32x2_t rhs)192 float32x2_t test_vcmla_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
193 return vcmla_lane_f32(acc, lhs, rhs, 0);
194 }
195
196 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
197 // CHECK-LABEL: @test_vcmla_laneq_f32(
198 // CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
199 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <1 x i32> <i32 1>
200 // CHECK: [[DUP_FLT:%.*]] = bitcast <1 x i64> [[DUP]] to <2 x float>
201 // CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> [[DUP_FLT]])
202 // CHECK: ret <2 x float> [[RES]]
test_vcmla_laneq_f32(float32x2_t acc,float32x2_t lhs,float32x4_t rhs)203 float32x2_t test_vcmla_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) {
204 return vcmla_laneq_f32(acc, lhs, rhs, 1);
205 }
206
207 // CHECK-LABEL: @test_vcmlaq_lane_f32(
208 // CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
209 // CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> undef, i64 [[CPLX]], i32 0
210 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX_VEC]], <2 x i64> poison, <2 x i32> zeroinitializer
211 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
212 // CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
213 // CHECK: ret <4 x float> [[RES]]
test_vcmlaq_lane_f32(float32x4_t acc,float32x4_t lhs,float32x2_t rhs)214 float32x4_t test_vcmlaq_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
215 return vcmlaq_lane_f32(acc, lhs, rhs, 0);
216 }
217
218 // CHECK-LABEL: @test_vcmlaq_laneq_f32(
219 // CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
220 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <2 x i32> <i32 1, i32 1>
221 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
222 // CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
223 // CHECK: ret <4 x float> [[RES]]
test_vcmlaq_laneq_f32(float32x4_t acc,float32x4_t lhs,float32x4_t rhs)224 float32x4_t test_vcmlaq_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
225 return vcmlaq_laneq_f32(acc, lhs, rhs, 1);
226 }
227
228 // CHECK-LABEL: @test_vcmla_rot90_lane_f16(
229 // CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
230 // CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
231 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
232 // CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
233 // CHECK: ret <4 x half> [[RES]]
test_vcmla_rot90_lane_f16(float16x4_t acc,float16x4_t lhs,float16x4_t rhs)234 float16x4_t test_vcmla_rot90_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
235 return vcmla_rot90_lane_f16(acc, lhs, rhs, 1);
236 }
237
238 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
239 // CHECK-LABEL: @test_vcmla_rot90_laneq_f16(
240 // CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
241 // CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <2 x i32> <i32 3, i32 3>
242 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
243 // CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
244 // CHECK: ret <4 x half> [[RES]]
test_vcmla_rot90_laneq_f16(float16x4_t acc,float16x4_t lhs,float16x8_t rhs)245 float16x4_t test_vcmla_rot90_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
246 return vcmla_rot90_laneq_f16(acc, lhs, rhs, 3);
247 }
248
249 // CHECK-LABEL: @test_vcmlaq_rot90_lane_f16(
250 // CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
251 // CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
252 // CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
253 // CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
254 // CHECK: ret <8 x half> [[RES]]
test_vcmlaq_rot90_lane_f16(float16x8_t acc,float16x8_t lhs,float16x4_t rhs)255 float16x8_t test_vcmlaq_rot90_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) {
256 return vcmlaq_rot90_lane_f16(acc, lhs, rhs, 1);
257 }
258
259 // CHECK-LABEL: @test_vcmlaq_rot90_laneq_f16(
260 // CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
261 // CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
262 // CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
263 // CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
264 // CHECK: ret <8 x half> [[RES]]
test_vcmlaq_rot90_laneq_f16(float16x8_t acc,float16x8_t lhs,float16x8_t rhs)265 float16x8_t test_vcmlaq_rot90_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
266 return vcmlaq_rot90_laneq_f16(acc, lhs, rhs, 3);
267 }
268
269 // CHECK-LABEL: @test_vcmla_rot90_lane_f32(
270 // CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
271 // CHECK: ret <2 x float> [[RES]]
test_vcmla_rot90_lane_f32(float32x2_t acc,float32x2_t lhs,float32x2_t rhs)272 float32x2_t test_vcmla_rot90_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
273 return vcmla_rot90_lane_f32(acc, lhs, rhs, 0);
274 }
275
276 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
277 // CHECK-LABEL: @test_vcmla_rot90_laneq_f32(
278 // CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
279 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <1 x i32> <i32 1>
280 // CHECK: [[DUP_FLT:%.*]] = bitcast <1 x i64> [[DUP]] to <2 x float>
281 // CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> [[DUP_FLT]])
282 // CHECK: ret <2 x float> [[RES]]
test_vcmla_rot90_laneq_f32(float32x2_t acc,float32x2_t lhs,float32x4_t rhs)283 float32x2_t test_vcmla_rot90_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) {
284 return vcmla_rot90_laneq_f32(acc, lhs, rhs, 1);
285 }
286
287 // CHECK-LABEL: @test_vcmlaq_rot90_lane_f32(
288 // CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
289 // CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> undef, i64 [[CPLX]], i32 0
290 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX_VEC]], <2 x i64> poison, <2 x i32> zeroinitializer
291 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
292 // CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
293 // CHECK: ret <4 x float> [[RES]]
test_vcmlaq_rot90_lane_f32(float32x4_t acc,float32x4_t lhs,float32x2_t rhs)294 float32x4_t test_vcmlaq_rot90_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
295 return vcmlaq_rot90_lane_f32(acc, lhs, rhs, 0);
296 }
297
298 // CHECK-LABEL: @test_vcmlaq_rot90_laneq_f32(
299 // CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
300 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <2 x i32> <i32 1, i32 1>
301 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
302 // CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
303 // CHECK: ret <4 x float> [[RES]]
test_vcmlaq_rot90_laneq_f32(float32x4_t acc,float32x4_t lhs,float32x4_t rhs)304 float32x4_t test_vcmlaq_rot90_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
305 return vcmlaq_rot90_laneq_f32(acc, lhs, rhs, 1);
306 }
307
308 // CHECK-LABEL: @test_vcmla_rot180_lane_f16(
309 // CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
310 // CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
311 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
312 // CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
313 // CHECK: ret <4 x half> [[RES]]
test_vcmla_rot180_lane_f16(float16x4_t acc,float16x4_t lhs,float16x4_t rhs)314 float16x4_t test_vcmla_rot180_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
315 return vcmla_rot180_lane_f16(acc, lhs, rhs, 1);
316 }
317
318 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
319 // CHECK-LABEL: @test_vcmla_rot180_laneq_f16(
320 // CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
321 // CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <2 x i32> <i32 3, i32 3>
322 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
323 // CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
324 // CHECK: ret <4 x half> [[RES]]
test_vcmla_rot180_laneq_f16(float16x4_t acc,float16x4_t lhs,float16x8_t rhs)325 float16x4_t test_vcmla_rot180_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
326 return vcmla_rot180_laneq_f16(acc, lhs, rhs, 3);
327 }
328
329 // CHECK-LABEL: @test_vcmlaq_rot180_lane_f16(
330 // CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
331 // CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
332 // CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
333 // CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
334 // CHECK: ret <8 x half> [[RES]]
test_vcmlaq_rot180_lane_f16(float16x8_t acc,float16x8_t lhs,float16x4_t rhs)335 float16x8_t test_vcmlaq_rot180_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) {
336 return vcmlaq_rot180_lane_f16(acc, lhs, rhs, 1);
337 }
338
339 // CHECK-LABEL: @test_vcmlaq_rot180_laneq_f16(
340 // CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
341 // CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
342 // CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
343 // CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
344 // CHECK: ret <8 x half> [[RES]]
test_vcmlaq_rot180_laneq_f16(float16x8_t acc,float16x8_t lhs,float16x8_t rhs)345 float16x8_t test_vcmlaq_rot180_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
346 return vcmlaq_rot180_laneq_f16(acc, lhs, rhs, 3);
347 }
348
349 // CHECK-LABEL: @test_vcmla_rot180_lane_f32(
350 // CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
351 // CHECK: ret <2 x float> [[RES]]
test_vcmla_rot180_lane_f32(float32x2_t acc,float32x2_t lhs,float32x2_t rhs)352 float32x2_t test_vcmla_rot180_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
353 return vcmla_rot180_lane_f32(acc, lhs, rhs, 0);
354 }
355
356 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
357 // CHECK-LABEL: @test_vcmla_rot180_laneq_f32(
358 // CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
359 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <1 x i32> <i32 1>
360 // CHECK: [[DUP_FLT:%.*]] = bitcast <1 x i64> [[DUP]] to <2 x float>
361 // CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> [[DUP_FLT]])
362 // CHECK: ret <2 x float> [[RES]]
test_vcmla_rot180_laneq_f32(float32x2_t acc,float32x2_t lhs,float32x4_t rhs)363 float32x2_t test_vcmla_rot180_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) {
364 return vcmla_rot180_laneq_f32(acc, lhs, rhs, 1);
365 }
366
367 // CHECK-LABEL: @test_vcmlaq_rot180_lane_f32(
368 // CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
369 // CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> undef, i64 [[CPLX]], i32 0
370 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX_VEC]], <2 x i64> poison, <2 x i32> zeroinitializer
371 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
372 // CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
373 // CHECK: ret <4 x float> [[RES]]
test_vcmlaq_rot180_lane_f32(float32x4_t acc,float32x4_t lhs,float32x2_t rhs)374 float32x4_t test_vcmlaq_rot180_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
375 return vcmlaq_rot180_lane_f32(acc, lhs, rhs, 0);
376 }
377
378 // CHECK-LABEL: @test_vcmlaq_rot180_laneq_f32(
379 // CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
380 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <2 x i32> <i32 1, i32 1>
381 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
382 // CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
383 // CHECK: ret <4 x float> [[RES]]
test_vcmlaq_rot180_laneq_f32(float32x4_t acc,float32x4_t lhs,float32x4_t rhs)384 float32x4_t test_vcmlaq_rot180_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
385 return vcmlaq_rot180_laneq_f32(acc, lhs, rhs, 1);
386 }
387
388 // CHECK-LABEL: @test_vcmla_rot270_lane_f16(
389 // CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
390 // CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
391 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
392 // CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
393 // CHECK: ret <4 x half> [[RES]]
test_vcmla_rot270_lane_f16(float16x4_t acc,float16x4_t lhs,float16x4_t rhs)394 float16x4_t test_vcmla_rot270_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
395 return vcmla_rot270_lane_f16(acc, lhs, rhs, 1);
396 }
397
398 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
399 // CHECK-LABEL: @test_vcmla_rot270_laneq_f16(
400 // CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
401 // CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <2 x i32> <i32 3, i32 3>
402 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
403 // CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
404 // CHECK: ret <4 x half> [[RES]]
test_vcmla_rot270_laneq_f16(float16x4_t acc,float16x4_t lhs,float16x8_t rhs)405 float16x4_t test_vcmla_rot270_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
406 return vcmla_rot270_laneq_f16(acc, lhs, rhs, 3);
407 }
408
409 // CHECK-LABEL: @test_vcmlaq_rot270_lane_f16(
410 // CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
411 // CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
412 // CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
413 // CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
414 // CHECK: ret <8 x half> [[RES]]
test_vcmlaq_rot270_lane_f16(float16x8_t acc,float16x8_t lhs,float16x4_t rhs)415 float16x8_t test_vcmlaq_rot270_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) {
416 return vcmlaq_rot270_lane_f16(acc, lhs, rhs, 1);
417 }
418
419 // CHECK-LABEL: @test_vcmlaq_rot270_laneq_f16(
420 // CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
421 // CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
422 // CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
423 // CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
424 // CHECK: ret <8 x half> [[RES]]
test_vcmlaq_rot270_laneq_f16(float16x8_t acc,float16x8_t lhs,float16x8_t rhs)425 float16x8_t test_vcmlaq_rot270_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
426 return vcmlaq_rot270_laneq_f16(acc, lhs, rhs, 3);
427 }
428
429 // CHECK-LABEL: @test_vcmla_rot270_lane_f32(
430 // CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
431 // CHECK: ret <2 x float> [[RES]]
test_vcmla_rot270_lane_f32(float32x2_t acc,float32x2_t lhs,float32x2_t rhs)432 float32x2_t test_vcmla_rot270_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
433 return vcmla_rot270_lane_f32(acc, lhs, rhs, 0);
434 }
435
436 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
437 // CHECK-LABEL: @test_vcmla_rot270_laneq_f32(
438 // CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
439 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <1 x i32> <i32 1>
440 // CHECK: [[DUP_FLT:%.*]] = bitcast <1 x i64> [[DUP]] to <2 x float>
441 // CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> [[DUP_FLT]])
442 // CHECK: ret <2 x float> [[RES]]
test_vcmla_rot270_laneq_f32(float32x2_t acc,float32x2_t lhs,float32x4_t rhs)443 float32x2_t test_vcmla_rot270_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) {
444 return vcmla_rot270_laneq_f32(acc, lhs, rhs, 1);
445 }
446
447 // CHECK-LABEL: @test_vcmlaq_rot270_lane_f32(
448 // CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
449 // CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> undef, i64 [[CPLX]], i32 0
450 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX_VEC]], <2 x i64> poison, <2 x i32> zeroinitializer
451 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
452 // CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
453 // CHECK: ret <4 x float> [[RES]]
test_vcmlaq_rot270_lane_f32(float32x4_t acc,float32x4_t lhs,float32x2_t rhs)454 float32x4_t test_vcmlaq_rot270_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
455 return vcmlaq_rot270_lane_f32(acc, lhs, rhs, 0);
456 }
457
458 // CHECK-LABEL: @test_vcmlaq_rot270_laneq_f32(
459 // CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
460 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <2 x i32> <i32 1, i32 1>
461 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
462 // CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
463 // CHECK: ret <4 x float> [[RES]]
test_vcmlaq_rot270_laneq_f32(float32x4_t acc,float32x4_t lhs,float32x4_t rhs)464 float32x4_t test_vcmlaq_rot270_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
465 return vcmlaq_rot270_laneq_f32(acc, lhs, rhs, 1);
466 }
467