1 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-cpu cyclone \
2 // RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
3 
4 // Test new aarch64 intrinsics and types
5 
6 #include <arm_neon.h>
7 
8 
9 // CHECK-LABEL: define float @test_vmuls_lane_f32(float %a, <2 x float> %b) #0 {
10 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> %b, i32 1
11 // CHECK:   [[MUL:%.*]] = fmul float %a, [[VGET_LANE]]
12 // CHECK:   ret float [[MUL]]
test_vmuls_lane_f32(float32_t a,float32x2_t b)13 float32_t test_vmuls_lane_f32(float32_t a, float32x2_t b) {
14   return vmuls_lane_f32(a, b, 1);
15 }
16 
17 // CHECK-LABEL: define double @test_vmuld_lane_f64(double %a, <1 x double> %b) #0 {
18 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> %b, i32 0
19 // CHECK:   [[MUL:%.*]] = fmul double %a, [[VGET_LANE]]
20 // CHECK:   ret double [[MUL]]
test_vmuld_lane_f64(float64_t a,float64x1_t b)21 float64_t test_vmuld_lane_f64(float64_t a, float64x1_t b) {
22   return vmuld_lane_f64(a, b, 0);
23 }
24 
25 // CHECK-LABEL: define float @test_vmuls_laneq_f32(float %a, <4 x float> %b) #1 {
26 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> %b, i32 3
27 // CHECK:   [[MUL:%.*]] = fmul float %a, [[VGETQ_LANE]]
28 // CHECK:   ret float [[MUL]]
test_vmuls_laneq_f32(float32_t a,float32x4_t b)29 float32_t test_vmuls_laneq_f32(float32_t a, float32x4_t b) {
30   return vmuls_laneq_f32(a, b, 3);
31 }
32 
33 // CHECK-LABEL: define double @test_vmuld_laneq_f64(double %a, <2 x double> %b) #1 {
34 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> %b, i32 1
35 // CHECK:   [[MUL:%.*]] = fmul double %a, [[VGETQ_LANE]]
36 // CHECK:   ret double [[MUL]]
test_vmuld_laneq_f64(float64_t a,float64x2_t b)37 float64_t test_vmuld_laneq_f64(float64_t a, float64x2_t b) {
38   return vmuld_laneq_f64(a, b, 1);
39 }
40 
41 // CHECK-LABEL: define <1 x double> @test_vmul_n_f64(<1 x double> %a, double %b) #0 {
42 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %a to double
43 // CHECK:   [[TMP3:%.*]] = fmul double [[TMP2]], %b
44 // CHECK:   [[TMP4:%.*]] = bitcast double [[TMP3]] to <1 x double>
45 // CHECK:   ret <1 x double> [[TMP4]]
test_vmul_n_f64(float64x1_t a,float64_t b)46 float64x1_t test_vmul_n_f64(float64x1_t a, float64_t b) {
47   return vmul_n_f64(a, b);
48 }
49 
50 // CHECK-LABEL: define float @test_vmulxs_lane_f32(float %a, <2 x float> %b) #0 {
51 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> %b, i32 1
52 // CHECK:   [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float [[VGET_LANE]])
53 // CHECK:   ret float [[VMULXS_F32_I]]
test_vmulxs_lane_f32(float32_t a,float32x2_t b)54 float32_t test_vmulxs_lane_f32(float32_t a, float32x2_t b) {
55   return vmulxs_lane_f32(a, b, 1);
56 }
57 
58 // CHECK-LABEL: define float @test_vmulxs_laneq_f32(float %a, <4 x float> %b) #1 {
59 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> %b, i32 3
60 // CHECK:   [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float [[VGETQ_LANE]])
61 // CHECK:   ret float [[VMULXS_F32_I]]
test_vmulxs_laneq_f32(float32_t a,float32x4_t b)62 float32_t test_vmulxs_laneq_f32(float32_t a, float32x4_t b) {
63   return vmulxs_laneq_f32(a, b, 3);
64 }
65 
66 // CHECK-LABEL: define double @test_vmulxd_lane_f64(double %a, <1 x double> %b) #0 {
67 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> %b, i32 0
68 // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double [[VGET_LANE]])
69 // CHECK:   ret double [[VMULXD_F64_I]]
test_vmulxd_lane_f64(float64_t a,float64x1_t b)70 float64_t test_vmulxd_lane_f64(float64_t a, float64x1_t b) {
71   return vmulxd_lane_f64(a, b, 0);
72 }
73 
74 // CHECK-LABEL: define double @test_vmulxd_laneq_f64(double %a, <2 x double> %b) #1 {
75 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> %b, i32 1
76 // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double [[VGETQ_LANE]])
77 // CHECK:   ret double [[VMULXD_F64_I]]
test_vmulxd_laneq_f64(float64_t a,float64x2_t b)78 float64_t test_vmulxd_laneq_f64(float64_t a, float64x2_t b) {
79   return vmulxd_laneq_f64(a, b, 1);
80 }
81 
82 // CHECK-LABEL: define <1 x double> @test_vmulx_lane_f64(<1 x double> %a, <1 x double> %b) #0 {
83 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> %a, i32 0
84 // CHECK:   [[VGET_LANE6:%.*]] = extractelement <1 x double> %b, i32 0
85 // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE6]])
86 // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> %a, double [[VMULXD_F64_I]], i32 0
87 // CHECK:   ret <1 x double> [[VSET_LANE]]
test_vmulx_lane_f64(float64x1_t a,float64x1_t b)88 float64x1_t test_vmulx_lane_f64(float64x1_t a, float64x1_t b) {
89   return vmulx_lane_f64(a, b, 0);
90 }
91 
92 
93 // CHECK-LABEL: define <1 x double> @test_vmulx_laneq_f64_0(<1 x double> %a, <2 x double> %b) #1 {
94 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> %a, i32 0
95 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> %b, i32 0
96 // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]])
97 // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> %a, double [[VMULXD_F64_I]], i32 0
98 // CHECK:   ret <1 x double> [[VSET_LANE]]
test_vmulx_laneq_f64_0(float64x1_t a,float64x2_t b)99 float64x1_t test_vmulx_laneq_f64_0(float64x1_t a, float64x2_t b) {
100   return vmulx_laneq_f64(a, b, 0);
101 }
102 
103 // CHECK-LABEL: define <1 x double> @test_vmulx_laneq_f64_1(<1 x double> %a, <2 x double> %b) #1 {
104 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> %a, i32 0
105 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> %b, i32 1
106 // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]])
107 // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> %a, double [[VMULXD_F64_I]], i32 0
108 // CHECK:   ret <1 x double> [[VSET_LANE]]
test_vmulx_laneq_f64_1(float64x1_t a,float64x2_t b)109 float64x1_t test_vmulx_laneq_f64_1(float64x1_t a, float64x2_t b) {
110   return vmulx_laneq_f64(a, b, 1);
111 }
112 
113 
114 // CHECK-LABEL: define float @test_vfmas_lane_f32(float %a, float %b, <2 x float> %c) #0 {
115 // CHECK:   [[EXTRACT:%.*]] = extractelement <2 x float> %c, i32 1
116 // CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a)
117 // CHECK:   ret float [[TMP2]]
test_vfmas_lane_f32(float32_t a,float32_t b,float32x2_t c)118 float32_t test_vfmas_lane_f32(float32_t a, float32_t b, float32x2_t c) {
119   return vfmas_lane_f32(a, b, c, 1);
120 }
121 
122 // CHECK-LABEL: define double @test_vfmad_lane_f64(double %a, double %b, <1 x double> %c) #0 {
123 // CHECK:   [[EXTRACT:%.*]] = extractelement <1 x double> %c, i32 0
124 // CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a)
125 // CHECK:   ret double [[TMP2]]
test_vfmad_lane_f64(float64_t a,float64_t b,float64x1_t c)126 float64_t test_vfmad_lane_f64(float64_t a, float64_t b, float64x1_t c) {
127   return vfmad_lane_f64(a, b, c, 0);
128 }
129 
130 // CHECK-LABEL: define double @test_vfmad_laneq_f64(double %a, double %b, <2 x double> %c) #1 {
131 // CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> %c, i32 1
132 // CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a)
133 // CHECK:   ret double [[TMP2]]
test_vfmad_laneq_f64(float64_t a,float64_t b,float64x2_t c)134 float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) {
135   return vfmad_laneq_f64(a, b, c, 1);
136 }
137 
138 // CHECK-LABEL: define float @test_vfmss_lane_f32(float %a, float %b, <2 x float> %c) #0 {
139 // CHECK:   [[SUB:%.*]] = fneg float %b
140 // CHECK:   [[EXTRACT:%.*]] = extractelement <2 x float> %c, i32 1
141 // CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float [[SUB]], float [[EXTRACT]], float %a)
142 // CHECK:   ret float [[TMP2]]
test_vfmss_lane_f32(float32_t a,float32_t b,float32x2_t c)143 float32_t test_vfmss_lane_f32(float32_t a, float32_t b, float32x2_t c) {
144   return vfmss_lane_f32(a, b, c, 1);
145 }
146 
147 // CHECK-LABEL: define <1 x double> @test_vfma_lane_f64(<1 x double> %a, <1 x double> %b, <1 x double> %v) #0 {
148 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
149 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
150 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
151 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
152 // CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
153 // CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
154 // CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
155 // CHECK:   [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
156 // CHECK:   ret <1 x double> [[FMLA2]]
test_vfma_lane_f64(float64x1_t a,float64x1_t b,float64x1_t v)157 float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
158   return vfma_lane_f64(a, b, v, 0);
159 }
160 
161 // CHECK-LABEL: define <1 x double> @test_vfms_lane_f64(<1 x double> %a, <1 x double> %b, <1 x double> %v) #0 {
162 // CHECK:   [[SUB:%.*]] = fneg <1 x double> %b
163 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
164 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8>
165 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
166 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
167 // CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
168 // CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
169 // CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
170 // CHECK:   [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
171 // CHECK:   ret <1 x double> [[FMLA2]]
test_vfms_lane_f64(float64x1_t a,float64x1_t b,float64x1_t v)172 float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
173   return vfms_lane_f64(a, b, v, 0);
174 }
175 
176 // CHECK-LABEL: define <1 x double> @test_vfma_laneq_f64(<1 x double> %a, <1 x double> %b, <2 x double> %v) #1 {
177 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
178 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
179 // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
180 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
181 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
182 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
183 // CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
184 // CHECK:   [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
185 // CHECK:   [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
186 // CHECK:   ret <1 x double> [[TMP7]]
test_vfma_laneq_f64(float64x1_t a,float64x1_t b,float64x2_t v)187 float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
188   return vfma_laneq_f64(a, b, v, 0);
189 }
190 
191 // CHECK-LABEL: define <1 x double> @test_vfms_laneq_f64(<1 x double> %a, <1 x double> %b, <2 x double> %v) #1 {
192 // CHECK:   [[SUB:%.*]] = fneg <1 x double> %b
193 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
194 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8>
195 // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
196 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
197 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
198 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
199 // CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
200 // CHECK:   [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
201 // CHECK:   [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
202 // CHECK:   ret <1 x double> [[TMP7]]
test_vfms_laneq_f64(float64x1_t a,float64x1_t b,float64x2_t v)203 float64x1_t test_vfms_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
204   return vfms_laneq_f64(a, b, v, 0);
205 }
206 
207 // CHECK-LABEL: define i32 @test_vqdmullh_lane_s16(i16 %a, <4 x i16> %b) #0 {
208 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> %b, i32 3
209 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
210 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
211 // CHECK:   [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
212 // CHECK:   [[TMP4:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0
213 // CHECK:   ret i32 [[TMP4]]
test_vqdmullh_lane_s16(int16_t a,int16x4_t b)214 int32_t test_vqdmullh_lane_s16(int16_t a, int16x4_t b) {
215   return vqdmullh_lane_s16(a, b, 3);
216 }
217 
218 // CHECK-LABEL: define i64 @test_vqdmulls_lane_s32(i32 %a, <2 x i32> %b) #0 {
219 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> %b, i32 1
220 // CHECK:   [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 [[VGET_LANE]])
221 // CHECK:   ret i64 [[VQDMULLS_S32_I]]
test_vqdmulls_lane_s32(int32_t a,int32x2_t b)222 int64_t test_vqdmulls_lane_s32(int32_t a, int32x2_t b) {
223   return vqdmulls_lane_s32(a, b, 1);
224 }
225 
226 // CHECK-LABEL: define i32 @test_vqdmullh_laneq_s16(i16 %a, <8 x i16> %b) #1 {
227 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> %b, i32 7
228 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
229 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
230 // CHECK:   [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
231 // CHECK:   [[TMP4:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0
232 // CHECK:   ret i32 [[TMP4]]
test_vqdmullh_laneq_s16(int16_t a,int16x8_t b)233 int32_t test_vqdmullh_laneq_s16(int16_t a, int16x8_t b) {
234   return vqdmullh_laneq_s16(a, b, 7);
235 }
236 
237 // CHECK-LABEL: define i64 @test_vqdmulls_laneq_s32(i32 %a, <4 x i32> %b) #1 {
238 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> %b, i32 3
239 // CHECK:   [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 [[VGETQ_LANE]])
240 // CHECK:   ret i64 [[VQDMULLS_S32_I]]
test_vqdmulls_laneq_s32(int32_t a,int32x4_t b)241 int64_t test_vqdmulls_laneq_s32(int32_t a, int32x4_t b) {
242   return vqdmulls_laneq_s32(a, b, 3);
243 }
244 
245 // CHECK-LABEL: define i16 @test_vqdmulhh_lane_s16(i16 %a, <4 x i16> %b) #0 {
246 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> %b, i32 3
247 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
248 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
249 // CHECK:   [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
250 // CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0
251 // CHECK:   ret i16 [[TMP4]]
test_vqdmulhh_lane_s16(int16_t a,int16x4_t b)252 int16_t test_vqdmulhh_lane_s16(int16_t a, int16x4_t b) {
253   return vqdmulhh_lane_s16(a, b, 3);
254 }
255 
256 // CHECK-LABEL: define i32 @test_vqdmulhs_lane_s32(i32 %a, <2 x i32> %b) #0 {
257 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> %b, i32 1
258 // CHECK:   [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 [[VGET_LANE]])
259 // CHECK:   ret i32 [[VQDMULHS_S32_I]]
test_vqdmulhs_lane_s32(int32_t a,int32x2_t b)260 int32_t test_vqdmulhs_lane_s32(int32_t a, int32x2_t b) {
261   return vqdmulhs_lane_s32(a, b, 1);
262 }
263 
264 
265 // CHECK-LABEL: define i16 @test_vqdmulhh_laneq_s16(i16 %a, <8 x i16> %b) #1 {
266 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> %b, i32 7
267 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
268 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
269 // CHECK:   [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
270 // CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0
271 // CHECK:   ret i16 [[TMP4]]
test_vqdmulhh_laneq_s16(int16_t a,int16x8_t b)272 int16_t test_vqdmulhh_laneq_s16(int16_t a, int16x8_t b) {
273   return vqdmulhh_laneq_s16(a, b, 7);
274 }
275 
276 
277 // CHECK-LABEL: define i32 @test_vqdmulhs_laneq_s32(i32 %a, <4 x i32> %b) #1 {
278 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> %b, i32 3
279 // CHECK:   [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 [[VGETQ_LANE]])
280 // CHECK:   ret i32 [[VQDMULHS_S32_I]]
test_vqdmulhs_laneq_s32(int32_t a,int32x4_t b)281 int32_t test_vqdmulhs_laneq_s32(int32_t a, int32x4_t b) {
282   return vqdmulhs_laneq_s32(a, b, 3);
283 }
284 
285 // CHECK-LABEL: define i16 @test_vqrdmulhh_lane_s16(i16 %a, <4 x i16> %b) #0 {
286 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> %b, i32 3
287 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
288 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
289 // CHECK:   [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
290 // CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
291 // CHECK:   ret i16 [[TMP4]]
test_vqrdmulhh_lane_s16(int16_t a,int16x4_t b)292 int16_t test_vqrdmulhh_lane_s16(int16_t a, int16x4_t b) {
293   return vqrdmulhh_lane_s16(a, b, 3);
294 }
295 
296 // CHECK-LABEL: define i32 @test_vqrdmulhs_lane_s32(i32 %a, <2 x i32> %b) #0 {
297 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> %b, i32 1
298 // CHECK:   [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 [[VGET_LANE]])
299 // CHECK:   ret i32 [[VQRDMULHS_S32_I]]
test_vqrdmulhs_lane_s32(int32_t a,int32x2_t b)300 int32_t test_vqrdmulhs_lane_s32(int32_t a, int32x2_t b) {
301   return vqrdmulhs_lane_s32(a, b, 1);
302 }
303 
304 
305 // CHECK-LABEL: define i16 @test_vqrdmulhh_laneq_s16(i16 %a, <8 x i16> %b) #1 {
306 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> %b, i32 7
307 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
308 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
309 // CHECK:   [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
310 // CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
311 // CHECK:   ret i16 [[TMP4]]
test_vqrdmulhh_laneq_s16(int16_t a,int16x8_t b)312 int16_t test_vqrdmulhh_laneq_s16(int16_t a, int16x8_t b) {
313   return vqrdmulhh_laneq_s16(a, b, 7);
314 }
315 
316 
317 // CHECK-LABEL: define i32 @test_vqrdmulhs_laneq_s32(i32 %a, <4 x i32> %b) #1 {
318 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> %b, i32 3
319 // CHECK:   [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 [[VGETQ_LANE]])
320 // CHECK:   ret i32 [[VQRDMULHS_S32_I]]
test_vqrdmulhs_laneq_s32(int32_t a,int32x4_t b)321 int32_t test_vqrdmulhs_laneq_s32(int32_t a, int32x4_t b) {
322   return vqrdmulhs_laneq_s32(a, b, 3);
323 }
324 
325 // CHECK-LABEL: define i32 @test_vqdmlalh_lane_s16(i32 %a, i16 %b, <4 x i16> %c) #0 {
326 // CHECK:   [[LANE:%.*]] = extractelement <4 x i16> %c, i32 3
327 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
328 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
329 // CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
330 // CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
331 // CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE0]])
332 // CHECK:   ret i32 [[VQDMLXL1]]
test_vqdmlalh_lane_s16(int32_t a,int16_t b,int16x4_t c)333 int32_t test_vqdmlalh_lane_s16(int32_t a, int16_t b, int16x4_t c) {
334   return vqdmlalh_lane_s16(a, b, c, 3);
335 }
336 
337 // CHECK-LABEL: define i64 @test_vqdmlals_lane_s32(i64 %a, i32 %b, <2 x i32> %c) #0 {
338 // CHECK:   [[LANE:%.*]] = extractelement <2 x i32> %c, i32 1
339 // CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
340 // CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL]])
341 // CHECK:   ret i64 [[VQDMLXL1]]
test_vqdmlals_lane_s32(int64_t a,int32_t b,int32x2_t c)342 int64_t test_vqdmlals_lane_s32(int64_t a, int32_t b, int32x2_t c) {
343   return vqdmlals_lane_s32(a, b, c, 1);
344 }
345 
346 // CHECK-LABEL: define i32 @test_vqdmlalh_laneq_s16(i32 %a, i16 %b, <8 x i16> %c) #1 {
347 // CHECK:   [[LANE:%.*]] = extractelement <8 x i16> %c, i32 7
348 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
349 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
350 // CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
351 // CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
352 // CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE0]])
353 // CHECK:   ret i32 [[VQDMLXL1]]
test_vqdmlalh_laneq_s16(int32_t a,int16_t b,int16x8_t c)354 int32_t test_vqdmlalh_laneq_s16(int32_t a, int16_t b, int16x8_t c) {
355   return vqdmlalh_laneq_s16(a, b, c, 7);
356 }
357 
358 // CHECK-LABEL: define i64 @test_vqdmlals_laneq_s32(i64 %a, i32 %b, <4 x i32> %c) #1 {
359 // CHECK:   [[LANE:%.*]] = extractelement <4 x i32> %c, i32 3
360 // CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
361 // CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL]])
362 // CHECK:   ret i64 [[VQDMLXL1]]
test_vqdmlals_laneq_s32(int64_t a,int32_t b,int32x4_t c)363 int64_t test_vqdmlals_laneq_s32(int64_t a, int32_t b, int32x4_t c) {
364   return vqdmlals_laneq_s32(a, b, c, 3);
365 }
366 
367 // CHECK-LABEL: define i32 @test_vqdmlslh_lane_s16(i32 %a, i16 %b, <4 x i16> %c) #0 {
368 // CHECK:   [[LANE:%.*]] = extractelement <4 x i16> %c, i32 3
369 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
370 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
371 // CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
372 // CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
373 // CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE0]])
374 // CHECK:   ret i32 [[VQDMLXL1]]
test_vqdmlslh_lane_s16(int32_t a,int16_t b,int16x4_t c)375 int32_t test_vqdmlslh_lane_s16(int32_t a, int16_t b, int16x4_t c) {
376   return vqdmlslh_lane_s16(a, b, c, 3);
377 }
378 
379 // CHECK-LABEL: define i64 @test_vqdmlsls_lane_s32(i64 %a, i32 %b, <2 x i32> %c) #0 {
380 // CHECK:   [[LANE:%.*]] = extractelement <2 x i32> %c, i32 1
381 // CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
382 // CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL]])
383 // CHECK:   ret i64 [[VQDMLXL1]]
test_vqdmlsls_lane_s32(int64_t a,int32_t b,int32x2_t c)384 int64_t test_vqdmlsls_lane_s32(int64_t a, int32_t b, int32x2_t c) {
385   return vqdmlsls_lane_s32(a, b, c, 1);
386 }
387 
388 // CHECK-LABEL: define i32 @test_vqdmlslh_laneq_s16(i32 %a, i16 %b, <8 x i16> %c) #1 {
389 // CHECK:   [[LANE:%.*]] = extractelement <8 x i16> %c, i32 7
390 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
391 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
392 // CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
393 // CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
394 // CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE0]])
395 // CHECK:   ret i32 [[VQDMLXL1]]
test_vqdmlslh_laneq_s16(int32_t a,int16_t b,int16x8_t c)396 int32_t test_vqdmlslh_laneq_s16(int32_t a, int16_t b, int16x8_t c) {
397   return vqdmlslh_laneq_s16(a, b, c, 7);
398 }
399 
400 // CHECK-LABEL: define i64 @test_vqdmlsls_laneq_s32(i64 %a, i32 %b, <4 x i32> %c) #1 {
401 // CHECK:   [[LANE:%.*]] = extractelement <4 x i32> %c, i32 3
402 // CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
403 // CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL]])
404 // CHECK:   ret i64 [[VQDMLXL1]]
test_vqdmlsls_laneq_s32(int64_t a,int32_t b,int32x4_t c)405 int64_t test_vqdmlsls_laneq_s32(int64_t a, int32_t b, int32x4_t c) {
406   return vqdmlsls_laneq_s32(a, b, c, 3);
407 }
408 
409 // CHECK-LABEL: define <1 x double> @test_vmulx_lane_f64_0() #0 {
410 // CHECK:   [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
411 // CHECK:   [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
412 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP0]], i32 0
413 // CHECK:   [[VGET_LANE7:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
414 // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE7]])
415 // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP0]], double [[VMULXD_F64_I]], i32 0
416 // CHECK:   ret <1 x double> [[VSET_LANE]]
test_vmulx_lane_f64_0()417 float64x1_t test_vmulx_lane_f64_0() {
418       float64x1_t arg1;
419       float64x1_t arg2;
420       float64x1_t result;
421       float64_t sarg1, sarg2, sres;
422       arg1 = vcreate_f64(UINT64_C(0x3fd6304bc43ab5c2));
423       arg2 = vcreate_f64(UINT64_C(0x3fee211e215aeef3));
424       result = vmulx_lane_f64(arg1, arg2, 0);
425       return result;
426 }
427 
428 // CHECK-LABEL: define <1 x double> @test_vmulx_laneq_f64_2() #1 {
429 // CHECK:   [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
430 // CHECK:   [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
431 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x double> [[TMP0]], <1 x double> [[TMP1]], <2 x i32> <i32 0, i32 1>
432 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP0]], i32 0
433 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[SHUFFLE_I]], i32 1
434 // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]])
435 // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP0]], double [[VMULXD_F64_I]], i32 0
436 // CHECK:   ret <1 x double> [[VSET_LANE]]
test_vmulx_laneq_f64_2()437 float64x1_t test_vmulx_laneq_f64_2() {
438       float64x1_t arg1;
439       float64x1_t arg2;
440       float64x2_t arg3;
441       float64x1_t result;
442       float64_t sarg1, sarg2, sres;
443       arg1 = vcreate_f64(UINT64_C(0x3fd6304bc43ab5c2));
444       arg2 = vcreate_f64(UINT64_C(0x3fee211e215aeef3));
445       arg3 = vcombine_f64(arg1, arg2);
446       result = vmulx_laneq_f64(arg1, arg3, 1);
447       return result;
448 }
449 
450 // CHECK: attributes #0 ={{.*}}"min-legal-vector-width"="64"
451 // CHECK: attributes #1 ={{.*}}"min-legal-vector-width"="128"
452