1 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
3 
4 // Test new aarch64 intrinsics and types
5 
6 #include <arm_neon.h>
7 
8 // CHECK-LABEL: @test_vmla_lane_s16(
9 // CHECK-NEXT:  entry:
10 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
11 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
12 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
13 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
14 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
15 // CHECK-NEXT:    ret <4 x i16> [[ADD]]
16 //
test_vmla_lane_s16(int16x4_t a,int16x4_t b,int16x4_t v)17 int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
18   return vmla_lane_s16(a, b, v, 3);
19 }
20 
21 // CHECK-LABEL: @test_vmlaq_lane_s16(
22 // CHECK-NEXT:  entry:
23 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
24 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
25 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
26 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
27 // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
28 // CHECK-NEXT:    ret <8 x i16> [[ADD]]
29 //
test_vmlaq_lane_s16(int16x8_t a,int16x8_t b,int16x4_t v)30 int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
31   return vmlaq_lane_s16(a, b, v, 3);
32 }
33 
34 // CHECK-LABEL: @test_vmla_lane_s32(
35 // CHECK-NEXT:  entry:
36 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
37 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
38 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
39 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
40 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
41 // CHECK-NEXT:    ret <2 x i32> [[ADD]]
42 //
test_vmla_lane_s32(int32x2_t a,int32x2_t b,int32x2_t v)43 int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
44   return vmla_lane_s32(a, b, v, 1);
45 }
46 
47 // CHECK-LABEL: @test_vmlaq_lane_s32(
48 // CHECK-NEXT:  entry:
49 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
50 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
51 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
52 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
53 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
54 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
55 //
test_vmlaq_lane_s32(int32x4_t a,int32x4_t b,int32x2_t v)56 int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
57   return vmlaq_lane_s32(a, b, v, 1);
58 }
59 
60 // CHECK-LABEL: @test_vmla_laneq_s16(
61 // CHECK-NEXT:  entry:
62 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
63 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
64 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
65 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
66 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
67 // CHECK-NEXT:    ret <4 x i16> [[ADD]]
68 //
test_vmla_laneq_s16(int16x4_t a,int16x4_t b,int16x8_t v)69 int16x4_t test_vmla_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
70   return vmla_laneq_s16(a, b, v, 7);
71 }
72 
73 // CHECK-LABEL: @test_vmlaq_laneq_s16(
74 // CHECK-NEXT:  entry:
75 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
76 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
77 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
78 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
79 // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
80 // CHECK-NEXT:    ret <8 x i16> [[ADD]]
81 //
test_vmlaq_laneq_s16(int16x8_t a,int16x8_t b,int16x8_t v)82 int16x8_t test_vmlaq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
83   return vmlaq_laneq_s16(a, b, v, 7);
84 }
85 
86 // CHECK-LABEL: @test_vmla_laneq_s32(
87 // CHECK-NEXT:  entry:
88 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
89 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
90 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
91 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
92 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
93 // CHECK-NEXT:    ret <2 x i32> [[ADD]]
94 //
test_vmla_laneq_s32(int32x2_t a,int32x2_t b,int32x4_t v)95 int32x2_t test_vmla_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
96   return vmla_laneq_s32(a, b, v, 3);
97 }
98 
99 // CHECK-LABEL: @test_vmlaq_laneq_s32(
100 // CHECK-NEXT:  entry:
101 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
102 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
103 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
104 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
105 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
106 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
107 //
test_vmlaq_laneq_s32(int32x4_t a,int32x4_t b,int32x4_t v)108 int32x4_t test_vmlaq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
109   return vmlaq_laneq_s32(a, b, v, 3);
110 }
111 
112 // CHECK-LABEL: @test_vmls_lane_s16(
113 // CHECK-NEXT:  entry:
114 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
115 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
116 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
117 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
118 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
119 // CHECK-NEXT:    ret <4 x i16> [[SUB]]
120 //
test_vmls_lane_s16(int16x4_t a,int16x4_t b,int16x4_t v)121 int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
122   return vmls_lane_s16(a, b, v, 3);
123 }
124 
125 // CHECK-LABEL: @test_vmlsq_lane_s16(
126 // CHECK-NEXT:  entry:
127 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
128 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
129 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
130 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
131 // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
132 // CHECK-NEXT:    ret <8 x i16> [[SUB]]
133 //
test_vmlsq_lane_s16(int16x8_t a,int16x8_t b,int16x4_t v)134 int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
135   return vmlsq_lane_s16(a, b, v, 3);
136 }
137 
138 // CHECK-LABEL: @test_vmls_lane_s32(
139 // CHECK-NEXT:  entry:
140 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
141 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
142 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
143 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
144 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
145 // CHECK-NEXT:    ret <2 x i32> [[SUB]]
146 //
test_vmls_lane_s32(int32x2_t a,int32x2_t b,int32x2_t v)147 int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
148   return vmls_lane_s32(a, b, v, 1);
149 }
150 
151 // CHECK-LABEL: @test_vmlsq_lane_s32(
152 // CHECK-NEXT:  entry:
153 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
154 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
155 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
156 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
157 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
158 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
159 //
test_vmlsq_lane_s32(int32x4_t a,int32x4_t b,int32x2_t v)160 int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
161   return vmlsq_lane_s32(a, b, v, 1);
162 }
163 
164 // CHECK-LABEL: @test_vmls_laneq_s16(
165 // CHECK-NEXT:  entry:
166 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
167 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
168 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
169 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
170 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
171 // CHECK-NEXT:    ret <4 x i16> [[SUB]]
172 //
test_vmls_laneq_s16(int16x4_t a,int16x4_t b,int16x8_t v)173 int16x4_t test_vmls_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
174   return vmls_laneq_s16(a, b, v, 7);
175 }
176 
177 // CHECK-LABEL: @test_vmlsq_laneq_s16(
178 // CHECK-NEXT:  entry:
179 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
180 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
181 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
182 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
183 // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
184 // CHECK-NEXT:    ret <8 x i16> [[SUB]]
185 //
test_vmlsq_laneq_s16(int16x8_t a,int16x8_t b,int16x8_t v)186 int16x8_t test_vmlsq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
187   return vmlsq_laneq_s16(a, b, v, 7);
188 }
189 
190 // CHECK-LABEL: @test_vmls_laneq_s32(
191 // CHECK-NEXT:  entry:
192 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
193 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
194 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
195 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
196 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
197 // CHECK-NEXT:    ret <2 x i32> [[SUB]]
198 //
test_vmls_laneq_s32(int32x2_t a,int32x2_t b,int32x4_t v)199 int32x2_t test_vmls_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
200   return vmls_laneq_s32(a, b, v, 3);
201 }
202 
203 // CHECK-LABEL: @test_vmlsq_laneq_s32(
204 // CHECK-NEXT:  entry:
205 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
206 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
207 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
208 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
209 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
210 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
211 //
test_vmlsq_laneq_s32(int32x4_t a,int32x4_t b,int32x4_t v)212 int32x4_t test_vmlsq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
213   return vmlsq_laneq_s32(a, b, v, 3);
214 }
215 
216 // CHECK-LABEL: @test_vmul_lane_s16(
217 // CHECK-NEXT:  entry:
218 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
219 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
220 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
221 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
222 // CHECK-NEXT:    ret <4 x i16> [[MUL]]
223 //
test_vmul_lane_s16(int16x4_t a,int16x4_t v)224 int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t v) {
225   return vmul_lane_s16(a, v, 3);
226 }
227 
228 // CHECK-LABEL: @test_vmulq_lane_s16(
229 // CHECK-NEXT:  entry:
230 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
231 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
232 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
233 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
234 // CHECK-NEXT:    ret <8 x i16> [[MUL]]
235 //
test_vmulq_lane_s16(int16x8_t a,int16x4_t v)236 int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t v) {
237   return vmulq_lane_s16(a, v, 3);
238 }
239 
240 // CHECK-LABEL: @test_vmul_lane_s32(
241 // CHECK-NEXT:  entry:
242 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
243 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
244 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
245 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
246 // CHECK-NEXT:    ret <2 x i32> [[MUL]]
247 //
test_vmul_lane_s32(int32x2_t a,int32x2_t v)248 int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t v) {
249   return vmul_lane_s32(a, v, 1);
250 }
251 
252 // CHECK-LABEL: @test_vmulq_lane_s32(
253 // CHECK-NEXT:  entry:
254 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
255 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
256 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
257 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
258 // CHECK-NEXT:    ret <4 x i32> [[MUL]]
259 //
test_vmulq_lane_s32(int32x4_t a,int32x2_t v)260 int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t v) {
261   return vmulq_lane_s32(a, v, 1);
262 }
263 
264 // CHECK-LABEL: @test_vmul_lane_u16(
265 // CHECK-NEXT:  entry:
266 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
267 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
268 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
269 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
270 // CHECK-NEXT:    ret <4 x i16> [[MUL]]
271 //
test_vmul_lane_u16(uint16x4_t a,uint16x4_t v)272 uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t v) {
273   return vmul_lane_u16(a, v, 3);
274 }
275 
276 // CHECK-LABEL: @test_vmulq_lane_u16(
277 // CHECK-NEXT:  entry:
278 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
279 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
280 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
281 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
282 // CHECK-NEXT:    ret <8 x i16> [[MUL]]
283 //
test_vmulq_lane_u16(uint16x8_t a,uint16x4_t v)284 uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t v) {
285   return vmulq_lane_u16(a, v, 3);
286 }
287 
288 // CHECK-LABEL: @test_vmul_lane_u32(
289 // CHECK-NEXT:  entry:
290 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
291 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
292 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
293 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
294 // CHECK-NEXT:    ret <2 x i32> [[MUL]]
295 //
test_vmul_lane_u32(uint32x2_t a,uint32x2_t v)296 uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t v) {
297   return vmul_lane_u32(a, v, 1);
298 }
299 
300 // CHECK-LABEL: @test_vmulq_lane_u32(
301 // CHECK-NEXT:  entry:
302 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
303 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
304 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
305 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
306 // CHECK-NEXT:    ret <4 x i32> [[MUL]]
307 //
test_vmulq_lane_u32(uint32x4_t a,uint32x2_t v)308 uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t v) {
309   return vmulq_lane_u32(a, v, 1);
310 }
311 
312 // CHECK-LABEL: @test_vmul_laneq_s16(
313 // CHECK-NEXT:  entry:
314 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
315 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
316 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
317 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
318 // CHECK-NEXT:    ret <4 x i16> [[MUL]]
319 //
test_vmul_laneq_s16(int16x4_t a,int16x8_t v)320 int16x4_t test_vmul_laneq_s16(int16x4_t a, int16x8_t v) {
321   return vmul_laneq_s16(a, v, 7);
322 }
323 
324 // CHECK-LABEL: @test_vmulq_laneq_s16(
325 // CHECK-NEXT:  entry:
326 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
327 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
328 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
329 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
330 // CHECK-NEXT:    ret <8 x i16> [[MUL]]
331 //
test_vmulq_laneq_s16(int16x8_t a,int16x8_t v)332 int16x8_t test_vmulq_laneq_s16(int16x8_t a, int16x8_t v) {
333   return vmulq_laneq_s16(a, v, 7);
334 }
335 
336 // CHECK-LABEL: @test_vmul_laneq_s32(
337 // CHECK-NEXT:  entry:
338 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
339 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
340 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
341 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
342 // CHECK-NEXT:    ret <2 x i32> [[MUL]]
343 //
test_vmul_laneq_s32(int32x2_t a,int32x4_t v)344 int32x2_t test_vmul_laneq_s32(int32x2_t a, int32x4_t v) {
345   return vmul_laneq_s32(a, v, 3);
346 }
347 
348 // CHECK-LABEL: @test_vmulq_laneq_s32(
349 // CHECK-NEXT:  entry:
350 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
351 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
352 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
353 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
354 // CHECK-NEXT:    ret <4 x i32> [[MUL]]
355 //
test_vmulq_laneq_s32(int32x4_t a,int32x4_t v)356 int32x4_t test_vmulq_laneq_s32(int32x4_t a, int32x4_t v) {
357   return vmulq_laneq_s32(a, v, 3);
358 }
359 
360 // CHECK-LABEL: @test_vmul_laneq_u16(
361 // CHECK-NEXT:  entry:
362 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
363 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
364 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
365 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
366 // CHECK-NEXT:    ret <4 x i16> [[MUL]]
367 //
test_vmul_laneq_u16(uint16x4_t a,uint16x8_t v)368 uint16x4_t test_vmul_laneq_u16(uint16x4_t a, uint16x8_t v) {
369   return vmul_laneq_u16(a, v, 7);
370 }
371 
372 // CHECK-LABEL: @test_vmulq_laneq_u16(
373 // CHECK-NEXT:  entry:
374 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
375 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
376 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
377 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
378 // CHECK-NEXT:    ret <8 x i16> [[MUL]]
379 //
test_vmulq_laneq_u16(uint16x8_t a,uint16x8_t v)380 uint16x8_t test_vmulq_laneq_u16(uint16x8_t a, uint16x8_t v) {
381   return vmulq_laneq_u16(a, v, 7);
382 }
383 
384 // CHECK-LABEL: @test_vmul_laneq_u32(
385 // CHECK-NEXT:  entry:
386 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
387 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
388 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
389 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
390 // CHECK-NEXT:    ret <2 x i32> [[MUL]]
391 //
test_vmul_laneq_u32(uint32x2_t a,uint32x4_t v)392 uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) {
393   return vmul_laneq_u32(a, v, 3);
394 }
395 
396 // CHECK-LABEL: @test_vmulq_laneq_u32(
397 // CHECK-NEXT:  entry:
398 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
399 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
400 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
401 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
402 // CHECK-NEXT:    ret <4 x i32> [[MUL]]
403 //
test_vmulq_laneq_u32(uint32x4_t a,uint32x4_t v)404 uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t v) {
405   return vmulq_laneq_u32(a, v, 3);
406 }
407 
408 // CHECK-LABEL: @test_vfma_lane_f32(
409 // CHECK-NEXT:  entry:
410 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
411 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
412 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
413 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
414 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1>
415 // CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
416 // CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
417 // CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
418 // CHECK-NEXT:    ret <2 x float> [[FMLA2]]
419 //
test_vfma_lane_f32(float32x2_t a,float32x2_t b,float32x2_t v)420 float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
421   return vfma_lane_f32(a, b, v, 1);
422 }
423 
424 // CHECK-LABEL: @test_vfmaq_lane_f32(
425 // CHECK-NEXT:  entry:
426 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
427 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8>
428 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
429 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
430 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
431 // CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
432 // CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
433 // CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
434 // CHECK-NEXT:    ret <4 x float> [[FMLA2]]
435 //
test_vfmaq_lane_f32(float32x4_t a,float32x4_t b,float32x2_t v)436 float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
437   return vfmaq_lane_f32(a, b, v, 1);
438 }
439 
440 // CHECK-LABEL: @test_vfma_laneq_f32(
441 // CHECK-NEXT:  entry:
442 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
443 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
444 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
445 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
446 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
447 // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
448 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3>
449 // CHECK-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
450 // CHECK-NEXT:    ret <2 x float> [[TMP6]]
451 //
test_vfma_laneq_f32(float32x2_t a,float32x2_t b,float32x4_t v)452 float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
453   return vfma_laneq_f32(a, b, v, 3);
454 }
455 
456 // CHECK-LABEL: @test_vfmaq_laneq_f32(
457 // CHECK-NEXT:  entry:
458 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
459 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8>
460 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
461 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
462 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
463 // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
464 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
465 // CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
466 // CHECK-NEXT:    ret <4 x float> [[TMP6]]
467 //
test_vfmaq_laneq_f32(float32x4_t a,float32x4_t b,float32x4_t v)468 float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
469   return vfmaq_laneq_f32(a, b, v, 3);
470 }
471 
472 // CHECK-LABEL: @test_vfms_lane_f32(
473 // CHECK-NEXT:  entry:
474 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]]
475 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
476 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8>
477 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
478 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
479 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1>
480 // CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
481 // CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
482 // CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
483 // CHECK-NEXT:    ret <2 x float> [[FMLA2]]
484 //
test_vfms_lane_f32(float32x2_t a,float32x2_t b,float32x2_t v)485 float32x2_t test_vfms_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
486   return vfms_lane_f32(a, b, v, 1);
487 }
488 
489 // CHECK-LABEL: @test_vfmsq_lane_f32(
490 // CHECK-NEXT:  entry:
491 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]]
492 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
493 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8>
494 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
495 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
496 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
497 // CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
498 // CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
499 // CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
500 // CHECK-NEXT:    ret <4 x float> [[FMLA2]]
501 //
test_vfmsq_lane_f32(float32x4_t a,float32x4_t b,float32x2_t v)502 float32x4_t test_vfmsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
503   return vfmsq_lane_f32(a, b, v, 1);
504 }
505 
506 // CHECK-LABEL: @test_vfms_laneq_f32(
507 // CHECK-NEXT:  entry:
508 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]]
509 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
510 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8>
511 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
512 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
513 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
514 // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
515 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3>
516 // CHECK-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
517 // CHECK-NEXT:    ret <2 x float> [[TMP6]]
518 //
test_vfms_laneq_f32(float32x2_t a,float32x2_t b,float32x4_t v)519 float32x2_t test_vfms_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
520   return vfms_laneq_f32(a, b, v, 3);
521 }
522 
523 // CHECK-LABEL: @test_vfmsq_laneq_f32(
524 // CHECK-NEXT:  entry:
525 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]]
526 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
527 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8>
528 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
529 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
530 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
531 // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
532 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
533 // CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
534 // CHECK-NEXT:    ret <4 x float> [[TMP6]]
535 //
test_vfmsq_laneq_f32(float32x4_t a,float32x4_t b,float32x4_t v)536 float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
537   return vfmsq_laneq_f32(a, b, v, 3);
538 }
539 
540 // CHECK-LABEL: @test_vfmaq_lane_f64(
541 // CHECK-NEXT:  entry:
542 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
543 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8>
544 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
545 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
546 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer
547 // CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
548 // CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
549 // CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]])
550 // CHECK-NEXT:    ret <2 x double> [[FMLA2]]
551 //
test_vfmaq_lane_f64(float64x2_t a,float64x2_t b,float64x1_t v)552 float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
553   return vfmaq_lane_f64(a, b, v, 0);
554 }
555 
556 // CHECK-LABEL: @test_vfmaq_laneq_f64(
557 // CHECK-NEXT:  entry:
558 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
559 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8>
560 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
561 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
562 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
563 // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
564 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1>
565 // CHECK-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
566 // CHECK-NEXT:    ret <2 x double> [[TMP6]]
567 //
test_vfmaq_laneq_f64(float64x2_t a,float64x2_t b,float64x2_t v)568 float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
569   return vfmaq_laneq_f64(a, b, v, 1);
570 }
571 
572 // CHECK-LABEL: @test_vfmsq_lane_f64(
573 // CHECK-NEXT:  entry:
574 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <2 x double> [[B:%.*]]
575 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
576 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[FNEG]] to <16 x i8>
577 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
578 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
579 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer
580 // CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
581 // CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
582 // CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]])
583 // CHECK-NEXT:    ret <2 x double> [[FMLA2]]
584 //
test_vfmsq_lane_f64(float64x2_t a,float64x2_t b,float64x1_t v)585 float64x2_t test_vfmsq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
586   return vfmsq_lane_f64(a, b, v, 0);
587 }
588 
589 // CHECK-LABEL: @test_vfmsq_laneq_f64(
590 // CHECK-NEXT:  entry:
591 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <2 x double> [[B:%.*]]
592 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
593 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[FNEG]] to <16 x i8>
594 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
595 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
596 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
597 // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
598 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1>
599 // CHECK-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
600 // CHECK-NEXT:    ret <2 x double> [[TMP6]]
601 //
test_vfmsq_laneq_f64(float64x2_t a,float64x2_t b,float64x2_t v)602 float64x2_t test_vfmsq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
603   return vfmsq_laneq_f64(a, b, v, 1);
604 }
605 
606 // CHECK-LABEL: @test_vfmas_laneq_f32(
607 // CHECK-NEXT:  entry:
608 // CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x float> [[V:%.*]], i32 3
609 // CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.fma.f32(float [[B:%.*]], float [[EXTRACT]], float [[A:%.*]])
610 // CHECK-NEXT:    ret float [[TMP0]]
611 //
test_vfmas_laneq_f32(float32_t a,float32_t b,float32x4_t v)612 float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
613   return vfmas_laneq_f32(a, b, v, 3);
614 }
615 
616 // CHECK-LABEL: @test_vfmsd_lane_f64(
617 // CHECK-NEXT:  entry:
618 // CHECK-NEXT:    [[FNEG:%.*]] = fneg double [[B:%.*]]
619 // CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <1 x double> [[V:%.*]], i32 0
620 // CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.fma.f64(double [[FNEG]], double [[EXTRACT]], double [[A:%.*]])
621 // CHECK-NEXT:    ret double [[TMP0]]
622 //
test_vfmsd_lane_f64(float64_t a,float64_t b,float64x1_t v)623 float64_t test_vfmsd_lane_f64(float64_t a, float64_t b, float64x1_t v) {
624   return vfmsd_lane_f64(a, b, v, 0);
625 }
626 
627 // CHECK-LABEL: @test_vfmss_laneq_f32(
628 // CHECK-NEXT:  entry:
629 // CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[B:%.*]]
630 // CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x float> [[V:%.*]], i32 3
631 // CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.fma.f32(float [[FNEG]], float [[EXTRACT]], float [[A:%.*]])
632 // CHECK-NEXT:    ret float [[TMP0]]
633 //
test_vfmss_laneq_f32(float32_t a,float32_t b,float32x4_t v)634 float32_t test_vfmss_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
635   return vfmss_laneq_f32(a, b, v, 3);
636 }
637 
638 // CHECK-LABEL: @test_vfmsd_laneq_f64(
639 // CHECK-NEXT:  entry:
640 // CHECK-NEXT:    [[FNEG:%.*]] = fneg double [[B:%.*]]
641 // CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[V:%.*]], i32 1
642 // CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.fma.f64(double [[FNEG]], double [[EXTRACT]], double [[A:%.*]])
643 // CHECK-NEXT:    ret double [[TMP0]]
644 //
test_vfmsd_laneq_f64(float64_t a,float64_t b,float64x2_t v)645 float64_t test_vfmsd_laneq_f64(float64_t a, float64_t b, float64x2_t v) {
646   return vfmsd_laneq_f64(a, b, v, 1);
647 }
648 
649 // CHECK-LABEL: @test_vmlal_lane_s16(
650 // CHECK-NEXT:  entry:
651 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
652 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
653 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
654 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
655 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
656 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
657 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
658 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
659 //
test_vmlal_lane_s16(int32x4_t a,int16x4_t b,int16x4_t v)660 int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
661   return vmlal_lane_s16(a, b, v, 3);
662 }
663 
664 // CHECK-LABEL: @test_vmlal_lane_s32(
665 // CHECK-NEXT:  entry:
666 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
667 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
668 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
669 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
670 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
671 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
672 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
673 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
674 //
test_vmlal_lane_s32(int64x2_t a,int32x2_t b,int32x2_t v)675 int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
676   return vmlal_lane_s32(a, b, v, 1);
677 }
678 
679 // CHECK-LABEL: @test_vmlal_laneq_s16(
680 // CHECK-NEXT:  entry:
681 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
682 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
683 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
684 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
685 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
686 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
687 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
688 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
689 //
test_vmlal_laneq_s16(int32x4_t a,int16x4_t b,int16x8_t v)690 int32x4_t test_vmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
691   return vmlal_laneq_s16(a, b, v, 7);
692 }
693 
694 // CHECK-LABEL: @test_vmlal_laneq_s32(
695 // CHECK-NEXT:  entry:
696 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
697 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
698 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
699 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
700 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
701 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
702 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
703 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
704 //
test_vmlal_laneq_s32(int64x2_t a,int32x2_t b,int32x4_t v)705 int64x2_t test_vmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
706   return vmlal_laneq_s32(a, b, v, 3);
707 }
708 
709 // CHECK-LABEL: @test_vmlal_high_lane_s16(
710 // CHECK-NEXT:  entry:
711 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
712 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
713 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
714 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
715 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
716 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
717 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
718 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
719 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
720 //
test_vmlal_high_lane_s16(int32x4_t a,int16x8_t b,int16x4_t v)721 int32x4_t test_vmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
722   return vmlal_high_lane_s16(a, b, v, 3);
723 }
724 
725 // CHECK-LABEL: @test_vmlal_high_lane_s32(
726 // CHECK-NEXT:  entry:
727 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
728 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
729 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
730 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
731 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
732 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
733 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
734 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
735 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
736 //
test_vmlal_high_lane_s32(int64x2_t a,int32x4_t b,int32x2_t v)737 int64x2_t test_vmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
738   return vmlal_high_lane_s32(a, b, v, 1);
739 }
740 
741 // CHECK-LABEL: @test_vmlal_high_laneq_s16(
742 // CHECK-NEXT:  entry:
743 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
744 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
745 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
746 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
747 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
748 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
749 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
750 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
751 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
752 //
test_vmlal_high_laneq_s16(int32x4_t a,int16x8_t b,int16x8_t v)753 int32x4_t test_vmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
754   return vmlal_high_laneq_s16(a, b, v, 7);
755 }
756 
757 // CHECK-LABEL: @test_vmlal_high_laneq_s32(
758 // CHECK-NEXT:  entry:
759 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
760 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
761 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
762 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
763 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
764 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
765 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
766 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
767 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
768 //
test_vmlal_high_laneq_s32(int64x2_t a,int32x4_t b,int32x4_t v)769 int64x2_t test_vmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
770   return vmlal_high_laneq_s32(a, b, v, 3);
771 }
772 
773 // CHECK-LABEL: @test_vmlsl_lane_s16(
774 // CHECK-NEXT:  entry:
775 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
776 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
777 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
778 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
779 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
780 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
781 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
782 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
783 //
test_vmlsl_lane_s16(int32x4_t a,int16x4_t b,int16x4_t v)784 int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
785   return vmlsl_lane_s16(a, b, v, 3);
786 }
787 
788 // CHECK-LABEL: @test_vmlsl_lane_s32(
789 // CHECK-NEXT:  entry:
790 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
791 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
792 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
793 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
794 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
795 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
796 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
797 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
798 //
test_vmlsl_lane_s32(int64x2_t a,int32x2_t b,int32x2_t v)799 int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
800   return vmlsl_lane_s32(a, b, v, 1);
801 }
802 
803 // CHECK-LABEL: @test_vmlsl_laneq_s16(
804 // CHECK-NEXT:  entry:
805 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
806 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
807 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
808 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
809 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
810 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
811 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
812 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
813 //
test_vmlsl_laneq_s16(int32x4_t a,int16x4_t b,int16x8_t v)814 int32x4_t test_vmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
815   return vmlsl_laneq_s16(a, b, v, 7);
816 }
817 
818 // CHECK-LABEL: @test_vmlsl_laneq_s32(
819 // CHECK-NEXT:  entry:
820 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
821 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
822 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
823 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
824 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
825 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
826 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
827 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
828 //
test_vmlsl_laneq_s32(int64x2_t a,int32x2_t b,int32x4_t v)829 int64x2_t test_vmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
830   return vmlsl_laneq_s32(a, b, v, 3);
831 }
832 
833 // CHECK-LABEL: @test_vmlsl_high_lane_s16(
834 // CHECK-NEXT:  entry:
835 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
836 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
837 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
838 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
839 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
840 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
841 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
842 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
843 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
844 //
test_vmlsl_high_lane_s16(int32x4_t a,int16x8_t b,int16x4_t v)845 int32x4_t test_vmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
846   return vmlsl_high_lane_s16(a, b, v, 3);
847 }
848 
849 // CHECK-LABEL: @test_vmlsl_high_lane_s32(
850 // CHECK-NEXT:  entry:
851 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
852 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
853 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
854 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
855 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
856 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
857 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
858 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
859 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
860 //
test_vmlsl_high_lane_s32(int64x2_t a,int32x4_t b,int32x2_t v)861 int64x2_t test_vmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
862   return vmlsl_high_lane_s32(a, b, v, 1);
863 }
864 
865 // CHECK-LABEL: @test_vmlsl_high_laneq_s16(
866 // CHECK-NEXT:  entry:
867 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
868 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
869 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
870 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
871 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
872 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
873 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
874 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
875 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
876 //
test_vmlsl_high_laneq_s16(int32x4_t a,int16x8_t b,int16x8_t v)877 int32x4_t test_vmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
878   return vmlsl_high_laneq_s16(a, b, v, 7);
879 }
880 
881 // CHECK-LABEL: @test_vmlsl_high_laneq_s32(
882 // CHECK-NEXT:  entry:
883 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
884 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
885 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
886 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
887 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
888 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
889 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
890 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
891 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
892 //
test_vmlsl_high_laneq_s32(int64x2_t a,int32x4_t b,int32x4_t v)893 int64x2_t test_vmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
894   return vmlsl_high_laneq_s32(a, b, v, 3);
895 }
896 
897 // CHECK-LABEL: @test_vmlal_lane_u16(
898 // CHECK-NEXT:  entry:
899 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
900 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
901 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
902 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
903 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
904 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
905 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
906 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
907 //
test_vmlal_lane_u16(int32x4_t a,int16x4_t b,int16x4_t v)908 int32x4_t test_vmlal_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
909   return vmlal_lane_u16(a, b, v, 3);
910 }
911 
912 // CHECK-LABEL: @test_vmlal_lane_u32(
913 // CHECK-NEXT:  entry:
914 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
915 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
916 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
917 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
918 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
919 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
920 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
921 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
922 //
test_vmlal_lane_u32(int64x2_t a,int32x2_t b,int32x2_t v)923 int64x2_t test_vmlal_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
924   return vmlal_lane_u32(a, b, v, 1);
925 }
926 
927 // CHECK-LABEL: @test_vmlal_laneq_u16(
928 // CHECK-NEXT:  entry:
929 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
930 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
931 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
932 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
933 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
934 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
935 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
936 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
937 //
test_vmlal_laneq_u16(int32x4_t a,int16x4_t b,int16x8_t v)938 int32x4_t test_vmlal_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
939   return vmlal_laneq_u16(a, b, v, 7);
940 }
941 
942 // CHECK-LABEL: @test_vmlal_laneq_u32(
943 // CHECK-NEXT:  entry:
944 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
945 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
946 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
947 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
948 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
949 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
950 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
951 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
952 //
test_vmlal_laneq_u32(int64x2_t a,int32x2_t b,int32x4_t v)953 int64x2_t test_vmlal_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
954   return vmlal_laneq_u32(a, b, v, 3);
955 }
956 
957 // CHECK-LABEL: @test_vmlal_high_lane_u16(
958 // CHECK-NEXT:  entry:
959 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
960 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
961 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
962 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
963 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
964 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
965 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
966 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
967 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
968 //
test_vmlal_high_lane_u16(int32x4_t a,int16x8_t b,int16x4_t v)969 int32x4_t test_vmlal_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
970   return vmlal_high_lane_u16(a, b, v, 3);
971 }
972 
973 // CHECK-LABEL: @test_vmlal_high_lane_u32(
974 // CHECK-NEXT:  entry:
975 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
976 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
977 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
978 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
979 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
980 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
981 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
982 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
983 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
984 //
test_vmlal_high_lane_u32(int64x2_t a,int32x4_t b,int32x2_t v)985 int64x2_t test_vmlal_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
986   return vmlal_high_lane_u32(a, b, v, 1);
987 }
988 
989 // CHECK-LABEL: @test_vmlal_high_laneq_u16(
990 // CHECK-NEXT:  entry:
991 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
992 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
993 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
994 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
995 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
996 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
997 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
998 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
999 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
1000 //
test_vmlal_high_laneq_u16(int32x4_t a,int16x8_t b,int16x8_t v)1001 int32x4_t test_vmlal_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
1002   return vmlal_high_laneq_u16(a, b, v, 7);
1003 }
1004 
1005 // CHECK-LABEL: @test_vmlal_high_laneq_u32(
1006 // CHECK-NEXT:  entry:
1007 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
1008 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1009 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1010 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1011 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1012 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1013 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
1014 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
1015 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
1016 //
test_vmlal_high_laneq_u32(int64x2_t a,int32x4_t b,int32x4_t v)1017 int64x2_t test_vmlal_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
1018   return vmlal_high_laneq_u32(a, b, v, 3);
1019 }
1020 
1021 // CHECK-LABEL: @test_vmlsl_lane_u16(
1022 // CHECK-NEXT:  entry:
1023 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1024 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1025 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1026 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
1027 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1028 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
1029 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
1030 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
1031 //
test_vmlsl_lane_u16(int32x4_t a,int16x4_t b,int16x4_t v)1032 int32x4_t test_vmlsl_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
1033   return vmlsl_lane_u16(a, b, v, 3);
1034 }
1035 
1036 // CHECK-LABEL: @test_vmlsl_lane_u32(
1037 // CHECK-NEXT:  entry:
1038 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1039 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1040 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1041 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
1042 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1043 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
1044 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
1045 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
1046 //
test_vmlsl_lane_u32(int64x2_t a,int32x2_t b,int32x2_t v)1047 int64x2_t test_vmlsl_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
1048   return vmlsl_lane_u32(a, b, v, 1);
1049 }
1050 
1051 // CHECK-LABEL: @test_vmlsl_laneq_u16(
1052 // CHECK-NEXT:  entry:
1053 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1054 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1055 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1056 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
1057 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1058 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
1059 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
1060 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
1061 //
test_vmlsl_laneq_u16(int32x4_t a,int16x4_t b,int16x8_t v)1062 int32x4_t test_vmlsl_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
1063   return vmlsl_laneq_u16(a, b, v, 7);
1064 }
1065 
1066 // CHECK-LABEL: @test_vmlsl_laneq_u32(
1067 // CHECK-NEXT:  entry:
1068 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1069 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1070 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1071 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
1072 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1073 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
1074 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
1075 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
1076 //
test_vmlsl_laneq_u32(int64x2_t a,int32x2_t b,int32x4_t v)1077 int64x2_t test_vmlsl_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
1078   return vmlsl_laneq_u32(a, b, v, 3);
1079 }
1080 
1081 // CHECK-LABEL: @test_vmlsl_high_lane_u16(
1082 // CHECK-NEXT:  entry:
1083 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1084 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1085 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1086 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1087 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1088 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1089 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
1090 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
1091 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
1092 //
test_vmlsl_high_lane_u16(int32x4_t a,int16x8_t b,int16x4_t v)1093 int32x4_t test_vmlsl_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
1094   return vmlsl_high_lane_u16(a, b, v, 3);
1095 }
1096 
1097 // CHECK-LABEL: @test_vmlsl_high_lane_u32(
1098 // CHECK-NEXT:  entry:
1099 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
1100 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1101 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1102 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1103 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1104 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1105 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
1106 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
1107 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
1108 //
test_vmlsl_high_lane_u32(int64x2_t a,int32x4_t b,int32x2_t v)1109 int64x2_t test_vmlsl_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
1110   return vmlsl_high_lane_u32(a, b, v, 1);
1111 }
1112 
1113 // CHECK-LABEL: @test_vmlsl_high_laneq_u16(
1114 // CHECK-NEXT:  entry:
1115 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1116 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1117 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1118 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1119 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1120 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1121 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
1122 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
1123 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
1124 //
test_vmlsl_high_laneq_u16(int32x4_t a,int16x8_t b,int16x8_t v)1125 int32x4_t test_vmlsl_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
1126   return vmlsl_high_laneq_u16(a, b, v, 7);
1127 }
1128 
1129 // CHECK-LABEL: @test_vmlsl_high_laneq_u32(
1130 // CHECK-NEXT:  entry:
1131 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
1132 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1133 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1134 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1135 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1136 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1137 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
1138 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
1139 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
1140 //
test_vmlsl_high_laneq_u32(int64x2_t a,int32x4_t b,int32x4_t v)1141 int64x2_t test_vmlsl_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
1142   return vmlsl_high_laneq_u32(a, b, v, 3);
1143 }
1144 
1145 // CHECK-LABEL: @test_vmull_lane_s16(
1146 // CHECK-NEXT:  entry:
1147 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1148 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1149 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1150 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
1151 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1152 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
1153 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
1154 //
test_vmull_lane_s16(int16x4_t a,int16x4_t v)1155 int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) {
1156   return vmull_lane_s16(a, v, 3);
1157 }
1158 
1159 // CHECK-LABEL: @test_vmull_lane_s32(
1160 // CHECK-NEXT:  entry:
1161 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1162 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1163 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1164 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
1165 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1166 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
1167 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
1168 //
test_vmull_lane_s32(int32x2_t a,int32x2_t v)1169 int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) {
1170   return vmull_lane_s32(a, v, 1);
1171 }
1172 
1173 // CHECK-LABEL: @test_vmull_lane_u16(
1174 // CHECK-NEXT:  entry:
1175 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1176 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1177 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1178 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
1179 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1180 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
1181 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
1182 //
test_vmull_lane_u16(uint16x4_t a,uint16x4_t v)1183 uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) {
1184   return vmull_lane_u16(a, v, 3);
1185 }
1186 
1187 // CHECK-LABEL: @test_vmull_lane_u32(
1188 // CHECK-NEXT:  entry:
1189 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1190 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1191 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1192 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
1193 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1194 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
1195 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
1196 //
test_vmull_lane_u32(uint32x2_t a,uint32x2_t v)1197 uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) {
1198   return vmull_lane_u32(a, v, 1);
1199 }
1200 
1201 // CHECK-LABEL: @test_vmull_high_lane_s16(
1202 // CHECK-NEXT:  entry:
1203 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1204 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1205 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1206 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1207 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1208 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1209 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
1210 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
1211 //
test_vmull_high_lane_s16(int16x8_t a,int16x4_t v)1212 int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) {
1213   return vmull_high_lane_s16(a, v, 3);
1214 }
1215 
1216 // CHECK-LABEL: @test_vmull_high_lane_s32(
1217 // CHECK-NEXT:  entry:
1218 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
1219 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1220 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1221 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1222 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1223 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1224 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
1225 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
1226 //
test_vmull_high_lane_s32(int32x4_t a,int32x2_t v)1227 int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) {
1228   return vmull_high_lane_s32(a, v, 1);
1229 }
1230 
1231 // CHECK-LABEL: @test_vmull_high_lane_u16(
1232 // CHECK-NEXT:  entry:
1233 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1234 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1235 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1236 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1237 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1238 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1239 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
1240 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
1241 //
test_vmull_high_lane_u16(uint16x8_t a,uint16x4_t v)1242 uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) {
1243   return vmull_high_lane_u16(a, v, 3);
1244 }
1245 
1246 // CHECK-LABEL: @test_vmull_high_lane_u32(
1247 // CHECK-NEXT:  entry:
1248 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
1249 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1250 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1251 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1252 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1253 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1254 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
1255 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
1256 //
test_vmull_high_lane_u32(uint32x4_t a,uint32x2_t v)1257 uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) {
1258   return vmull_high_lane_u32(a, v, 1);
1259 }
1260 
1261 // CHECK-LABEL: @test_vmull_laneq_s16(
1262 // CHECK-NEXT:  entry:
1263 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1264 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1265 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1266 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
1267 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1268 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
1269 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
1270 //
test_vmull_laneq_s16(int16x4_t a,int16x8_t v)1271 int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) {
1272   return vmull_laneq_s16(a, v, 7);
1273 }
1274 
1275 // CHECK-LABEL: @test_vmull_laneq_s32(
1276 // CHECK-NEXT:  entry:
1277 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1278 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1279 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1280 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
1281 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1282 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
1283 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
1284 //
test_vmull_laneq_s32(int32x2_t a,int32x4_t v)1285 int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) {
1286   return vmull_laneq_s32(a, v, 3);
1287 }
1288 
1289 // CHECK-LABEL: @test_vmull_laneq_u16(
1290 // CHECK-NEXT:  entry:
1291 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1292 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1293 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1294 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
1295 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1296 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
1297 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
1298 //
test_vmull_laneq_u16(uint16x4_t a,uint16x8_t v)1299 uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) {
1300   return vmull_laneq_u16(a, v, 7);
1301 }
1302 
1303 // CHECK-LABEL: @test_vmull_laneq_u32(
1304 // CHECK-NEXT:  entry:
1305 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1306 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1307 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1308 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
1309 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1310 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
1311 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
1312 //
test_vmull_laneq_u32(uint32x2_t a,uint32x4_t v)1313 uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) {
1314   return vmull_laneq_u32(a, v, 3);
1315 }
1316 
1317 // CHECK-LABEL: @test_vmull_high_laneq_s16(
1318 // CHECK-NEXT:  entry:
1319 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1320 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1321 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1322 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1323 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1324 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1325 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
1326 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
1327 //
test_vmull_high_laneq_s16(int16x8_t a,int16x8_t v)1328 int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
1329   return vmull_high_laneq_s16(a, v, 7);
1330 }
1331 
1332 // CHECK-LABEL: @test_vmull_high_laneq_s32(
1333 // CHECK-NEXT:  entry:
1334 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
1335 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1336 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1337 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1338 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1339 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1340 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
1341 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
1342 //
test_vmull_high_laneq_s32(int32x4_t a,int32x4_t v)1343 int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
1344   return vmull_high_laneq_s32(a, v, 3);
1345 }
1346 
1347 // CHECK-LABEL: @test_vmull_high_laneq_u16(
1348 // CHECK-NEXT:  entry:
1349 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1350 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1351 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1352 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1353 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1354 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1355 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
1356 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
1357 //
test_vmull_high_laneq_u16(uint16x8_t a,uint16x8_t v)1358 uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) {
1359   return vmull_high_laneq_u16(a, v, 7);
1360 }
1361 
1362 // CHECK-LABEL: @test_vmull_high_laneq_u32(
1363 // CHECK-NEXT:  entry:
1364 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
1365 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1366 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1367 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1368 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1369 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1370 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
1371 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
1372 //
test_vmull_high_laneq_u32(uint32x4_t a,uint32x4_t v)1373 uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) {
1374   return vmull_high_laneq_u32(a, v, 3);
1375 }
1376 
1377 // CHECK-LABEL: @test_vqdmlal_lane_s16(
1378 // CHECK-NEXT:  entry:
1379 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1380 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1381 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1382 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
1383 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
1384 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1385 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
1386 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
1387 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
1388 //
test_vqdmlal_lane_s16(int32x4_t a,int16x4_t b,int16x4_t v)1389 int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
1390   return vqdmlal_lane_s16(a, b, v, 3);
1391 }
1392 
1393 // CHECK-LABEL: @test_vqdmlal_lane_s32(
1394 // CHECK-NEXT:  entry:
1395 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1396 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1397 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1398 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
1399 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
1400 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1401 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
1402 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
1403 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
1404 //
test_vqdmlal_lane_s32(int64x2_t a,int32x2_t b,int32x2_t v)1405 int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
1406   return vqdmlal_lane_s32(a, b, v, 1);
1407 }
1408 
1409 // CHECK-LABEL: @test_vqdmlal_high_lane_s16(
1410 // CHECK-NEXT:  entry:
1411 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1412 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1413 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1414 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1415 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
1416 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1417 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1418 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
1419 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
1420 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
1421 //
test_vqdmlal_high_lane_s16(int32x4_t a,int16x8_t b,int16x4_t v)1422 int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
1423   return vqdmlal_high_lane_s16(a, b, v, 3);
1424 }
1425 
1426 // CHECK-LABEL: @test_vqdmlal_high_lane_s32(
1427 // CHECK-NEXT:  entry:
1428 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
1429 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1430 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1431 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1432 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
1433 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1434 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1435 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
1436 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
1437 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
1438 //
test_vqdmlal_high_lane_s32(int64x2_t a,int32x4_t b,int32x2_t v)1439 int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
1440   return vqdmlal_high_lane_s32(a, b, v, 1);
1441 }
1442 
1443 // CHECK-LABEL: @test_vqdmlsl_lane_s16(
1444 // CHECK-NEXT:  entry:
1445 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1446 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1447 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1448 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
1449 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
1450 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1451 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
1452 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
1453 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
1454 //
test_vqdmlsl_lane_s16(int32x4_t a,int16x4_t b,int16x4_t v)1455 int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
1456   return vqdmlsl_lane_s16(a, b, v, 3);
1457 }
1458 
1459 // CHECK-LABEL: @test_vqdmlsl_lane_s32(
1460 // CHECK-NEXT:  entry:
1461 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1462 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1463 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1464 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
1465 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
1466 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1467 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
1468 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
1469 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
1470 //
test_vqdmlsl_lane_s32(int64x2_t a,int32x2_t b,int32x2_t v)1471 int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
1472   return vqdmlsl_lane_s32(a, b, v, 1);
1473 }
1474 
1475 // CHECK-LABEL: @test_vqdmlsl_high_lane_s16(
1476 // CHECK-NEXT:  entry:
1477 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1478 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1479 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1480 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1481 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
1482 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1483 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1484 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
1485 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
1486 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
1487 //
test_vqdmlsl_high_lane_s16(int32x4_t a,int16x8_t b,int16x4_t v)1488 int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
1489   return vqdmlsl_high_lane_s16(a, b, v, 3);
1490 }
1491 
1492 // CHECK-LABEL: @test_vqdmlsl_high_lane_s32(
1493 // CHECK-NEXT:  entry:
1494 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
1495 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1496 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1497 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1498 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
1499 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1500 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1501 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
1502 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
1503 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
1504 //
test_vqdmlsl_high_lane_s32(int64x2_t a,int32x4_t b,int32x2_t v)1505 int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
1506   return vqdmlsl_high_lane_s32(a, b, v, 1);
1507 }
1508 
1509 // CHECK-LABEL: @test_vqdmull_lane_s16(
1510 // CHECK-NEXT:  entry:
1511 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1512 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1513 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1514 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
1515 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1516 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
1517 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1518 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
1519 //
test_vqdmull_lane_s16(int16x4_t a,int16x4_t v)1520 int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) {
1521   return vqdmull_lane_s16(a, v, 3);
1522 }
1523 
1524 // CHECK-LABEL: @test_vqdmull_lane_s32(
1525 // CHECK-NEXT:  entry:
1526 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1527 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1528 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1529 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
1530 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1531 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
1532 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1533 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
1534 //
test_vqdmull_lane_s32(int32x2_t a,int32x2_t v)1535 int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) {
1536   return vqdmull_lane_s32(a, v, 1);
1537 }
1538 
1539 // CHECK-LABEL: @test_vqdmull_laneq_s16(
1540 // CHECK-NEXT:  entry:
1541 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1542 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1543 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1544 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
1545 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1546 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
1547 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1548 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
1549 //
test_vqdmull_laneq_s16(int16x4_t a,int16x8_t v)1550 int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) {
1551   return vqdmull_laneq_s16(a, v, 3);
1552 }
1553 
1554 // CHECK-LABEL: @test_vqdmull_laneq_s32(
1555 // CHECK-NEXT:  entry:
1556 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1557 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1558 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1559 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
1560 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1561 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
1562 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1563 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
1564 //
test_vqdmull_laneq_s32(int32x2_t a,int32x4_t v)1565 int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) {
1566   return vqdmull_laneq_s32(a, v, 3);
1567 }
1568 
1569 // CHECK-LABEL: @test_vqdmull_high_lane_s16(
1570 // CHECK-NEXT:  entry:
1571 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1572 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1573 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1574 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1575 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1576 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1577 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
1578 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1579 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
1580 //
test_vqdmull_high_lane_s16(int16x8_t a,int16x4_t v)1581 int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) {
1582   return vqdmull_high_lane_s16(a, v, 3);
1583 }
1584 
1585 // CHECK-LABEL: @test_vqdmull_high_lane_s32(
1586 // CHECK-NEXT:  entry:
1587 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
1588 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1589 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1590 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1591 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1592 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1593 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
1594 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1595 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
1596 //
test_vqdmull_high_lane_s32(int32x4_t a,int32x2_t v)1597 int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) {
1598   return vqdmull_high_lane_s32(a, v, 1);
1599 }
1600 
1601 // CHECK-LABEL: @test_vqdmull_high_laneq_s16(
1602 // CHECK-NEXT:  entry:
1603 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1604 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1605 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1606 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1607 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1608 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1609 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
1610 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1611 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
1612 //
test_vqdmull_high_laneq_s16(int16x8_t a,int16x8_t v)1613 int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
1614   return vqdmull_high_laneq_s16(a, v, 7);
1615 }
1616 
1617 // CHECK-LABEL: @test_vqdmull_high_laneq_s32(
1618 // CHECK-NEXT:  entry:
1619 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
1620 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1621 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1622 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1623 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1624 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1625 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
1626 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1627 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
1628 //
test_vqdmull_high_laneq_s32(int32x4_t a,int32x4_t v)1629 int64x2_t test_vqdmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
1630   return vqdmull_high_laneq_s32(a, v, 3);
1631 }
1632 
1633 // CHECK-LABEL: @test_vqdmulh_lane_s16(
1634 // CHECK-NEXT:  entry:
1635 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
1636 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1637 // CHECK-NEXT:    [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1638 // CHECK-NEXT:    [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1639 // CHECK-NEXT:    [[VQDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16> [[VQDMULH_LANE_V]], <4 x i16> [[VQDMULH_LANE_V1]], i32 3)
1640 // CHECK-NEXT:    ret <4 x i16> [[VQDMULH_LANE_V2]]
1641 //
test_vqdmulh_lane_s16(int16x4_t a,int16x4_t v)1642 int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) {
1643   return vqdmulh_lane_s16(a, v, 3);
1644 }
1645 
1646 // CHECK-LABEL: @test_vqdmulhq_lane_s16(
1647 // CHECK-NEXT:  entry:
1648 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
1649 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1650 // CHECK-NEXT:    [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1651 // CHECK-NEXT:    [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1652 // CHECK-NEXT:    [[VQDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16> [[VQDMULHQ_LANE_V]], <4 x i16> [[VQDMULHQ_LANE_V1]], i32 3)
1653 // CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_LANE_V2]]
1654 //
test_vqdmulhq_lane_s16(int16x8_t a,int16x4_t v)1655 int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
1656   return vqdmulhq_lane_s16(a, v, 3);
1657 }
1658 
1659 // CHECK-LABEL: @test_vqdmulh_lane_s32(
1660 // CHECK-NEXT:  entry:
1661 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
1662 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1663 // CHECK-NEXT:    [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1664 // CHECK-NEXT:    [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1665 // CHECK-NEXT:    [[VQDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32> [[VQDMULH_LANE_V]], <2 x i32> [[VQDMULH_LANE_V1]], i32 1)
1666 // CHECK-NEXT:    ret <2 x i32> [[VQDMULH_LANE_V2]]
1667 //
test_vqdmulh_lane_s32(int32x2_t a,int32x2_t v)1668 int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) {
1669   return vqdmulh_lane_s32(a, v, 1);
1670 }
1671 
1672 // CHECK-LABEL: @test_vqdmulhq_lane_s32(
1673 // CHECK-NEXT:  entry:
1674 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
1675 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1676 // CHECK-NEXT:    [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1677 // CHECK-NEXT:    [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1678 // CHECK-NEXT:    [[VQDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32> [[VQDMULHQ_LANE_V]], <2 x i32> [[VQDMULHQ_LANE_V1]], i32 1)
1679 // CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_LANE_V2]]
1680 //
test_vqdmulhq_lane_s32(int32x4_t a,int32x2_t v)1681 int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
1682   return vqdmulhq_lane_s32(a, v, 1);
1683 }
1684 
1685 // CHECK-LABEL: @test_vqrdmulh_lane_s16(
1686 // CHECK-NEXT:  entry:
1687 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
1688 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1689 // CHECK-NEXT:    [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1690 // CHECK-NEXT:    [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1691 // CHECK-NEXT:    [[VQRDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16> [[VQRDMULH_LANE_V]], <4 x i16> [[VQRDMULH_LANE_V1]], i32 3)
1692 // CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_LANE_V2]]
1693 //
test_vqrdmulh_lane_s16(int16x4_t a,int16x4_t v)1694 int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) {
1695   return vqrdmulh_lane_s16(a, v, 3);
1696 }
1697 
1698 // CHECK-LABEL: @test_vqrdmulhq_lane_s16(
1699 // CHECK-NEXT:  entry:
1700 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
1701 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1702 // CHECK-NEXT:    [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1703 // CHECK-NEXT:    [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1704 // CHECK-NEXT:    [[VQRDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16> [[VQRDMULHQ_LANE_V]], <4 x i16> [[VQRDMULHQ_LANE_V1]], i32 3)
1705 // CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_LANE_V2]]
1706 //
test_vqrdmulhq_lane_s16(int16x8_t a,int16x4_t v)1707 int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
1708   return vqrdmulhq_lane_s16(a, v, 3);
1709 }
1710 
1711 // CHECK-LABEL: @test_vqrdmulh_lane_s32(
1712 // CHECK-NEXT:  entry:
1713 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
1714 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1715 // CHECK-NEXT:    [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1716 // CHECK-NEXT:    [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1717 // CHECK-NEXT:    [[VQRDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32> [[VQRDMULH_LANE_V]], <2 x i32> [[VQRDMULH_LANE_V1]], i32 1)
1718 // CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_LANE_V2]]
1719 //
test_vqrdmulh_lane_s32(int32x2_t a,int32x2_t v)1720 int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) {
1721   return vqrdmulh_lane_s32(a, v, 1);
1722 }
1723 
1724 // CHECK-LABEL: @test_vqrdmulhq_lane_s32(
1725 // CHECK-NEXT:  entry:
1726 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
1727 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1728 // CHECK-NEXT:    [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1729 // CHECK-NEXT:    [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1730 // CHECK-NEXT:    [[VQRDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32> [[VQRDMULHQ_LANE_V]], <2 x i32> [[VQRDMULHQ_LANE_V1]], i32 1)
1731 // CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_LANE_V2]]
1732 //
test_vqrdmulhq_lane_s32(int32x4_t a,int32x2_t v)1733 int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
1734   return vqrdmulhq_lane_s32(a, v, 1);
1735 }
1736 
1737 // CHECK-LABEL: @test_vmul_lane_f32(
1738 // CHECK-NEXT:  entry:
1739 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
1740 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1741 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
1742 // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]]
1743 // CHECK-NEXT:    ret <2 x float> [[MUL]]
1744 //
test_vmul_lane_f32(float32x2_t a,float32x2_t v)1745 float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t v) {
1746   return vmul_lane_f32(a, v, 1);
1747 }
1748 
1749 
1750 // CHECK-LABEL: @test_vmul_lane_f64(
1751 // CHECK-NEXT:  entry:
1752 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
1753 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
1754 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
1755 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
1756 // CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
1757 // CHECK-NEXT:    [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
1758 // CHECK-NEXT:    [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
1759 // CHECK-NEXT:    ret <1 x double> [[TMP5]]
1760 //
test_vmul_lane_f64(float64x1_t a,float64x1_t v)1761 float64x1_t test_vmul_lane_f64(float64x1_t a, float64x1_t v) {
1762   return vmul_lane_f64(a, v, 0);
1763 }
1764 
1765 // CHECK-LABEL: @test_vmulq_lane_f32(
1766 // CHECK-NEXT:  entry:
1767 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
1768 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1769 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1770 // CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]]
1771 // CHECK-NEXT:    ret <4 x float> [[MUL]]
1772 //
test_vmulq_lane_f32(float32x4_t a,float32x2_t v)1773 float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) {
1774   return vmulq_lane_f32(a, v, 1);
1775 }
1776 
1777 // CHECK-LABEL: @test_vmulq_lane_f64(
1778 // CHECK-NEXT:  entry:
1779 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
1780 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
1781 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer
1782 // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]]
1783 // CHECK-NEXT:    ret <2 x double> [[MUL]]
1784 //
test_vmulq_lane_f64(float64x2_t a,float64x1_t v)1785 float64x2_t test_vmulq_lane_f64(float64x2_t a, float64x1_t v) {
1786   return vmulq_lane_f64(a, v, 0);
1787 }
1788 
1789 // CHECK-LABEL: @test_vmul_laneq_f32(
1790 // CHECK-NEXT:  entry:
1791 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
1792 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1793 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> <i32 3, i32 3>
1794 // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]]
1795 // CHECK-NEXT:    ret <2 x float> [[MUL]]
1796 //
test_vmul_laneq_f32(float32x2_t a,float32x4_t v)1797 float32x2_t test_vmul_laneq_f32(float32x2_t a, float32x4_t v) {
1798   return vmul_laneq_f32(a, v, 3);
1799 }
1800 
1801 // CHECK-LABEL: @test_vmul_laneq_f64(
1802 // CHECK-NEXT:  entry:
1803 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
1804 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
1805 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
1806 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
1807 // CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
1808 // CHECK-NEXT:    [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
1809 // CHECK-NEXT:    [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
1810 // CHECK-NEXT:    ret <1 x double> [[TMP5]]
1811 //
test_vmul_laneq_f64(float64x1_t a,float64x2_t v)1812 float64x1_t test_vmul_laneq_f64(float64x1_t a, float64x2_t v) {
1813   return vmul_laneq_f64(a, v, 1);
1814 }
1815 
1816 // CHECK-LABEL: @test_vmulq_laneq_f32(
1817 // CHECK-NEXT:  entry:
1818 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
1819 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1820 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1821 // CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]]
1822 // CHECK-NEXT:    ret <4 x float> [[MUL]]
1823 //
test_vmulq_laneq_f32(float32x4_t a,float32x4_t v)1824 float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) {
1825   return vmulq_laneq_f32(a, v, 3);
1826 }
1827 
1828 // CHECK-LABEL: @test_vmulq_laneq_f64(
1829 // CHECK-NEXT:  entry:
1830 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
1831 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
1832 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> <i32 1, i32 1>
1833 // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]]
1834 // CHECK-NEXT:    ret <2 x double> [[MUL]]
1835 //
test_vmulq_laneq_f64(float64x2_t a,float64x2_t v)1836 float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) {
1837   return vmulq_laneq_f64(a, v, 1);
1838 }
1839 
1840 // CHECK-LABEL: @test_vmulx_lane_f32(
1841 // CHECK-NEXT:  entry:
1842 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
1843 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1844 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
1845 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
1846 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
1847 // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #4
1848 // CHECK-NEXT:    ret <2 x float> [[VMULX2_I]]
1849 //
test_vmulx_lane_f32(float32x2_t a,float32x2_t v)1850 float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) {
1851   return vmulx_lane_f32(a, v, 1);
1852 }
1853 
1854 // CHECK-LABEL: @test_vmulxq_lane_f32(
1855 // CHECK-NEXT:  entry:
1856 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
1857 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1858 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1859 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
1860 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
1861 // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #4
1862 // CHECK-NEXT:    ret <4 x float> [[VMULX2_I]]
1863 //
test_vmulxq_lane_f32(float32x4_t a,float32x2_t v)1864 float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) {
1865   return vmulxq_lane_f32(a, v, 1);
1866 }
1867 
1868 // CHECK-LABEL: @test_vmulxq_lane_f64(
1869 // CHECK-NEXT:  entry:
1870 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
1871 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
1872 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer
1873 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
1874 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
1875 // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #4
1876 // CHECK-NEXT:    ret <2 x double> [[VMULX2_I]]
1877 //
test_vmulxq_lane_f64(float64x2_t a,float64x1_t v)1878 float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) {
1879   return vmulxq_lane_f64(a, v, 0);
1880 }
1881 
1882 // CHECK-LABEL: @test_vmulx_laneq_f32(
1883 // CHECK-NEXT:  entry:
1884 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
1885 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1886 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> <i32 3, i32 3>
1887 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
1888 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
1889 // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #4
1890 // CHECK-NEXT:    ret <2 x float> [[VMULX2_I]]
1891 //
test_vmulx_laneq_f32(float32x2_t a,float32x4_t v)1892 float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) {
1893   return vmulx_laneq_f32(a, v, 3);
1894 }
1895 
1896 // CHECK-LABEL: @test_vmulxq_laneq_f32(
1897 // CHECK-NEXT:  entry:
1898 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
1899 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1900 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1901 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
1902 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
1903 // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #4
1904 // CHECK-NEXT:    ret <4 x float> [[VMULX2_I]]
1905 //
test_vmulxq_laneq_f32(float32x4_t a,float32x4_t v)1906 float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) {
1907   return vmulxq_laneq_f32(a, v, 3);
1908 }
1909 
1910 // CHECK-LABEL: @test_vmulxq_laneq_f64(
1911 // CHECK-NEXT:  entry:
1912 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
1913 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
1914 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> <i32 1, i32 1>
1915 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
1916 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
1917 // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #4
1918 // CHECK-NEXT:    ret <2 x double> [[VMULX2_I]]
1919 //
test_vmulxq_laneq_f64(float64x2_t a,float64x2_t v)1920 float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) {
1921   return vmulxq_laneq_f64(a, v, 1);
1922 }
1923 
1924 // CHECK-LABEL: @test_vmla_lane_s16_0(
1925 // CHECK-NEXT:  entry:
1926 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1927 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1928 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
1929 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
1930 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
1931 // CHECK-NEXT:    ret <4 x i16> [[ADD]]
1932 //
test_vmla_lane_s16_0(int16x4_t a,int16x4_t b,int16x4_t v)1933 int16x4_t test_vmla_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
1934   return vmla_lane_s16(a, b, v, 0);
1935 }
1936 
1937 // CHECK-LABEL: @test_vmlaq_lane_s16_0(
1938 // CHECK-NEXT:  entry:
1939 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1940 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1941 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
1942 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
1943 // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
1944 // CHECK-NEXT:    ret <8 x i16> [[ADD]]
1945 //
test_vmlaq_lane_s16_0(int16x8_t a,int16x8_t b,int16x4_t v)1946 int16x8_t test_vmlaq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
1947   return vmlaq_lane_s16(a, b, v, 0);
1948 }
1949 
1950 // CHECK-LABEL: @test_vmla_lane_s32_0(
1951 // CHECK-NEXT:  entry:
1952 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1953 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1954 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
1955 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
1956 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
1957 // CHECK-NEXT:    ret <2 x i32> [[ADD]]
1958 //
test_vmla_lane_s32_0(int32x2_t a,int32x2_t b,int32x2_t v)1959 int32x2_t test_vmla_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
1960   return vmla_lane_s32(a, b, v, 0);
1961 }
1962 
1963 // CHECK-LABEL: @test_vmlaq_lane_s32_0(
1964 // CHECK-NEXT:  entry:
1965 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1966 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1967 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
1968 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
1969 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
1970 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
1971 //
test_vmlaq_lane_s32_0(int32x4_t a,int32x4_t b,int32x2_t v)1972 int32x4_t test_vmlaq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
1973   return vmlaq_lane_s32(a, b, v, 0);
1974 }
1975 
1976 // CHECK-LABEL: @test_vmla_laneq_s16_0(
1977 // CHECK-NEXT:  entry:
1978 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1979 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1980 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
1981 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
1982 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
1983 // CHECK-NEXT:    ret <4 x i16> [[ADD]]
1984 //
test_vmla_laneq_s16_0(int16x4_t a,int16x4_t b,int16x8_t v)1985 int16x4_t test_vmla_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
1986   return vmla_laneq_s16(a, b, v, 0);
1987 }
1988 
1989 // CHECK-LABEL: @test_vmlaq_laneq_s16_0(
1990 // CHECK-NEXT:  entry:
1991 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1992 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1993 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
1994 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
1995 // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
1996 // CHECK-NEXT:    ret <8 x i16> [[ADD]]
1997 //
test_vmlaq_laneq_s16_0(int16x8_t a,int16x8_t b,int16x8_t v)1998 int16x8_t test_vmlaq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
1999   return vmlaq_laneq_s16(a, b, v, 0);
2000 }
2001 
2002 // CHECK-LABEL: @test_vmla_laneq_s32_0(
2003 // CHECK-NEXT:  entry:
2004 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2005 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2006 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2007 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
2008 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
2009 // CHECK-NEXT:    ret <2 x i32> [[ADD]]
2010 //
test_vmla_laneq_s32_0(int32x2_t a,int32x2_t b,int32x4_t v)2011 int32x2_t test_vmla_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
2012   return vmla_laneq_s32(a, b, v, 0);
2013 }
2014 
2015 // CHECK-LABEL: @test_vmlaq_laneq_s32_0(
2016 // CHECK-NEXT:  entry:
2017 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2018 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2019 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
2020 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
2021 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
2022 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
2023 //
test_vmlaq_laneq_s32_0(int32x4_t a,int32x4_t b,int32x4_t v)2024 int32x4_t test_vmlaq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
2025   return vmlaq_laneq_s32(a, b, v, 0);
2026 }
2027 
2028 // CHECK-LABEL: @test_vmls_lane_s16_0(
2029 // CHECK-NEXT:  entry:
2030 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2031 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2032 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2033 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
2034 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
2035 // CHECK-NEXT:    ret <4 x i16> [[SUB]]
2036 //
test_vmls_lane_s16_0(int16x4_t a,int16x4_t b,int16x4_t v)2037 int16x4_t test_vmls_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
2038   return vmls_lane_s16(a, b, v, 0);
2039 }
2040 
2041 // CHECK-LABEL: @test_vmlsq_lane_s16_0(
2042 // CHECK-NEXT:  entry:
2043 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2044 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2045 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
2046 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
2047 // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
2048 // CHECK-NEXT:    ret <8 x i16> [[SUB]]
2049 //
test_vmlsq_lane_s16_0(int16x8_t a,int16x8_t b,int16x4_t v)2050 int16x8_t test_vmlsq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
2051   return vmlsq_lane_s16(a, b, v, 0);
2052 }
2053 
2054 // CHECK-LABEL: @test_vmls_lane_s32_0(
2055 // CHECK-NEXT:  entry:
2056 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2057 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2058 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2059 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
2060 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
2061 // CHECK-NEXT:    ret <2 x i32> [[SUB]]
2062 //
test_vmls_lane_s32_0(int32x2_t a,int32x2_t b,int32x2_t v)2063 int32x2_t test_vmls_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
2064   return vmls_lane_s32(a, b, v, 0);
2065 }
2066 
2067 // CHECK-LABEL: @test_vmlsq_lane_s32_0(
2068 // CHECK-NEXT:  entry:
2069 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2070 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2071 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
2072 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
2073 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
2074 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
2075 //
test_vmlsq_lane_s32_0(int32x4_t a,int32x4_t b,int32x2_t v)2076 int32x4_t test_vmlsq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
2077   return vmlsq_lane_s32(a, b, v, 0);
2078 }
2079 
2080 // CHECK-LABEL: @test_vmls_laneq_s16_0(
2081 // CHECK-NEXT:  entry:
2082 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2083 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2084 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2085 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
2086 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
2087 // CHECK-NEXT:    ret <4 x i16> [[SUB]]
2088 //
test_vmls_laneq_s16_0(int16x4_t a,int16x4_t b,int16x8_t v)2089 int16x4_t test_vmls_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
2090   return vmls_laneq_s16(a, b, v, 0);
2091 }
2092 
2093 // CHECK-LABEL: @test_vmlsq_laneq_s16_0(
2094 // CHECK-NEXT:  entry:
2095 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2096 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2097 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
2098 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
2099 // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
2100 // CHECK-NEXT:    ret <8 x i16> [[SUB]]
2101 //
test_vmlsq_laneq_s16_0(int16x8_t a,int16x8_t b,int16x8_t v)2102 int16x8_t test_vmlsq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
2103   return vmlsq_laneq_s16(a, b, v, 0);
2104 }
2105 
2106 // CHECK-LABEL: @test_vmls_laneq_s32_0(
2107 // CHECK-NEXT:  entry:
2108 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2109 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2110 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2111 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
2112 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
2113 // CHECK-NEXT:    ret <2 x i32> [[SUB]]
2114 //
test_vmls_laneq_s32_0(int32x2_t a,int32x2_t b,int32x4_t v)2115 int32x2_t test_vmls_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
2116   return vmls_laneq_s32(a, b, v, 0);
2117 }
2118 
2119 // CHECK-LABEL: @test_vmlsq_laneq_s32_0(
2120 // CHECK-NEXT:  entry:
2121 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2122 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2123 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
2124 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
2125 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
2126 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
2127 //
test_vmlsq_laneq_s32_0(int32x4_t a,int32x4_t b,int32x4_t v)2128 int32x4_t test_vmlsq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
2129   return vmlsq_laneq_s32(a, b, v, 0);
2130 }
2131 
2132 // CHECK-LABEL: @test_vmul_lane_s16_0(
2133 // CHECK-NEXT:  entry:
2134 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2135 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2136 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2137 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
2138 // CHECK-NEXT:    ret <4 x i16> [[MUL]]
2139 //
test_vmul_lane_s16_0(int16x4_t a,int16x4_t v)2140 int16x4_t test_vmul_lane_s16_0(int16x4_t a, int16x4_t v) {
2141   return vmul_lane_s16(a, v, 0);
2142 }
2143 
2144 // CHECK-LABEL: @test_vmulq_lane_s16_0(
2145 // CHECK-NEXT:  entry:
2146 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2147 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2148 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
2149 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
2150 // CHECK-NEXT:    ret <8 x i16> [[MUL]]
2151 //
test_vmulq_lane_s16_0(int16x8_t a,int16x4_t v)2152 int16x8_t test_vmulq_lane_s16_0(int16x8_t a, int16x4_t v) {
2153   return vmulq_lane_s16(a, v, 0);
2154 }
2155 
2156 // CHECK-LABEL: @test_vmul_lane_s32_0(
2157 // CHECK-NEXT:  entry:
2158 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2159 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2160 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2161 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
2162 // CHECK-NEXT:    ret <2 x i32> [[MUL]]
2163 //
test_vmul_lane_s32_0(int32x2_t a,int32x2_t v)2164 int32x2_t test_vmul_lane_s32_0(int32x2_t a, int32x2_t v) {
2165   return vmul_lane_s32(a, v, 0);
2166 }
2167 
2168 // CHECK-LABEL: @test_vmulq_lane_s32_0(
2169 // CHECK-NEXT:  entry:
2170 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2171 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2172 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
2173 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
2174 // CHECK-NEXT:    ret <4 x i32> [[MUL]]
2175 //
test_vmulq_lane_s32_0(int32x4_t a,int32x2_t v)2176 int32x4_t test_vmulq_lane_s32_0(int32x4_t a, int32x2_t v) {
2177   return vmulq_lane_s32(a, v, 0);
2178 }
2179 
2180 // CHECK-LABEL: @test_vmul_lane_u16_0(
2181 // CHECK-NEXT:  entry:
2182 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2183 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2184 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2185 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
2186 // CHECK-NEXT:    ret <4 x i16> [[MUL]]
2187 //
test_vmul_lane_u16_0(uint16x4_t a,uint16x4_t v)2188 uint16x4_t test_vmul_lane_u16_0(uint16x4_t a, uint16x4_t v) {
2189   return vmul_lane_u16(a, v, 0);
2190 }
2191 
2192 // CHECK-LABEL: @test_vmulq_lane_u16_0(
2193 // CHECK-NEXT:  entry:
2194 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2195 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2196 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
2197 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
2198 // CHECK-NEXT:    ret <8 x i16> [[MUL]]
2199 //
test_vmulq_lane_u16_0(uint16x8_t a,uint16x4_t v)2200 uint16x8_t test_vmulq_lane_u16_0(uint16x8_t a, uint16x4_t v) {
2201   return vmulq_lane_u16(a, v, 0);
2202 }
2203 
2204 // CHECK-LABEL: @test_vmul_lane_u32_0(
2205 // CHECK-NEXT:  entry:
2206 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2207 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2208 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2209 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
2210 // CHECK-NEXT:    ret <2 x i32> [[MUL]]
2211 //
test_vmul_lane_u32_0(uint32x2_t a,uint32x2_t v)2212 uint32x2_t test_vmul_lane_u32_0(uint32x2_t a, uint32x2_t v) {
2213   return vmul_lane_u32(a, v, 0);
2214 }
2215 
2216 // CHECK-LABEL: @test_vmulq_lane_u32_0(
2217 // CHECK-NEXT:  entry:
2218 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2219 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2220 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
2221 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
2222 // CHECK-NEXT:    ret <4 x i32> [[MUL]]
2223 //
test_vmulq_lane_u32_0(uint32x4_t a,uint32x2_t v)2224 uint32x4_t test_vmulq_lane_u32_0(uint32x4_t a, uint32x2_t v) {
2225   return vmulq_lane_u32(a, v, 0);
2226 }
2227 
2228 // CHECK-LABEL: @test_vmul_laneq_s16_0(
2229 // CHECK-NEXT:  entry:
2230 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2231 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2232 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2233 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
2234 // CHECK-NEXT:    ret <4 x i16> [[MUL]]
2235 //
test_vmul_laneq_s16_0(int16x4_t a,int16x8_t v)2236 int16x4_t test_vmul_laneq_s16_0(int16x4_t a, int16x8_t v) {
2237   return vmul_laneq_s16(a, v, 0);
2238 }
2239 
2240 // CHECK-LABEL: @test_vmulq_laneq_s16_0(
2241 // CHECK-NEXT:  entry:
2242 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2243 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2244 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
2245 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
2246 // CHECK-NEXT:    ret <8 x i16> [[MUL]]
2247 //
test_vmulq_laneq_s16_0(int16x8_t a,int16x8_t v)2248 int16x8_t test_vmulq_laneq_s16_0(int16x8_t a, int16x8_t v) {
2249   return vmulq_laneq_s16(a, v, 0);
2250 }
2251 
2252 // CHECK-LABEL: @test_vmul_laneq_s32_0(
2253 // CHECK-NEXT:  entry:
2254 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2255 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2256 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2257 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
2258 // CHECK-NEXT:    ret <2 x i32> [[MUL]]
2259 //
test_vmul_laneq_s32_0(int32x2_t a,int32x4_t v)2260 int32x2_t test_vmul_laneq_s32_0(int32x2_t a, int32x4_t v) {
2261   return vmul_laneq_s32(a, v, 0);
2262 }
2263 
2264 // CHECK-LABEL: @test_vmulq_laneq_s32_0(
2265 // CHECK-NEXT:  entry:
2266 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2267 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2268 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
2269 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
2270 // CHECK-NEXT:    ret <4 x i32> [[MUL]]
2271 //
test_vmulq_laneq_s32_0(int32x4_t a,int32x4_t v)2272 int32x4_t test_vmulq_laneq_s32_0(int32x4_t a, int32x4_t v) {
2273   return vmulq_laneq_s32(a, v, 0);
2274 }
2275 
2276 // CHECK-LABEL: @test_vmul_laneq_u16_0(
2277 // CHECK-NEXT:  entry:
2278 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2279 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2280 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2281 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
2282 // CHECK-NEXT:    ret <4 x i16> [[MUL]]
2283 //
test_vmul_laneq_u16_0(uint16x4_t a,uint16x8_t v)2284 uint16x4_t test_vmul_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
2285   return vmul_laneq_u16(a, v, 0);
2286 }
2287 
2288 // CHECK-LABEL: @test_vmulq_laneq_u16_0(
2289 // CHECK-NEXT:  entry:
2290 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2291 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2292 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
2293 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
2294 // CHECK-NEXT:    ret <8 x i16> [[MUL]]
2295 //
test_vmulq_laneq_u16_0(uint16x8_t a,uint16x8_t v)2296 uint16x8_t test_vmulq_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
2297   return vmulq_laneq_u16(a, v, 0);
2298 }
2299 
2300 // CHECK-LABEL: @test_vmul_laneq_u32_0(
2301 // CHECK-NEXT:  entry:
2302 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2303 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2304 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2305 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
2306 // CHECK-NEXT:    ret <2 x i32> [[MUL]]
2307 //
test_vmul_laneq_u32_0(uint32x2_t a,uint32x4_t v)2308 uint32x2_t test_vmul_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
2309   return vmul_laneq_u32(a, v, 0);
2310 }
2311 
2312 // CHECK-LABEL: @test_vmulq_laneq_u32_0(
2313 // CHECK-NEXT:  entry:
2314 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2315 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2316 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
2317 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
2318 // CHECK-NEXT:    ret <4 x i32> [[MUL]]
2319 //
test_vmulq_laneq_u32_0(uint32x4_t a,uint32x4_t v)2320 uint32x4_t test_vmulq_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
2321   return vmulq_laneq_u32(a, v, 0);
2322 }
2323 
2324 // CHECK-LABEL: @test_vfma_lane_f32_0(
2325 // CHECK-NEXT:  entry:
2326 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
2327 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
2328 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
2329 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
2330 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
2331 // CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
2332 // CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
2333 // CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
2334 // CHECK-NEXT:    ret <2 x float> [[FMLA2]]
2335 //
test_vfma_lane_f32_0(float32x2_t a,float32x2_t b,float32x2_t v)2336 float32x2_t test_vfma_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
2337   return vfma_lane_f32(a, b, v, 0);
2338 }
2339 
2340 // CHECK-LABEL: @test_vfmaq_lane_f32_0(
2341 // CHECK-NEXT:  entry:
2342 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
2343 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8>
2344 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
2345 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
2346 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer
2347 // CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
2348 // CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
2349 // CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
2350 // CHECK-NEXT:    ret <4 x float> [[FMLA2]]
2351 //
test_vfmaq_lane_f32_0(float32x4_t a,float32x4_t b,float32x2_t v)2352 float32x4_t test_vfmaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
2353   return vfmaq_lane_f32(a, b, v, 0);
2354 }
2355 
2356 // CHECK-LABEL: @test_vfma_laneq_f32_0(
2357 // CHECK-NEXT:  entry:
2358 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
2359 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
2360 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
2361 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
2362 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
2363 // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
2364 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer
2365 // CHECK-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
2366 // CHECK-NEXT:    ret <2 x float> [[TMP6]]
2367 //
test_vfma_laneq_f32_0(float32x2_t a,float32x2_t b,float32x4_t v)2368 float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
2369   return vfma_laneq_f32(a, b, v, 0);
2370 }
2371 
2372 // CHECK-LABEL: @test_vfmaq_laneq_f32_0(
2373 // CHECK-NEXT:  entry:
2374 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
2375 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8>
2376 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
2377 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
2378 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
2379 // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
2380 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer
2381 // CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
2382 // CHECK-NEXT:    ret <4 x float> [[TMP6]]
2383 //
test_vfmaq_laneq_f32_0(float32x4_t a,float32x4_t b,float32x4_t v)2384 float32x4_t test_vfmaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
2385   return vfmaq_laneq_f32(a, b, v, 0);
2386 }
2387 
2388 // CHECK-LABEL: @test_vfms_lane_f32_0(
2389 // CHECK-NEXT:  entry:
2390 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]]
2391 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
2392 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8>
2393 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
2394 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
2395 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
2396 // CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
2397 // CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
2398 // CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
2399 // CHECK-NEXT:    ret <2 x float> [[FMLA2]]
2400 //
test_vfms_lane_f32_0(float32x2_t a,float32x2_t b,float32x2_t v)2401 float32x2_t test_vfms_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
2402   return vfms_lane_f32(a, b, v, 0);
2403 }
2404 
2405 // CHECK-LABEL: @test_vfmsq_lane_f32_0(
2406 // CHECK-NEXT:  entry:
2407 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]]
2408 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
2409 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8>
2410 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
2411 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
2412 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer
2413 // CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
2414 // CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
2415 // CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
2416 // CHECK-NEXT:    ret <4 x float> [[FMLA2]]
2417 //
test_vfmsq_lane_f32_0(float32x4_t a,float32x4_t b,float32x2_t v)2418 float32x4_t test_vfmsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
2419   return vfmsq_lane_f32(a, b, v, 0);
2420 }
2421 
2422 // CHECK-LABEL: @test_vfms_laneq_f32_0(
2423 // CHECK-NEXT:  entry:
2424 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]]
2425 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
2426 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8>
2427 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
2428 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
2429 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
2430 // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
2431 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer
2432 // CHECK-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
2433 // CHECK-NEXT:    ret <2 x float> [[TMP6]]
2434 //
test_vfms_laneq_f32_0(float32x2_t a,float32x2_t b,float32x4_t v)2435 float32x2_t test_vfms_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
2436   return vfms_laneq_f32(a, b, v, 0);
2437 }
2438 
2439 // CHECK-LABEL: @test_vfmsq_laneq_f32_0(
2440 // CHECK-NEXT:  entry:
2441 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]]
2442 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
2443 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8>
2444 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
2445 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
2446 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
2447 // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
2448 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer
2449 // CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
2450 // CHECK-NEXT:    ret <4 x float> [[TMP6]]
2451 //
test_vfmsq_laneq_f32_0(float32x4_t a,float32x4_t b,float32x4_t v)2452 float32x4_t test_vfmsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
2453   return vfmsq_laneq_f32(a, b, v, 0);
2454 }
2455 
2456 // CHECK-LABEL: @test_vfmaq_laneq_f64_0(
2457 // CHECK-NEXT:  entry:
2458 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
2459 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8>
2460 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
2461 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
2462 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
2463 // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
2464 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer
2465 // CHECK-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
2466 // CHECK-NEXT:    ret <2 x double> [[TMP6]]
2467 //
test_vfmaq_laneq_f64_0(float64x2_t a,float64x2_t b,float64x2_t v)2468 float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
2469   return vfmaq_laneq_f64(a, b, v, 0);
2470 }
2471 
2472 // CHECK-LABEL: @test_vfmsq_laneq_f64_0(
2473 // CHECK-NEXT:  entry:
2474 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <2 x double> [[B:%.*]]
2475 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
2476 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[FNEG]] to <16 x i8>
2477 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
2478 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
2479 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
2480 // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
2481 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer
2482 // CHECK-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
2483 // CHECK-NEXT:    ret <2 x double> [[TMP6]]
2484 //
test_vfmsq_laneq_f64_0(float64x2_t a,float64x2_t b,float64x2_t v)2485 float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
2486   return vfmsq_laneq_f64(a, b, v, 0);
2487 }
2488 
2489 // CHECK-LABEL: @test_vmlal_lane_s16_0(
2490 // CHECK-NEXT:  entry:
2491 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2492 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2493 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2494 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
2495 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2496 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
2497 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
2498 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
2499 //
test_vmlal_lane_s16_0(int32x4_t a,int16x4_t b,int16x4_t v)2500 int32x4_t test_vmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2501   return vmlal_lane_s16(a, b, v, 0);
2502 }
2503 
2504 // CHECK-LABEL: @test_vmlal_lane_s32_0(
2505 // CHECK-NEXT:  entry:
2506 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2507 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2508 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2509 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
2510 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2511 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
2512 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
2513 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
2514 //
test_vmlal_lane_s32_0(int64x2_t a,int32x2_t b,int32x2_t v)2515 int64x2_t test_vmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2516   return vmlal_lane_s32(a, b, v, 0);
2517 }
2518 
2519 // CHECK-LABEL: @test_vmlal_laneq_s16_0(
2520 // CHECK-NEXT:  entry:
2521 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2522 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2523 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2524 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
2525 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2526 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
2527 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
2528 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
2529 //
test_vmlal_laneq_s16_0(int32x4_t a,int16x4_t b,int16x8_t v)2530 int32x4_t test_vmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
2531   return vmlal_laneq_s16(a, b, v, 0);
2532 }
2533 
2534 // CHECK-LABEL: @test_vmlal_laneq_s32_0(
2535 // CHECK-NEXT:  entry:
2536 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2537 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2538 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2539 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
2540 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2541 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
2542 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
2543 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
2544 //
test_vmlal_laneq_s32_0(int64x2_t a,int32x2_t b,int32x4_t v)2545 int64x2_t test_vmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
2546   return vmlal_laneq_s32(a, b, v, 0);
2547 }
2548 
2549 // CHECK-LABEL: @test_vmlal_high_lane_s16_0(
2550 // CHECK-NEXT:  entry:
2551 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2552 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2553 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2554 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2555 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2556 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2557 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
2558 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
2559 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
2560 //
test_vmlal_high_lane_s16_0(int32x4_t a,int16x8_t b,int16x4_t v)2561 int32x4_t test_vmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2562   return vmlal_high_lane_s16(a, b, v, 0);
2563 }
2564 
2565 // CHECK-LABEL: @test_vmlal_high_lane_s32_0(
2566 // CHECK-NEXT:  entry:
2567 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
2568 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2569 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2570 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2571 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2572 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2573 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
2574 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
2575 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
2576 //
test_vmlal_high_lane_s32_0(int64x2_t a,int32x4_t b,int32x2_t v)2577 int64x2_t test_vmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2578   return vmlal_high_lane_s32(a, b, v, 0);
2579 }
2580 
2581 // CHECK-LABEL: @test_vmlal_high_laneq_s16_0(
2582 // CHECK-NEXT:  entry:
2583 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2584 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2585 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2586 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2587 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2588 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2589 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
2590 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
2591 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
2592 //
test_vmlal_high_laneq_s16_0(int32x4_t a,int16x8_t b,int16x8_t v)2593 int32x4_t test_vmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2594   return vmlal_high_laneq_s16(a, b, v, 0);
2595 }
2596 
2597 // CHECK-LABEL: @test_vmlal_high_laneq_s32_0(
2598 // CHECK-NEXT:  entry:
2599 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
2600 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2601 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2602 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2603 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2604 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2605 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
2606 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
2607 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
2608 //
test_vmlal_high_laneq_s32_0(int64x2_t a,int32x4_t b,int32x4_t v)2609 int64x2_t test_vmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2610   return vmlal_high_laneq_s32(a, b, v, 0);
2611 }
2612 
2613 // CHECK-LABEL: @test_vmlsl_lane_s16_0(
2614 // CHECK-NEXT:  entry:
2615 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2616 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2617 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2618 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
2619 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2620 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
2621 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
2622 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
2623 //
test_vmlsl_lane_s16_0(int32x4_t a,int16x4_t b,int16x4_t v)2624 int32x4_t test_vmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2625   return vmlsl_lane_s16(a, b, v, 0);
2626 }
2627 
2628 // CHECK-LABEL: @test_vmlsl_lane_s32_0(
2629 // CHECK-NEXT:  entry:
2630 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2631 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2632 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2633 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
2634 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2635 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
2636 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
2637 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
2638 //
test_vmlsl_lane_s32_0(int64x2_t a,int32x2_t b,int32x2_t v)2639 int64x2_t test_vmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2640   return vmlsl_lane_s32(a, b, v, 0);
2641 }
2642 
2643 // CHECK-LABEL: @test_vmlsl_laneq_s16_0(
2644 // CHECK-NEXT:  entry:
2645 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2646 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2647 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2648 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
2649 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2650 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
2651 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
2652 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
2653 //
test_vmlsl_laneq_s16_0(int32x4_t a,int16x4_t b,int16x8_t v)2654 int32x4_t test_vmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
2655   return vmlsl_laneq_s16(a, b, v, 0);
2656 }
2657 
2658 // CHECK-LABEL: @test_vmlsl_laneq_s32_0(
2659 // CHECK-NEXT:  entry:
2660 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2661 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2662 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2663 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
2664 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2665 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
2666 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
2667 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
2668 //
test_vmlsl_laneq_s32_0(int64x2_t a,int32x2_t b,int32x4_t v)2669 int64x2_t test_vmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
2670   return vmlsl_laneq_s32(a, b, v, 0);
2671 }
2672 
2673 // CHECK-LABEL: @test_vmlsl_high_lane_s16_0(
2674 // CHECK-NEXT:  entry:
2675 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2676 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2677 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2678 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2679 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2680 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2681 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
2682 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
2683 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
2684 //
test_vmlsl_high_lane_s16_0(int32x4_t a,int16x8_t b,int16x4_t v)2685 int32x4_t test_vmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2686   return vmlsl_high_lane_s16(a, b, v, 0);
2687 }
2688 
2689 // CHECK-LABEL: @test_vmlsl_high_lane_s32_0(
2690 // CHECK-NEXT:  entry:
2691 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
2692 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2693 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2694 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2695 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2696 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2697 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
2698 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
2699 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
2700 //
test_vmlsl_high_lane_s32_0(int64x2_t a,int32x4_t b,int32x2_t v)2701 int64x2_t test_vmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2702   return vmlsl_high_lane_s32(a, b, v, 0);
2703 }
2704 
2705 // CHECK-LABEL: @test_vmlsl_high_laneq_s16_0(
2706 // CHECK-NEXT:  entry:
2707 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2708 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2709 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2710 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2711 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2712 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2713 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
2714 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
2715 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
2716 //
test_vmlsl_high_laneq_s16_0(int32x4_t a,int16x8_t b,int16x8_t v)2717 int32x4_t test_vmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2718   return vmlsl_high_laneq_s16(a, b, v, 0);
2719 }
2720 
2721 // CHECK-LABEL: @test_vmlsl_high_laneq_s32_0(
2722 // CHECK-NEXT:  entry:
2723 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
2724 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2725 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2726 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2727 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2728 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2729 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
2730 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
2731 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
2732 //
test_vmlsl_high_laneq_s32_0(int64x2_t a,int32x4_t b,int32x4_t v)2733 int64x2_t test_vmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2734   return vmlsl_high_laneq_s32(a, b, v, 0);
2735 }
2736 
2737 // CHECK-LABEL: @test_vmlal_lane_u16_0(
2738 // CHECK-NEXT:  entry:
2739 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2740 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2741 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2742 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
2743 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2744 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
2745 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
2746 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
2747 //
test_vmlal_lane_u16_0(int32x4_t a,int16x4_t b,int16x4_t v)2748 int32x4_t test_vmlal_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2749   return vmlal_lane_u16(a, b, v, 0);
2750 }
2751 
2752 // CHECK-LABEL: @test_vmlal_lane_u32_0(
2753 // CHECK-NEXT:  entry:
2754 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2755 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2756 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2757 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
2758 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2759 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
2760 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
2761 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
2762 //
test_vmlal_lane_u32_0(int64x2_t a,int32x2_t b,int32x2_t v)2763 int64x2_t test_vmlal_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2764   return vmlal_lane_u32(a, b, v, 0);
2765 }
2766 
2767 // CHECK-LABEL: @test_vmlal_laneq_u16_0(
2768 // CHECK-NEXT:  entry:
2769 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2770 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2771 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2772 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
2773 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2774 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
2775 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
2776 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
2777 //
test_vmlal_laneq_u16_0(int32x4_t a,int16x4_t b,int16x8_t v)2778 int32x4_t test_vmlal_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
2779   return vmlal_laneq_u16(a, b, v, 0);
2780 }
2781 
2782 // CHECK-LABEL: @test_vmlal_laneq_u32_0(
2783 // CHECK-NEXT:  entry:
2784 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2785 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2786 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2787 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
2788 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2789 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
2790 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
2791 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
2792 //
test_vmlal_laneq_u32_0(int64x2_t a,int32x2_t b,int32x4_t v)2793 int64x2_t test_vmlal_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
2794   return vmlal_laneq_u32(a, b, v, 0);
2795 }
2796 
2797 // CHECK-LABEL: @test_vmlal_high_lane_u16_0(
2798 // CHECK-NEXT:  entry:
2799 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2800 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2801 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2802 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2803 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2804 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2805 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
2806 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
2807 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
2808 //
test_vmlal_high_lane_u16_0(int32x4_t a,int16x8_t b,int16x4_t v)2809 int32x4_t test_vmlal_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2810   return vmlal_high_lane_u16(a, b, v, 0);
2811 }
2812 
2813 // CHECK-LABEL: @test_vmlal_high_lane_u32_0(
2814 // CHECK-NEXT:  entry:
2815 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
2816 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2817 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2818 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2819 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2820 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2821 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
2822 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
2823 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
2824 //
test_vmlal_high_lane_u32_0(int64x2_t a,int32x4_t b,int32x2_t v)2825 int64x2_t test_vmlal_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2826   return vmlal_high_lane_u32(a, b, v, 0);
2827 }
2828 
2829 // CHECK-LABEL: @test_vmlal_high_laneq_u16_0(
2830 // CHECK-NEXT:  entry:
2831 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2832 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2833 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2834 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2835 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2836 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2837 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
2838 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
2839 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
2840 //
test_vmlal_high_laneq_u16_0(int32x4_t a,int16x8_t b,int16x8_t v)2841 int32x4_t test_vmlal_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2842   return vmlal_high_laneq_u16(a, b, v, 0);
2843 }
2844 
2845 // CHECK-LABEL: @test_vmlal_high_laneq_u32_0(
2846 // CHECK-NEXT:  entry:
2847 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
2848 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2849 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2850 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2851 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2852 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2853 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
2854 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
2855 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
2856 //
test_vmlal_high_laneq_u32_0(int64x2_t a,int32x4_t b,int32x4_t v)2857 int64x2_t test_vmlal_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2858   return vmlal_high_laneq_u32(a, b, v, 0);
2859 }
2860 
2861 // CHECK-LABEL: @test_vmlsl_lane_u16_0(
2862 // CHECK-NEXT:  entry:
2863 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2864 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2865 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2866 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
2867 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2868 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
2869 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
2870 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
2871 //
test_vmlsl_lane_u16_0(int32x4_t a,int16x4_t b,int16x4_t v)2872 int32x4_t test_vmlsl_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2873   return vmlsl_lane_u16(a, b, v, 0);
2874 }
2875 
2876 // CHECK-LABEL: @test_vmlsl_lane_u32_0(
2877 // CHECK-NEXT:  entry:
2878 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2879 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2880 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2881 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
2882 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2883 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
2884 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
2885 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
2886 //
test_vmlsl_lane_u32_0(int64x2_t a,int32x2_t b,int32x2_t v)2887 int64x2_t test_vmlsl_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2888   return vmlsl_lane_u32(a, b, v, 0);
2889 }
2890 
2891 // CHECK-LABEL: @test_vmlsl_laneq_u16_0(
2892 // CHECK-NEXT:  entry:
2893 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2894 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2895 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2896 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
2897 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2898 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
2899 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
2900 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
2901 //
test_vmlsl_laneq_u16_0(int32x4_t a,int16x4_t b,int16x8_t v)2902 int32x4_t test_vmlsl_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
2903   return vmlsl_laneq_u16(a, b, v, 0);
2904 }
2905 
2906 // CHECK-LABEL: @test_vmlsl_laneq_u32_0(
2907 // CHECK-NEXT:  entry:
2908 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2909 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2910 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2911 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
2912 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2913 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
2914 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
2915 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
2916 //
test_vmlsl_laneq_u32_0(int64x2_t a,int32x2_t b,int32x4_t v)2917 int64x2_t test_vmlsl_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
2918   return vmlsl_laneq_u32(a, b, v, 0);
2919 }
2920 
2921 // CHECK-LABEL: @test_vmlsl_high_lane_u16_0(
2922 // CHECK-NEXT:  entry:
2923 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2924 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2925 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2926 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2927 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2928 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2929 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
2930 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
2931 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
2932 //
test_vmlsl_high_lane_u16_0(int32x4_t a,int16x8_t b,int16x4_t v)2933 int32x4_t test_vmlsl_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2934   return vmlsl_high_lane_u16(a, b, v, 0);
2935 }
2936 
2937 // CHECK-LABEL: @test_vmlsl_high_lane_u32_0(
2938 // CHECK-NEXT:  entry:
2939 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
2940 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2941 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2942 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2943 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2944 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2945 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
2946 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
2947 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
2948 //
test_vmlsl_high_lane_u32_0(int64x2_t a,int32x4_t b,int32x2_t v)2949 int64x2_t test_vmlsl_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2950   return vmlsl_high_lane_u32(a, b, v, 0);
2951 }
2952 
2953 // CHECK-LABEL: @test_vmlsl_high_laneq_u16_0(
2954 // CHECK-NEXT:  entry:
2955 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2956 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2957 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2958 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2959 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2960 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2961 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
2962 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
2963 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
2964 //
test_vmlsl_high_laneq_u16_0(int32x4_t a,int16x8_t b,int16x8_t v)2965 int32x4_t test_vmlsl_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2966   return vmlsl_high_laneq_u16(a, b, v, 0);
2967 }
2968 
2969 // CHECK-LABEL: @test_vmlsl_high_laneq_u32_0(
2970 // CHECK-NEXT:  entry:
2971 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
2972 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2973 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2974 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2975 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2976 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2977 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
2978 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
2979 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
2980 //
test_vmlsl_high_laneq_u32_0(int64x2_t a,int32x4_t b,int32x4_t v)2981 int64x2_t test_vmlsl_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2982   return vmlsl_high_laneq_u32(a, b, v, 0);
2983 }
2984 
2985 // CHECK-LABEL: @test_vmull_lane_s16_0(
2986 // CHECK-NEXT:  entry:
2987 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2988 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2989 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2990 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
2991 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2992 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
2993 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
2994 //
test_vmull_lane_s16_0(int16x4_t a,int16x4_t v)2995 int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) {
2996   return vmull_lane_s16(a, v, 0);
2997 }
2998 
2999 // CHECK-LABEL: @test_vmull_lane_s32_0(
3000 // CHECK-NEXT:  entry:
3001 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3002 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3003 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3004 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
3005 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3006 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
3007 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
3008 //
test_vmull_lane_s32_0(int32x2_t a,int32x2_t v)3009 int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) {
3010   return vmull_lane_s32(a, v, 0);
3011 }
3012 
3013 // CHECK-LABEL: @test_vmull_lane_u16_0(
3014 // CHECK-NEXT:  entry:
3015 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3016 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3017 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3018 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
3019 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3020 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
3021 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
3022 //
test_vmull_lane_u16_0(uint16x4_t a,uint16x4_t v)3023 uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) {
3024   return vmull_lane_u16(a, v, 0);
3025 }
3026 
3027 // CHECK-LABEL: @test_vmull_lane_u32_0(
3028 // CHECK-NEXT:  entry:
3029 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3030 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3031 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3032 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
3033 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3034 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
3035 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
3036 //
test_vmull_lane_u32_0(uint32x2_t a,uint32x2_t v)3037 uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) {
3038   return vmull_lane_u32(a, v, 0);
3039 }
3040 
3041 // CHECK-LABEL: @test_vmull_high_lane_s16_0(
3042 // CHECK-NEXT:  entry:
3043 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3044 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3045 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3046 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3047 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3048 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3049 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
3050 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
3051 //
test_vmull_high_lane_s16_0(int16x8_t a,int16x4_t v)3052 int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
3053   return vmull_high_lane_s16(a, v, 0);
3054 }
3055 
3056 // CHECK-LABEL: @test_vmull_high_lane_s32_0(
3057 // CHECK-NEXT:  entry:
3058 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3059 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3060 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3061 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3062 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3063 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3064 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
3065 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
3066 //
test_vmull_high_lane_s32_0(int32x4_t a,int32x2_t v)3067 int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
3068   return vmull_high_lane_s32(a, v, 0);
3069 }
3070 
3071 // CHECK-LABEL: @test_vmull_high_lane_u16_0(
3072 // CHECK-NEXT:  entry:
3073 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3074 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3075 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3076 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3077 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3078 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3079 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
3080 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
3081 //
test_vmull_high_lane_u16_0(uint16x8_t a,uint16x4_t v)3082 uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) {
3083   return vmull_high_lane_u16(a, v, 0);
3084 }
3085 
3086 // CHECK-LABEL: @test_vmull_high_lane_u32_0(
3087 // CHECK-NEXT:  entry:
3088 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3089 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3090 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3091 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3092 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3093 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3094 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
3095 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
3096 //
test_vmull_high_lane_u32_0(uint32x4_t a,uint32x2_t v)3097 uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) {
3098   return vmull_high_lane_u32(a, v, 0);
3099 }
3100 
3101 // CHECK-LABEL: @test_vmull_laneq_s16_0(
3102 // CHECK-NEXT:  entry:
3103 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
3104 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3105 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
3106 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
3107 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3108 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
3109 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
3110 //
test_vmull_laneq_s16_0(int16x4_t a,int16x8_t v)3111 int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
3112   return vmull_laneq_s16(a, v, 0);
3113 }
3114 
3115 // CHECK-LABEL: @test_vmull_laneq_s32_0(
3116 // CHECK-NEXT:  entry:
3117 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
3118 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3119 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
3120 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
3121 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3122 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
3123 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
3124 //
test_vmull_laneq_s32_0(int32x2_t a,int32x4_t v)3125 int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
3126   return vmull_laneq_s32(a, v, 0);
3127 }
3128 
3129 // CHECK-LABEL: @test_vmull_laneq_u16_0(
3130 // CHECK-NEXT:  entry:
3131 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
3132 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3133 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
3134 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
3135 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3136 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
3137 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
3138 //
test_vmull_laneq_u16_0(uint16x4_t a,uint16x8_t v)3139 uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
3140   return vmull_laneq_u16(a, v, 0);
3141 }
3142 
3143 // CHECK-LABEL: @test_vmull_laneq_u32_0(
3144 // CHECK-NEXT:  entry:
3145 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
3146 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3147 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
3148 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
3149 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3150 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
3151 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
3152 //
test_vmull_laneq_u32_0(uint32x2_t a,uint32x4_t v)3153 uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
3154   return vmull_laneq_u32(a, v, 0);
3155 }
3156 
3157 // CHECK-LABEL: @test_vmull_high_laneq_s16_0(
3158 // CHECK-NEXT:  entry:
3159 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3160 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
3161 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3162 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
3163 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3164 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3165 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
3166 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
3167 //
test_vmull_high_laneq_s16_0(int16x8_t a,int16x8_t v)3168 int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
3169   return vmull_high_laneq_s16(a, v, 0);
3170 }
3171 
3172 // CHECK-LABEL: @test_vmull_high_laneq_s32_0(
3173 // CHECK-NEXT:  entry:
3174 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3175 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
3176 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3177 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
3178 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3179 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3180 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
3181 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
3182 //
test_vmull_high_laneq_s32_0(int32x4_t a,int32x4_t v)3183 int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
3184   return vmull_high_laneq_s32(a, v, 0);
3185 }
3186 
3187 // CHECK-LABEL: @test_vmull_high_laneq_u16_0(
3188 // CHECK-NEXT:  entry:
3189 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3190 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
3191 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3192 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
3193 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3194 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3195 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
3196 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
3197 //
test_vmull_high_laneq_u16_0(uint16x8_t a,uint16x8_t v)3198 uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
3199   return vmull_high_laneq_u16(a, v, 0);
3200 }
3201 
3202 // CHECK-LABEL: @test_vmull_high_laneq_u32_0(
3203 // CHECK-NEXT:  entry:
3204 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3205 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
3206 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3207 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
3208 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3209 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3210 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
3211 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
3212 //
test_vmull_high_laneq_u32_0(uint32x4_t a,uint32x4_t v)3213 uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
3214   return vmull_high_laneq_u32(a, v, 0);
3215 }
3216 
3217 // CHECK-LABEL: @test_vqdmlal_lane_s16_0(
3218 // CHECK-NEXT:  entry:
3219 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3220 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3221 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3222 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
3223 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
3224 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3225 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
3226 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
3227 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
3228 //
test_vqdmlal_lane_s16_0(int32x4_t a,int16x4_t b,int16x4_t v)3229 int32x4_t test_vqdmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
3230   return vqdmlal_lane_s16(a, b, v, 0);
3231 }
3232 
3233 // CHECK-LABEL: @test_vqdmlal_lane_s32_0(
3234 // CHECK-NEXT:  entry:
3235 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3236 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3237 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3238 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
3239 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
3240 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3241 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
3242 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
3243 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
3244 //
test_vqdmlal_lane_s32_0(int64x2_t a,int32x2_t b,int32x2_t v)3245 int64x2_t test_vqdmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
3246   return vqdmlal_lane_s32(a, b, v, 0);
3247 }
3248 
3249 // CHECK-LABEL: @test_vqdmlal_high_lane_s16_0(
3250 // CHECK-NEXT:  entry:
3251 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3252 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3253 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3254 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3255 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
3256 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3257 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3258 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
3259 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
3260 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
3261 //
test_vqdmlal_high_lane_s16_0(int32x4_t a,int16x8_t b,int16x4_t v)3262 int32x4_t test_vqdmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
3263   return vqdmlal_high_lane_s16(a, b, v, 0);
3264 }
3265 
3266 // CHECK-LABEL: @test_vqdmlal_high_lane_s32_0(
3267 // CHECK-NEXT:  entry:
3268 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
3269 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3270 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3271 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3272 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
3273 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3274 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3275 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
3276 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
3277 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
3278 //
test_vqdmlal_high_lane_s32_0(int64x2_t a,int32x4_t b,int32x2_t v)3279 int64x2_t test_vqdmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
3280   return vqdmlal_high_lane_s32(a, b, v, 0);
3281 }
3282 
3283 // CHECK-LABEL: @test_vqdmlsl_lane_s16_0(
3284 // CHECK-NEXT:  entry:
3285 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3286 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3287 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3288 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
3289 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
3290 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3291 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
3292 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
3293 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
3294 //
test_vqdmlsl_lane_s16_0(int32x4_t a,int16x4_t b,int16x4_t v)3295 int32x4_t test_vqdmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
3296   return vqdmlsl_lane_s16(a, b, v, 0);
3297 }
3298 
3299 // CHECK-LABEL: @test_vqdmlsl_lane_s32_0(
3300 // CHECK-NEXT:  entry:
3301 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3302 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3303 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3304 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
3305 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
3306 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3307 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
3308 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
3309 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
3310 //
test_vqdmlsl_lane_s32_0(int64x2_t a,int32x2_t b,int32x2_t v)3311 int64x2_t test_vqdmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
3312   return vqdmlsl_lane_s32(a, b, v, 0);
3313 }
3314 
3315 // CHECK-LABEL: @test_vqdmlsl_high_lane_s16_0(
3316 // CHECK-NEXT:  entry:
3317 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3318 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3319 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3320 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3321 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
3322 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3323 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3324 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
3325 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
3326 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
3327 //
test_vqdmlsl_high_lane_s16_0(int32x4_t a,int16x8_t b,int16x4_t v)3328 int32x4_t test_vqdmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
3329   return vqdmlsl_high_lane_s16(a, b, v, 0);
3330 }
3331 
3332 // CHECK-LABEL: @test_vqdmlsl_high_lane_s32_0(
3333 // CHECK-NEXT:  entry:
3334 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
3335 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3336 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3337 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3338 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
3339 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3340 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3341 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
3342 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
3343 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
3344 //
test_vqdmlsl_high_lane_s32_0(int64x2_t a,int32x4_t b,int32x2_t v)3345 int64x2_t test_vqdmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
3346   return vqdmlsl_high_lane_s32(a, b, v, 0);
3347 }
3348 
3349 // CHECK-LABEL: @test_vqdmull_lane_s16_0(
3350 // CHECK-NEXT:  entry:
3351 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3352 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3353 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3354 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
3355 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3356 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
3357 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
3358 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
3359 //
test_vqdmull_lane_s16_0(int16x4_t a,int16x4_t v)3360 int32x4_t test_vqdmull_lane_s16_0(int16x4_t a, int16x4_t v) {
3361   return vqdmull_lane_s16(a, v, 0);
3362 }
3363 
3364 // CHECK-LABEL: @test_vqdmull_lane_s32_0(
3365 // CHECK-NEXT:  entry:
3366 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3367 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3368 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3369 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
3370 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3371 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
3372 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
3373 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
3374 //
test_vqdmull_lane_s32_0(int32x2_t a,int32x2_t v)3375 int64x2_t test_vqdmull_lane_s32_0(int32x2_t a, int32x2_t v) {
3376   return vqdmull_lane_s32(a, v, 0);
3377 }
3378 
3379 // CHECK-LABEL: @test_vqdmull_laneq_s16_0(
3380 // CHECK-NEXT:  entry:
3381 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
3382 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3383 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
3384 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
3385 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3386 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
3387 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
3388 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
3389 //
test_vqdmull_laneq_s16_0(int16x4_t a,int16x8_t v)3390 int32x4_t test_vqdmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
3391   return vqdmull_laneq_s16(a, v, 0);
3392 }
3393 
3394 // CHECK-LABEL: @test_vqdmull_laneq_s32_0(
3395 // CHECK-NEXT:  entry:
3396 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
3397 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3398 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
3399 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
3400 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3401 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
3402 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
3403 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
3404 //
test_vqdmull_laneq_s32_0(int32x2_t a,int32x4_t v)3405 int64x2_t test_vqdmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
3406   return vqdmull_laneq_s32(a, v, 0);
3407 }
3408 
3409 // CHECK-LABEL: @test_vqdmull_high_lane_s16_0(
3410 // CHECK-NEXT:  entry:
3411 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3412 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3413 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3414 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3415 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3416 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3417 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
3418 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
3419 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
3420 //
test_vqdmull_high_lane_s16_0(int16x8_t a,int16x4_t v)3421 int32x4_t test_vqdmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
3422   return vqdmull_high_lane_s16(a, v, 0);
3423 }
3424 
3425 // CHECK-LABEL: @test_vqdmull_high_lane_s32_0(
3426 // CHECK-NEXT:  entry:
3427 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3428 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3429 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3430 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3431 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3432 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3433 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
3434 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
3435 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
3436 //
test_vqdmull_high_lane_s32_0(int32x4_t a,int32x2_t v)3437 int64x2_t test_vqdmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
3438   return vqdmull_high_lane_s32(a, v, 0);
3439 }
3440 
3441 // CHECK-LABEL: @test_vqdmull_high_laneq_s16_0(
3442 // CHECK-NEXT:  entry:
3443 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3444 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
3445 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3446 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
3447 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3448 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3449 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
3450 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
3451 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
3452 //
test_vqdmull_high_laneq_s16_0(int16x8_t a,int16x8_t v)3453 int32x4_t test_vqdmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
3454   return vqdmull_high_laneq_s16(a, v, 0);
3455 }
3456 
3457 // CHECK-LABEL: @test_vqdmull_high_laneq_s32_0(
3458 // CHECK-NEXT:  entry:
3459 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3460 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
3461 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3462 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
3463 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3464 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3465 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
3466 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
3467 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
3468 //
test_vqdmull_high_laneq_s32_0(int32x4_t a,int32x4_t v)3469 int64x2_t test_vqdmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
3470   return vqdmull_high_laneq_s32(a, v, 0);
3471 }
3472 
3473 // CHECK-LABEL: @test_vqdmulh_lane_s16_0(
3474 // CHECK-NEXT:  entry:
3475 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
3476 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3477 // CHECK-NEXT:    [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3478 // CHECK-NEXT:    [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3479 // CHECK-NEXT:    [[VQDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16> [[VQDMULH_LANE_V]], <4 x i16> [[VQDMULH_LANE_V1]], i32 0)
3480 // CHECK-NEXT:    ret <4 x i16> [[VQDMULH_LANE_V2]]
3481 //
test_vqdmulh_lane_s16_0(int16x4_t a,int16x4_t v)3482 int16x4_t test_vqdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
3483   return vqdmulh_lane_s16(a, v, 0);
3484 }
3485 
3486 // CHECK-LABEL: @test_vqdmulhq_lane_s16_0(
3487 // CHECK-NEXT:  entry:
3488 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
3489 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3490 // CHECK-NEXT:    [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3491 // CHECK-NEXT:    [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3492 // CHECK-NEXT:    [[VQDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16> [[VQDMULHQ_LANE_V]], <4 x i16> [[VQDMULHQ_LANE_V1]], i32 0)
3493 // CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_LANE_V2]]
3494 //
test_vqdmulhq_lane_s16_0(int16x8_t a,int16x4_t v)3495 int16x8_t test_vqdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
3496   return vqdmulhq_lane_s16(a, v, 0);
3497 }
3498 
3499 // CHECK-LABEL: @test_vqdmulh_lane_s32_0(
3500 // CHECK-NEXT:  entry:
3501 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
3502 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3503 // CHECK-NEXT:    [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3504 // CHECK-NEXT:    [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3505 // CHECK-NEXT:    [[VQDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32> [[VQDMULH_LANE_V]], <2 x i32> [[VQDMULH_LANE_V1]], i32 0)
3506 // CHECK-NEXT:    ret <2 x i32> [[VQDMULH_LANE_V2]]
3507 //
test_vqdmulh_lane_s32_0(int32x2_t a,int32x2_t v)3508 int32x2_t test_vqdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
3509   return vqdmulh_lane_s32(a, v, 0);
3510 }
3511 
3512 // CHECK-LABEL: @test_vqdmulhq_lane_s32_0(
3513 // CHECK-NEXT:  entry:
3514 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
3515 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3516 // CHECK-NEXT:    [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3517 // CHECK-NEXT:    [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3518 // CHECK-NEXT:    [[VQDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32> [[VQDMULHQ_LANE_V]], <2 x i32> [[VQDMULHQ_LANE_V1]], i32 0)
3519 // CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_LANE_V2]]
3520 //
test_vqdmulhq_lane_s32_0(int32x4_t a,int32x2_t v)3521 int32x4_t test_vqdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
3522   return vqdmulhq_lane_s32(a, v, 0);
3523 }
3524 
3525 // CHECK-LABEL: @test_vqrdmulh_lane_s16_0(
3526 // CHECK-NEXT:  entry:
3527 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
3528 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3529 // CHECK-NEXT:    [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3530 // CHECK-NEXT:    [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3531 // CHECK-NEXT:    [[VQRDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16> [[VQRDMULH_LANE_V]], <4 x i16> [[VQRDMULH_LANE_V1]], i32 0)
3532 // CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_LANE_V2]]
3533 //
test_vqrdmulh_lane_s16_0(int16x4_t a,int16x4_t v)3534 int16x4_t test_vqrdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
3535   return vqrdmulh_lane_s16(a, v, 0);
3536 }
3537 
3538 // CHECK-LABEL: @test_vqrdmulhq_lane_s16_0(
3539 // CHECK-NEXT:  entry:
3540 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
3541 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3542 // CHECK-NEXT:    [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3543 // CHECK-NEXT:    [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3544 // CHECK-NEXT:    [[VQRDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16> [[VQRDMULHQ_LANE_V]], <4 x i16> [[VQRDMULHQ_LANE_V1]], i32 0)
3545 // CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_LANE_V2]]
3546 //
test_vqrdmulhq_lane_s16_0(int16x8_t a,int16x4_t v)3547 int16x8_t test_vqrdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
3548   return vqrdmulhq_lane_s16(a, v, 0);
3549 }
3550 
3551 // CHECK-LABEL: @test_vqrdmulh_lane_s32_0(
3552 // CHECK-NEXT:  entry:
3553 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
3554 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3555 // CHECK-NEXT:    [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3556 // CHECK-NEXT:    [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3557 // CHECK-NEXT:    [[VQRDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32> [[VQRDMULH_LANE_V]], <2 x i32> [[VQRDMULH_LANE_V1]], i32 0)
3558 // CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_LANE_V2]]
3559 //
test_vqrdmulh_lane_s32_0(int32x2_t a,int32x2_t v)3560 int32x2_t test_vqrdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
3561   return vqrdmulh_lane_s32(a, v, 0);
3562 }
3563 
3564 // CHECK-LABEL: @test_vqrdmulhq_lane_s32_0(
3565 // CHECK-NEXT:  entry:
3566 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
3567 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3568 // CHECK-NEXT:    [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3569 // CHECK-NEXT:    [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3570 // CHECK-NEXT:    [[VQRDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32> [[VQRDMULHQ_LANE_V]], <2 x i32> [[VQRDMULHQ_LANE_V1]], i32 0)
3571 // CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_LANE_V2]]
3572 //
test_vqrdmulhq_lane_s32_0(int32x4_t a,int32x2_t v)3573 int32x4_t test_vqrdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
3574   return vqrdmulhq_lane_s32(a, v, 0);
3575 }
3576 
3577 // CHECK-LABEL: @test_vmul_lane_f32_0(
3578 // CHECK-NEXT:  entry:
3579 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
3580 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
3581 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer
3582 // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]]
3583 // CHECK-NEXT:    ret <2 x float> [[MUL]]
3584 //
test_vmul_lane_f32_0(float32x2_t a,float32x2_t v)3585 float32x2_t test_vmul_lane_f32_0(float32x2_t a, float32x2_t v) {
3586   return vmul_lane_f32(a, v, 0);
3587 }
3588 
3589 // CHECK-LABEL: @test_vmulq_lane_f32_0(
3590 // CHECK-NEXT:  entry:
3591 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
3592 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
3593 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer
3594 // CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]]
3595 // CHECK-NEXT:    ret <4 x float> [[MUL]]
3596 //
test_vmulq_lane_f32_0(float32x4_t a,float32x2_t v)3597 float32x4_t test_vmulq_lane_f32_0(float32x4_t a, float32x2_t v) {
3598   return vmulq_lane_f32(a, v, 0);
3599 }
3600 
3601 // CHECK-LABEL: @test_vmul_laneq_f32_0(
3602 // CHECK-NEXT:  entry:
3603 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
3604 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
3605 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer
3606 // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]]
3607 // CHECK-NEXT:    ret <2 x float> [[MUL]]
3608 //
test_vmul_laneq_f32_0(float32x2_t a,float32x4_t v)3609 float32x2_t test_vmul_laneq_f32_0(float32x2_t a, float32x4_t v) {
3610   return vmul_laneq_f32(a, v, 0);
3611 }
3612 
3613 // CHECK-LABEL: @test_vmul_laneq_f64_0(
3614 // CHECK-NEXT:  entry:
3615 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
3616 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
3617 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
3618 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
3619 // CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
3620 // CHECK-NEXT:    [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
3621 // CHECK-NEXT:    [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
3622 // CHECK-NEXT:    ret <1 x double> [[TMP5]]
3623 //
test_vmul_laneq_f64_0(float64x1_t a,float64x2_t v)3624 float64x1_t test_vmul_laneq_f64_0(float64x1_t a, float64x2_t v) {
3625   return vmul_laneq_f64(a, v, 0);
3626 }
3627 
3628 // CHECK-LABEL: @test_vmulq_laneq_f32_0(
3629 // CHECK-NEXT:  entry:
3630 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
3631 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
3632 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer
3633 // CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]]
3634 // CHECK-NEXT:    ret <4 x float> [[MUL]]
3635 //
test_vmulq_laneq_f32_0(float32x4_t a,float32x4_t v)3636 float32x4_t test_vmulq_laneq_f32_0(float32x4_t a, float32x4_t v) {
3637   return vmulq_laneq_f32(a, v, 0);
3638 }
3639 
3640 // CHECK-LABEL: @test_vmulq_laneq_f64_0(
3641 // CHECK-NEXT:  entry:
3642 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
3643 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
3644 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> zeroinitializer
3645 // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]]
3646 // CHECK-NEXT:    ret <2 x double> [[MUL]]
3647 //
test_vmulq_laneq_f64_0(float64x2_t a,float64x2_t v)3648 float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) {
3649   return vmulq_laneq_f64(a, v, 0);
3650 }
3651 
3652 // CHECK-LABEL: @test_vmulx_lane_f32_0(
3653 // CHECK-NEXT:  entry:
3654 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
3655 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
3656 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer
3657 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
3658 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
3659 // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #4
3660 // CHECK-NEXT:    ret <2 x float> [[VMULX2_I]]
3661 //
test_vmulx_lane_f32_0(float32x2_t a,float32x2_t v)3662 float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) {
3663   return vmulx_lane_f32(a, v, 0);
3664 }
3665 
3666 // CHECK-LABEL: @test_vmulxq_lane_f32_0(
3667 // CHECK-NEXT:  entry:
3668 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
3669 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
3670 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer
3671 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
3672 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
3673 // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #4
3674 // CHECK-NEXT:    ret <4 x float> [[VMULX2_I]]
3675 //
test_vmulxq_lane_f32_0(float32x4_t a,float32x2_t v)3676 float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) {
3677   return vmulxq_lane_f32(a, v, 0);
3678 }
3679 
3680 // CHECK-LABEL: @test_vmulxq_lane_f64_0(
3681 // CHECK-NEXT:  entry:
3682 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
3683 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
3684 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer
3685 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
3686 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
3687 // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #4
3688 // CHECK-NEXT:    ret <2 x double> [[VMULX2_I]]
3689 //
test_vmulxq_lane_f64_0(float64x2_t a,float64x1_t v)3690 float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) {
3691   return vmulxq_lane_f64(a, v, 0);
3692 }
3693 
3694 // CHECK-LABEL: @test_vmulx_laneq_f32_0(
3695 // CHECK-NEXT:  entry:
3696 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
3697 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
3698 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer
3699 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
3700 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
3701 // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #4
3702 // CHECK-NEXT:    ret <2 x float> [[VMULX2_I]]
3703 //
test_vmulx_laneq_f32_0(float32x2_t a,float32x4_t v)3704 float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) {
3705   return vmulx_laneq_f32(a, v, 0);
3706 }
3707 
3708 // CHECK-LABEL: @test_vmulxq_laneq_f32_0(
3709 // CHECK-NEXT:  entry:
3710 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
3711 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
3712 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer
3713 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
3714 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
3715 // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #4
3716 // CHECK-NEXT:    ret <4 x float> [[VMULX2_I]]
3717 //
test_vmulxq_laneq_f32_0(float32x4_t a,float32x4_t v)3718 float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) {
3719   return vmulxq_laneq_f32(a, v, 0);
3720 }
3721 
3722 // CHECK-LABEL: @test_vmulxq_laneq_f64_0(
3723 // CHECK-NEXT:  entry:
3724 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
3725 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
3726 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> zeroinitializer
3727 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
3728 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
3729 // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #4
3730 // CHECK-NEXT:    ret <2 x double> [[VMULX2_I]]
3731 //
test_vmulxq_laneq_f64_0(float64x2_t a,float64x2_t v)3732 float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) {
3733   return vmulxq_laneq_f64(a, v, 0);
3734 }
3735 
3736 // CHECK-LABEL: @test_vmull_high_n_s16(
3737 // CHECK-NEXT:  entry:
3738 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3739 // CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
3740 // CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[B]], i32 1
3741 // CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[B]], i32 2
3742 // CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[B]], i32 3
3743 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3744 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
3745 // CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4
3746 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I_I_I]]
3747 //
test_vmull_high_n_s16(int16x8_t a,int16_t b)3748 int32x4_t test_vmull_high_n_s16(int16x8_t a, int16_t b) {
3749   return vmull_high_n_s16(a, b);
3750 }
3751 
3752 // CHECK-LABEL: @test_vmull_high_n_s32(
3753 // CHECK-NEXT:  entry:
3754 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3755 // CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
3756 // CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[B]], i32 1
3757 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3758 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
3759 // CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4
3760 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I_I_I]]
3761 //
test_vmull_high_n_s32(int32x4_t a,int32_t b)3762 int64x2_t test_vmull_high_n_s32(int32x4_t a, int32_t b) {
3763   return vmull_high_n_s32(a, b);
3764 }
3765 
3766 // CHECK-LABEL: @test_vmull_high_n_u16(
3767 // CHECK-NEXT:  entry:
3768 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3769 // CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
3770 // CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[B]], i32 1
3771 // CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[B]], i32 2
3772 // CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[B]], i32 3
3773 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3774 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
3775 // CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4
3776 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I_I_I]]
3777 //
test_vmull_high_n_u16(uint16x8_t a,uint16_t b)3778 uint32x4_t test_vmull_high_n_u16(uint16x8_t a, uint16_t b) {
3779   return vmull_high_n_u16(a, b);
3780 }
3781 
3782 // CHECK-LABEL: @test_vmull_high_n_u32(
3783 // CHECK-NEXT:  entry:
3784 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3785 // CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
3786 // CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[B]], i32 1
3787 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3788 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
3789 // CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4
3790 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I_I_I]]
3791 //
test_vmull_high_n_u32(uint32x4_t a,uint32_t b)3792 uint64x2_t test_vmull_high_n_u32(uint32x4_t a, uint32_t b) {
3793   return vmull_high_n_u32(a, b);
3794 }
3795 
3796 // CHECK-LABEL: @test_vqdmull_high_n_s16(
3797 // CHECK-NEXT:  entry:
3798 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3799 // CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
3800 // CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[B]], i32 1
3801 // CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[B]], i32 2
3802 // CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[B]], i32 3
3803 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3804 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
3805 // CHECK-NEXT:    [[VQDMULL_V2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4
3806 // CHECK-NEXT:    [[VQDMULL_V3_I_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I_I]] to <16 x i8>
3807 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I_I_I]]
3808 //
test_vqdmull_high_n_s16(int16x8_t a,int16_t b)3809 int32x4_t test_vqdmull_high_n_s16(int16x8_t a, int16_t b) {
3810   return vqdmull_high_n_s16(a, b);
3811 }
3812 
3813 // CHECK-LABEL: @test_vqdmull_high_n_s32(
3814 // CHECK-NEXT:  entry:
3815 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3816 // CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
3817 // CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[B]], i32 1
3818 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3819 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
3820 // CHECK-NEXT:    [[VQDMULL_V2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4
3821 // CHECK-NEXT:    [[VQDMULL_V3_I_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I_I]] to <16 x i8>
3822 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I_I_I]]
3823 //
test_vqdmull_high_n_s32(int32x4_t a,int32_t b)3824 int64x2_t test_vqdmull_high_n_s32(int32x4_t a, int32_t b) {
3825   return vqdmull_high_n_s32(a, b);
3826 }
3827 
3828 // CHECK-LABEL: @test_vmlal_high_n_s16(
3829 // CHECK-NEXT:  entry:
3830 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3831 // CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
3832 // CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[C]], i32 1
3833 // CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[C]], i32 2
3834 // CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[C]], i32 3
3835 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3836 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
3837 // CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4
3838 // CHECK-NEXT:    [[ADD_I_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I_I]]
3839 // CHECK-NEXT:    ret <4 x i32> [[ADD_I_I]]
3840 //
test_vmlal_high_n_s16(int32x4_t a,int16x8_t b,int16_t c)3841 int32x4_t test_vmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
3842   return vmlal_high_n_s16(a, b, c);
3843 }
3844 
3845 // CHECK-LABEL: @test_vmlal_high_n_s32(
3846 // CHECK-NEXT:  entry:
3847 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
3848 // CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
3849 // CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[C]], i32 1
3850 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3851 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
3852 // CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4
3853 // CHECK-NEXT:    [[ADD_I_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I_I]]
3854 // CHECK-NEXT:    ret <2 x i64> [[ADD_I_I]]
3855 //
test_vmlal_high_n_s32(int64x2_t a,int32x4_t b,int32_t c)3856 int64x2_t test_vmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
3857   return vmlal_high_n_s32(a, b, c);
3858 }
3859 
3860 // CHECK-LABEL: @test_vmlal_high_n_u16(
3861 // CHECK-NEXT:  entry:
3862 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3863 // CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
3864 // CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[C]], i32 1
3865 // CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[C]], i32 2
3866 // CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[C]], i32 3
3867 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3868 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
3869 // CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4
3870 // CHECK-NEXT:    [[ADD_I_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I_I]]
3871 // CHECK-NEXT:    ret <4 x i32> [[ADD_I_I]]
3872 //
test_vmlal_high_n_u16(uint32x4_t a,uint16x8_t b,uint16_t c)3873 uint32x4_t test_vmlal_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
3874   return vmlal_high_n_u16(a, b, c);
3875 }
3876 
3877 // CHECK-LABEL: @test_vmlal_high_n_u32(
3878 // CHECK-NEXT:  entry:
3879 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
3880 // CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
3881 // CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[C]], i32 1
3882 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3883 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
3884 // CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4
3885 // CHECK-NEXT:    [[ADD_I_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I_I]]
3886 // CHECK-NEXT:    ret <2 x i64> [[ADD_I_I]]
3887 //
test_vmlal_high_n_u32(uint64x2_t a,uint32x4_t b,uint32_t c)3888 uint64x2_t test_vmlal_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
3889   return vmlal_high_n_u32(a, b, c);
3890 }
3891 
3892 // CHECK-LABEL: @test_vqdmlal_high_n_s16(
3893 // CHECK-NEXT:  entry:
3894 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3895 // CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
3896 // CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[C]], i32 1
3897 // CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[C]], i32 2
3898 // CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[C]], i32 3
3899 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
3900 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3901 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
3902 // CHECK-NEXT:    [[VQDMLAL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4
3903 // CHECK-NEXT:    [[VQDMLAL_V3_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I_I]]) #4
3904 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I_I_I]]
3905 //
test_vqdmlal_high_n_s16(int32x4_t a,int16x8_t b,int16_t c)3906 int32x4_t test_vqdmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
3907   return vqdmlal_high_n_s16(a, b, c);
3908 }
3909 
3910 // CHECK-LABEL: @test_vqdmlal_high_n_s32(
3911 // CHECK-NEXT:  entry:
3912 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
3913 // CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
3914 // CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[C]], i32 1
3915 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
3916 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3917 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
3918 // CHECK-NEXT:    [[VQDMLAL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4
3919 // CHECK-NEXT:    [[VQDMLAL_V3_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I_I]]) #4
3920 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I_I_I]]
3921 //
test_vqdmlal_high_n_s32(int64x2_t a,int32x4_t b,int32_t c)3922 int64x2_t test_vqdmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
3923   return vqdmlal_high_n_s32(a, b, c);
3924 }
3925 
3926 // CHECK-LABEL: @test_vmlsl_high_n_s16(
3927 // CHECK-NEXT:  entry:
3928 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3929 // CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
3930 // CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[C]], i32 1
3931 // CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[C]], i32 2
3932 // CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[C]], i32 3
3933 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3934 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
3935 // CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4
3936 // CHECK-NEXT:    [[SUB_I_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I_I]]
3937 // CHECK-NEXT:    ret <4 x i32> [[SUB_I_I]]
3938 //
test_vmlsl_high_n_s16(int32x4_t a,int16x8_t b,int16_t c)3939 int32x4_t test_vmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
3940   return vmlsl_high_n_s16(a, b, c);
3941 }
3942 
3943 // CHECK-LABEL: @test_vmlsl_high_n_s32(
3944 // CHECK-NEXT:  entry:
3945 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
3946 // CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
3947 // CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[C]], i32 1
3948 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3949 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
3950 // CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4
3951 // CHECK-NEXT:    [[SUB_I_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I_I]]
3952 // CHECK-NEXT:    ret <2 x i64> [[SUB_I_I]]
3953 //
test_vmlsl_high_n_s32(int64x2_t a,int32x4_t b,int32_t c)3954 int64x2_t test_vmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
3955   return vmlsl_high_n_s32(a, b, c);
3956 }
3957 
3958 // CHECK-LABEL: @test_vmlsl_high_n_u16(
3959 // CHECK-NEXT:  entry:
3960 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3961 // CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
3962 // CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[C]], i32 1
3963 // CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[C]], i32 2
3964 // CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[C]], i32 3
3965 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3966 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
3967 // CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4
3968 // CHECK-NEXT:    [[SUB_I_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I_I]]
3969 // CHECK-NEXT:    ret <4 x i32> [[SUB_I_I]]
3970 //
test_vmlsl_high_n_u16(uint32x4_t a,uint16x8_t b,uint16_t c)3971 uint32x4_t test_vmlsl_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
3972   return vmlsl_high_n_u16(a, b, c);
3973 }
3974 
3975 // CHECK-LABEL: @test_vmlsl_high_n_u32(
3976 // CHECK-NEXT:  entry:
3977 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
3978 // CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
3979 // CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[C]], i32 1
3980 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3981 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
3982 // CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4
3983 // CHECK-NEXT:    [[SUB_I_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I_I]]
3984 // CHECK-NEXT:    ret <2 x i64> [[SUB_I_I]]
3985 //
test_vmlsl_high_n_u32(uint64x2_t a,uint32x4_t b,uint32_t c)3986 uint64x2_t test_vmlsl_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
3987   return vmlsl_high_n_u32(a, b, c);
3988 }
3989 
3990 // CHECK-LABEL: @test_vqdmlsl_high_n_s16(
3991 // CHECK-NEXT:  entry:
3992 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3993 // CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
3994 // CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[C]], i32 1
3995 // CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[C]], i32 2
3996 // CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[C]], i32 3
3997 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
3998 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3999 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
4000 // CHECK-NEXT:    [[VQDMLAL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4
4001 // CHECK-NEXT:    [[VQDMLSL_V3_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I_I]]) #4
4002 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I_I_I]]
4003 //
test_vqdmlsl_high_n_s16(int32x4_t a,int16x8_t b,int16_t c)4004 int32x4_t test_vqdmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
4005   return vqdmlsl_high_n_s16(a, b, c);
4006 }
4007 
4008 // CHECK-LABEL: @test_vqdmlsl_high_n_s32(
4009 // CHECK-NEXT:  entry:
4010 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
4011 // CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
4012 // CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[C]], i32 1
4013 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
4014 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
4015 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
4016 // CHECK-NEXT:    [[VQDMLAL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4
4017 // CHECK-NEXT:    [[VQDMLSL_V3_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I_I]]) #4
4018 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I_I_I]]
4019 //
test_vqdmlsl_high_n_s32(int64x2_t a,int32x4_t b,int32_t c)4020 int64x2_t test_vqdmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
4021   return vqdmlsl_high_n_s32(a, b, c);
4022 }
4023 
4024 // CHECK-LABEL: @test_vmul_n_f32(
4025 // CHECK-NEXT:  entry:
4026 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float [[B:%.*]], i32 0
4027 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[B]], i32 1
4028 // CHECK-NEXT:    [[MUL_I:%.*]] = fmul <2 x float> [[A:%.*]], [[VECINIT1_I]]
4029 // CHECK-NEXT:    ret <2 x float> [[MUL_I]]
4030 //
test_vmul_n_f32(float32x2_t a,float32_t b)4031 float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) {
4032   return vmul_n_f32(a, b);
4033 }
4034 
4035 // CHECK-LABEL: @test_vmulq_n_f32(
4036 // CHECK-NEXT:  entry:
4037 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0
4038 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[B]], i32 1
4039 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[B]], i32 2
4040 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[B]], i32 3
4041 // CHECK-NEXT:    [[MUL_I:%.*]] = fmul <4 x float> [[A:%.*]], [[VECINIT3_I]]
4042 // CHECK-NEXT:    ret <4 x float> [[MUL_I]]
4043 //
test_vmulq_n_f32(float32x4_t a,float32_t b)4044 float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) {
4045   return vmulq_n_f32(a, b);
4046 }
4047 
4048 // CHECK-LABEL: @test_vmulq_n_f64(
4049 // CHECK-NEXT:  entry:
4050 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double [[B:%.*]], i32 0
4051 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[B]], i32 1
4052 // CHECK-NEXT:    [[MUL_I:%.*]] = fmul <2 x double> [[A:%.*]], [[VECINIT1_I]]
4053 // CHECK-NEXT:    ret <2 x double> [[MUL_I]]
4054 //
test_vmulq_n_f64(float64x2_t a,float64_t b)4055 float64x2_t test_vmulq_n_f64(float64x2_t a, float64_t b) {
4056   return vmulq_n_f64(a, b);
4057 }
4058 
4059 // CHECK-LABEL: @test_vfma_n_f32(
4060 // CHECK-NEXT:  entry:
4061 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float [[N:%.*]], i32 0
4062 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[N]], i32 1
4063 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
4064 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
4065 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
4066 // CHECK-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]]) #4
4067 // CHECK-NEXT:    ret <2 x float> [[TMP3]]
4068 //
test_vfma_n_f32(float32x2_t a,float32x2_t b,float32_t n)4069 float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
4070   return vfma_n_f32(a, b, n);
4071 }
4072 
4073 // CHECK-LABEL: @test_vfma_n_f64(
4074 // CHECK-NEXT:  entry:
4075 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <1 x double> undef, double [[N:%.*]], i32 0
4076 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
4077 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[B:%.*]] to <8 x i8>
4078 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[VECINIT_I]] to <8 x i8>
4079 // CHECK-NEXT:    [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[B]], <1 x double> [[VECINIT_I]], <1 x double> [[A]]) #4
4080 // CHECK-NEXT:    ret <1 x double> [[TMP3]]
4081 //
test_vfma_n_f64(float64x1_t a,float64x1_t b,float64_t n)4082 float64x1_t test_vfma_n_f64(float64x1_t a, float64x1_t b, float64_t n) {
4083   return vfma_n_f64(a, b, n);
4084 }
4085 
4086 // CHECK-LABEL: @test_vfmaq_n_f32(
4087 // CHECK-NEXT:  entry:
4088 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[N:%.*]], i32 0
4089 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[N]], i32 1
4090 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[N]], i32 2
4091 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[N]], i32 3
4092 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
4093 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8>
4094 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
4095 // CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]]) #4
4096 // CHECK-NEXT:    ret <4 x float> [[TMP3]]
4097 //
test_vfmaq_n_f32(float32x4_t a,float32x4_t b,float32_t n)4098 float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
4099   return vfmaq_n_f32(a, b, n);
4100 }
4101 
4102 // CHECK-LABEL: @test_vfms_n_f32(
4103 // CHECK-NEXT:  entry:
4104 // CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <2 x float> [[B:%.*]]
4105 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float [[N:%.*]], i32 0
4106 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[N]], i32 1
4107 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
4108 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[FNEG_I]] to <8 x i8>
4109 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
4110 // CHECK-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FNEG_I]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]]) #4
4111 // CHECK-NEXT:    ret <2 x float> [[TMP3]]
4112 //
test_vfms_n_f32(float32x2_t a,float32x2_t b,float32_t n)4113 float32x2_t test_vfms_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
4114   return vfms_n_f32(a, b, n);
4115 }
4116 
4117 // CHECK-LABEL: @test_vfms_n_f64(
4118 // CHECK-NEXT:  entry:
4119 // CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <1 x double> [[B:%.*]]
4120 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <1 x double> undef, double [[N:%.*]], i32 0
4121 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
4122 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[FNEG_I]] to <8 x i8>
4123 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[VECINIT_I]] to <8 x i8>
4124 // CHECK-NEXT:    [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FNEG_I]], <1 x double> [[VECINIT_I]], <1 x double> [[A]]) #4
4125 // CHECK-NEXT:    ret <1 x double> [[TMP3]]
4126 //
test_vfms_n_f64(float64x1_t a,float64x1_t b,float64_t n)4127 float64x1_t test_vfms_n_f64(float64x1_t a, float64x1_t b, float64_t n) {
4128   return vfms_n_f64(a, b, n);
4129 }
4130 
4131 // CHECK-LABEL: @test_vfmsq_n_f32(
4132 // CHECK-NEXT:  entry:
4133 // CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <4 x float> [[B:%.*]]
4134 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[N:%.*]], i32 0
4135 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[N]], i32 1
4136 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[N]], i32 2
4137 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[N]], i32 3
4138 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
4139 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[FNEG_I]] to <16 x i8>
4140 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
4141 // CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FNEG_I]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]]) #4
4142 // CHECK-NEXT:    ret <4 x float> [[TMP3]]
4143 //
test_vfmsq_n_f32(float32x4_t a,float32x4_t b,float32_t n)4144 float32x4_t test_vfmsq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
4145   return vfmsq_n_f32(a, b, n);
4146 }
4147 
4148 // CHECK-LABEL: @test_vmul_n_s16(
4149 // CHECK-NEXT:  entry:
4150 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
4151 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4152 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4153 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4154 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[A:%.*]], [[VECINIT3_I]]
4155 // CHECK-NEXT:    ret <4 x i16> [[MUL_I]]
4156 //
test_vmul_n_s16(int16x4_t a,int16_t b)4157 int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) {
4158   return vmul_n_s16(a, b);
4159 }
4160 
4161 // CHECK-LABEL: @test_vmulq_n_s16(
4162 // CHECK-NEXT:  entry:
4163 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
4164 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4165 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4166 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4167 // CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4
4168 // CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5
4169 // CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6
4170 // CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
4171 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[A:%.*]], [[VECINIT7_I]]
4172 // CHECK-NEXT:    ret <8 x i16> [[MUL_I]]
4173 //
test_vmulq_n_s16(int16x8_t a,int16_t b)4174 int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) {
4175   return vmulq_n_s16(a, b);
4176 }
4177 
4178 // CHECK-LABEL: @test_vmul_n_s32(
4179 // CHECK-NEXT:  entry:
4180 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
4181 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4182 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[A:%.*]], [[VECINIT1_I]]
4183 // CHECK-NEXT:    ret <2 x i32> [[MUL_I]]
4184 //
test_vmul_n_s32(int32x2_t a,int32_t b)4185 int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) {
4186   return vmul_n_s32(a, b);
4187 }
4188 
4189 // CHECK-LABEL: @test_vmulq_n_s32(
4190 // CHECK-NEXT:  entry:
4191 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
4192 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4193 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2
4194 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
4195 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[A:%.*]], [[VECINIT3_I]]
4196 // CHECK-NEXT:    ret <4 x i32> [[MUL_I]]
4197 //
test_vmulq_n_s32(int32x4_t a,int32_t b)4198 int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) {
4199   return vmulq_n_s32(a, b);
4200 }
4201 
4202 // CHECK-LABEL: @test_vmul_n_u16(
4203 // CHECK-NEXT:  entry:
4204 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
4205 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4206 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4207 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4208 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[A:%.*]], [[VECINIT3_I]]
4209 // CHECK-NEXT:    ret <4 x i16> [[MUL_I]]
4210 //
test_vmul_n_u16(uint16x4_t a,uint16_t b)4211 uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) {
4212   return vmul_n_u16(a, b);
4213 }
4214 
4215 // CHECK-LABEL: @test_vmulq_n_u16(
4216 // CHECK-NEXT:  entry:
4217 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
4218 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4219 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4220 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4221 // CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4
4222 // CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5
4223 // CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6
4224 // CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
4225 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[A:%.*]], [[VECINIT7_I]]
4226 // CHECK-NEXT:    ret <8 x i16> [[MUL_I]]
4227 //
test_vmulq_n_u16(uint16x8_t a,uint16_t b)4228 uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) {
4229   return vmulq_n_u16(a, b);
4230 }
4231 
4232 // CHECK-LABEL: @test_vmul_n_u32(
4233 // CHECK-NEXT:  entry:
4234 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
4235 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4236 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[A:%.*]], [[VECINIT1_I]]
4237 // CHECK-NEXT:    ret <2 x i32> [[MUL_I]]
4238 //
test_vmul_n_u32(uint32x2_t a,uint32_t b)4239 uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) {
4240   return vmul_n_u32(a, b);
4241 }
4242 
4243 // CHECK-LABEL: @test_vmulq_n_u32(
4244 // CHECK-NEXT:  entry:
4245 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
4246 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4247 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2
4248 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
4249 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[A:%.*]], [[VECINIT3_I]]
4250 // CHECK-NEXT:    ret <4 x i32> [[MUL_I]]
4251 //
test_vmulq_n_u32(uint32x4_t a,uint32_t b)4252 uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) {
4253   return vmulq_n_u32(a, b);
4254 }
4255 
4256 // CHECK-LABEL: @test_vmull_n_s16(
4257 // CHECK-NEXT:  entry:
4258 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
4259 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4260 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4261 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4262 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
4263 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4264 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) #4
4265 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I_I]]
4266 //
test_vmull_n_s16(int16x4_t a,int16_t b)4267 int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
4268   return vmull_n_s16(a, b);
4269 }
4270 
4271 // CHECK-LABEL: @test_vmull_n_s32(
4272 // CHECK-NEXT:  entry:
4273 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
4274 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4275 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
4276 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4277 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) #4
4278 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I_I]]
4279 //
test_vmull_n_s32(int32x2_t a,int32_t b)4280 int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
4281   return vmull_n_s32(a, b);
4282 }
4283 
4284 // CHECK-LABEL: @test_vmull_n_u16(
4285 // CHECK-NEXT:  entry:
4286 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
4287 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4288 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4289 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4290 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
4291 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4292 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) #4
4293 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I_I]]
4294 //
test_vmull_n_u16(uint16x4_t a,uint16_t b)4295 uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
4296   return vmull_n_u16(a, b);
4297 }
4298 
4299 // CHECK-LABEL: @test_vmull_n_u32(
4300 // CHECK-NEXT:  entry:
4301 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
4302 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4303 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
4304 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4305 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) #4
4306 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I_I]]
4307 //
test_vmull_n_u32(uint32x2_t a,uint32_t b)4308 uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) {
4309   return vmull_n_u32(a, b);
4310 }
4311 
4312 // CHECK-LABEL: @test_vqdmull_n_s16(
4313 // CHECK-NEXT:  entry:
4314 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
4315 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4316 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4317 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4318 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
4319 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4320 // CHECK-NEXT:    [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) #4
4321 // CHECK-NEXT:    [[VQDMULL_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I]] to <16 x i8>
4322 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I_I]]
4323 //
test_vqdmull_n_s16(int16x4_t a,int16_t b)4324 int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
4325   return vqdmull_n_s16(a, b);
4326 }
4327 
4328 // CHECK-LABEL: @test_vqdmull_n_s32(
4329 // CHECK-NEXT:  entry:
4330 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
4331 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4332 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
4333 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4334 // CHECK-NEXT:    [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) #4
4335 // CHECK-NEXT:    [[VQDMULL_V3_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I]] to <16 x i8>
4336 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I_I]]
4337 //
test_vqdmull_n_s32(int32x2_t a,int32_t b)4338 int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) {
4339   return vqdmull_n_s32(a, b);
4340 }
4341 
4342 // CHECK-LABEL: @test_vqdmulh_n_s16(
4343 // CHECK-NEXT:  entry:
4344 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
4345 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4346 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4347 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4348 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
4349 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4350 // CHECK-NEXT:    [[VQDMULH_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) #4
4351 // CHECK-NEXT:    [[VQDMULH_V3_I_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I_I]] to <8 x i8>
4352 // CHECK-NEXT:    ret <4 x i16> [[VQDMULH_V2_I_I]]
4353 //
test_vqdmulh_n_s16(int16x4_t a,int16_t b)4354 int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
4355   return vqdmulh_n_s16(a, b);
4356 }
4357 
4358 // CHECK-LABEL: @test_vqdmulhq_n_s16(
4359 // CHECK-NEXT:  entry:
4360 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
4361 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4362 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4363 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4364 // CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4
4365 // CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5
4366 // CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6
4367 // CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
4368 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
4369 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
4370 // CHECK-NEXT:    [[VQDMULHQ_V2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]]) #4
4371 // CHECK-NEXT:    [[VQDMULHQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I_I]] to <16 x i8>
4372 // CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_V2_I_I]]
4373 //
test_vqdmulhq_n_s16(int16x8_t a,int16_t b)4374 int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
4375   return vqdmulhq_n_s16(a, b);
4376 }
4377 
4378 // CHECK-LABEL: @test_vqdmulh_n_s32(
4379 // CHECK-NEXT:  entry:
4380 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
4381 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4382 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
4383 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4384 // CHECK-NEXT:    [[VQDMULH_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) #4
4385 // CHECK-NEXT:    [[VQDMULH_V3_I_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I_I]] to <8 x i8>
4386 // CHECK-NEXT:    ret <2 x i32> [[VQDMULH_V2_I_I]]
4387 //
test_vqdmulh_n_s32(int32x2_t a,int32_t b)4388 int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
4389   return vqdmulh_n_s32(a, b);
4390 }
4391 
4392 // CHECK-LABEL: @test_vqdmulhq_n_s32(
4393 // CHECK-NEXT:  entry:
4394 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
4395 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4396 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2
4397 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
4398 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
4399 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
4400 // CHECK-NEXT:    [[VQDMULHQ_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]]) #4
4401 // CHECK-NEXT:    [[VQDMULHQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I_I]] to <16 x i8>
4402 // CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_V2_I_I]]
4403 //
test_vqdmulhq_n_s32(int32x4_t a,int32_t b)4404 int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
4405   return vqdmulhq_n_s32(a, b);
4406 }
4407 
4408 // CHECK-LABEL: @test_vqrdmulh_n_s16(
4409 // CHECK-NEXT:  entry:
4410 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
4411 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4412 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4413 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4414 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
4415 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4416 // CHECK-NEXT:    [[VQRDMULH_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) #4
4417 // CHECK-NEXT:    [[VQRDMULH_V3_I_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I_I]] to <8 x i8>
4418 // CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_V2_I_I]]
4419 //
test_vqrdmulh_n_s16(int16x4_t a,int16_t b)4420 int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
4421   return vqrdmulh_n_s16(a, b);
4422 }
4423 
4424 // CHECK-LABEL: @test_vqrdmulhq_n_s16(
4425 // CHECK-NEXT:  entry:
4426 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
4427 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4428 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4429 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4430 // CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4
4431 // CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5
4432 // CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6
4433 // CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
4434 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
4435 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
4436 // CHECK-NEXT:    [[VQRDMULHQ_V2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]]) #4
4437 // CHECK-NEXT:    [[VQRDMULHQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I_I]] to <16 x i8>
4438 // CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_V2_I_I]]
4439 //
test_vqrdmulhq_n_s16(int16x8_t a,int16_t b)4440 int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
4441   return vqrdmulhq_n_s16(a, b);
4442 }
4443 
4444 // CHECK-LABEL: @test_vqrdmulh_n_s32(
4445 // CHECK-NEXT:  entry:
4446 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
4447 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4448 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
4449 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4450 // CHECK-NEXT:    [[VQRDMULH_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) #4
4451 // CHECK-NEXT:    [[VQRDMULH_V3_I_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I_I]] to <8 x i8>
4452 // CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_V2_I_I]]
4453 //
test_vqrdmulh_n_s32(int32x2_t a,int32_t b)4454 int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
4455   return vqrdmulh_n_s32(a, b);
4456 }
4457 
4458 // CHECK-LABEL: @test_vqrdmulhq_n_s32(
4459 // CHECK-NEXT:  entry:
4460 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
4461 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4462 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2
4463 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
4464 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
4465 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
4466 // CHECK-NEXT:    [[VQRDMULHQ_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]]) #4
4467 // CHECK-NEXT:    [[VQRDMULHQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I_I]] to <16 x i8>
4468 // CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_V2_I_I]]
4469 //
test_vqrdmulhq_n_s32(int32x4_t a,int32_t b)4470 int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) {
4471   return vqrdmulhq_n_s32(a, b);
4472 }
4473 
4474 // CHECK-LABEL: @test_vmla_n_s16(
4475 // CHECK-NEXT:  entry:
4476 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
4477 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4478 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4479 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4480 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[B:%.*]], [[VECINIT3_I]]
4481 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[A:%.*]], [[MUL_I]]
4482 // CHECK-NEXT:    ret <4 x i16> [[ADD_I]]
4483 //
test_vmla_n_s16(int16x4_t a,int16x4_t b,int16_t c)4484 int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
4485   return vmla_n_s16(a, b, c);
4486 }
4487 
4488 // CHECK-LABEL: @test_vmlaq_n_s16(
4489 // CHECK-NEXT:  entry:
4490 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 [[C:%.*]], i32 0
4491 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4492 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4493 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4494 // CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4
4495 // CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5
4496 // CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6
4497 // CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7
4498 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[B:%.*]], [[VECINIT7_I]]
4499 // CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A:%.*]], [[MUL_I]]
4500 // CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
4501 //
test_vmlaq_n_s16(int16x8_t a,int16x8_t b,int16_t c)4502 int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
4503   return vmlaq_n_s16(a, b, c);
4504 }
4505 
4506 // CHECK-LABEL: @test_vmla_n_s32(
4507 // CHECK-NEXT:  entry:
4508 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
4509 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4510 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[B:%.*]], [[VECINIT1_I]]
4511 // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[A:%.*]], [[MUL_I]]
4512 // CHECK-NEXT:    ret <2 x i32> [[ADD_I]]
4513 //
test_vmla_n_s32(int32x2_t a,int32x2_t b,int32_t c)4514 int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
4515   return vmla_n_s32(a, b, c);
4516 }
4517 
4518 // CHECK-LABEL: @test_vmlaq_n_s32(
4519 // CHECK-NEXT:  entry:
4520 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 [[C:%.*]], i32 0
4521 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4522 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2
4523 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3
4524 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[B:%.*]], [[VECINIT3_I]]
4525 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[MUL_I]]
4526 // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
4527 //
test_vmlaq_n_s32(int32x4_t a,int32x4_t b,int32_t c)4528 int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
4529   return vmlaq_n_s32(a, b, c);
4530 }
4531 
4532 // CHECK-LABEL: @test_vmla_n_u16(
4533 // CHECK-NEXT:  entry:
4534 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
4535 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4536 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4537 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4538 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[B:%.*]], [[VECINIT3_I]]
4539 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[A:%.*]], [[MUL_I]]
4540 // CHECK-NEXT:    ret <4 x i16> [[ADD_I]]
4541 //
test_vmla_n_u16(uint16x4_t a,uint16x4_t b,uint16_t c)4542 uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
4543   return vmla_n_u16(a, b, c);
4544 }
4545 
4546 // CHECK-LABEL: @test_vmlaq_n_u16(
4547 // CHECK-NEXT:  entry:
4548 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 [[C:%.*]], i32 0
4549 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4550 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4551 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4552 // CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4
4553 // CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5
4554 // CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6
4555 // CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7
4556 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[B:%.*]], [[VECINIT7_I]]
4557 // CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A:%.*]], [[MUL_I]]
4558 // CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
4559 //
test_vmlaq_n_u16(uint16x8_t a,uint16x8_t b,uint16_t c)4560 uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
4561   return vmlaq_n_u16(a, b, c);
4562 }
4563 
4564 // CHECK-LABEL: @test_vmla_n_u32(
4565 // CHECK-NEXT:  entry:
4566 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
4567 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4568 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[B:%.*]], [[VECINIT1_I]]
4569 // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[A:%.*]], [[MUL_I]]
4570 // CHECK-NEXT:    ret <2 x i32> [[ADD_I]]
4571 //
test_vmla_n_u32(uint32x2_t a,uint32x2_t b,uint32_t c)4572 uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
4573   return vmla_n_u32(a, b, c);
4574 }
4575 
4576 // CHECK-LABEL: @test_vmlaq_n_u32(
4577 // CHECK-NEXT:  entry:
4578 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 [[C:%.*]], i32 0
4579 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4580 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2
4581 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3
4582 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[B:%.*]], [[VECINIT3_I]]
4583 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[MUL_I]]
4584 // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
4585 //
test_vmlaq_n_u32(uint32x4_t a,uint32x4_t b,uint32_t c)4586 uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
4587   return vmlaq_n_u32(a, b, c);
4588 }
4589 
4590 // CHECK-LABEL: @test_vmlal_n_s16(
4591 // CHECK-NEXT:  entry:
4592 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
4593 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4594 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4595 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4596 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
4597 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4598 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #4
4599 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
4600 // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
4601 //
test_vmlal_n_s16(int32x4_t a,int16x4_t b,int16_t c)4602 int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
4603   return vmlal_n_s16(a, b, c);
4604 }
4605 
4606 // CHECK-LABEL: @test_vmlal_n_s32(
4607 // CHECK-NEXT:  entry:
4608 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
4609 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4610 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
4611 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4612 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #4
4613 // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
4614 // CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
4615 //
test_vmlal_n_s32(int64x2_t a,int32x2_t b,int32_t c)4616 int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
4617   return vmlal_n_s32(a, b, c);
4618 }
4619 
4620 // CHECK-LABEL: @test_vmlal_n_u16(
4621 // CHECK-NEXT:  entry:
4622 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
4623 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4624 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4625 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4626 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
4627 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4628 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #4
4629 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
4630 // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
4631 //
test_vmlal_n_u16(uint32x4_t a,uint16x4_t b,uint16_t c)4632 uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
4633   return vmlal_n_u16(a, b, c);
4634 }
4635 
4636 // CHECK-LABEL: @test_vmlal_n_u32(
4637 // CHECK-NEXT:  entry:
4638 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
4639 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4640 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
4641 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4642 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #4
4643 // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
4644 // CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
4645 //
test_vmlal_n_u32(uint64x2_t a,uint32x2_t b,uint32_t c)4646 uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
4647   return vmlal_n_u32(a, b, c);
4648 }
4649 
4650 // CHECK-LABEL: @test_vqdmlal_n_s16(
4651 // CHECK-NEXT:  entry:
4652 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
4653 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4654 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4655 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4656 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
4657 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
4658 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4659 // CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #4
4660 // CHECK-NEXT:    [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) #4
4661 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I_I]]
4662 //
test_vqdmlal_n_s16(int32x4_t a,int16x4_t b,int16_t c)4663 int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
4664   return vqdmlal_n_s16(a, b, c);
4665 }
4666 
4667 // CHECK-LABEL: @test_vqdmlal_n_s32(
4668 // CHECK-NEXT:  entry:
4669 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
4670 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4671 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
4672 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
4673 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4674 // CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #4
4675 // CHECK-NEXT:    [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) #4
4676 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I_I]]
4677 //
test_vqdmlal_n_s32(int64x2_t a,int32x2_t b,int32_t c)4678 int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
4679   return vqdmlal_n_s32(a, b, c);
4680 }
4681 
4682 // CHECK-LABEL: @test_vmls_n_s16(
4683 // CHECK-NEXT:  entry:
4684 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
4685 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4686 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4687 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4688 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[B:%.*]], [[VECINIT3_I]]
4689 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL_I]]
4690 // CHECK-NEXT:    ret <4 x i16> [[SUB_I]]
4691 //
test_vmls_n_s16(int16x4_t a,int16x4_t b,int16_t c)4692 int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
4693   return vmls_n_s16(a, b, c);
4694 }
4695 
4696 // CHECK-LABEL: @test_vmlsq_n_s16(
4697 // CHECK-NEXT:  entry:
4698 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 [[C:%.*]], i32 0
4699 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4700 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4701 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4702 // CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4
4703 // CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5
4704 // CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6
4705 // CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7
4706 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[B:%.*]], [[VECINIT7_I]]
4707 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL_I]]
4708 // CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
4709 //
test_vmlsq_n_s16(int16x8_t a,int16x8_t b,int16_t c)4710 int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
4711   return vmlsq_n_s16(a, b, c);
4712 }
4713 
4714 // CHECK-LABEL: @test_vmls_n_s32(
4715 // CHECK-NEXT:  entry:
4716 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
4717 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4718 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[B:%.*]], [[VECINIT1_I]]
4719 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL_I]]
4720 // CHECK-NEXT:    ret <2 x i32> [[SUB_I]]
4721 //
test_vmls_n_s32(int32x2_t a,int32x2_t b,int32_t c)4722 int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
4723   return vmls_n_s32(a, b, c);
4724 }
4725 
4726 // CHECK-LABEL: @test_vmlsq_n_s32(
4727 // CHECK-NEXT:  entry:
4728 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 [[C:%.*]], i32 0
4729 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4730 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2
4731 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3
4732 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[B:%.*]], [[VECINIT3_I]]
4733 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL_I]]
4734 // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
4735 //
test_vmlsq_n_s32(int32x4_t a,int32x4_t b,int32_t c)4736 int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
4737   return vmlsq_n_s32(a, b, c);
4738 }
4739 
4740 // CHECK-LABEL: @test_vmls_n_u16(
4741 // CHECK-NEXT:  entry:
4742 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
4743 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4744 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4745 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4746 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[B:%.*]], [[VECINIT3_I]]
4747 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL_I]]
4748 // CHECK-NEXT:    ret <4 x i16> [[SUB_I]]
4749 //
test_vmls_n_u16(uint16x4_t a,uint16x4_t b,uint16_t c)4750 uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
4751   return vmls_n_u16(a, b, c);
4752 }
4753 
4754 // CHECK-LABEL: @test_vmlsq_n_u16(
4755 // CHECK-NEXT:  entry:
4756 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 [[C:%.*]], i32 0
4757 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4758 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4759 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4760 // CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4
4761 // CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5
4762 // CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6
4763 // CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7
4764 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[B:%.*]], [[VECINIT7_I]]
4765 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL_I]]
4766 // CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
4767 //
test_vmlsq_n_u16(uint16x8_t a,uint16x8_t b,uint16_t c)4768 uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
4769   return vmlsq_n_u16(a, b, c);
4770 }
4771 
4772 // CHECK-LABEL: @test_vmls_n_u32(
4773 // CHECK-NEXT:  entry:
4774 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
4775 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4776 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[B:%.*]], [[VECINIT1_I]]
4777 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL_I]]
4778 // CHECK-NEXT:    ret <2 x i32> [[SUB_I]]
4779 //
test_vmls_n_u32(uint32x2_t a,uint32x2_t b,uint32_t c)4780 uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
4781   return vmls_n_u32(a, b, c);
4782 }
4783 
4784 // CHECK-LABEL: @test_vmlsq_n_u32(
4785 // CHECK-NEXT:  entry:
4786 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 [[C:%.*]], i32 0
4787 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4788 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2
4789 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3
4790 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[B:%.*]], [[VECINIT3_I]]
4791 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL_I]]
4792 // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
4793 //
test_vmlsq_n_u32(uint32x4_t a,uint32x4_t b,uint32_t c)4794 uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
4795   return vmlsq_n_u32(a, b, c);
4796 }
4797 
4798 // CHECK-LABEL: @test_vmlsl_n_s16(
4799 // CHECK-NEXT:  entry:
4800 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
4801 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4802 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4803 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4804 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
4805 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4806 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #4
4807 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
4808 // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
4809 //
test_vmlsl_n_s16(int32x4_t a,int16x4_t b,int16_t c)4810 int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
4811   return vmlsl_n_s16(a, b, c);
4812 }
4813 
4814 // CHECK-LABEL: @test_vmlsl_n_s32(
4815 // CHECK-NEXT:  entry:
4816 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
4817 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4818 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
4819 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4820 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #4
4821 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
4822 // CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
4823 //
test_vmlsl_n_s32(int64x2_t a,int32x2_t b,int32_t c)4824 int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
4825   return vmlsl_n_s32(a, b, c);
4826 }
4827 
4828 // CHECK-LABEL: @test_vmlsl_n_u16(
4829 // CHECK-NEXT:  entry:
4830 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
4831 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4832 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4833 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4834 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
4835 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4836 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #4
4837 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
4838 // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
4839 //
test_vmlsl_n_u16(uint32x4_t a,uint16x4_t b,uint16_t c)4840 uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
4841   return vmlsl_n_u16(a, b, c);
4842 }
4843 
4844 // CHECK-LABEL: @test_vmlsl_n_u32(
4845 // CHECK-NEXT:  entry:
4846 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
4847 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4848 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
4849 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4850 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #4
4851 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
4852 // CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
4853 //
test_vmlsl_n_u32(uint64x2_t a,uint32x2_t b,uint32_t c)4854 uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
4855   return vmlsl_n_u32(a, b, c);
4856 }
4857 
4858 // CHECK-LABEL: @test_vqdmlsl_n_s16(
4859 // CHECK-NEXT:  entry:
4860 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
4861 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4862 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4863 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4864 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
4865 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
4866 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4867 // CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #4
4868 // CHECK-NEXT:    [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) #4
4869 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I_I]]
4870 //
test_vqdmlsl_n_s16(int32x4_t a,int16x4_t b,int16_t c)4871 int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
4872   return vqdmlsl_n_s16(a, b, c);
4873 }
4874 
4875 // CHECK-LABEL: @test_vqdmlsl_n_s32(
4876 // CHECK-NEXT:  entry:
4877 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
4878 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4879 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
4880 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
4881 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4882 // CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #4
4883 // CHECK-NEXT:    [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) #4
4884 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I_I]]
4885 //
test_vqdmlsl_n_s32(int64x2_t a,int32x2_t b,int32_t c)4886 int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
4887   return vqdmlsl_n_s32(a, b, c);
4888 }
4889 
4890 // CHECK-LABEL: @test_vmla_lane_u16_0(
4891 // CHECK-NEXT:  entry:
4892 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
4893 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
4894 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
4895 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
4896 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
4897 // CHECK-NEXT:    ret <4 x i16> [[ADD]]
4898 //
test_vmla_lane_u16_0(uint16x4_t a,uint16x4_t b,uint16x4_t v)4899 uint16x4_t test_vmla_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
4900   return vmla_lane_u16(a, b, v, 0);
4901 }
4902 
4903 // CHECK-LABEL: @test_vmlaq_lane_u16_0(
4904 // CHECK-NEXT:  entry:
4905 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
4906 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
4907 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
4908 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
4909 // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
4910 // CHECK-NEXT:    ret <8 x i16> [[ADD]]
4911 //
test_vmlaq_lane_u16_0(uint16x8_t a,uint16x8_t b,uint16x4_t v)4912 uint16x8_t test_vmlaq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
4913   return vmlaq_lane_u16(a, b, v, 0);
4914 }
4915 
4916 // CHECK-LABEL: @test_vmla_lane_u32_0(
4917 // CHECK-NEXT:  entry:
4918 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
4919 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
4920 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
4921 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
4922 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
4923 // CHECK-NEXT:    ret <2 x i32> [[ADD]]
4924 //
test_vmla_lane_u32_0(uint32x2_t a,uint32x2_t b,uint32x2_t v)4925 uint32x2_t test_vmla_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
4926   return vmla_lane_u32(a, b, v, 0);
4927 }
4928 
4929 // CHECK-LABEL: @test_vmlaq_lane_u32_0(
4930 // CHECK-NEXT:  entry:
4931 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
4932 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
4933 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
4934 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
4935 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
4936 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
4937 //
test_vmlaq_lane_u32_0(uint32x4_t a,uint32x4_t b,uint32x2_t v)4938 uint32x4_t test_vmlaq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
4939   return vmlaq_lane_u32(a, b, v, 0);
4940 }
4941 
4942 // CHECK-LABEL: @test_vmla_laneq_u16_0(
4943 // CHECK-NEXT:  entry:
4944 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
4945 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
4946 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
4947 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
4948 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
4949 // CHECK-NEXT:    ret <4 x i16> [[ADD]]
4950 //
test_vmla_laneq_u16_0(uint16x4_t a,uint16x4_t b,uint16x8_t v)4951 uint16x4_t test_vmla_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
4952   return vmla_laneq_u16(a, b, v, 0);
4953 }
4954 
4955 // CHECK-LABEL: @test_vmlaq_laneq_u16_0(
4956 // CHECK-NEXT:  entry:
4957 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
4958 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
4959 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
4960 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
4961 // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
4962 // CHECK-NEXT:    ret <8 x i16> [[ADD]]
4963 //
test_vmlaq_laneq_u16_0(uint16x8_t a,uint16x8_t b,uint16x8_t v)4964 uint16x8_t test_vmlaq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
4965   return vmlaq_laneq_u16(a, b, v, 0);
4966 }
4967 
4968 // CHECK-LABEL: @test_vmla_laneq_u32_0(
4969 // CHECK-NEXT:  entry:
4970 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
4971 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4972 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
4973 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
4974 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
4975 // CHECK-NEXT:    ret <2 x i32> [[ADD]]
4976 //
test_vmla_laneq_u32_0(uint32x2_t a,uint32x2_t b,uint32x4_t v)4977 uint32x2_t test_vmla_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
4978   return vmla_laneq_u32(a, b, v, 0);
4979 }
4980 
4981 // CHECK-LABEL: @test_vmlaq_laneq_u32_0(
4982 // CHECK-NEXT:  entry:
4983 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
4984 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4985 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
4986 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
4987 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
4988 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
4989 //
test_vmlaq_laneq_u32_0(uint32x4_t a,uint32x4_t b,uint32x4_t v)4990 uint32x4_t test_vmlaq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
4991   return vmlaq_laneq_u32(a, b, v, 0);
4992 }
4993 
4994 // CHECK-LABEL: @test_vqdmlal_laneq_s16_0(
4995 // CHECK-NEXT:  entry:
4996 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
4997 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
4998 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
4999 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5000 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
5001 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
5002 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
5003 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
5004 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
5005 //
test_vqdmlal_laneq_s16_0(int32x4_t a,int16x4_t b,int16x8_t v)5006 int32x4_t test_vqdmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
5007   return vqdmlal_laneq_s16(a, b, v, 0);
5008 }
5009 
5010 // CHECK-LABEL: @test_vqdmlal_laneq_s32_0(
5011 // CHECK-NEXT:  entry:
5012 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5013 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5014 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
5015 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
5016 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
5017 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
5018 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
5019 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
5020 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
5021 //
test_vqdmlal_laneq_s32_0(int64x2_t a,int32x2_t b,int32x4_t v)5022 int64x2_t test_vqdmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
5023   return vqdmlal_laneq_s32(a, b, v, 0);
5024 }
5025 
5026 // CHECK-LABEL: @test_vqdmlal_high_laneq_s16_0(
5027 // CHECK-NEXT:  entry:
5028 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
5029 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5030 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5031 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
5032 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5033 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
5034 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
5035 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
5036 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
5037 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
5038 //
test_vqdmlal_high_laneq_s16_0(int32x4_t a,int16x8_t b,int16x8_t v)5039 int32x4_t test_vqdmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
5040   return vqdmlal_high_laneq_s16(a, b, v, 0);
5041 }
5042 
5043 // CHECK-LABEL: @test_vqdmlal_high_laneq_s32_0(
5044 // CHECK-NEXT:  entry:
5045 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
5046 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5047 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5048 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
5049 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
5050 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
5051 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
5052 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
5053 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
5054 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
5055 //
test_vqdmlal_high_laneq_s32_0(int64x2_t a,int32x4_t b,int32x4_t v)5056 int64x2_t test_vqdmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
5057   return vqdmlal_high_laneq_s32(a, b, v, 0);
5058 }
5059 
5060 // CHECK-LABEL: @test_vmls_lane_u16_0(
5061 // CHECK-NEXT:  entry:
5062 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
5063 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5064 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
5065 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
5066 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
5067 // CHECK-NEXT:    ret <4 x i16> [[SUB]]
5068 //
test_vmls_lane_u16_0(uint16x4_t a,uint16x4_t b,uint16x4_t v)5069 uint16x4_t test_vmls_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
5070   return vmls_lane_u16(a, b, v, 0);
5071 }
5072 
5073 // CHECK-LABEL: @test_vmlsq_lane_u16_0(
5074 // CHECK-NEXT:  entry:
5075 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
5076 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5077 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
5078 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
5079 // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
5080 // CHECK-NEXT:    ret <8 x i16> [[SUB]]
5081 //
test_vmlsq_lane_u16_0(uint16x8_t a,uint16x8_t b,uint16x4_t v)5082 uint16x8_t test_vmlsq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
5083   return vmlsq_lane_u16(a, b, v, 0);
5084 }
5085 
5086 // CHECK-LABEL: @test_vmls_lane_u32_0(
5087 // CHECK-NEXT:  entry:
5088 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
5089 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5090 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
5091 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
5092 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
5093 // CHECK-NEXT:    ret <2 x i32> [[SUB]]
5094 //
test_vmls_lane_u32_0(uint32x2_t a,uint32x2_t b,uint32x2_t v)5095 uint32x2_t test_vmls_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
5096   return vmls_lane_u32(a, b, v, 0);
5097 }
5098 
5099 // CHECK-LABEL: @test_vmlsq_lane_u32_0(
5100 // CHECK-NEXT:  entry:
5101 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
5102 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5103 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
5104 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
5105 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
5106 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
5107 //
test_vmlsq_lane_u32_0(uint32x4_t a,uint32x4_t b,uint32x2_t v)5108 uint32x4_t test_vmlsq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
5109   return vmlsq_lane_u32(a, b, v, 0);
5110 }
5111 
5112 // CHECK-LABEL: @test_vmls_laneq_u16_0(
5113 // CHECK-NEXT:  entry:
5114 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5115 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5116 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
5117 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
5118 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
5119 // CHECK-NEXT:    ret <4 x i16> [[SUB]]
5120 //
test_vmls_laneq_u16_0(uint16x4_t a,uint16x4_t b,uint16x8_t v)5121 uint16x4_t test_vmls_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
5122   return vmls_laneq_u16(a, b, v, 0);
5123 }
5124 
5125 // CHECK-LABEL: @test_vmlsq_laneq_u16_0(
5126 // CHECK-NEXT:  entry:
5127 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5128 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5129 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
5130 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
5131 // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
5132 // CHECK-NEXT:    ret <8 x i16> [[SUB]]
5133 //
test_vmlsq_laneq_u16_0(uint16x8_t a,uint16x8_t b,uint16x8_t v)5134 uint16x8_t test_vmlsq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
5135   return vmlsq_laneq_u16(a, b, v, 0);
5136 }
5137 
5138 // CHECK-LABEL: @test_vmls_laneq_u32_0(
5139 // CHECK-NEXT:  entry:
5140 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5141 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5142 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
5143 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
5144 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
5145 // CHECK-NEXT:    ret <2 x i32> [[SUB]]
5146 //
test_vmls_laneq_u32_0(uint32x2_t a,uint32x2_t b,uint32x4_t v)5147 uint32x2_t test_vmls_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
5148   return vmls_laneq_u32(a, b, v, 0);
5149 }
5150 
5151 // CHECK-LABEL: @test_vmlsq_laneq_u32_0(
5152 // CHECK-NEXT:  entry:
5153 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5154 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5155 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
5156 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
5157 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
5158 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
5159 //
test_vmlsq_laneq_u32_0(uint32x4_t a,uint32x4_t b,uint32x4_t v)5160 uint32x4_t test_vmlsq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
5161   return vmlsq_laneq_u32(a, b, v, 0);
5162 }
5163 
5164 // CHECK-LABEL: @test_vqdmlsl_laneq_s16_0(
5165 // CHECK-NEXT:  entry:
5166 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5167 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5168 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
5169 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5170 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
5171 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
5172 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
5173 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
5174 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
5175 //
test_vqdmlsl_laneq_s16_0(int32x4_t a,int16x4_t b,int16x8_t v)5176 int32x4_t test_vqdmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
5177   return vqdmlsl_laneq_s16(a, b, v, 0);
5178 }
5179 
5180 // CHECK-LABEL: @test_vqdmlsl_laneq_s32_0(
5181 // CHECK-NEXT:  entry:
5182 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5183 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5184 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
5185 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
5186 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
5187 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
5188 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
5189 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
5190 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
5191 //
test_vqdmlsl_laneq_s32_0(int64x2_t a,int32x2_t b,int32x4_t v)5192 int64x2_t test_vqdmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
5193   return vqdmlsl_laneq_s32(a, b, v, 0);
5194 }
5195 
5196 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s16_0(
5197 // CHECK-NEXT:  entry:
5198 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
5199 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5200 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5201 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
5202 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5203 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
5204 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
5205 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
5206 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
5207 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
5208 //
test_vqdmlsl_high_laneq_s16_0(int32x4_t a,int16x8_t b,int16x8_t v)5209 int32x4_t test_vqdmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
5210   return vqdmlsl_high_laneq_s16(a, b, v, 0);
5211 }
5212 
5213 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s32_0(
5214 // CHECK-NEXT:  entry:
5215 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
5216 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5217 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5218 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
5219 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
5220 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
5221 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
5222 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
5223 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
5224 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
5225 //
test_vqdmlsl_high_laneq_s32_0(int64x2_t a,int32x4_t b,int32x4_t v)5226 int64x2_t test_vqdmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
5227   return vqdmlsl_high_laneq_s32(a, b, v, 0);
5228 }
5229 
5230 // CHECK-LABEL: @test_vqdmulh_laneq_s16_0(
5231 // CHECK-NEXT:  entry:
5232 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
5233 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5234 // CHECK-NEXT:    [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5235 // CHECK-NEXT:    [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
5236 // CHECK-NEXT:    [[VQDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQDMULH_LANEQ_V]], <8 x i16> [[VQDMULH_LANEQ_V1]], i32 0)
5237 // CHECK-NEXT:    ret <4 x i16> [[VQDMULH_LANEQ_V2]]
5238 //
test_vqdmulh_laneq_s16_0(int16x4_t a,int16x8_t v)5239 int16x4_t test_vqdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
5240   return vqdmulh_laneq_s16(a, v, 0);
5241 }
5242 
5243 // CHECK-LABEL: @test_vqdmulhq_laneq_s16_0(
5244 // CHECK-NEXT:  entry:
5245 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
5246 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5247 // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5248 // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
5249 // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQDMULHQ_LANEQ_V]], <8 x i16> [[VQDMULHQ_LANEQ_V1]], i32 0)
5250 // CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_LANEQ_V2]]
5251 //
test_vqdmulhq_laneq_s16_0(int16x8_t a,int16x8_t v)5252 int16x8_t test_vqdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
5253   return vqdmulhq_laneq_s16(a, v, 0);
5254 }
5255 
5256 // CHECK-LABEL: @test_vqdmulh_laneq_s32_0(
5257 // CHECK-NEXT:  entry:
5258 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
5259 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5260 // CHECK-NEXT:    [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5261 // CHECK-NEXT:    [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
5262 // CHECK-NEXT:    [[VQDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQDMULH_LANEQ_V]], <4 x i32> [[VQDMULH_LANEQ_V1]], i32 0)
5263 // CHECK-NEXT:    ret <2 x i32> [[VQDMULH_LANEQ_V2]]
5264 //
test_vqdmulh_laneq_s32_0(int32x2_t a,int32x4_t v)5265 int32x2_t test_vqdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
5266   return vqdmulh_laneq_s32(a, v, 0);
5267 }
5268 
5269 // CHECK-LABEL: @test_vqdmulhq_laneq_s32_0(
5270 // CHECK-NEXT:  entry:
5271 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5272 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5273 // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5274 // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
5275 // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQDMULHQ_LANEQ_V]], <4 x i32> [[VQDMULHQ_LANEQ_V1]], i32 0)
5276 // CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_LANEQ_V2]]
5277 //
test_vqdmulhq_laneq_s32_0(int32x4_t a,int32x4_t v)5278 int32x4_t test_vqdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
5279   return vqdmulhq_laneq_s32(a, v, 0);
5280 }
5281 
5282 // CHECK-LABEL: @test_vqrdmulh_laneq_s16_0(
5283 // CHECK-NEXT:  entry:
5284 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
5285 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5286 // CHECK-NEXT:    [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5287 // CHECK-NEXT:    [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
5288 // CHECK-NEXT:    [[VQRDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQRDMULH_LANEQ_V]], <8 x i16> [[VQRDMULH_LANEQ_V1]], i32 0)
5289 // CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_LANEQ_V2]]
5290 //
test_vqrdmulh_laneq_s16_0(int16x4_t a,int16x8_t v)5291 int16x4_t test_vqrdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
5292   return vqrdmulh_laneq_s16(a, v, 0);
5293 }
5294 
5295 // CHECK-LABEL: @test_vqrdmulhq_laneq_s16_0(
5296 // CHECK-NEXT:  entry:
5297 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
5298 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5299 // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5300 // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
5301 // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQRDMULHQ_LANEQ_V]], <8 x i16> [[VQRDMULHQ_LANEQ_V1]], i32 0)
5302 // CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_LANEQ_V2]]
5303 //
test_vqrdmulhq_laneq_s16_0(int16x8_t a,int16x8_t v)5304 int16x8_t test_vqrdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
5305   return vqrdmulhq_laneq_s16(a, v, 0);
5306 }
5307 
5308 // CHECK-LABEL: @test_vqrdmulh_laneq_s32_0(
5309 // CHECK-NEXT:  entry:
5310 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
5311 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5312 // CHECK-NEXT:    [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5313 // CHECK-NEXT:    [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
5314 // CHECK-NEXT:    [[VQRDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQRDMULH_LANEQ_V]], <4 x i32> [[VQRDMULH_LANEQ_V1]], i32 0)
5315 // CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_LANEQ_V2]]
5316 //
test_vqrdmulh_laneq_s32_0(int32x2_t a,int32x4_t v)5317 int32x2_t test_vqrdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
5318   return vqrdmulh_laneq_s32(a, v, 0);
5319 }
5320 
5321 // CHECK-LABEL: @test_vqrdmulhq_laneq_s32_0(
5322 // CHECK-NEXT:  entry:
5323 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5324 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5325 // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5326 // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
5327 // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQRDMULHQ_LANEQ_V]], <4 x i32> [[VQRDMULHQ_LANEQ_V1]], i32 0)
5328 // CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_LANEQ_V2]]
5329 //
test_vqrdmulhq_laneq_s32_0(int32x4_t a,int32x4_t v)5330 int32x4_t test_vqrdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
5331   return vqrdmulhq_laneq_s32(a, v, 0);
5332 }
5333 
5334 // CHECK-LABEL: @test_vmla_lane_u16(
5335 // CHECK-NEXT:  entry:
5336 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
5337 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5338 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
5339 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
5340 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
5341 // CHECK-NEXT:    ret <4 x i16> [[ADD]]
5342 //
test_vmla_lane_u16(uint16x4_t a,uint16x4_t b,uint16x4_t v)5343 uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
5344   return vmla_lane_u16(a, b, v, 3);
5345 }
5346 
5347 // CHECK-LABEL: @test_vmlaq_lane_u16(
5348 // CHECK-NEXT:  entry:
5349 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
5350 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5351 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
5352 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
5353 // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
5354 // CHECK-NEXT:    ret <8 x i16> [[ADD]]
5355 //
test_vmlaq_lane_u16(uint16x8_t a,uint16x8_t b,uint16x4_t v)5356 uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
5357   return vmlaq_lane_u16(a, b, v, 3);
5358 }
5359 
5360 // CHECK-LABEL: @test_vmla_lane_u32(
5361 // CHECK-NEXT:  entry:
5362 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
5363 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5364 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
5365 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
5366 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
5367 // CHECK-NEXT:    ret <2 x i32> [[ADD]]
5368 //
test_vmla_lane_u32(uint32x2_t a,uint32x2_t b,uint32x2_t v)5369 uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
5370   return vmla_lane_u32(a, b, v, 1);
5371 }
5372 
5373 // CHECK-LABEL: @test_vmlaq_lane_u32(
5374 // CHECK-NEXT:  entry:
5375 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
5376 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5377 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
5378 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
5379 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
5380 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
5381 //
test_vmlaq_lane_u32(uint32x4_t a,uint32x4_t b,uint32x2_t v)5382 uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
5383   return vmlaq_lane_u32(a, b, v, 1);
5384 }
5385 
5386 // CHECK-LABEL: @test_vmla_laneq_u16(
5387 // CHECK-NEXT:  entry:
5388 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5389 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5390 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
5391 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
5392 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
5393 // CHECK-NEXT:    ret <4 x i16> [[ADD]]
5394 //
test_vmla_laneq_u16(uint16x4_t a,uint16x4_t b,uint16x8_t v)5395 uint16x4_t test_vmla_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
5396   return vmla_laneq_u16(a, b, v, 7);
5397 }
5398 
5399 // CHECK-LABEL: @test_vmlaq_laneq_u16(
5400 // CHECK-NEXT:  entry:
5401 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5402 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5403 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
5404 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
5405 // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
5406 // CHECK-NEXT:    ret <8 x i16> [[ADD]]
5407 //
test_vmlaq_laneq_u16(uint16x8_t a,uint16x8_t b,uint16x8_t v)5408 uint16x8_t test_vmlaq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
5409   return vmlaq_laneq_u16(a, b, v, 7);
5410 }
5411 
5412 // CHECK-LABEL: @test_vmla_laneq_u32(
5413 // CHECK-NEXT:  entry:
5414 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5415 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5416 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
5417 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
5418 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
5419 // CHECK-NEXT:    ret <2 x i32> [[ADD]]
5420 //
test_vmla_laneq_u32(uint32x2_t a,uint32x2_t b,uint32x4_t v)5421 uint32x2_t test_vmla_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
5422   return vmla_laneq_u32(a, b, v, 3);
5423 }
5424 
5425 // CHECK-LABEL: @test_vmlaq_laneq_u32(
5426 // CHECK-NEXT:  entry:
5427 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5428 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5429 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
5430 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
5431 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
5432 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
5433 //
test_vmlaq_laneq_u32(uint32x4_t a,uint32x4_t b,uint32x4_t v)5434 uint32x4_t test_vmlaq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
5435   return vmlaq_laneq_u32(a, b, v, 3);
5436 }
5437 
5438 // CHECK-LABEL: @test_vqdmlal_laneq_s16(
5439 // CHECK-NEXT:  entry:
5440 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5441 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5442 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
5443 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5444 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
5445 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
5446 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
5447 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
5448 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
5449 //
test_vqdmlal_laneq_s16(int32x4_t a,int16x4_t b,int16x8_t v)5450 int32x4_t test_vqdmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
5451   return vqdmlal_laneq_s16(a, b, v, 7);
5452 }
5453 
5454 // CHECK-LABEL: @test_vqdmlal_laneq_s32(
5455 // CHECK-NEXT:  entry:
5456 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5457 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5458 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
5459 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
5460 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
5461 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
5462 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
5463 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
5464 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
5465 //
test_vqdmlal_laneq_s32(int64x2_t a,int32x2_t b,int32x4_t v)5466 int64x2_t test_vqdmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
5467   return vqdmlal_laneq_s32(a, b, v, 3);
5468 }
5469 
5470 // CHECK-LABEL: @test_vqdmlal_high_laneq_s16(
5471 // CHECK-NEXT:  entry:
5472 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
5473 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5474 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5475 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
5476 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5477 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
5478 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
5479 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
5480 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
5481 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
5482 //
test_vqdmlal_high_laneq_s16(int32x4_t a,int16x8_t b,int16x8_t v)5483 int32x4_t test_vqdmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
5484   return vqdmlal_high_laneq_s16(a, b, v, 7);
5485 }
5486 
5487 // CHECK-LABEL: @test_vqdmlal_high_laneq_s32(
5488 // CHECK-NEXT:  entry:
5489 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
5490 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5491 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5492 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
5493 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
5494 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
5495 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
5496 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
5497 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
5498 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
5499 //
test_vqdmlal_high_laneq_s32(int64x2_t a,int32x4_t b,int32x4_t v)5500 int64x2_t test_vqdmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
5501   return vqdmlal_high_laneq_s32(a, b, v, 3);
5502 }
5503 
5504 // CHECK-LABEL: @test_vmls_lane_u16(
5505 // CHECK-NEXT:  entry:
5506 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
5507 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5508 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
5509 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
5510 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
5511 // CHECK-NEXT:    ret <4 x i16> [[SUB]]
5512 //
test_vmls_lane_u16(uint16x4_t a,uint16x4_t b,uint16x4_t v)5513 uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
5514   return vmls_lane_u16(a, b, v, 3);
5515 }
5516 
5517 // CHECK-LABEL: @test_vmlsq_lane_u16(
5518 // CHECK-NEXT:  entry:
5519 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
5520 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5521 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
5522 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
5523 // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
5524 // CHECK-NEXT:    ret <8 x i16> [[SUB]]
5525 //
test_vmlsq_lane_u16(uint16x8_t a,uint16x8_t b,uint16x4_t v)5526 uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
5527   return vmlsq_lane_u16(a, b, v, 3);
5528 }
5529 
5530 // CHECK-LABEL: @test_vmls_lane_u32(
5531 // CHECK-NEXT:  entry:
5532 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
5533 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5534 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
5535 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
5536 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
5537 // CHECK-NEXT:    ret <2 x i32> [[SUB]]
5538 //
test_vmls_lane_u32(uint32x2_t a,uint32x2_t b,uint32x2_t v)5539 uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
5540   return vmls_lane_u32(a, b, v, 1);
5541 }
5542 
5543 // CHECK-LABEL: @test_vmlsq_lane_u32(
5544 // CHECK-NEXT:  entry:
5545 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
5546 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5547 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
5548 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
5549 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
5550 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
5551 //
test_vmlsq_lane_u32(uint32x4_t a,uint32x4_t b,uint32x2_t v)5552 uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
5553   return vmlsq_lane_u32(a, b, v, 1);
5554 }
5555 
5556 // CHECK-LABEL: @test_vmls_laneq_u16(
5557 // CHECK-NEXT:  entry:
5558 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5559 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5560 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
5561 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
5562 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
5563 // CHECK-NEXT:    ret <4 x i16> [[SUB]]
5564 //
test_vmls_laneq_u16(uint16x4_t a,uint16x4_t b,uint16x8_t v)5565 uint16x4_t test_vmls_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
5566   return vmls_laneq_u16(a, b, v, 7);
5567 }
5568 
5569 // CHECK-LABEL: @test_vmlsq_laneq_u16(
5570 // CHECK-NEXT:  entry:
5571 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5572 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5573 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
5574 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
5575 // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
5576 // CHECK-NEXT:    ret <8 x i16> [[SUB]]
5577 //
test_vmlsq_laneq_u16(uint16x8_t a,uint16x8_t b,uint16x8_t v)5578 uint16x8_t test_vmlsq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
5579   return vmlsq_laneq_u16(a, b, v, 7);
5580 }
5581 
5582 // CHECK-LABEL: @test_vmls_laneq_u32(
5583 // CHECK-NEXT:  entry:
5584 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5585 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5586 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
5587 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
5588 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
5589 // CHECK-NEXT:    ret <2 x i32> [[SUB]]
5590 //
test_vmls_laneq_u32(uint32x2_t a,uint32x2_t b,uint32x4_t v)5591 uint32x2_t test_vmls_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
5592   return vmls_laneq_u32(a, b, v, 3);
5593 }
5594 
5595 // CHECK-LABEL: @test_vmlsq_laneq_u32(
5596 // CHECK-NEXT:  entry:
5597 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5598 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5599 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
5600 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
5601 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
5602 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
5603 //
test_vmlsq_laneq_u32(uint32x4_t a,uint32x4_t b,uint32x4_t v)5604 uint32x4_t test_vmlsq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
5605   return vmlsq_laneq_u32(a, b, v, 3);
5606 }
5607 
5608 // CHECK-LABEL: @test_vqdmlsl_laneq_s16(
5609 // CHECK-NEXT:  entry:
5610 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5611 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5612 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
5613 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5614 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
5615 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
5616 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
5617 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
5618 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
5619 //
test_vqdmlsl_laneq_s16(int32x4_t a,int16x4_t b,int16x8_t v)5620 int32x4_t test_vqdmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
5621   return vqdmlsl_laneq_s16(a, b, v, 7);
5622 }
5623 
5624 // CHECK-LABEL: @test_vqdmlsl_laneq_s32(
5625 // CHECK-NEXT:  entry:
5626 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5627 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5628 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
5629 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
5630 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
5631 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
5632 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
5633 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
5634 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
5635 //
test_vqdmlsl_laneq_s32(int64x2_t a,int32x2_t b,int32x4_t v)5636 int64x2_t test_vqdmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
5637   return vqdmlsl_laneq_s32(a, b, v, 3);
5638 }
5639 
5640 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s16(
5641 // CHECK-NEXT:  entry:
5642 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
5643 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5644 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5645 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
5646 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5647 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
5648 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
5649 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
5650 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
5651 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
5652 //
test_vqdmlsl_high_laneq_s16(int32x4_t a,int16x8_t b,int16x8_t v)5653 int32x4_t test_vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
5654   return vqdmlsl_high_laneq_s16(a, b, v, 7);
5655 }
5656 
5657 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s32(
5658 // CHECK-NEXT:  entry:
5659 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
5660 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5661 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5662 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
5663 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
5664 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
5665 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
5666 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
5667 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
5668 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
5669 //
test_vqdmlsl_high_laneq_s32(int64x2_t a,int32x4_t b,int32x4_t v)5670 int64x2_t test_vqdmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
5671   return vqdmlsl_high_laneq_s32(a, b, v, 3);
5672 }
5673 
5674 // CHECK-LABEL: @test_vqdmulh_laneq_s16(
5675 // CHECK-NEXT:  entry:
5676 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
5677 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5678 // CHECK-NEXT:    [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5679 // CHECK-NEXT:    [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
5680 // CHECK-NEXT:    [[VQDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQDMULH_LANEQ_V]], <8 x i16> [[VQDMULH_LANEQ_V1]], i32 7)
5681 // CHECK-NEXT:    ret <4 x i16> [[VQDMULH_LANEQ_V2]]
5682 //
test_vqdmulh_laneq_s16(int16x4_t a,int16x8_t v)5683 int16x4_t test_vqdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
5684   return vqdmulh_laneq_s16(a, v, 7);
5685 }
5686 
5687 // CHECK-LABEL: @test_vqdmulhq_laneq_s16(
5688 // CHECK-NEXT:  entry:
5689 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
5690 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5691 // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5692 // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
5693 // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQDMULHQ_LANEQ_V]], <8 x i16> [[VQDMULHQ_LANEQ_V1]], i32 7)
5694 // CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_LANEQ_V2]]
5695 //
test_vqdmulhq_laneq_s16(int16x8_t a,int16x8_t v)5696 int16x8_t test_vqdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
5697   return vqdmulhq_laneq_s16(a, v, 7);
5698 }
5699 
5700 // CHECK-LABEL: @test_vqdmulh_laneq_s32(
5701 // CHECK-NEXT:  entry:
5702 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
5703 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5704 // CHECK-NEXT:    [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5705 // CHECK-NEXT:    [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
5706 // CHECK-NEXT:    [[VQDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQDMULH_LANEQ_V]], <4 x i32> [[VQDMULH_LANEQ_V1]], i32 3)
5707 // CHECK-NEXT:    ret <2 x i32> [[VQDMULH_LANEQ_V2]]
5708 //
test_vqdmulh_laneq_s32(int32x2_t a,int32x4_t v)5709 int32x2_t test_vqdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
5710   return vqdmulh_laneq_s32(a, v, 3);
5711 }
5712 
5713 // CHECK-LABEL: @test_vqdmulhq_laneq_s32(
5714 // CHECK-NEXT:  entry:
5715 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5716 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5717 // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5718 // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
5719 // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQDMULHQ_LANEQ_V]], <4 x i32> [[VQDMULHQ_LANEQ_V1]], i32 3)
5720 // CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_LANEQ_V2]]
5721 //
test_vqdmulhq_laneq_s32(int32x4_t a,int32x4_t v)5722 int32x4_t test_vqdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
5723   return vqdmulhq_laneq_s32(a, v, 3);
5724 }
5725 
5726 // CHECK-LABEL: @test_vqrdmulh_laneq_s16(
5727 // CHECK-NEXT:  entry:
5728 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
5729 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5730 // CHECK-NEXT:    [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5731 // CHECK-NEXT:    [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
5732 // CHECK-NEXT:    [[VQRDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQRDMULH_LANEQ_V]], <8 x i16> [[VQRDMULH_LANEQ_V1]], i32 7)
5733 // CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_LANEQ_V2]]
5734 //
test_vqrdmulh_laneq_s16(int16x4_t a,int16x8_t v)5735 int16x4_t test_vqrdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
5736   return vqrdmulh_laneq_s16(a, v, 7);
5737 }
5738 
5739 // CHECK-LABEL: @test_vqrdmulhq_laneq_s16(
5740 // CHECK-NEXT:  entry:
5741 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
5742 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5743 // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5744 // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
5745 // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQRDMULHQ_LANEQ_V]], <8 x i16> [[VQRDMULHQ_LANEQ_V1]], i32 7)
5746 // CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_LANEQ_V2]]
5747 //
test_vqrdmulhq_laneq_s16(int16x8_t a,int16x8_t v)5748 int16x8_t test_vqrdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
5749   return vqrdmulhq_laneq_s16(a, v, 7);
5750 }
5751 
5752 // CHECK-LABEL: @test_vqrdmulh_laneq_s32(
5753 // CHECK-NEXT:  entry:
5754 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
5755 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5756 // CHECK-NEXT:    [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5757 // CHECK-NEXT:    [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
5758 // CHECK-NEXT:    [[VQRDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQRDMULH_LANEQ_V]], <4 x i32> [[VQRDMULH_LANEQ_V1]], i32 3)
5759 // CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_LANEQ_V2]]
5760 //
test_vqrdmulh_laneq_s32(int32x2_t a,int32x4_t v)5761 int32x2_t test_vqrdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
5762   return vqrdmulh_laneq_s32(a, v, 3);
5763 }
5764 
5765 // CHECK-LABEL: @test_vqrdmulhq_laneq_s32(
5766 // CHECK-NEXT:  entry:
5767 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5768 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5769 // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5770 // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
5771 // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQRDMULHQ_LANEQ_V]], <4 x i32> [[VQRDMULHQ_LANEQ_V1]], i32 3)
5772 // CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_LANEQ_V2]]
5773 //
test_vqrdmulhq_laneq_s32(int32x4_t a,int32x4_t v)5774 int32x4_t test_vqrdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
5775   return vqrdmulhq_laneq_s32(a, v, 3);
5776 }
5777