1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast                 | FileCheck %s --check-prefixes=CHECK,GENERIC
3; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast -mcpu=exynos-m3 | FileCheck %s --check-prefixes=CHECK,EXYNOSM3
4
5declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>)
6
7declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>)
8
9declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>)
10
11declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
12
13declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
14
15declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
16
17declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
18
19declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>)
20
21declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>)
22
23declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>)
24
25declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>)
26
27declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
28
29declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
30
31declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
32
33declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
34
35declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
36
37declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
38
39declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
40
41declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
42
43declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
44
45declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
46
47define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
48; CHECK-LABEL: test_vmla_lane_s16:
49; CHECK:       // %bb.0: // %entry
50; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
51; CHECK-NEXT:    mla v0.4h, v1.4h, v2.h[3]
52; CHECK-NEXT:    ret
53entry:
54  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
55  %mul = mul <4 x i16> %shuffle, %b
56  %add = add <4 x i16> %mul, %a
57  ret <4 x i16> %add
58}
59
60define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
61; CHECK-LABEL: test_vmlaq_lane_s16:
62; CHECK:       // %bb.0: // %entry
63; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
64; CHECK-NEXT:    mla v0.8h, v1.8h, v2.h[3]
65; CHECK-NEXT:    ret
66entry:
67  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
68  %mul = mul <8 x i16> %shuffle, %b
69  %add = add <8 x i16> %mul, %a
70  ret <8 x i16> %add
71}
72
73define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
74; CHECK-LABEL: test_vmla_lane_s32:
75; CHECK:       // %bb.0: // %entry
76; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
77; CHECK-NEXT:    mla v0.2s, v1.2s, v2.s[1]
78; CHECK-NEXT:    ret
79entry:
80  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
81  %mul = mul <2 x i32> %shuffle, %b
82  %add = add <2 x i32> %mul, %a
83  ret <2 x i32> %add
84}
85
86define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
87; CHECK-LABEL: test_vmlaq_lane_s32:
88; CHECK:       // %bb.0: // %entry
89; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
90; CHECK-NEXT:    mla v0.4s, v1.4s, v2.s[1]
91; CHECK-NEXT:    ret
92entry:
93  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
94  %mul = mul <4 x i32> %shuffle, %b
95  %add = add <4 x i32> %mul, %a
96  ret <4 x i32> %add
97}
98
99define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
100; CHECK-LABEL: test_vmla_laneq_s16:
101; CHECK:       // %bb.0: // %entry
102; CHECK-NEXT:    mla v0.4h, v1.4h, v2.h[7]
103; CHECK-NEXT:    ret
104entry:
105  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
106  %mul = mul <4 x i16> %shuffle, %b
107  %add = add <4 x i16> %mul, %a
108  ret <4 x i16> %add
109}
110
111define <8 x i16> @test_vmlaq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
112; CHECK-LABEL: test_vmlaq_laneq_s16:
113; CHECK:       // %bb.0: // %entry
114; CHECK-NEXT:    mla v0.8h, v1.8h, v2.h[7]
115; CHECK-NEXT:    ret
116entry:
117  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
118  %mul = mul <8 x i16> %shuffle, %b
119  %add = add <8 x i16> %mul, %a
120  ret <8 x i16> %add
121}
122
123define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
124; CHECK-LABEL: test_vmla_laneq_s32:
125; CHECK:       // %bb.0: // %entry
126; CHECK-NEXT:    mla v0.2s, v1.2s, v2.s[3]
127; CHECK-NEXT:    ret
128entry:
129  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
130  %mul = mul <2 x i32> %shuffle, %b
131  %add = add <2 x i32> %mul, %a
132  ret <2 x i32> %add
133}
134
135define <4 x i32> @test_vmlaq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
136; CHECK-LABEL: test_vmlaq_laneq_s32:
137; CHECK:       // %bb.0: // %entry
138; CHECK-NEXT:    mla v0.4s, v1.4s, v2.s[3]
139; CHECK-NEXT:    ret
140entry:
141  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
142  %mul = mul <4 x i32> %shuffle, %b
143  %add = add <4 x i32> %mul, %a
144  ret <4 x i32> %add
145}
146
147define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
148; CHECK-LABEL: test_vmls_lane_s16:
149; CHECK:       // %bb.0: // %entry
150; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
151; CHECK-NEXT:    mls v0.4h, v1.4h, v2.h[3]
152; CHECK-NEXT:    ret
153entry:
154  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
155  %mul = mul <4 x i16> %shuffle, %b
156  %sub = sub <4 x i16> %a, %mul
157  ret <4 x i16> %sub
158}
159
160define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
161; CHECK-LABEL: test_vmlsq_lane_s16:
162; CHECK:       // %bb.0: // %entry
163; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
164; CHECK-NEXT:    mls v0.8h, v1.8h, v2.h[3]
165; CHECK-NEXT:    ret
166entry:
167  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
168  %mul = mul <8 x i16> %shuffle, %b
169  %sub = sub <8 x i16> %a, %mul
170  ret <8 x i16> %sub
171}
172
173define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
174; CHECK-LABEL: test_vmls_lane_s32:
175; CHECK:       // %bb.0: // %entry
176; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
177; CHECK-NEXT:    mls v0.2s, v1.2s, v2.s[1]
178; CHECK-NEXT:    ret
179entry:
180  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
181  %mul = mul <2 x i32> %shuffle, %b
182  %sub = sub <2 x i32> %a, %mul
183  ret <2 x i32> %sub
184}
185
186define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
187; CHECK-LABEL: test_vmlsq_lane_s32:
188; CHECK:       // %bb.0: // %entry
189; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
190; CHECK-NEXT:    mls v0.4s, v1.4s, v2.s[1]
191; CHECK-NEXT:    ret
192entry:
193  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
194  %mul = mul <4 x i32> %shuffle, %b
195  %sub = sub <4 x i32> %a, %mul
196  ret <4 x i32> %sub
197}
198
199define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
200; CHECK-LABEL: test_vmls_laneq_s16:
201; CHECK:       // %bb.0: // %entry
202; CHECK-NEXT:    mls v0.4h, v1.4h, v2.h[7]
203; CHECK-NEXT:    ret
204entry:
205  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
206  %mul = mul <4 x i16> %shuffle, %b
207  %sub = sub <4 x i16> %a, %mul
208  ret <4 x i16> %sub
209}
210
211define <8 x i16> @test_vmlsq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
212; CHECK-LABEL: test_vmlsq_laneq_s16:
213; CHECK:       // %bb.0: // %entry
214; CHECK-NEXT:    mls v0.8h, v1.8h, v2.h[7]
215; CHECK-NEXT:    ret
216entry:
217  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
218  %mul = mul <8 x i16> %shuffle, %b
219  %sub = sub <8 x i16> %a, %mul
220  ret <8 x i16> %sub
221}
222
223define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
224; CHECK-LABEL: test_vmls_laneq_s32:
225; CHECK:       // %bb.0: // %entry
226; CHECK-NEXT:    mls v0.2s, v1.2s, v2.s[3]
227; CHECK-NEXT:    ret
228entry:
229  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
230  %mul = mul <2 x i32> %shuffle, %b
231  %sub = sub <2 x i32> %a, %mul
232  ret <2 x i32> %sub
233}
234
235define <4 x i32> @test_vmlsq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
236; CHECK-LABEL: test_vmlsq_laneq_s32:
237; CHECK:       // %bb.0: // %entry
238; CHECK-NEXT:    mls v0.4s, v1.4s, v2.s[3]
239; CHECK-NEXT:    ret
240entry:
241  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
242  %mul = mul <4 x i32> %shuffle, %b
243  %sub = sub <4 x i32> %a, %mul
244  ret <4 x i32> %sub
245}
246
247define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %v) {
248; CHECK-LABEL: test_vmul_lane_s16:
249; CHECK:       // %bb.0: // %entry
250; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
251; CHECK-NEXT:    mul v0.4h, v0.4h, v1.h[3]
252; CHECK-NEXT:    ret
253entry:
254  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
255  %mul = mul <4 x i16> %shuffle, %a
256  ret <4 x i16> %mul
257}
258
259define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
260; CHECK-LABEL: test_vmulq_lane_s16:
261; CHECK:       // %bb.0: // %entry
262; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
263; CHECK-NEXT:    mul v0.8h, v0.8h, v1.h[3]
264; CHECK-NEXT:    ret
265entry:
266  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
267  %mul = mul <8 x i16> %shuffle, %a
268  ret <8 x i16> %mul
269}
270
271define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %v) {
272; CHECK-LABEL: test_vmul_lane_s32:
273; CHECK:       // %bb.0: // %entry
274; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
275; CHECK-NEXT:    mul v0.2s, v0.2s, v1.s[1]
276; CHECK-NEXT:    ret
277entry:
278  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
279  %mul = mul <2 x i32> %shuffle, %a
280  ret <2 x i32> %mul
281}
282
283define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
284; CHECK-LABEL: test_vmulq_lane_s32:
285; CHECK:       // %bb.0: // %entry
286; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
287; CHECK-NEXT:    mul v0.4s, v0.4s, v1.s[1]
288; CHECK-NEXT:    ret
289entry:
290  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
291  %mul = mul <4 x i32> %shuffle, %a
292  ret <4 x i32> %mul
293}
294
295define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %v) {
296; CHECK-LABEL: test_vmul_lane_u16:
297; CHECK:       // %bb.0: // %entry
298; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
299; CHECK-NEXT:    mul v0.4h, v0.4h, v1.h[3]
300; CHECK-NEXT:    ret
301entry:
302  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
303  %mul = mul <4 x i16> %shuffle, %a
304  ret <4 x i16> %mul
305}
306
307define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %v) {
308; CHECK-LABEL: test_vmulq_lane_u16:
309; CHECK:       // %bb.0: // %entry
310; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
311; CHECK-NEXT:    mul v0.8h, v0.8h, v1.h[3]
312; CHECK-NEXT:    ret
313entry:
314  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
315  %mul = mul <8 x i16> %shuffle, %a
316  ret <8 x i16> %mul
317}
318
319define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %v) {
320; CHECK-LABEL: test_vmul_lane_u32:
321; CHECK:       // %bb.0: // %entry
322; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
323; CHECK-NEXT:    mul v0.2s, v0.2s, v1.s[1]
324; CHECK-NEXT:    ret
325entry:
326  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
327  %mul = mul <2 x i32> %shuffle, %a
328  ret <2 x i32> %mul
329}
330
331define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %v) {
332; CHECK-LABEL: test_vmulq_lane_u32:
333; CHECK:       // %bb.0: // %entry
334; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
335; CHECK-NEXT:    mul v0.4s, v0.4s, v1.s[1]
336; CHECK-NEXT:    ret
337entry:
338  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
339  %mul = mul <4 x i32> %shuffle, %a
340  ret <4 x i32> %mul
341}
342
343define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
344; CHECK-LABEL: test_vmul_laneq_s16:
345; CHECK:       // %bb.0: // %entry
346; CHECK-NEXT:    mul v0.4h, v0.4h, v1.h[7]
347; CHECK-NEXT:    ret
348entry:
349  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
350  %mul = mul <4 x i16> %shuffle, %a
351  ret <4 x i16> %mul
352}
353
354define <8 x i16> @test_vmulq_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
355; CHECK-LABEL: test_vmulq_laneq_s16:
356; CHECK:       // %bb.0: // %entry
357; CHECK-NEXT:    mul v0.8h, v0.8h, v1.h[7]
358; CHECK-NEXT:    ret
359entry:
360  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
361  %mul = mul <8 x i16> %shuffle, %a
362  ret <8 x i16> %mul
363}
364
365define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
366; CHECK-LABEL: test_vmul_laneq_s32:
367; CHECK:       // %bb.0: // %entry
368; CHECK-NEXT:    mul v0.2s, v0.2s, v1.s[3]
369; CHECK-NEXT:    ret
370entry:
371  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
372  %mul = mul <2 x i32> %shuffle, %a
373  ret <2 x i32> %mul
374}
375
376define <4 x i32> @test_vmulq_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
377; CHECK-LABEL: test_vmulq_laneq_s32:
378; CHECK:       // %bb.0: // %entry
379; CHECK-NEXT:    mul v0.4s, v0.4s, v1.s[3]
380; CHECK-NEXT:    ret
381entry:
382  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
383  %mul = mul <4 x i32> %shuffle, %a
384  ret <4 x i32> %mul
385}
386
387define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
388; CHECK-LABEL: test_vmul_laneq_u16:
389; CHECK:       // %bb.0: // %entry
390; CHECK-NEXT:    mul v0.4h, v0.4h, v1.h[7]
391; CHECK-NEXT:    ret
392entry:
393  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
394  %mul = mul <4 x i16> %shuffle, %a
395  ret <4 x i16> %mul
396}
397
398define <8 x i16> @test_vmulq_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
399; CHECK-LABEL: test_vmulq_laneq_u16:
400; CHECK:       // %bb.0: // %entry
401; CHECK-NEXT:    mul v0.8h, v0.8h, v1.h[7]
402; CHECK-NEXT:    ret
403entry:
404  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
405  %mul = mul <8 x i16> %shuffle, %a
406  ret <8 x i16> %mul
407}
408
409define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
410; CHECK-LABEL: test_vmul_laneq_u32:
411; CHECK:       // %bb.0: // %entry
412; CHECK-NEXT:    mul v0.2s, v0.2s, v1.s[3]
413; CHECK-NEXT:    ret
414entry:
415  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
416  %mul = mul <2 x i32> %shuffle, %a
417  ret <2 x i32> %mul
418}
419
420define <4 x i32> @test_vmulq_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
421; CHECK-LABEL: test_vmulq_laneq_u32:
422; CHECK:       // %bb.0: // %entry
423; CHECK-NEXT:    mul v0.4s, v0.4s, v1.s[3]
424; CHECK-NEXT:    ret
425entry:
426  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
427  %mul = mul <4 x i32> %shuffle, %a
428  ret <4 x i32> %mul
429}
430
431define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
432; CHECK-LABEL: test_vfma_lane_f32:
433; CHECK:       // %bb.0: // %entry
434; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
435; CHECK-NEXT:    fmla v0.2s, v1.2s, v2.s[1]
436; CHECK-NEXT:    ret
437entry:
438  %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
439  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
440  ret <2 x float> %0
441}
442
443declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
444
445define <4 x float> @test_vfmaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
446; CHECK-LABEL: test_vfmaq_lane_f32:
447; CHECK:       // %bb.0: // %entry
448; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
449; CHECK-NEXT:    fmla v0.4s, v1.4s, v2.s[1]
450; CHECK-NEXT:    ret
451entry:
452  %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
453  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
454  ret <4 x float> %0
455}
456
457declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
458
459define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
460; CHECK-LABEL: test_vfma_laneq_f32:
461; CHECK:       // %bb.0: // %entry
462; CHECK-NEXT:    fmla v0.2s, v1.2s, v2.s[3]
463; CHECK-NEXT:    ret
464entry:
465  %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
466  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
467  ret <2 x float> %0
468}
469
470define <4 x float> @test_vfmaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
471; CHECK-LABEL: test_vfmaq_laneq_f32:
472; CHECK:       // %bb.0: // %entry
473; CHECK-NEXT:    fmla v0.4s, v1.4s, v2.s[3]
474; CHECK-NEXT:    ret
475entry:
476  %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
477  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
478  ret <4 x float> %0
479}
480
481define <2 x float> @test_vfms_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
482; CHECK-LABEL: test_vfms_lane_f32:
483; CHECK:       // %bb.0: // %entry
484; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
485; CHECK-NEXT:    fmls v0.2s, v1.2s, v2.s[1]
486; CHECK-NEXT:    ret
487entry:
488  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
489  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> <i32 1, i32 1>
490  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
491  ret <2 x float> %0
492}
493
494define <4 x float> @test_vfmsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
495; CHECK-LABEL: test_vfmsq_lane_f32:
496; CHECK:       // %bb.0: // %entry
497; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
498; CHECK-NEXT:    fmls v0.4s, v1.4s, v2.s[1]
499; CHECK-NEXT:    ret
500entry:
501  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
502  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
503  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
504  ret <4 x float> %0
505}
506
507define <2 x float> @test_vfms_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
508; CHECK-LABEL: test_vfms_laneq_f32:
509; CHECK:       // %bb.0: // %entry
510; CHECK-NEXT:    fmls v0.2s, v1.2s, v2.s[3]
511; CHECK-NEXT:    ret
512entry:
513  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
514  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> <i32 3, i32 3>
515  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
516  ret <2 x float> %0
517}
518
519define <4 x float> @test_vfmsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
520; CHECK-LABEL: test_vfmsq_laneq_f32:
521; CHECK:       // %bb.0: // %entry
522; CHECK-NEXT:    fmls v0.4s, v1.4s, v2.s[3]
523; CHECK-NEXT:    ret
524entry:
525  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
526  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
527  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
528  ret <4 x float> %0
529}
530
531define <2 x double> @test_vfmaq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
532; CHECK-LABEL: test_vfmaq_lane_f64:
533; CHECK:       // %bb.0: // %entry
534; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
535; CHECK-NEXT:    fmla v0.2d, v1.2d, v2.d[0]
536; CHECK-NEXT:    ret
537entry:
538  %lane = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
539  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
540  ret <2 x double> %0
541}
542
543declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
544
545define <2 x double> @test_vfmaq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
546; CHECK-LABEL: test_vfmaq_laneq_f64:
547; CHECK:       // %bb.0: // %entry
548; CHECK-NEXT:    fmla v0.2d, v1.2d, v2.d[1]
549; CHECK-NEXT:    ret
550entry:
551  %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
552  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
553  ret <2 x double> %0
554}
555
556define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
557; CHECK-LABEL: test_vfmsq_lane_f64:
558; CHECK:       // %bb.0: // %entry
559; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
560; CHECK-NEXT:    fmls v0.2d, v1.2d, v2.d[0]
561; CHECK-NEXT:    ret
562entry:
563  %sub = fsub <1 x double> <double -0.000000e+00>, %v
564  %lane = shufflevector <1 x double> %sub, <1 x double> undef, <2 x i32> zeroinitializer
565  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
566  ret <2 x double> %0
567}
568
569define <2 x double> @test_vfmsq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
570; CHECK-LABEL: test_vfmsq_laneq_f64:
571; CHECK:       // %bb.0: // %entry
572; CHECK-NEXT:    fmls v0.2d, v1.2d, v2.d[1]
573; CHECK-NEXT:    ret
574entry:
575  %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
576  %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> <i32 1, i32 1>
577  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
578  ret <2 x double> %0
579}
580
581define float @test_vfmas_laneq_f32(float %a, float %b, <4 x float> %v) {
582; CHECK-LABEL: test_vfmas_laneq_f32:
583; CHECK:       // %bb.0: // %entry
584; CHECK-NEXT:    fmla s0, s1, v2.s[3]
585; CHECK-NEXT:    ret
586entry:
587  %extract = extractelement <4 x float> %v, i32 3
588  %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a)
589  ret float %0
590}
591
592declare float @llvm.fma.f32(float, float, float)
593
594define double @test_vfmsd_lane_f64(double %a, double %b, <1 x double> %v) {
595; CHECK-LABEL: test_vfmsd_lane_f64:
596; CHECK:       // %bb.0: // %entry
597; CHECK-NEXT:    fmsub d0, d1, d2, d0
598; CHECK-NEXT:    ret
599entry:
600  %extract.rhs = extractelement <1 x double> %v, i32 0
601  %extract = fsub double -0.000000e+00, %extract.rhs
602  %0 = tail call double @llvm.fma.f64(double %b, double %extract, double %a)
603  ret double %0
604}
605
606declare double @llvm.fma.f64(double, double, double)
607
608define float @test_vfmss_lane_f32(float %a, float %b, <2 x float> %v) {
609; CHECK-LABEL: test_vfmss_lane_f32:
610; CHECK:       // %bb.0: // %entry
611; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
612; CHECK-NEXT:    fmls s0, s1, v2.s[1]
613; CHECK-NEXT:    ret
614entry:
615  %extract.rhs = extractelement <2 x float> %v, i32 1
616  %extract = fsub float -0.000000e+00, %extract.rhs
617  %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a)
618  ret float %0
619}
620
621define float @test_vfmss_laneq_f32(float %a, float %b, <4 x float> %v) {
622; CHECK-LABEL: test_vfmss_laneq_f32:
623; CHECK:       // %bb.0: // %entry
624; CHECK-NEXT:    fmls s0, s1, v2.s[3]
625; CHECK-NEXT:    ret
626entry:
627  %extract.rhs = extractelement <4 x float> %v, i32 3
628  %extract = fsub float -0.000000e+00, %extract.rhs
629  %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a)
630  ret float %0
631}
632
633define double @test_vfmsd_laneq_f64(double %a, double %b, <2 x double> %v) {
634; CHECK-LABEL: test_vfmsd_laneq_f64:
635; CHECK:       // %bb.0: // %entry
636; CHECK-NEXT:    fmls d0, d1, v2.d[1]
637; CHECK-NEXT:    ret
638entry:
639  %extract.rhs = extractelement <2 x double> %v, i32 1
640  %extract = fsub double -0.000000e+00, %extract.rhs
641  %0 = tail call double @llvm.fma.f64(double %b, double %extract, double %a)
642  ret double %0
643}
644
645define double @test_vfmsd_lane_f64_0(double %a, double %b, <1 x double> %v) {
646; CHECK-LABEL: test_vfmsd_lane_f64_0:
647; CHECK:       // %bb.0: // %entry
648; CHECK-NEXT:    fmsub d0, d1, d2, d0
649; CHECK-NEXT:    ret
650entry:
651  %tmp0 = fsub <1 x double> <double -0.000000e+00>, %v
652  %tmp1 = extractelement <1 x double> %tmp0, i32 0
653  %0 = tail call double @llvm.fma.f64(double %b, double %tmp1, double %a)
654  ret double %0
655}
656
657define float @test_vfmss_lane_f32_0(float %a, float %b, <2 x float> %v) {
658; CHECK-LABEL: test_vfmss_lane_f32_0:
659; CHECK:       // %bb.0: // %entry
660; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
661; CHECK-NEXT:    fmls s0, s1, v2.s[1]
662; CHECK-NEXT:    ret
663entry:
664  %tmp0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
665  %tmp1 = extractelement <2 x float> %tmp0, i32 1
666  %0 = tail call float @llvm.fma.f32(float %b, float %tmp1, float %a)
667  ret float %0
668}
669
670define float @test_vfmss_laneq_f32_0(float %a, float %b, <4 x float> %v) {
671; CHECK-LABEL: test_vfmss_laneq_f32_0:
672; CHECK:       // %bb.0: // %entry
673; CHECK-NEXT:    fmls s0, s1, v2.s[3]
674; CHECK-NEXT:    ret
675entry:
676  %tmp0 = fsub <4 x float><float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
677  %tmp1 = extractelement <4 x float> %tmp0, i32 3
678  %0 = tail call float @llvm.fma.f32(float %b, float %tmp1, float %a)
679  ret float %0
680}
681
682define double @test_vfmsd_laneq_f64_0(double %a, double %b, <2 x double> %v) {
683; CHECK-LABEL: test_vfmsd_laneq_f64_0:
684; CHECK:       // %bb.0: // %entry
685; CHECK-NEXT:    fmls d0, d1, v2.d[1]
686; CHECK-NEXT:    ret
687entry:
688  %tmp0 = fsub <2 x double><double -0.000000e+00, double -0.000000e+00>, %v
689  %tmp1 = extractelement <2 x double> %tmp0, i32 1
690  %0 = tail call double @llvm.fma.f64(double %b, double %tmp1, double %a)
691  ret double %0
692}
693
694define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
695; CHECK-LABEL: test_vmlal_lane_s16:
696; CHECK:       // %bb.0: // %entry
697; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
698; CHECK-NEXT:    smlal v0.4s, v1.4h, v2.h[3]
699; CHECK-NEXT:    ret
700entry:
701  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
702  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
703  %add = add <4 x i32> %vmull2.i, %a
704  ret <4 x i32> %add
705}
706
707define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
708; CHECK-LABEL: test_vmlal_lane_s32:
709; CHECK:       // %bb.0: // %entry
710; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
711; CHECK-NEXT:    smlal v0.2d, v1.2s, v2.s[1]
712; CHECK-NEXT:    ret
713entry:
714  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
715  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
716  %add = add <2 x i64> %vmull2.i, %a
717  ret <2 x i64> %add
718}
719
720define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
721; CHECK-LABEL: test_vmlal_laneq_s16:
722; CHECK:       // %bb.0: // %entry
723; CHECK-NEXT:    smlal v0.4s, v1.4h, v2.h[7]
724; CHECK-NEXT:    ret
725entry:
726  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
727  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
728  %add = add <4 x i32> %vmull2.i, %a
729  ret <4 x i32> %add
730}
731
732define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
733; CHECK-LABEL: test_vmlal_laneq_s32:
734; CHECK:       // %bb.0: // %entry
735; CHECK-NEXT:    smlal v0.2d, v1.2s, v2.s[3]
736; CHECK-NEXT:    ret
737entry:
738  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
739  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
740  %add = add <2 x i64> %vmull2.i, %a
741  ret <2 x i64> %add
742}
743
744define <4 x i32> @test_vmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
745; CHECK-LABEL: test_vmlal_high_lane_s16:
746; CHECK:       // %bb.0: // %entry
747; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
748; CHECK-NEXT:    smlal2 v0.4s, v1.8h, v2.h[3]
749; CHECK-NEXT:    ret
750entry:
751  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
752  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
753  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
754  %add = add <4 x i32> %vmull2.i, %a
755  ret <4 x i32> %add
756}
757
758define <2 x i64> @test_vmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
759; CHECK-LABEL: test_vmlal_high_lane_s32:
760; CHECK:       // %bb.0: // %entry
761; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
762; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.s[1]
763; CHECK-NEXT:    ret
764entry:
765  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
766  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
767  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
768  %add = add <2 x i64> %vmull2.i, %a
769  ret <2 x i64> %add
770}
771
772define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
773; CHECK-LABEL: test_vmlal_high_laneq_s16:
774; CHECK:       // %bb.0: // %entry
775; CHECK-NEXT:    smlal2 v0.4s, v1.8h, v2.h[7]
776; CHECK-NEXT:    ret
777entry:
778  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
779  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
780  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
781  %add = add <4 x i32> %vmull2.i, %a
782  ret <4 x i32> %add
783}
784
785define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
786; CHECK-LABEL: test_vmlal_high_laneq_s32:
787; CHECK:       // %bb.0: // %entry
788; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.s[3]
789; CHECK-NEXT:    ret
790entry:
791  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
792  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
793  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
794  %add = add <2 x i64> %vmull2.i, %a
795  ret <2 x i64> %add
796}
797
798define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
799; CHECK-LABEL: test_vmlsl_lane_s16:
800; CHECK:       // %bb.0: // %entry
801; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
802; CHECK-NEXT:    smlsl v0.4s, v1.4h, v2.h[3]
803; CHECK-NEXT:    ret
804entry:
805  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
806  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
807  %sub = sub <4 x i32> %a, %vmull2.i
808  ret <4 x i32> %sub
809}
810
811define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
812; CHECK-LABEL: test_vmlsl_lane_s32:
813; CHECK:       // %bb.0: // %entry
814; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
815; CHECK-NEXT:    smlsl v0.2d, v1.2s, v2.s[1]
816; CHECK-NEXT:    ret
817entry:
818  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
819  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
820  %sub = sub <2 x i64> %a, %vmull2.i
821  ret <2 x i64> %sub
822}
823
824define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
825; CHECK-LABEL: test_vmlsl_laneq_s16:
826; CHECK:       // %bb.0: // %entry
827; CHECK-NEXT:    smlsl v0.4s, v1.4h, v2.h[7]
828; CHECK-NEXT:    ret
829entry:
830  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
831  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
832  %sub = sub <4 x i32> %a, %vmull2.i
833  ret <4 x i32> %sub
834}
835
836define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
837; CHECK-LABEL: test_vmlsl_laneq_s32:
838; CHECK:       // %bb.0: // %entry
839; CHECK-NEXT:    smlsl v0.2d, v1.2s, v2.s[3]
840; CHECK-NEXT:    ret
841entry:
842  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
843  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
844  %sub = sub <2 x i64> %a, %vmull2.i
845  ret <2 x i64> %sub
846}
847
848define <4 x i32> @test_vmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
849; CHECK-LABEL: test_vmlsl_high_lane_s16:
850; CHECK:       // %bb.0: // %entry
851; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
852; CHECK-NEXT:    smlsl2 v0.4s, v1.8h, v2.h[3]
853; CHECK-NEXT:    ret
854entry:
855  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
856  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
857  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
858  %sub = sub <4 x i32> %a, %vmull2.i
859  ret <4 x i32> %sub
860}
861
862define <2 x i64> @test_vmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
863; CHECK-LABEL: test_vmlsl_high_lane_s32:
864; CHECK:       // %bb.0: // %entry
865; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
866; CHECK-NEXT:    smlsl2 v0.2d, v1.4s, v2.s[1]
867; CHECK-NEXT:    ret
868entry:
869  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
870  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
871  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
872  %sub = sub <2 x i64> %a, %vmull2.i
873  ret <2 x i64> %sub
874}
875
876define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
877; CHECK-LABEL: test_vmlsl_high_laneq_s16:
878; CHECK:       // %bb.0: // %entry
879; CHECK-NEXT:    smlsl2 v0.4s, v1.8h, v2.h[7]
880; CHECK-NEXT:    ret
881entry:
882  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
883  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
884  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
885  %sub = sub <4 x i32> %a, %vmull2.i
886  ret <4 x i32> %sub
887}
888
889define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
890; CHECK-LABEL: test_vmlsl_high_laneq_s32:
891; CHECK:       // %bb.0: // %entry
892; CHECK-NEXT:    smlsl2 v0.2d, v1.4s, v2.s[3]
893; CHECK-NEXT:    ret
894entry:
895  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
896  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
897  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
898  %sub = sub <2 x i64> %a, %vmull2.i
899  ret <2 x i64> %sub
900}
901
902define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
903; CHECK-LABEL: test_vmlal_lane_u16:
904; CHECK:       // %bb.0: // %entry
905; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
906; CHECK-NEXT:    umlal v0.4s, v1.4h, v2.h[3]
907; CHECK-NEXT:    ret
908entry:
909  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
910  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
911  %add = add <4 x i32> %vmull2.i, %a
912  ret <4 x i32> %add
913}
914
915define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
916; CHECK-LABEL: test_vmlal_lane_u32:
917; CHECK:       // %bb.0: // %entry
918; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
919; CHECK-NEXT:    umlal v0.2d, v1.2s, v2.s[1]
920; CHECK-NEXT:    ret
921entry:
922  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
923  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
924  %add = add <2 x i64> %vmull2.i, %a
925  ret <2 x i64> %add
926}
927
928define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
929; CHECK-LABEL: test_vmlal_laneq_u16:
930; CHECK:       // %bb.0: // %entry
931; CHECK-NEXT:    umlal v0.4s, v1.4h, v2.h[7]
932; CHECK-NEXT:    ret
933entry:
934  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
935  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
936  %add = add <4 x i32> %vmull2.i, %a
937  ret <4 x i32> %add
938}
939
940define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
941; CHECK-LABEL: test_vmlal_laneq_u32:
942; CHECK:       // %bb.0: // %entry
943; CHECK-NEXT:    umlal v0.2d, v1.2s, v2.s[3]
944; CHECK-NEXT:    ret
945entry:
946  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
947  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
948  %add = add <2 x i64> %vmull2.i, %a
949  ret <2 x i64> %add
950}
951
952define <4 x i32> @test_vmlal_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
953; CHECK-LABEL: test_vmlal_high_lane_u16:
954; CHECK:       // %bb.0: // %entry
955; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
956; CHECK-NEXT:    umlal2 v0.4s, v1.8h, v2.h[3]
957; CHECK-NEXT:    ret
958entry:
959  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
960  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
961  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
962  %add = add <4 x i32> %vmull2.i, %a
963  ret <4 x i32> %add
964}
965
966define <2 x i64> @test_vmlal_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
967; CHECK-LABEL: test_vmlal_high_lane_u32:
968; CHECK:       // %bb.0: // %entry
969; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
970; CHECK-NEXT:    umlal2 v0.2d, v1.4s, v2.s[1]
971; CHECK-NEXT:    ret
972entry:
973  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
974  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
975  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
976  %add = add <2 x i64> %vmull2.i, %a
977  ret <2 x i64> %add
978}
979
980define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
981; CHECK-LABEL: test_vmlal_high_laneq_u16:
982; CHECK:       // %bb.0: // %entry
983; CHECK-NEXT:    umlal2 v0.4s, v1.8h, v2.h[7]
984; CHECK-NEXT:    ret
985entry:
986  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
987  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
988  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
989  %add = add <4 x i32> %vmull2.i, %a
990  ret <4 x i32> %add
991}
992
993define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
994; CHECK-LABEL: test_vmlal_high_laneq_u32:
995; CHECK:       // %bb.0: // %entry
996; CHECK-NEXT:    umlal2 v0.2d, v1.4s, v2.s[3]
997; CHECK-NEXT:    ret
998entry:
999  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1000  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
1001  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
1002  %add = add <2 x i64> %vmull2.i, %a
1003  ret <2 x i64> %add
1004}
1005
1006define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
1007; CHECK-LABEL: test_vmlsl_lane_u16:
1008; CHECK:       // %bb.0: // %entry
1009; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
1010; CHECK-NEXT:    umlsl v0.4s, v1.4h, v2.h[3]
1011; CHECK-NEXT:    ret
1012entry:
1013  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1014  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
1015  %sub = sub <4 x i32> %a, %vmull2.i
1016  ret <4 x i32> %sub
1017}
1018
1019define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
1020; CHECK-LABEL: test_vmlsl_lane_u32:
1021; CHECK:       // %bb.0: // %entry
1022; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
1023; CHECK-NEXT:    umlsl v0.2d, v1.2s, v2.s[1]
1024; CHECK-NEXT:    ret
1025entry:
1026  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1027  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
1028  %sub = sub <2 x i64> %a, %vmull2.i
1029  ret <2 x i64> %sub
1030}
1031
1032define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
1033; CHECK-LABEL: test_vmlsl_laneq_u16:
1034; CHECK:       // %bb.0: // %entry
1035; CHECK-NEXT:    umlsl v0.4s, v1.4h, v2.h[7]
1036; CHECK-NEXT:    ret
1037entry:
1038  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1039  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
1040  %sub = sub <4 x i32> %a, %vmull2.i
1041  ret <4 x i32> %sub
1042}
1043
1044define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
1045; CHECK-LABEL: test_vmlsl_laneq_u32:
1046; CHECK:       // %bb.0: // %entry
1047; CHECK-NEXT:    umlsl v0.2d, v1.2s, v2.s[3]
1048; CHECK-NEXT:    ret
1049entry:
1050  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
1051  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
1052  %sub = sub <2 x i64> %a, %vmull2.i
1053  ret <2 x i64> %sub
1054}
1055
1056define <4 x i32> @test_vmlsl_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
1057; CHECK-LABEL: test_vmlsl_high_lane_u16:
1058; CHECK:       // %bb.0: // %entry
1059; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
1060; CHECK-NEXT:    umlsl2 v0.4s, v1.8h, v2.h[3]
1061; CHECK-NEXT:    ret
1062entry:
1063  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1064  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1065  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
1066  %sub = sub <4 x i32> %a, %vmull2.i
1067  ret <4 x i32> %sub
1068}
1069
1070define <2 x i64> @test_vmlsl_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
1071; CHECK-LABEL: test_vmlsl_high_lane_u32:
1072; CHECK:       // %bb.0: // %entry
1073; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
1074; CHECK-NEXT:    umlsl2 v0.2d, v1.4s, v2.s[1]
1075; CHECK-NEXT:    ret
1076entry:
1077  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1078  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1079  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
1080  %sub = sub <2 x i64> %a, %vmull2.i
1081  ret <2 x i64> %sub
1082}
1083
1084define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
1085; CHECK-LABEL: test_vmlsl_high_laneq_u16:
1086; CHECK:       // %bb.0: // %entry
1087; CHECK-NEXT:    umlsl2 v0.4s, v1.8h, v2.h[7]
1088; CHECK-NEXT:    ret
1089entry:
1090  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1091  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1092  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
1093  %sub = sub <4 x i32> %a, %vmull2.i
1094  ret <4 x i32> %sub
1095}
1096
1097define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
1098; CHECK-LABEL: test_vmlsl_high_laneq_u32:
1099; CHECK:       // %bb.0: // %entry
1100; CHECK-NEXT:    umlsl2 v0.2d, v1.4s, v2.s[3]
1101; CHECK-NEXT:    ret
1102entry:
1103  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1104  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
1105  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
1106  %sub = sub <2 x i64> %a, %vmull2.i
1107  ret <2 x i64> %sub
1108}
1109
1110define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
1111; CHECK-LABEL: test_vmull_lane_s16:
1112; CHECK:       // %bb.0: // %entry
1113; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1114; CHECK-NEXT:    smull v0.4s, v0.4h, v1.h[3]
1115; CHECK-NEXT:    ret
1116entry:
1117  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1118  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
1119  ret <4 x i32> %vmull2.i
1120}
1121
1122define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
1123; CHECK-LABEL: test_vmull_lane_s32:
1124; CHECK:       // %bb.0: // %entry
1125; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1126; CHECK-NEXT:    smull v0.2d, v0.2s, v1.s[1]
1127; CHECK-NEXT:    ret
1128entry:
1129  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1130  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
1131  ret <2 x i64> %vmull2.i
1132}
1133
1134define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %v) {
1135; CHECK-LABEL: test_vmull_lane_u16:
1136; CHECK:       // %bb.0: // %entry
1137; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1138; CHECK-NEXT:    umull v0.4s, v0.4h, v1.h[3]
1139; CHECK-NEXT:    ret
1140entry:
1141  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1142  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
1143  ret <4 x i32> %vmull2.i
1144}
1145
1146define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %v) {
1147; CHECK-LABEL: test_vmull_lane_u32:
1148; CHECK:       // %bb.0: // %entry
1149; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1150; CHECK-NEXT:    umull v0.2d, v0.2s, v1.s[1]
1151; CHECK-NEXT:    ret
1152entry:
1153  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1154  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
1155  ret <2 x i64> %vmull2.i
1156}
1157
1158define <4 x i32> @test_vmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
1159; CHECK-LABEL: test_vmull_high_lane_s16:
1160; CHECK:       // %bb.0: // %entry
1161; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1162; CHECK-NEXT:    smull2 v0.4s, v0.8h, v1.h[3]
1163; CHECK-NEXT:    ret
1164entry:
1165  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1166  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1167  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
1168  ret <4 x i32> %vmull2.i
1169}
1170
1171define <2 x i64> @test_vmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
1172; CHECK-LABEL: test_vmull_high_lane_s32:
1173; CHECK:       // %bb.0: // %entry
1174; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1175; CHECK-NEXT:    smull2 v0.2d, v0.4s, v1.s[1]
1176; CHECK-NEXT:    ret
1177entry:
1178  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1179  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1180  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
1181  ret <2 x i64> %vmull2.i
1182}
1183
1184define <4 x i32> @test_vmull_high_lane_u16(<8 x i16> %a, <4 x i16> %v) {
1185; CHECK-LABEL: test_vmull_high_lane_u16:
1186; CHECK:       // %bb.0: // %entry
1187; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1188; CHECK-NEXT:    umull2 v0.4s, v0.8h, v1.h[3]
1189; CHECK-NEXT:    ret
1190entry:
1191  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1192  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1193  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
1194  ret <4 x i32> %vmull2.i
1195}
1196
1197define <2 x i64> @test_vmull_high_lane_u32(<4 x i32> %a, <2 x i32> %v) {
1198; CHECK-LABEL: test_vmull_high_lane_u32:
1199; CHECK:       // %bb.0: // %entry
1200; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1201; CHECK-NEXT:    umull2 v0.2d, v0.4s, v1.s[1]
1202; CHECK-NEXT:    ret
1203entry:
1204  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1205  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1206  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
1207  ret <2 x i64> %vmull2.i
1208}
1209
1210define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
1211; CHECK-LABEL: test_vmull_laneq_s16:
1212; CHECK:       // %bb.0: // %entry
1213; CHECK-NEXT:    smull v0.4s, v0.4h, v1.h[7]
1214; CHECK-NEXT:    ret
1215entry:
1216  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1217  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
1218  ret <4 x i32> %vmull2.i
1219}
1220
1221define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
1222; CHECK-LABEL: test_vmull_laneq_s32:
1223; CHECK:       // %bb.0: // %entry
1224; CHECK-NEXT:    smull v0.2d, v0.2s, v1.s[3]
1225; CHECK-NEXT:    ret
1226entry:
1227  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
1228  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
1229  ret <2 x i64> %vmull2.i
1230}
1231
1232define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
1233; CHECK-LABEL: test_vmull_laneq_u16:
1234; CHECK:       // %bb.0: // %entry
1235; CHECK-NEXT:    umull v0.4s, v0.4h, v1.h[7]
1236; CHECK-NEXT:    ret
1237entry:
1238  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1239  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
1240  ret <4 x i32> %vmull2.i
1241}
1242
1243define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
1244; CHECK-LABEL: test_vmull_laneq_u32:
1245; CHECK:       // %bb.0: // %entry
1246; CHECK-NEXT:    umull v0.2d, v0.2s, v1.s[3]
1247; CHECK-NEXT:    ret
1248entry:
1249  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
1250  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
1251  ret <2 x i64> %vmull2.i
1252}
1253
1254define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
1255; CHECK-LABEL: test_vmull_high_laneq_s16:
1256; CHECK:       // %bb.0: // %entry
1257; CHECK-NEXT:    smull2 v0.4s, v0.8h, v1.h[7]
1258; CHECK-NEXT:    ret
1259entry:
1260  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1261  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1262  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
1263  ret <4 x i32> %vmull2.i
1264}
1265
1266define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
1267; CHECK-LABEL: test_vmull_high_laneq_s32:
1268; CHECK:       // %bb.0: // %entry
1269; CHECK-NEXT:    smull2 v0.2d, v0.4s, v1.s[3]
1270; CHECK-NEXT:    ret
1271entry:
1272  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1273  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
1274  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
1275  ret <2 x i64> %vmull2.i
1276}
1277
1278define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
1279; CHECK-LABEL: test_vmull_high_laneq_u16:
1280; CHECK:       // %bb.0: // %entry
1281; CHECK-NEXT:    umull2 v0.4s, v0.8h, v1.h[7]
1282; CHECK-NEXT:    ret
1283entry:
1284  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1285  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1286  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
1287  ret <4 x i32> %vmull2.i
1288}
1289
1290define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
1291; CHECK-LABEL: test_vmull_high_laneq_u32:
1292; CHECK:       // %bb.0: // %entry
1293; CHECK-NEXT:    umull2 v0.2d, v0.4s, v1.s[3]
1294; CHECK-NEXT:    ret
1295entry:
1296  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1297  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
1298  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
1299  ret <2 x i64> %vmull2.i
1300}
1301
1302define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
1303; CHECK-LABEL: test_vqdmlal_lane_s16:
1304; CHECK:       // %bb.0: // %entry
1305; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
1306; CHECK-NEXT:    sqdmlal v0.4s, v1.4h, v2.h[3]
1307; CHECK-NEXT:    ret
1308entry:
1309  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1310  %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
1311  %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
1312  ret <4 x i32> %vqdmlal4.i
1313}
1314
1315define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
1316; CHECK-LABEL: test_vqdmlal_lane_s32:
1317; CHECK:       // %bb.0: // %entry
1318; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
1319; CHECK-NEXT:    sqdmlal v0.2d, v1.2s, v2.s[1]
1320; CHECK-NEXT:    ret
1321entry:
1322  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1323  %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
1324  %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
1325  ret <2 x i64> %vqdmlal4.i
1326}
1327
1328define <4 x i32> @test_vqdmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
1329; CHECK-LABEL: test_vqdmlal_high_lane_s16:
1330; CHECK:       // %bb.0: // %entry
1331; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
1332; CHECK-NEXT:    sqdmlal2 v0.4s, v1.8h, v2.h[3]
1333; CHECK-NEXT:    ret
1334entry:
1335  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1336  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1337  %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
1338  %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
1339  ret <4 x i32> %vqdmlal4.i
1340}
1341
1342define <2 x i64> @test_vqdmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
1343; CHECK-LABEL: test_vqdmlal_high_lane_s32:
1344; CHECK:       // %bb.0: // %entry
1345; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
1346; CHECK-NEXT:    sqdmlal2 v0.2d, v1.4s, v2.s[1]
1347; CHECK-NEXT:    ret
1348entry:
1349  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1350  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1351  %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
1352  %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
1353  ret <2 x i64> %vqdmlal4.i
1354}
1355
1356define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
1357; CHECK-LABEL: test_vqdmlsl_lane_s16:
1358; CHECK:       // %bb.0: // %entry
1359; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
1360; CHECK-NEXT:    sqdmlsl v0.4s, v1.4h, v2.h[3]
1361; CHECK-NEXT:    ret
1362entry:
1363  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1364  %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
1365  %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
1366  ret <4 x i32> %vqdmlsl4.i
1367}
1368
1369define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
1370; CHECK-LABEL: test_vqdmlsl_lane_s32:
1371; CHECK:       // %bb.0: // %entry
1372; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
1373; CHECK-NEXT:    sqdmlsl v0.2d, v1.2s, v2.s[1]
1374; CHECK-NEXT:    ret
1375entry:
1376  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1377  %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
1378  %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
1379  ret <2 x i64> %vqdmlsl4.i
1380}
1381
1382define <4 x i32> @test_vqdmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
1383; CHECK-LABEL: test_vqdmlsl_high_lane_s16:
1384; CHECK:       // %bb.0: // %entry
1385; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
1386; CHECK-NEXT:    sqdmlsl2 v0.4s, v1.8h, v2.h[3]
1387; CHECK-NEXT:    ret
1388entry:
1389  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1390  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1391  %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
1392  %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
1393  ret <4 x i32> %vqdmlsl4.i
1394}
1395
1396define <2 x i64> @test_vqdmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
1397; CHECK-LABEL: test_vqdmlsl_high_lane_s32:
1398; CHECK:       // %bb.0: // %entry
1399; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
1400; CHECK-NEXT:    sqdmlsl2 v0.2d, v1.4s, v2.s[1]
1401; CHECK-NEXT:    ret
1402entry:
1403  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1404  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1405  %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
1406  %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
1407  ret <2 x i64> %vqdmlsl4.i
1408}
1409
1410define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
1411; CHECK-LABEL: test_vqdmull_lane_s16:
1412; CHECK:       // %bb.0: // %entry
1413; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1414; CHECK-NEXT:    sqdmull v0.4s, v0.4h, v1.h[3]
1415; CHECK-NEXT:    ret
1416entry:
1417  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1418  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
1419  ret <4 x i32> %vqdmull2.i
1420}
1421
1422define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
1423; CHECK-LABEL: test_vqdmull_lane_s32:
1424; CHECK:       // %bb.0: // %entry
1425; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1426; CHECK-NEXT:    sqdmull v0.2d, v0.2s, v1.s[1]
1427; CHECK-NEXT:    ret
1428entry:
1429  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1430  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
1431  ret <2 x i64> %vqdmull2.i
1432}
1433
1434define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
1435; CHECK-LABEL: test_vqdmull_laneq_s16:
1436; CHECK:       // %bb.0: // %entry
1437; CHECK-NEXT:    sqdmull v0.4s, v0.4h, v1.h[3]
1438; CHECK-NEXT:    ret
1439entry:
1440  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1441  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
1442  ret <4 x i32> %vqdmull2.i
1443}
1444
1445define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
1446; CHECK-LABEL: test_vqdmull_laneq_s32:
1447; CHECK:       // %bb.0: // %entry
1448; CHECK-NEXT:    sqdmull v0.2d, v0.2s, v1.s[3]
1449; CHECK-NEXT:    ret
1450entry:
1451  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
1452  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
1453  ret <2 x i64> %vqdmull2.i
1454}
1455
1456define <4 x i32> @test_vqdmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
1457; CHECK-LABEL: test_vqdmull_high_lane_s16:
1458; CHECK:       // %bb.0: // %entry
1459; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1460; CHECK-NEXT:    sqdmull2 v0.4s, v0.8h, v1.h[3]
1461; CHECK-NEXT:    ret
1462entry:
1463  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1464  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1465  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
1466  ret <4 x i32> %vqdmull2.i
1467}
1468
1469define <2 x i64> @test_vqdmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
1470; CHECK-LABEL: test_vqdmull_high_lane_s32:
1471; CHECK:       // %bb.0: // %entry
1472; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1473; CHECK-NEXT:    sqdmull2 v0.2d, v0.4s, v1.s[1]
1474; CHECK-NEXT:    ret
1475entry:
1476  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1477  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1478  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
1479  ret <2 x i64> %vqdmull2.i
1480}
1481
1482define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
1483; CHECK-LABEL: test_vqdmull_high_laneq_s16:
1484; CHECK:       // %bb.0: // %entry
1485; CHECK-NEXT:    sqdmull2 v0.4s, v0.8h, v1.h[7]
1486; CHECK-NEXT:    ret
1487entry:
1488  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1489  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1490  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
1491  ret <4 x i32> %vqdmull2.i
1492}
1493
1494define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
1495; CHECK-LABEL: test_vqdmull_high_laneq_s32:
1496; CHECK:       // %bb.0: // %entry
1497; CHECK-NEXT:    sqdmull2 v0.2d, v0.4s, v1.s[3]
1498; CHECK-NEXT:    ret
1499entry:
1500  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1501  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
1502  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
1503  ret <2 x i64> %vqdmull2.i
1504}
1505
1506define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
1507; CHECK-LABEL: test_vqdmulh_lane_s16:
1508; CHECK:       // %bb.0: // %entry
1509; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1510; CHECK-NEXT:    sqdmulh v0.4h, v0.4h, v1.h[3]
1511; CHECK-NEXT:    ret
1512entry:
1513  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1514  %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
1515  ret <4 x i16> %vqdmulh2.i
1516}
1517
1518define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
1519; CHECK-LABEL: test_vqdmulhq_lane_s16:
1520; CHECK:       // %bb.0: // %entry
1521; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1522; CHECK-NEXT:    sqdmulh v0.8h, v0.8h, v1.h[3]
1523; CHECK-NEXT:    ret
1524entry:
1525  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
1526  %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
1527  ret <8 x i16> %vqdmulh2.i
1528}
1529
1530define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
1531; CHECK-LABEL: test_vqdmulh_lane_s32:
1532; CHECK:       // %bb.0: // %entry
1533; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1534; CHECK-NEXT:    sqdmulh v0.2s, v0.2s, v1.s[1]
1535; CHECK-NEXT:    ret
1536entry:
1537  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1538  %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
1539  ret <2 x i32> %vqdmulh2.i
1540}
1541
1542define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
1543; CHECK-LABEL: test_vqdmulhq_lane_s32:
1544; CHECK:       // %bb.0: // %entry
1545; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1546; CHECK-NEXT:    sqdmulh v0.4s, v0.4s, v1.s[1]
1547; CHECK-NEXT:    ret
1548entry:
1549  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1550  %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
1551  ret <4 x i32> %vqdmulh2.i
1552}
1553
1554define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
1555; CHECK-LABEL: test_vqrdmulh_lane_s16:
1556; CHECK:       // %bb.0: // %entry
1557; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1558; CHECK-NEXT:    sqrdmulh v0.4h, v0.4h, v1.h[3]
1559; CHECK-NEXT:    ret
1560entry:
1561  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1562  %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
1563  ret <4 x i16> %vqrdmulh2.i
1564}
1565
1566define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
1567; CHECK-LABEL: test_vqrdmulhq_lane_s16:
1568; CHECK:       // %bb.0: // %entry
1569; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1570; CHECK-NEXT:    sqrdmulh v0.8h, v0.8h, v1.h[3]
1571; CHECK-NEXT:    ret
1572entry:
1573  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
1574  %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
1575  ret <8 x i16> %vqrdmulh2.i
1576}
1577
1578define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
1579; CHECK-LABEL: test_vqrdmulh_lane_s32:
1580; CHECK:       // %bb.0: // %entry
1581; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1582; CHECK-NEXT:    sqrdmulh v0.2s, v0.2s, v1.s[1]
1583; CHECK-NEXT:    ret
1584entry:
1585  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1586  %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
1587  ret <2 x i32> %vqrdmulh2.i
1588}
1589
1590define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
1591; CHECK-LABEL: test_vqrdmulhq_lane_s32:
1592; CHECK:       // %bb.0: // %entry
1593; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1594; CHECK-NEXT:    sqrdmulh v0.4s, v0.4s, v1.s[1]
1595; CHECK-NEXT:    ret
1596entry:
1597  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1598  %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
1599  ret <4 x i32> %vqrdmulh2.i
1600}
1601
1602define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) {
1603; CHECK-LABEL: test_vmul_lane_f32:
1604; CHECK:       // %bb.0: // %entry
1605; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1606; CHECK-NEXT:    fmul v0.2s, v0.2s, v1.s[1]
1607; CHECK-NEXT:    ret
1608entry:
1609  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
1610  %mul = fmul <2 x float> %shuffle, %a
1611  ret <2 x float> %mul
1612}
1613
1614define <1 x double> @test_vmul_lane_f64(<1 x double> %a, <1 x double> %v) {
1615; CHECK-LABEL: test_vmul_lane_f64:
1616; CHECK:       // %bb.0: // %entry
1617; CHECK-NEXT:    fmul d0, d0, d1
1618; CHECK-NEXT:    ret
1619entry:
1620  %0 = bitcast <1 x double> %a to <8 x i8>
1621  %1 = bitcast <8 x i8> %0 to double
1622  %extract = extractelement <1 x double> %v, i32 0
1623  %2 = fmul double %1, %extract
1624  %3 = insertelement <1 x double> undef, double %2, i32 0
1625  ret <1 x double> %3
1626}
1627
1628define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %v) {
1629; CHECK-LABEL: test_vmulq_lane_f32:
1630; CHECK:       // %bb.0: // %entry
1631; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1632; CHECK-NEXT:    fmul v0.4s, v0.4s, v1.s[1]
1633; CHECK-NEXT:    ret
1634entry:
1635  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1636  %mul = fmul <4 x float> %shuffle, %a
1637  ret <4 x float> %mul
1638}
1639
1640define <2 x double> @test_vmulq_lane_f64(<2 x double> %a, <1 x double> %v) {
1641; CHECK-LABEL: test_vmulq_lane_f64:
1642; CHECK:       // %bb.0: // %entry
1643; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1644; CHECK-NEXT:    fmul v0.2d, v0.2d, v1.d[0]
1645; CHECK-NEXT:    ret
1646entry:
1647  %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
1648  %mul = fmul <2 x double> %shuffle, %a
1649  ret <2 x double> %mul
1650}
1651
1652define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) {
1653; CHECK-LABEL: test_vmul_laneq_f32:
1654; CHECK:       // %bb.0: // %entry
1655; CHECK-NEXT:    fmul v0.2s, v0.2s, v1.s[3]
1656; CHECK-NEXT:    ret
1657entry:
1658  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
1659  %mul = fmul <2 x float> %shuffle, %a
1660  ret <2 x float> %mul
1661}
1662
1663define <2 x float> @test_vmul_laneq3_f32_bitcast(<2 x float> %a, <2 x double> %v) {
1664; CHECK-LABEL: test_vmul_laneq3_f32_bitcast:
1665; CHECK:       // %bb.0:
1666; CHECK-NEXT:    fmul v0.2s, v0.2s, v1.s[3]
1667; CHECK-NEXT:    ret
1668  %extract = shufflevector <2 x double> %v, <2 x double> undef, <1 x i32> <i32 1>
1669  %bc = bitcast <1 x double> %extract to <2 x float>
1670  %splat = shufflevector <2 x float> %bc, <2 x float> undef, <2 x i32> <i32 1, i32 1>
1671  %mul = fmul <2 x float> %splat, %a
1672  ret <2 x float> %mul
1673}
1674
1675define <2 x float> @test_vmul_laneq2_f32_bitcast(<2 x float> %a, <2 x double> %v) {
1676; CHECK-LABEL: test_vmul_laneq2_f32_bitcast:
1677; CHECK:       // %bb.0:
1678; CHECK-NEXT:    fmul v0.2s, v0.2s, v1.s[2]
1679; CHECK-NEXT:    ret
1680  %extract = shufflevector <2 x double> %v, <2 x double> undef, <1 x i32> <i32 1>
1681  %bc = bitcast <1 x double> %extract to <2 x float>
1682  %splat = shufflevector <2 x float> %bc, <2 x float> undef, <2 x i32> <i32 0, i32 0>
1683  %mul = fmul <2 x float> %splat, %a
1684  ret <2 x float> %mul
1685}
1686
1687define <4 x i16> @test_vadd_laneq5_i16_bitcast(<4 x i16> %a, <2 x double> %v) {
1688; CHECK-LABEL: test_vadd_laneq5_i16_bitcast:
1689; CHECK:       // %bb.0:
1690; CHECK-NEXT:    dup v1.4h, v1.h[5]
1691; CHECK-NEXT:    add v0.4h, v1.4h, v0.4h
1692; CHECK-NEXT:    ret
1693  %extract = shufflevector <2 x double> %v, <2 x double> undef, <1 x i32> <i32 1>
1694  %bc = bitcast <1 x double> %extract to <4 x i16>
1695  %splat = shufflevector <4 x i16> %bc, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1696  %r = add <4 x i16> %splat, %a
1697  ret <4 x i16> %r
1698}
1699
1700; TODO: The pattern in LowerVECTOR_SHUFFLE does not match what we are looking for.
1701
1702define <4 x i16> @test_vadd_lane2_i16_bitcast_bigger_aligned(<4 x i16> %a, <16 x i8> %v) {
1703; CHECK-LABEL: test_vadd_lane2_i16_bitcast_bigger_aligned:
1704; CHECK:       // %bb.0:
1705; CHECK-NEXT:    ext v1.8b, v1.8b, v0.8b, #2
1706; CHECK-NEXT:    dup v1.4h, v1.h[1]
1707; CHECK-NEXT:    add v0.4h, v1.4h, v0.4h
1708; CHECK-NEXT:    ret
1709  %extract = shufflevector <16 x i8> %v, <16 x i8> undef, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
1710  %bc = bitcast <8 x i8> %extract to <4 x i16>
1711  %splat = shufflevector <4 x i16> %bc, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1712  %r = add <4 x i16> %splat, %a
1713  ret <4 x i16> %r
1714}
1715
1716define <4 x i16> @test_vadd_lane5_i16_bitcast_bigger_aligned(<4 x i16> %a, <16 x i8> %v) {
1717; CHECK-LABEL: test_vadd_lane5_i16_bitcast_bigger_aligned:
1718; CHECK:       // %bb.0:
1719; CHECK-NEXT:    dup v1.4h, v1.h[5]
1720; CHECK-NEXT:    add v0.4h, v1.4h, v0.4h
1721; CHECK-NEXT:    ret
1722  %extract = shufflevector <16 x i8> %v, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1723  %bc = bitcast <8 x i8> %extract to <4 x i16>
1724  %splat = shufflevector <4 x i16> %bc, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1725  %r = add <4 x i16> %splat, %a
1726  ret <4 x i16> %r
1727}
1728
1729; Negative test - can't dup bytes {3,4} of v8i16.
1730
1731define <4 x i16> @test_vadd_lane_i16_bitcast_bigger_unaligned(<4 x i16> %a, <16 x i8> %v) {
1732; CHECK-LABEL: test_vadd_lane_i16_bitcast_bigger_unaligned:
1733; CHECK:       // %bb.0:
1734; CHECK-NEXT:    ext v1.8b, v1.8b, v0.8b, #1
1735; CHECK-NEXT:    dup v1.4h, v1.h[1]
1736; CHECK-NEXT:    add v0.4h, v1.4h, v0.4h
1737; CHECK-NEXT:    ret
1738  %extract = shufflevector <16 x i8> %v, <16 x i8> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
1739  %bc = bitcast <8 x i8> %extract to <4 x i16>
1740  %splat = shufflevector <4 x i16> %bc, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1741  %r = add <4 x i16> %splat, %a
1742  ret <4 x i16> %r
1743}
1744
1745define <1 x double> @test_vmul_laneq_f64(<1 x double> %a, <2 x double> %v) {
1746; CHECK-LABEL: test_vmul_laneq_f64:
1747; CHECK:       // %bb.0: // %entry
1748; CHECK-NEXT:    fmul d0, d0, v1.d[1]
1749; CHECK-NEXT:    ret
1750entry:
1751  %0 = bitcast <1 x double> %a to <8 x i8>
1752  %1 = bitcast <8 x i8> %0 to double
1753  %extract = extractelement <2 x double> %v, i32 1
1754  %2 = fmul double %1, %extract
1755  %3 = insertelement <1 x double> undef, double %2, i32 0
1756  ret <1 x double> %3
1757}
1758
1759define <4 x float> @test_vmulq_laneq_f32(<4 x float> %a, <4 x float> %v) {
1760; CHECK-LABEL: test_vmulq_laneq_f32:
1761; CHECK:       // %bb.0: // %entry
1762; CHECK-NEXT:    fmul v0.4s, v0.4s, v1.s[3]
1763; CHECK-NEXT:    ret
1764entry:
1765  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1766  %mul = fmul <4 x float> %shuffle, %a
1767  ret <4 x float> %mul
1768}
1769
1770define <2 x double> @test_vmulq_laneq_f64(<2 x double> %a, <2 x double> %v) {
1771; CHECK-LABEL: test_vmulq_laneq_f64:
1772; CHECK:       // %bb.0: // %entry
1773; CHECK-NEXT:    fmul v0.2d, v0.2d, v1.d[1]
1774; CHECK-NEXT:    ret
1775entry:
1776  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
1777  %mul = fmul <2 x double> %shuffle, %a
1778  ret <2 x double> %mul
1779}
1780
1781define <2 x float> @test_vmulx_lane_f32(<2 x float> %a, <2 x float> %v) {
1782; CHECK-LABEL: test_vmulx_lane_f32:
1783; CHECK:       // %bb.0: // %entry
1784; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1785; CHECK-NEXT:    fmulx v0.2s, v0.2s, v1.s[1]
1786; CHECK-NEXT:    ret
1787entry:
1788  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
1789  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
1790  ret <2 x float> %vmulx2.i
1791}
1792
1793define <4 x float> @test_vmulxq_lane_f32(<4 x float> %a, <2 x float> %v) {
1794; CHECK-LABEL: test_vmulxq_lane_f32:
1795; CHECK:       // %bb.0: // %entry
1796; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1797; CHECK-NEXT:    fmulx v0.4s, v0.4s, v1.s[1]
1798; CHECK-NEXT:    ret
1799entry:
1800  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1801  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
1802  ret <4 x float> %vmulx2.i
1803}
1804
1805define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) {
1806; CHECK-LABEL: test_vmulxq_lane_f64:
1807; CHECK:       // %bb.0: // %entry
1808; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1809; CHECK-NEXT:    fmulx v0.2d, v0.2d, v1.d[0]
1810; CHECK-NEXT:    ret
1811entry:
1812  %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
1813  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
1814  ret <2 x double> %vmulx2.i
1815}
1816
1817define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) {
1818; CHECK-LABEL: test_vmulx_laneq_f32:
1819; CHECK:       // %bb.0: // %entry
1820; CHECK-NEXT:    fmulx v0.2s, v0.2s, v1.s[3]
1821; CHECK-NEXT:    ret
1822entry:
1823  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
1824  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
1825  ret <2 x float> %vmulx2.i
1826}
1827
1828define <4 x float> @test_vmulxq_laneq_f32(<4 x float> %a, <4 x float> %v) {
1829; CHECK-LABEL: test_vmulxq_laneq_f32:
1830; CHECK:       // %bb.0: // %entry
1831; CHECK-NEXT:    fmulx v0.4s, v0.4s, v1.s[3]
1832; CHECK-NEXT:    ret
1833entry:
1834  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1835  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
1836  ret <4 x float> %vmulx2.i
1837}
1838
1839define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) {
1840; CHECK-LABEL: test_vmulxq_laneq_f64:
1841; CHECK:       // %bb.0: // %entry
1842; CHECK-NEXT:    fmulx v0.2d, v0.2d, v1.d[1]
1843; CHECK-NEXT:    ret
1844entry:
1845  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
1846  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
1847  ret <2 x double> %vmulx2.i
1848}
1849
1850define <4 x i16> @test_vmla_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
1851; CHECK-LABEL: test_vmla_lane_s16_0:
1852; CHECK:       // %bb.0: // %entry
1853; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
1854; CHECK-NEXT:    mla v0.4h, v1.4h, v2.h[0]
1855; CHECK-NEXT:    ret
1856entry:
1857  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
1858  %mul = mul <4 x i16> %shuffle, %b
1859  %add = add <4 x i16> %mul, %a
1860  ret <4 x i16> %add
1861}
1862
1863define <8 x i16> @test_vmlaq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
1864; CHECK-LABEL: test_vmlaq_lane_s16_0:
1865; CHECK:       // %bb.0: // %entry
1866; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
1867; CHECK-NEXT:    mla v0.8h, v1.8h, v2.h[0]
1868; CHECK-NEXT:    ret
1869entry:
1870  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
1871  %mul = mul <8 x i16> %shuffle, %b
1872  %add = add <8 x i16> %mul, %a
1873  ret <8 x i16> %add
1874}
1875
1876define <2 x i32> @test_vmla_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
1877; CHECK-LABEL: test_vmla_lane_s32_0:
1878; CHECK:       // %bb.0: // %entry
1879; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
1880; CHECK-NEXT:    mla v0.2s, v1.2s, v2.s[0]
1881; CHECK-NEXT:    ret
1882entry:
1883  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
1884  %mul = mul <2 x i32> %shuffle, %b
1885  %add = add <2 x i32> %mul, %a
1886  ret <2 x i32> %add
1887}
1888
1889define <4 x i32> @test_vmlaq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
1890; CHECK-LABEL: test_vmlaq_lane_s32_0:
1891; CHECK:       // %bb.0: // %entry
1892; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
1893; CHECK-NEXT:    mla v0.4s, v1.4s, v2.s[0]
1894; CHECK-NEXT:    ret
1895entry:
1896  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
1897  %mul = mul <4 x i32> %shuffle, %b
1898  %add = add <4 x i32> %mul, %a
1899  ret <4 x i32> %add
1900}
1901
1902define <4 x i16> @test_vmla_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
1903; CHECK-LABEL: test_vmla_laneq_s16_0:
1904; CHECK:       // %bb.0: // %entry
1905; CHECK-NEXT:    mla v0.4h, v1.4h, v2.h[0]
1906; CHECK-NEXT:    ret
1907entry:
1908  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
1909  %mul = mul <4 x i16> %shuffle, %b
1910  %add = add <4 x i16> %mul, %a
1911  ret <4 x i16> %add
1912}
1913
1914define <8 x i16> @test_vmlaq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
1915; CHECK-LABEL: test_vmlaq_laneq_s16_0:
1916; CHECK:       // %bb.0: // %entry
1917; CHECK-NEXT:    mla v0.8h, v1.8h, v2.h[0]
1918; CHECK-NEXT:    ret
1919entry:
1920  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
1921  %mul = mul <8 x i16> %shuffle, %b
1922  %add = add <8 x i16> %mul, %a
1923  ret <8 x i16> %add
1924}
1925
1926define <2 x i32> @test_vmla_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
1927; CHECK-LABEL: test_vmla_laneq_s32_0:
1928; CHECK:       // %bb.0: // %entry
1929; CHECK-NEXT:    mla v0.2s, v1.2s, v2.s[0]
1930; CHECK-NEXT:    ret
1931entry:
1932  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
1933  %mul = mul <2 x i32> %shuffle, %b
1934  %add = add <2 x i32> %mul, %a
1935  ret <2 x i32> %add
1936}
1937
1938define <4 x i32> @test_vmlaq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
1939; CHECK-LABEL: test_vmlaq_laneq_s32_0:
1940; CHECK:       // %bb.0: // %entry
1941; CHECK-NEXT:    mla v0.4s, v1.4s, v2.s[0]
1942; CHECK-NEXT:    ret
1943entry:
1944  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
1945  %mul = mul <4 x i32> %shuffle, %b
1946  %add = add <4 x i32> %mul, %a
1947  ret <4 x i32> %add
1948}
1949
1950define <4 x i16> @test_vmls_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
1951; CHECK-LABEL: test_vmls_lane_s16_0:
1952; CHECK:       // %bb.0: // %entry
1953; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
1954; CHECK-NEXT:    mls v0.4h, v1.4h, v2.h[0]
1955; CHECK-NEXT:    ret
1956entry:
1957  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
1958  %mul = mul <4 x i16> %shuffle, %b
1959  %sub = sub <4 x i16> %a, %mul
1960  ret <4 x i16> %sub
1961}
1962
1963define <8 x i16> @test_vmlsq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
1964; CHECK-LABEL: test_vmlsq_lane_s16_0:
1965; CHECK:       // %bb.0: // %entry
1966; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
1967; CHECK-NEXT:    mls v0.8h, v1.8h, v2.h[0]
1968; CHECK-NEXT:    ret
1969entry:
1970  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
1971  %mul = mul <8 x i16> %shuffle, %b
1972  %sub = sub <8 x i16> %a, %mul
1973  ret <8 x i16> %sub
1974}
1975
1976define <2 x i32> @test_vmls_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
1977; CHECK-LABEL: test_vmls_lane_s32_0:
1978; CHECK:       // %bb.0: // %entry
1979; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
1980; CHECK-NEXT:    mls v0.2s, v1.2s, v2.s[0]
1981; CHECK-NEXT:    ret
1982entry:
1983  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
1984  %mul = mul <2 x i32> %shuffle, %b
1985  %sub = sub <2 x i32> %a, %mul
1986  ret <2 x i32> %sub
1987}
1988
1989define <4 x i32> @test_vmlsq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
1990; CHECK-LABEL: test_vmlsq_lane_s32_0:
1991; CHECK:       // %bb.0: // %entry
1992; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
1993; CHECK-NEXT:    mls v0.4s, v1.4s, v2.s[0]
1994; CHECK-NEXT:    ret
1995entry:
1996  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
1997  %mul = mul <4 x i32> %shuffle, %b
1998  %sub = sub <4 x i32> %a, %mul
1999  ret <4 x i32> %sub
2000}
2001
2002define <4 x i16> @test_vmls_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
2003; CHECK-LABEL: test_vmls_laneq_s16_0:
2004; CHECK:       // %bb.0: // %entry
2005; CHECK-NEXT:    mls v0.4h, v1.4h, v2.h[0]
2006; CHECK-NEXT:    ret
2007entry:
2008  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2009  %mul = mul <4 x i16> %shuffle, %b
2010  %sub = sub <4 x i16> %a, %mul
2011  ret <4 x i16> %sub
2012}
2013
2014define <8 x i16> @test_vmlsq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
2015; CHECK-LABEL: test_vmlsq_laneq_s16_0:
2016; CHECK:       // %bb.0: // %entry
2017; CHECK-NEXT:    mls v0.8h, v1.8h, v2.h[0]
2018; CHECK-NEXT:    ret
2019entry:
2020  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
2021  %mul = mul <8 x i16> %shuffle, %b
2022  %sub = sub <8 x i16> %a, %mul
2023  ret <8 x i16> %sub
2024}
2025
2026define <2 x i32> @test_vmls_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
2027; CHECK-LABEL: test_vmls_laneq_s32_0:
2028; CHECK:       // %bb.0: // %entry
2029; CHECK-NEXT:    mls v0.2s, v1.2s, v2.s[0]
2030; CHECK-NEXT:    ret
2031entry:
2032  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2033  %mul = mul <2 x i32> %shuffle, %b
2034  %sub = sub <2 x i32> %a, %mul
2035  ret <2 x i32> %sub
2036}
2037
2038define <4 x i32> @test_vmlsq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
2039; CHECK-LABEL: test_vmlsq_laneq_s32_0:
2040; CHECK:       // %bb.0: // %entry
2041; CHECK-NEXT:    mls v0.4s, v1.4s, v2.s[0]
2042; CHECK-NEXT:    ret
2043entry:
2044  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
2045  %mul = mul <4 x i32> %shuffle, %b
2046  %sub = sub <4 x i32> %a, %mul
2047  ret <4 x i32> %sub
2048}
2049
2050define <4 x i16> @test_vmul_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
2051; CHECK-LABEL: test_vmul_lane_s16_0:
2052; CHECK:       // %bb.0: // %entry
2053; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
2054; CHECK-NEXT:    mul v0.4h, v0.4h, v1.h[0]
2055; CHECK-NEXT:    ret
2056entry:
2057  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2058  %mul = mul <4 x i16> %shuffle, %a
2059  ret <4 x i16> %mul
2060}
2061
2062define <8 x i16> @test_vmulq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
2063; CHECK-LABEL: test_vmulq_lane_s16_0:
2064; CHECK:       // %bb.0: // %entry
2065; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
2066; CHECK-NEXT:    mul v0.8h, v0.8h, v1.h[0]
2067; CHECK-NEXT:    ret
2068entry:
2069  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
2070  %mul = mul <8 x i16> %shuffle, %a
2071  ret <8 x i16> %mul
2072}
2073
2074define <2 x i32> @test_vmul_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
2075; CHECK-LABEL: test_vmul_lane_s32_0:
2076; CHECK:       // %bb.0: // %entry
2077; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
2078; CHECK-NEXT:    mul v0.2s, v0.2s, v1.s[0]
2079; CHECK-NEXT:    ret
2080entry:
2081  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2082  %mul = mul <2 x i32> %shuffle, %a
2083  ret <2 x i32> %mul
2084}
2085
2086define <4 x i32> @test_vmulq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
2087; CHECK-LABEL: test_vmulq_lane_s32_0:
2088; CHECK:       // %bb.0: // %entry
2089; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
2090; CHECK-NEXT:    mul v0.4s, v0.4s, v1.s[0]
2091; CHECK-NEXT:    ret
2092entry:
2093  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
2094  %mul = mul <4 x i32> %shuffle, %a
2095  ret <4 x i32> %mul
2096}
2097
2098define <4 x i16> @test_vmul_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
2099; CHECK-LABEL: test_vmul_lane_u16_0:
2100; CHECK:       // %bb.0: // %entry
2101; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
2102; CHECK-NEXT:    mul v0.4h, v0.4h, v1.h[0]
2103; CHECK-NEXT:    ret
2104entry:
2105  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2106  %mul = mul <4 x i16> %shuffle, %a
2107  ret <4 x i16> %mul
2108}
2109
2110define <8 x i16> @test_vmulq_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
2111; CHECK-LABEL: test_vmulq_lane_u16_0:
2112; CHECK:       // %bb.0: // %entry
2113; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
2114; CHECK-NEXT:    mul v0.8h, v0.8h, v1.h[0]
2115; CHECK-NEXT:    ret
2116entry:
2117  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
2118  %mul = mul <8 x i16> %shuffle, %a
2119  ret <8 x i16> %mul
2120}
2121
2122define <2 x i32> @test_vmul_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
2123; CHECK-LABEL: test_vmul_lane_u32_0:
2124; CHECK:       // %bb.0: // %entry
2125; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
2126; CHECK-NEXT:    mul v0.2s, v0.2s, v1.s[0]
2127; CHECK-NEXT:    ret
2128entry:
2129  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2130  %mul = mul <2 x i32> %shuffle, %a
2131  ret <2 x i32> %mul
2132}
2133
2134define <4 x i32> @test_vmulq_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
2135; CHECK-LABEL: test_vmulq_lane_u32_0:
2136; CHECK:       // %bb.0: // %entry
2137; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
2138; CHECK-NEXT:    mul v0.4s, v0.4s, v1.s[0]
2139; CHECK-NEXT:    ret
2140entry:
2141  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
2142  %mul = mul <4 x i32> %shuffle, %a
2143  ret <4 x i32> %mul
2144}
2145
2146define <4 x i16> @test_vmul_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
2147; CHECK-LABEL: test_vmul_laneq_s16_0:
2148; CHECK:       // %bb.0: // %entry
2149; CHECK-NEXT:    mul v0.4h, v0.4h, v1.h[0]
2150; CHECK-NEXT:    ret
2151entry:
2152  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2153  %mul = mul <4 x i16> %shuffle, %a
2154  ret <4 x i16> %mul
2155}
2156
2157define <8 x i16> @test_vmulq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
2158; CHECK-LABEL: test_vmulq_laneq_s16_0:
2159; CHECK:       // %bb.0: // %entry
2160; CHECK-NEXT:    mul v0.8h, v0.8h, v1.h[0]
2161; CHECK-NEXT:    ret
2162entry:
2163  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
2164  %mul = mul <8 x i16> %shuffle, %a
2165  ret <8 x i16> %mul
2166}
2167
2168define <2 x i32> @test_vmul_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
2169; CHECK-LABEL: test_vmul_laneq_s32_0:
2170; CHECK:       // %bb.0: // %entry
2171; CHECK-NEXT:    mul v0.2s, v0.2s, v1.s[0]
2172; CHECK-NEXT:    ret
2173entry:
2174  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2175  %mul = mul <2 x i32> %shuffle, %a
2176  ret <2 x i32> %mul
2177}
2178
2179define <4 x i32> @test_vmulq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
2180; CHECK-LABEL: test_vmulq_laneq_s32_0:
2181; CHECK:       // %bb.0: // %entry
2182; CHECK-NEXT:    mul v0.4s, v0.4s, v1.s[0]
2183; CHECK-NEXT:    ret
2184entry:
2185  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
2186  %mul = mul <4 x i32> %shuffle, %a
2187  ret <4 x i32> %mul
2188}
2189
2190define <4 x i16> @test_vmul_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
2191; CHECK-LABEL: test_vmul_laneq_u16_0:
2192; CHECK:       // %bb.0: // %entry
2193; CHECK-NEXT:    mul v0.4h, v0.4h, v1.h[0]
2194; CHECK-NEXT:    ret
2195entry:
2196  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2197  %mul = mul <4 x i16> %shuffle, %a
2198  ret <4 x i16> %mul
2199}
2200
2201define <8 x i16> @test_vmulq_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
2202; CHECK-LABEL: test_vmulq_laneq_u16_0:
2203; CHECK:       // %bb.0: // %entry
2204; CHECK-NEXT:    mul v0.8h, v0.8h, v1.h[0]
2205; CHECK-NEXT:    ret
2206entry:
2207  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
2208  %mul = mul <8 x i16> %shuffle, %a
2209  ret <8 x i16> %mul
2210}
2211
2212define <2 x i32> @test_vmul_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
2213; CHECK-LABEL: test_vmul_laneq_u32_0:
2214; CHECK:       // %bb.0: // %entry
2215; CHECK-NEXT:    mul v0.2s, v0.2s, v1.s[0]
2216; CHECK-NEXT:    ret
2217entry:
2218  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2219  %mul = mul <2 x i32> %shuffle, %a
2220  ret <2 x i32> %mul
2221}
2222
2223define <4 x i32> @test_vmulq_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
2224; CHECK-LABEL: test_vmulq_laneq_u32_0:
2225; CHECK:       // %bb.0: // %entry
2226; CHECK-NEXT:    mul v0.4s, v0.4s, v1.s[0]
2227; CHECK-NEXT:    ret
2228entry:
2229  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
2230  %mul = mul <4 x i32> %shuffle, %a
2231  ret <4 x i32> %mul
2232}
2233
2234define <2 x float> @test_vfma_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
2235; CHECK-LABEL: test_vfma_lane_f32_0:
2236; CHECK:       // %bb.0: // %entry
2237; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
2238; CHECK-NEXT:    fmla v0.2s, v1.2s, v2.s[0]
2239; CHECK-NEXT:    ret
2240entry:
2241  %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
2242  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
2243  ret <2 x float> %0
2244}
2245
2246define <4 x float> @test_vfmaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
2247; CHECK-LABEL: test_vfmaq_lane_f32_0:
2248; CHECK:       // %bb.0: // %entry
2249; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
2250; CHECK-NEXT:    fmla v0.4s, v1.4s, v2.s[0]
2251; CHECK-NEXT:    ret
2252entry:
2253  %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
2254  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
2255  ret <4 x float> %0
2256}
2257
2258define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
2259; CHECK-LABEL: test_vfma_laneq_f32_0:
2260; CHECK:       // %bb.0: // %entry
2261; CHECK-NEXT:    fmla v0.2s, v1.2s, v2.s[0]
2262; CHECK-NEXT:    ret
2263entry:
2264  %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
2265  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
2266  ret <2 x float> %0
2267}
2268
2269define <4 x float> @test_vfmaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
2270; CHECK-LABEL: test_vfmaq_laneq_f32_0:
2271; CHECK:       // %bb.0: // %entry
2272; CHECK-NEXT:    fmla v0.4s, v1.4s, v2.s[0]
2273; CHECK-NEXT:    ret
2274entry:
2275  %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
2276  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
2277  ret <4 x float> %0
2278}
2279
2280define <2 x float> @test_vfms_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
2281; CHECK-LABEL: test_vfms_lane_f32_0:
2282; CHECK:       // %bb.0: // %entry
2283; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
2284; CHECK-NEXT:    fmls v0.2s, v1.2s, v2.s[0]
2285; CHECK-NEXT:    ret
2286entry:
2287  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
2288  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> zeroinitializer
2289  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
2290  ret <2 x float> %0
2291}
2292
2293define <4 x float> @test_vfmsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
2294; CHECK-LABEL: test_vfmsq_lane_f32_0:
2295; CHECK:       // %bb.0: // %entry
2296; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
2297; CHECK-NEXT:    fmls v0.4s, v1.4s, v2.s[0]
2298; CHECK-NEXT:    ret
2299entry:
2300  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
2301  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> zeroinitializer
2302  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
2303  ret <4 x float> %0
2304}
2305
2306define <2 x float> @test_vfms_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
2307; CHECK-LABEL: test_vfms_laneq_f32_0:
2308; CHECK:       // %bb.0: // %entry
2309; CHECK-NEXT:    fmls v0.2s, v1.2s, v2.s[0]
2310; CHECK-NEXT:    ret
2311entry:
2312  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
2313  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> zeroinitializer
2314  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
2315  ret <2 x float> %0
2316}
2317
2318define <4 x float> @test_vfmsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
2319; CHECK-LABEL: test_vfmsq_laneq_f32_0:
2320; CHECK:       // %bb.0: // %entry
2321; CHECK-NEXT:    fmls v0.4s, v1.4s, v2.s[0]
2322; CHECK-NEXT:    ret
2323entry:
2324  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
2325  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> zeroinitializer
2326  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
2327  ret <4 x float> %0
2328}
2329
2330define <2 x double> @test_vfmaq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
2331; CHECK-LABEL: test_vfmaq_laneq_f64_0:
2332; CHECK:       // %bb.0: // %entry
2333; CHECK-NEXT:    fmla v0.2d, v1.2d, v2.d[0]
2334; CHECK-NEXT:    ret
2335entry:
2336  %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
2337  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
2338  ret <2 x double> %0
2339}
2340
2341define <2 x double> @test_vfmsq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
2342; CHECK-LABEL: test_vfmsq_laneq_f64_0:
2343; CHECK:       // %bb.0: // %entry
2344; CHECK-NEXT:    fmls v0.2d, v1.2d, v2.d[0]
2345; CHECK-NEXT:    ret
2346entry:
2347  %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
2348  %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> zeroinitializer
2349  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
2350  ret <2 x double> %0
2351}
2352
2353define <4 x i32> @test_vmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
2354; CHECK-LABEL: test_vmlal_lane_s16_0:
2355; CHECK:       // %bb.0: // %entry
2356; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
2357; CHECK-NEXT:    smlal v0.4s, v1.4h, v2.h[0]
2358; CHECK-NEXT:    ret
2359entry:
2360  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2361  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
2362  %add = add <4 x i32> %vmull2.i, %a
2363  ret <4 x i32> %add
2364}
2365
2366define <2 x i64> @test_vmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
2367; CHECK-LABEL: test_vmlal_lane_s32_0:
2368; CHECK:       // %bb.0: // %entry
2369; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
2370; CHECK-NEXT:    smlal v0.2d, v1.2s, v2.s[0]
2371; CHECK-NEXT:    ret
2372entry:
2373  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2374  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
2375  %add = add <2 x i64> %vmull2.i, %a
2376  ret <2 x i64> %add
2377}
2378
2379define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
2380; CHECK-LABEL: test_vmlal_laneq_s16_0:
2381; CHECK:       // %bb.0: // %entry
2382; CHECK-NEXT:    smlal v0.4s, v1.4h, v2.h[0]
2383; CHECK-NEXT:    ret
2384entry:
2385  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2386  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
2387  %add = add <4 x i32> %vmull2.i, %a
2388  ret <4 x i32> %add
2389}
2390
2391define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
2392; CHECK-LABEL: test_vmlal_laneq_s32_0:
2393; CHECK:       // %bb.0: // %entry
2394; CHECK-NEXT:    smlal v0.2d, v1.2s, v2.s[0]
2395; CHECK-NEXT:    ret
2396entry:
2397  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2398  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
2399  %add = add <2 x i64> %vmull2.i, %a
2400  ret <2 x i64> %add
2401}
2402
2403define <4 x i32> @test_vmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
2404; CHECK-LABEL: test_vmlal_high_lane_s16_0:
2405; CHECK:       // %bb.0: // %entry
2406; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
2407; CHECK-NEXT:    smlal2 v0.4s, v1.8h, v2.h[0]
2408; CHECK-NEXT:    ret
2409entry:
2410  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2411  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2412  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
2413  %add = add <4 x i32> %vmull2.i, %a
2414  ret <4 x i32> %add
2415}
2416
2417define <2 x i64> @test_vmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
2418; CHECK-LABEL: test_vmlal_high_lane_s32_0:
2419; CHECK:       // %bb.0: // %entry
2420; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
2421; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.s[0]
2422; CHECK-NEXT:    ret
2423entry:
2424  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2425  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2426  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
2427  %add = add <2 x i64> %vmull2.i, %a
2428  ret <2 x i64> %add
2429}
2430
2431define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
2432; CHECK-LABEL: test_vmlal_high_laneq_s16_0:
2433; CHECK:       // %bb.0: // %entry
2434; CHECK-NEXT:    smlal2 v0.4s, v1.8h, v2.h[0]
2435; CHECK-NEXT:    ret
2436entry:
2437  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2438  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2439  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
2440  %add = add <4 x i32> %vmull2.i, %a
2441  ret <4 x i32> %add
2442}
2443
2444define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
2445; CHECK-LABEL: test_vmlal_high_laneq_s32_0:
2446; CHECK:       // %bb.0: // %entry
2447; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.s[0]
2448; CHECK-NEXT:    ret
2449entry:
2450  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2451  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2452  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
2453  %add = add <2 x i64> %vmull2.i, %a
2454  ret <2 x i64> %add
2455}
2456
2457define <4 x i32> @test_vmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
2458; CHECK-LABEL: test_vmlsl_lane_s16_0:
2459; CHECK:       // %bb.0: // %entry
2460; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
2461; CHECK-NEXT:    smlsl v0.4s, v1.4h, v2.h[0]
2462; CHECK-NEXT:    ret
2463entry:
2464  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2465  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
2466  %sub = sub <4 x i32> %a, %vmull2.i
2467  ret <4 x i32> %sub
2468}
2469
2470define <2 x i64> @test_vmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
2471; CHECK-LABEL: test_vmlsl_lane_s32_0:
2472; CHECK:       // %bb.0: // %entry
2473; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
2474; CHECK-NEXT:    smlsl v0.2d, v1.2s, v2.s[0]
2475; CHECK-NEXT:    ret
2476entry:
2477  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2478  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
2479  %sub = sub <2 x i64> %a, %vmull2.i
2480  ret <2 x i64> %sub
2481}
2482
2483define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
2484; CHECK-LABEL: test_vmlsl_laneq_s16_0:
2485; CHECK:       // %bb.0: // %entry
2486; CHECK-NEXT:    smlsl v0.4s, v1.4h, v2.h[0]
2487; CHECK-NEXT:    ret
2488entry:
2489  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2490  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
2491  %sub = sub <4 x i32> %a, %vmull2.i
2492  ret <4 x i32> %sub
2493}
2494
2495define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
2496; CHECK-LABEL: test_vmlsl_laneq_s32_0:
2497; CHECK:       // %bb.0: // %entry
2498; CHECK-NEXT:    smlsl v0.2d, v1.2s, v2.s[0]
2499; CHECK-NEXT:    ret
2500entry:
2501  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2502  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
2503  %sub = sub <2 x i64> %a, %vmull2.i
2504  ret <2 x i64> %sub
2505}
2506
2507define <4 x i32> @test_vmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
2508; CHECK-LABEL: test_vmlsl_high_lane_s16_0:
2509; CHECK:       // %bb.0: // %entry
2510; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
2511; CHECK-NEXT:    smlsl2 v0.4s, v1.8h, v2.h[0]
2512; CHECK-NEXT:    ret
2513entry:
2514  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2515  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2516  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
2517  %sub = sub <4 x i32> %a, %vmull2.i
2518  ret <4 x i32> %sub
2519}
2520
2521define <2 x i64> @test_vmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
2522; CHECK-LABEL: test_vmlsl_high_lane_s32_0:
2523; CHECK:       // %bb.0: // %entry
2524; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
2525; CHECK-NEXT:    smlsl2 v0.2d, v1.4s, v2.s[0]
2526; CHECK-NEXT:    ret
2527entry:
2528  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2529  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2530  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
2531  %sub = sub <2 x i64> %a, %vmull2.i
2532  ret <2 x i64> %sub
2533}
2534
2535define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
2536; CHECK-LABEL: test_vmlsl_high_laneq_s16_0:
2537; CHECK:       // %bb.0: // %entry
2538; CHECK-NEXT:    smlsl2 v0.4s, v1.8h, v2.h[0]
2539; CHECK-NEXT:    ret
2540entry:
2541  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2542  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2543  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
2544  %sub = sub <4 x i32> %a, %vmull2.i
2545  ret <4 x i32> %sub
2546}
2547
2548define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
2549; CHECK-LABEL: test_vmlsl_high_laneq_s32_0:
2550; CHECK:       // %bb.0: // %entry
2551; CHECK-NEXT:    smlsl2 v0.2d, v1.4s, v2.s[0]
2552; CHECK-NEXT:    ret
2553entry:
2554  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2555  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2556  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
2557  %sub = sub <2 x i64> %a, %vmull2.i
2558  ret <2 x i64> %sub
2559}
2560
2561define <4 x i32> @test_vmlal_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
2562; CHECK-LABEL: test_vmlal_lane_u16_0:
2563; CHECK:       // %bb.0: // %entry
2564; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
2565; CHECK-NEXT:    umlal v0.4s, v1.4h, v2.h[0]
2566; CHECK-NEXT:    ret
2567entry:
2568  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2569  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
2570  %add = add <4 x i32> %vmull2.i, %a
2571  ret <4 x i32> %add
2572}
2573
2574define <2 x i64> @test_vmlal_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
2575; CHECK-LABEL: test_vmlal_lane_u32_0:
2576; CHECK:       // %bb.0: // %entry
2577; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
2578; CHECK-NEXT:    umlal v0.2d, v1.2s, v2.s[0]
2579; CHECK-NEXT:    ret
2580entry:
2581  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2582  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
2583  %add = add <2 x i64> %vmull2.i, %a
2584  ret <2 x i64> %add
2585}
2586
2587define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
2588; CHECK-LABEL: test_vmlal_laneq_u16_0:
2589; CHECK:       // %bb.0: // %entry
2590; CHECK-NEXT:    umlal v0.4s, v1.4h, v2.h[0]
2591; CHECK-NEXT:    ret
2592entry:
2593  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2594  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
2595  %add = add <4 x i32> %vmull2.i, %a
2596  ret <4 x i32> %add
2597}
2598
2599define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
2600; CHECK-LABEL: test_vmlal_laneq_u32_0:
2601; CHECK:       // %bb.0: // %entry
2602; CHECK-NEXT:    umlal v0.2d, v1.2s, v2.s[0]
2603; CHECK-NEXT:    ret
2604entry:
2605  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2606  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
2607  %add = add <2 x i64> %vmull2.i, %a
2608  ret <2 x i64> %add
2609}
2610
2611define <4 x i32> @test_vmlal_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
2612; CHECK-LABEL: test_vmlal_high_lane_u16_0:
2613; CHECK:       // %bb.0: // %entry
2614; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
2615; CHECK-NEXT:    umlal2 v0.4s, v1.8h, v2.h[0]
2616; CHECK-NEXT:    ret
2617entry:
2618  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2619  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2620  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
2621  %add = add <4 x i32> %vmull2.i, %a
2622  ret <4 x i32> %add
2623}
2624
2625define <2 x i64> @test_vmlal_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
2626; CHECK-LABEL: test_vmlal_high_lane_u32_0:
2627; CHECK:       // %bb.0: // %entry
2628; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
2629; CHECK-NEXT:    umlal2 v0.2d, v1.4s, v2.s[0]
2630; CHECK-NEXT:    ret
2631entry:
2632  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2633  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2634  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
2635  %add = add <2 x i64> %vmull2.i, %a
2636  ret <2 x i64> %add
2637}
2638
2639define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
2640; CHECK-LABEL: test_vmlal_high_laneq_u16_0:
2641; CHECK:       // %bb.0: // %entry
2642; CHECK-NEXT:    umlal2 v0.4s, v1.8h, v2.h[0]
2643; CHECK-NEXT:    ret
2644entry:
2645  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2646  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2647  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
2648  %add = add <4 x i32> %vmull2.i, %a
2649  ret <4 x i32> %add
2650}
2651
2652define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
2653; CHECK-LABEL: test_vmlal_high_laneq_u32_0:
2654; CHECK:       // %bb.0: // %entry
2655; CHECK-NEXT:    umlal2 v0.2d, v1.4s, v2.s[0]
2656; CHECK-NEXT:    ret
2657entry:
2658  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2659  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2660  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
2661  %add = add <2 x i64> %vmull2.i, %a
2662  ret <2 x i64> %add
2663}
2664
2665define <4 x i32> @test_vmlsl_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
2666; CHECK-LABEL: test_vmlsl_lane_u16_0:
2667; CHECK:       // %bb.0: // %entry
2668; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
2669; CHECK-NEXT:    umlsl v0.4s, v1.4h, v2.h[0]
2670; CHECK-NEXT:    ret
2671entry:
2672  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2673  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
2674  %sub = sub <4 x i32> %a, %vmull2.i
2675  ret <4 x i32> %sub
2676}
2677
2678define <2 x i64> @test_vmlsl_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
2679; CHECK-LABEL: test_vmlsl_lane_u32_0:
2680; CHECK:       // %bb.0: // %entry
2681; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
2682; CHECK-NEXT:    umlsl v0.2d, v1.2s, v2.s[0]
2683; CHECK-NEXT:    ret
2684entry:
2685  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2686  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
2687  %sub = sub <2 x i64> %a, %vmull2.i
2688  ret <2 x i64> %sub
2689}
2690
2691define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
2692; CHECK-LABEL: test_vmlsl_laneq_u16_0:
2693; CHECK:       // %bb.0: // %entry
2694; CHECK-NEXT:    umlsl v0.4s, v1.4h, v2.h[0]
2695; CHECK-NEXT:    ret
2696entry:
2697  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2698  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
2699  %sub = sub <4 x i32> %a, %vmull2.i
2700  ret <4 x i32> %sub
2701}
2702
2703define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
2704; CHECK-LABEL: test_vmlsl_laneq_u32_0:
2705; CHECK:       // %bb.0: // %entry
2706; CHECK-NEXT:    umlsl v0.2d, v1.2s, v2.s[0]
2707; CHECK-NEXT:    ret
2708entry:
2709  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2710  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
2711  %sub = sub <2 x i64> %a, %vmull2.i
2712  ret <2 x i64> %sub
2713}
2714
2715define <4 x i32> @test_vmlsl_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
2716; CHECK-LABEL: test_vmlsl_high_lane_u16_0:
2717; CHECK:       // %bb.0: // %entry
2718; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
2719; CHECK-NEXT:    umlsl2 v0.4s, v1.8h, v2.h[0]
2720; CHECK-NEXT:    ret
2721entry:
2722  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2723  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2724  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
2725  %sub = sub <4 x i32> %a, %vmull2.i
2726  ret <4 x i32> %sub
2727}
2728
2729define <2 x i64> @test_vmlsl_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
2730; CHECK-LABEL: test_vmlsl_high_lane_u32_0:
2731; CHECK:       // %bb.0: // %entry
2732; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
2733; CHECK-NEXT:    umlsl2 v0.2d, v1.4s, v2.s[0]
2734; CHECK-NEXT:    ret
2735entry:
2736  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2737  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2738  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
2739  %sub = sub <2 x i64> %a, %vmull2.i
2740  ret <2 x i64> %sub
2741}
2742
2743define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
2744; CHECK-LABEL: test_vmlsl_high_laneq_u16_0:
2745; CHECK:       // %bb.0: // %entry
2746; CHECK-NEXT:    umlsl2 v0.4s, v1.8h, v2.h[0]
2747; CHECK-NEXT:    ret
2748entry:
2749  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2750  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2751  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
2752  %sub = sub <4 x i32> %a, %vmull2.i
2753  ret <4 x i32> %sub
2754}
2755
2756define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
2757; CHECK-LABEL: test_vmlsl_high_laneq_u32_0:
2758; CHECK:       // %bb.0: // %entry
2759; CHECK-NEXT:    umlsl2 v0.2d, v1.4s, v2.s[0]
2760; CHECK-NEXT:    ret
2761entry:
2762  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2763  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2764  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
2765  %sub = sub <2 x i64> %a, %vmull2.i
2766  ret <2 x i64> %sub
2767}
2768
2769define <4 x i32> @test_vmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
2770; CHECK-LABEL: test_vmull_lane_s16_0:
2771; CHECK:       // %bb.0: // %entry
2772; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
2773; CHECK-NEXT:    smull v0.4s, v0.4h, v1.h[0]
2774; CHECK-NEXT:    ret
2775entry:
2776  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2777  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
2778  ret <4 x i32> %vmull2.i
2779}
2780
2781define <2 x i64> @test_vmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
2782; CHECK-LABEL: test_vmull_lane_s32_0:
2783; CHECK:       // %bb.0: // %entry
2784; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
2785; CHECK-NEXT:    smull v0.2d, v0.2s, v1.s[0]
2786; CHECK-NEXT:    ret
2787entry:
2788  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2789  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
2790  ret <2 x i64> %vmull2.i
2791}
2792
2793define <4 x i32> @test_vmull_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
2794; CHECK-LABEL: test_vmull_lane_u16_0:
2795; CHECK:       // %bb.0: // %entry
2796; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
2797; CHECK-NEXT:    umull v0.4s, v0.4h, v1.h[0]
2798; CHECK-NEXT:    ret
2799entry:
2800  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2801  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
2802  ret <4 x i32> %vmull2.i
2803}
2804
2805define <2 x i64> @test_vmull_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
2806; CHECK-LABEL: test_vmull_lane_u32_0:
2807; CHECK:       // %bb.0: // %entry
2808; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
2809; CHECK-NEXT:    umull v0.2d, v0.2s, v1.s[0]
2810; CHECK-NEXT:    ret
2811entry:
2812  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2813  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
2814  ret <2 x i64> %vmull2.i
2815}
2816
2817define <4 x i32> @test_vmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
2818; CHECK-LABEL: test_vmull_high_lane_s16_0:
2819; CHECK:       // %bb.0: // %entry
2820; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
2821; CHECK-NEXT:    smull2 v0.4s, v0.8h, v1.h[0]
2822; CHECK-NEXT:    ret
2823entry:
2824  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2825  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2826  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
2827  ret <4 x i32> %vmull2.i
2828}
2829
2830define <2 x i64> @test_vmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
2831; CHECK-LABEL: test_vmull_high_lane_s32_0:
2832; CHECK:       // %bb.0: // %entry
2833; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
2834; CHECK-NEXT:    smull2 v0.2d, v0.4s, v1.s[0]
2835; CHECK-NEXT:    ret
2836entry:
2837  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2838  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2839  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
2840  ret <2 x i64> %vmull2.i
2841}
2842
2843define <4 x i32> @test_vmull_high_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
2844; CHECK-LABEL: test_vmull_high_lane_u16_0:
2845; CHECK:       // %bb.0: // %entry
2846; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
2847; CHECK-NEXT:    umull2 v0.4s, v0.8h, v1.h[0]
2848; CHECK-NEXT:    ret
2849entry:
2850  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2851  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2852  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
2853  ret <4 x i32> %vmull2.i
2854}
2855
2856define <2 x i64> @test_vmull_high_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
2857; CHECK-LABEL: test_vmull_high_lane_u32_0:
2858; CHECK:       // %bb.0: // %entry
2859; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
2860; CHECK-NEXT:    umull2 v0.2d, v0.4s, v1.s[0]
2861; CHECK-NEXT:    ret
2862entry:
2863  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2864  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2865  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
2866  ret <2 x i64> %vmull2.i
2867}
2868
2869define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
2870; CHECK-LABEL: test_vmull_laneq_s16_0:
2871; CHECK:       // %bb.0: // %entry
2872; CHECK-NEXT:    smull v0.4s, v0.4h, v1.h[0]
2873; CHECK-NEXT:    ret
2874entry:
2875  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2876  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
2877  ret <4 x i32> %vmull2.i
2878}
2879
2880define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
2881; CHECK-LABEL: test_vmull_laneq_s32_0:
2882; CHECK:       // %bb.0: // %entry
2883; CHECK-NEXT:    smull v0.2d, v0.2s, v1.s[0]
2884; CHECK-NEXT:    ret
2885entry:
2886  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2887  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
2888  ret <2 x i64> %vmull2.i
2889}
2890
2891define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
2892; CHECK-LABEL: test_vmull_laneq_u16_0:
2893; CHECK:       // %bb.0: // %entry
2894; CHECK-NEXT:    umull v0.4s, v0.4h, v1.h[0]
2895; CHECK-NEXT:    ret
2896entry:
2897  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2898  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
2899  ret <4 x i32> %vmull2.i
2900}
2901
2902define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
2903; CHECK-LABEL: test_vmull_laneq_u32_0:
2904; CHECK:       // %bb.0: // %entry
2905; CHECK-NEXT:    umull v0.2d, v0.2s, v1.s[0]
2906; CHECK-NEXT:    ret
2907entry:
2908  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2909  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
2910  ret <2 x i64> %vmull2.i
2911}
2912
2913define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
2914; CHECK-LABEL: test_vmull_high_laneq_s16_0:
2915; CHECK:       // %bb.0: // %entry
2916; CHECK-NEXT:    smull2 v0.4s, v0.8h, v1.h[0]
2917; CHECK-NEXT:    ret
2918entry:
2919  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2920  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2921  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
2922  ret <4 x i32> %vmull2.i
2923}
2924
2925define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
2926; CHECK-LABEL: test_vmull_high_laneq_s32_0:
2927; CHECK:       // %bb.0: // %entry
2928; CHECK-NEXT:    smull2 v0.2d, v0.4s, v1.s[0]
2929; CHECK-NEXT:    ret
2930entry:
2931  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2932  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2933  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
2934  ret <2 x i64> %vmull2.i
2935}
2936
2937define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
2938; CHECK-LABEL: test_vmull_high_laneq_u16_0:
2939; CHECK:       // %bb.0: // %entry
2940; CHECK-NEXT:    umull2 v0.4s, v0.8h, v1.h[0]
2941; CHECK-NEXT:    ret
2942entry:
2943  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2944  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2945  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
2946  ret <4 x i32> %vmull2.i
2947}
2948
2949define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
2950; CHECK-LABEL: test_vmull_high_laneq_u32_0:
2951; CHECK:       // %bb.0: // %entry
2952; CHECK-NEXT:    umull2 v0.2d, v0.4s, v1.s[0]
2953; CHECK-NEXT:    ret
2954entry:
2955  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2956  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2957  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
2958  ret <2 x i64> %vmull2.i
2959}
2960
2961define <4 x i32> @test_vqdmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
2962; CHECK-LABEL: test_vqdmlal_lane_s16_0:
2963; CHECK:       // %bb.0: // %entry
2964; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
2965; CHECK-NEXT:    sqdmlal v0.4s, v1.4h, v2.h[0]
2966; CHECK-NEXT:    ret
2967entry:
2968  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2969  %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
2970  %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
2971  ret <4 x i32> %vqdmlal4.i
2972}
2973
2974define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
2975; CHECK-LABEL: test_vqdmlal_lane_s32_0:
2976; CHECK:       // %bb.0: // %entry
2977; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
2978; CHECK-NEXT:    sqdmlal v0.2d, v1.2s, v2.s[0]
2979; CHECK-NEXT:    ret
2980entry:
2981  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2982  %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
2983  %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
2984  ret <2 x i64> %vqdmlal4.i
2985}
2986
2987define <4 x i32> @test_vqdmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
2988; CHECK-LABEL: test_vqdmlal_high_lane_s16_0:
2989; CHECK:       // %bb.0: // %entry
2990; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
2991; CHECK-NEXT:    sqdmlal2 v0.4s, v1.8h, v2.h[0]
2992; CHECK-NEXT:    ret
2993entry:
2994  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2995  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2996  %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
2997  %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
2998  ret <4 x i32> %vqdmlal4.i
2999}
3000
3001define <2 x i64> @test_vqdmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
3002; CHECK-LABEL: test_vqdmlal_high_lane_s32_0:
3003; CHECK:       // %bb.0: // %entry
3004; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
3005; CHECK-NEXT:    sqdmlal2 v0.2d, v1.4s, v2.s[0]
3006; CHECK-NEXT:    ret
3007entry:
3008  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
3009  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
3010  %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
3011  %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
3012  ret <2 x i64> %vqdmlal4.i
3013}
3014
3015define <4 x i32> @test_vqdmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
3016; CHECK-LABEL: test_vqdmlsl_lane_s16_0:
3017; CHECK:       // %bb.0: // %entry
3018; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
3019; CHECK-NEXT:    sqdmlsl v0.4s, v1.4h, v2.h[0]
3020; CHECK-NEXT:    ret
3021entry:
3022  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
3023  %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
3024  %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
3025  ret <4 x i32> %vqdmlsl4.i
3026}
3027
3028define <2 x i64> @test_vqdmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
3029; CHECK-LABEL: test_vqdmlsl_lane_s32_0:
3030; CHECK:       // %bb.0: // %entry
3031; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
3032; CHECK-NEXT:    sqdmlsl v0.2d, v1.2s, v2.s[0]
3033; CHECK-NEXT:    ret
3034entry:
3035  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
3036  %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
3037  %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
3038  ret <2 x i64> %vqdmlsl4.i
3039}
3040
3041define <4 x i32> @test_vqdmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
3042; CHECK-LABEL: test_vqdmlsl_high_lane_s16_0:
3043; CHECK:       // %bb.0: // %entry
3044; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
3045; CHECK-NEXT:    sqdmlsl2 v0.4s, v1.8h, v2.h[0]
3046; CHECK-NEXT:    ret
3047entry:
3048  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3049  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
3050  %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
3051  %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
3052  ret <4 x i32> %vqdmlsl4.i
3053}
3054
3055define <2 x i64> @test_vqdmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
3056; CHECK-LABEL: test_vqdmlsl_high_lane_s32_0:
3057; CHECK:       // %bb.0: // %entry
3058; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
3059; CHECK-NEXT:    sqdmlsl2 v0.2d, v1.4s, v2.s[0]
3060; CHECK-NEXT:    ret
3061entry:
3062  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
3063  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
3064  %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
3065  %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
3066  ret <2 x i64> %vqdmlsl4.i
3067}
3068
3069define <4 x i32> @test_vqdmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
3070; CHECK-LABEL: test_vqdmull_lane_s16_0:
3071; CHECK:       // %bb.0: // %entry
3072; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
3073; CHECK-NEXT:    sqdmull v0.4s, v0.4h, v1.h[0]
3074; CHECK-NEXT:    ret
3075entry:
3076  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
3077  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
3078  ret <4 x i32> %vqdmull2.i
3079}
3080
3081define <2 x i64> @test_vqdmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
3082; CHECK-LABEL: test_vqdmull_lane_s32_0:
3083; CHECK:       // %bb.0: // %entry
3084; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
3085; CHECK-NEXT:    sqdmull v0.2d, v0.2s, v1.s[0]
3086; CHECK-NEXT:    ret
3087entry:
3088  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
3089  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
3090  ret <2 x i64> %vqdmull2.i
3091}
3092
3093define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
3094; CHECK-LABEL: test_vqdmull_laneq_s16_0:
3095; CHECK:       // %bb.0: // %entry
3096; CHECK-NEXT:    sqdmull v0.4s, v0.4h, v1.h[0]
3097; CHECK-NEXT:    ret
3098entry:
3099  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
3100  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
3101  ret <4 x i32> %vqdmull2.i
3102}
3103
3104define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
3105; CHECK-LABEL: test_vqdmull_laneq_s32_0:
3106; CHECK:       // %bb.0: // %entry
3107; CHECK-NEXT:    sqdmull v0.2d, v0.2s, v1.s[0]
3108; CHECK-NEXT:    ret
3109entry:
3110  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
3111  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
3112  ret <2 x i64> %vqdmull2.i
3113}
3114
3115define <4 x i32> @test_vqdmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
3116; CHECK-LABEL: test_vqdmull_high_lane_s16_0:
3117; CHECK:       // %bb.0: // %entry
3118; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
3119; CHECK-NEXT:    sqdmull2 v0.4s, v0.8h, v1.h[0]
3120; CHECK-NEXT:    ret
3121entry:
3122  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3123  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
3124  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
3125  ret <4 x i32> %vqdmull2.i
3126}
3127
3128define <2 x i64> @test_vqdmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
3129; CHECK-LABEL: test_vqdmull_high_lane_s32_0:
3130; CHECK:       // %bb.0: // %entry
3131; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
3132; CHECK-NEXT:    sqdmull2 v0.2d, v0.4s, v1.s[0]
3133; CHECK-NEXT:    ret
3134entry:
3135  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
3136  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
3137  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
3138  ret <2 x i64> %vqdmull2.i
3139}
3140
3141define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
3142; CHECK-LABEL: test_vqdmull_high_laneq_s16_0:
3143; CHECK:       // %bb.0: // %entry
3144; CHECK-NEXT:    sqdmull2 v0.4s, v0.8h, v1.h[0]
3145; CHECK-NEXT:    ret
3146entry:
3147  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3148  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
3149  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
3150  ret <4 x i32> %vqdmull2.i
3151}
3152
3153define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
3154; CHECK-LABEL: test_vqdmull_high_laneq_s32_0:
3155; CHECK:       // %bb.0: // %entry
3156; CHECK-NEXT:    sqdmull2 v0.2d, v0.4s, v1.s[0]
3157; CHECK-NEXT:    ret
3158entry:
3159  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
3160  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
3161  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
3162  ret <2 x i64> %vqdmull2.i
3163}
3164
3165define <4 x i16> @test_vqdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
3166; CHECK-LABEL: test_vqdmulh_lane_s16_0:
3167; CHECK:       // %bb.0: // %entry
3168; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
3169; CHECK-NEXT:    sqdmulh v0.4h, v0.4h, v1.h[0]
3170; CHECK-NEXT:    ret
3171entry:
3172  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
3173  %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
3174  ret <4 x i16> %vqdmulh2.i
3175}
3176
3177define <8 x i16> @test_vqdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
3178; CHECK-LABEL: test_vqdmulhq_lane_s16_0:
3179; CHECK:       // %bb.0: // %entry
3180; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
3181; CHECK-NEXT:    sqdmulh v0.8h, v0.8h, v1.h[0]
3182; CHECK-NEXT:    ret
3183entry:
3184  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
3185  %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
3186  ret <8 x i16> %vqdmulh2.i
3187}
3188
3189define <2 x i32> @test_vqdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
3190; CHECK-LABEL: test_vqdmulh_lane_s32_0:
3191; CHECK:       // %bb.0: // %entry
3192; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
3193; CHECK-NEXT:    sqdmulh v0.2s, v0.2s, v1.s[0]
3194; CHECK-NEXT:    ret
3195entry:
3196  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
3197  %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
3198  ret <2 x i32> %vqdmulh2.i
3199}
3200
3201define <4 x i32> @test_vqdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
3202; CHECK-LABEL: test_vqdmulhq_lane_s32_0:
3203; CHECK:       // %bb.0: // %entry
3204; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
3205; CHECK-NEXT:    sqdmulh v0.4s, v0.4s, v1.s[0]
3206; CHECK-NEXT:    ret
3207entry:
3208  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
3209  %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
3210  ret <4 x i32> %vqdmulh2.i
3211}
3212
3213define <4 x i16> @test_vqrdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
3214; CHECK-LABEL: test_vqrdmulh_lane_s16_0:
3215; CHECK:       // %bb.0: // %entry
3216; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
3217; CHECK-NEXT:    sqrdmulh v0.4h, v0.4h, v1.h[0]
3218; CHECK-NEXT:    ret
3219entry:
3220  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
3221  %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
3222  ret <4 x i16> %vqrdmulh2.i
3223}
3224
3225define <8 x i16> @test_vqrdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
3226; CHECK-LABEL: test_vqrdmulhq_lane_s16_0:
3227; CHECK:       // %bb.0: // %entry
3228; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
3229; CHECK-NEXT:    sqrdmulh v0.8h, v0.8h, v1.h[0]
3230; CHECK-NEXT:    ret
3231entry:
3232  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
3233  %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
3234  ret <8 x i16> %vqrdmulh2.i
3235}
3236
3237define <2 x i32> @test_vqrdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
3238; CHECK-LABEL: test_vqrdmulh_lane_s32_0:
3239; CHECK:       // %bb.0: // %entry
3240; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
3241; CHECK-NEXT:    sqrdmulh v0.2s, v0.2s, v1.s[0]
3242; CHECK-NEXT:    ret
3243entry:
3244  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
3245  %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
3246  ret <2 x i32> %vqrdmulh2.i
3247}
3248
3249define <4 x i32> @test_vqrdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
3250; CHECK-LABEL: test_vqrdmulhq_lane_s32_0:
3251; CHECK:       // %bb.0: // %entry
3252; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
3253; CHECK-NEXT:    sqrdmulh v0.4s, v0.4s, v1.s[0]
3254; CHECK-NEXT:    ret
3255entry:
3256  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
3257  %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
3258  ret <4 x i32> %vqrdmulh2.i
3259}
3260
3261define <2 x float> @test_vmul_lane_f32_0(<2 x float> %a, <2 x float> %v) {
3262; CHECK-LABEL: test_vmul_lane_f32_0:
3263; CHECK:       // %bb.0: // %entry
3264; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
3265; CHECK-NEXT:    fmul v0.2s, v0.2s, v1.s[0]
3266; CHECK-NEXT:    ret
3267entry:
3268  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
3269  %mul = fmul <2 x float> %shuffle, %a
3270  ret <2 x float> %mul
3271}
3272
3273define <4 x float> @test_vmulq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
3274; CHECK-LABEL: test_vmulq_lane_f32_0:
3275; CHECK:       // %bb.0: // %entry
3276; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
3277; CHECK-NEXT:    fmul v0.4s, v0.4s, v1.s[0]
3278; CHECK-NEXT:    ret
3279entry:
3280  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
3281  %mul = fmul <4 x float> %shuffle, %a
3282  ret <4 x float> %mul
3283}
3284
3285define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
3286; CHECK-LABEL: test_vmul_laneq_f32_0:
3287; CHECK:       // %bb.0: // %entry
3288; CHECK-NEXT:    fmul v0.2s, v0.2s, v1.s[0]
3289; CHECK-NEXT:    ret
3290entry:
3291  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
3292  %mul = fmul <2 x float> %shuffle, %a
3293  ret <2 x float> %mul
3294}
3295
3296define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) {
3297; CHECK-LABEL: test_vmul_laneq_f64_0:
3298; CHECK:       // %bb.0: // %entry
3299; CHECK-NEXT:    fmul d0, d0, v1.d[0]
3300; CHECK-NEXT:    ret
3301entry:
3302  %0 = bitcast <1 x double> %a to <8 x i8>
3303  %1 = bitcast <8 x i8> %0 to double
3304  %extract = extractelement <2 x double> %v, i32 0
3305  %2 = fmul double %1, %extract
3306  %3 = insertelement <1 x double> undef, double %2, i32 0
3307  ret <1 x double> %3
3308}
3309
3310define <4 x float> @test_vmulq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
3311; CHECK-LABEL: test_vmulq_laneq_f32_0:
3312; CHECK:       // %bb.0: // %entry
3313; CHECK-NEXT:    fmul v0.4s, v0.4s, v1.s[0]
3314; CHECK-NEXT:    ret
3315entry:
3316  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
3317  %mul = fmul <4 x float> %shuffle, %a
3318  ret <4 x float> %mul
3319}
3320
3321define <2 x double> @test_vmulq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
3322; CHECK-LABEL: test_vmulq_laneq_f64_0:
3323; CHECK:       // %bb.0: // %entry
3324; CHECK-NEXT:    fmul v0.2d, v0.2d, v1.d[0]
3325; CHECK-NEXT:    ret
3326entry:
3327  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
3328  %mul = fmul <2 x double> %shuffle, %a
3329  ret <2 x double> %mul
3330}
3331
3332define <2 x float> @test_vmulx_lane_f32_0(<2 x float> %a, <2 x float> %v) {
3333; CHECK-LABEL: test_vmulx_lane_f32_0:
3334; CHECK:       // %bb.0: // %entry
3335; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
3336; CHECK-NEXT:    fmulx v0.2s, v0.2s, v1.s[0]
3337; CHECK-NEXT:    ret
3338entry:
3339  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
3340  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
3341  ret <2 x float> %vmulx2.i
3342}
3343
3344define <4 x float> @test_vmulxq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
3345; CHECK-LABEL: test_vmulxq_lane_f32_0:
3346; CHECK:       // %bb.0: // %entry
3347; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
3348; CHECK-NEXT:    fmulx v0.4s, v0.4s, v1.s[0]
3349; CHECK-NEXT:    ret
3350entry:
3351  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
3352  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
3353  ret <4 x float> %vmulx2.i
3354}
3355
3356define <2 x double> @test_vmulxq_lane_f64_0(<2 x double> %a, <1 x double> %v) {
3357; CHECK-LABEL: test_vmulxq_lane_f64_0:
3358; CHECK:       // %bb.0: // %entry
3359; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
3360; CHECK-NEXT:    fmulx v0.2d, v0.2d, v1.d[0]
3361; CHECK-NEXT:    ret
3362entry:
3363  %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
3364  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
3365  ret <2 x double> %vmulx2.i
3366}
3367
3368define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
3369; CHECK-LABEL: test_vmulx_laneq_f32_0:
3370; CHECK:       // %bb.0: // %entry
3371; CHECK-NEXT:    fmulx v0.2s, v0.2s, v1.s[0]
3372; CHECK-NEXT:    ret
3373entry:
3374  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
3375  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
3376  ret <2 x float> %vmulx2.i
3377}
3378
3379define <4 x float> @test_vmulxq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
3380; CHECK-LABEL: test_vmulxq_laneq_f32_0:
3381; CHECK:       // %bb.0: // %entry
3382; CHECK-NEXT:    fmulx v0.4s, v0.4s, v1.s[0]
3383; CHECK-NEXT:    ret
3384entry:
3385  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
3386  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
3387  ret <4 x float> %vmulx2.i
3388}
3389
3390define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
3391; CHECK-LABEL: test_vmulxq_laneq_f64_0:
3392; CHECK:       // %bb.0: // %entry
3393; CHECK-NEXT:    fmulx v0.2d, v0.2d, v1.d[0]
3394; CHECK-NEXT:    ret
3395entry:
3396  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
3397  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
3398  ret <2 x double> %vmulx2.i
3399}
3400
3401define <4 x float> @optimize_dup(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %v) {
3402; CHECK-LABEL: optimize_dup:
3403; CHECK:       // %bb.0: // %entry
3404; CHECK-NEXT:    fmla v0.4s, v1.4s, v3.s[3]
3405; CHECK-NEXT:    fmls v0.4s, v2.4s, v3.s[3]
3406; CHECK-NEXT:    ret
3407entry:
3408  %lane1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
3409  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane1, <4 x float> %b, <4 x float> %a)
3410  %lane2 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
3411	%1 = fmul <4 x float> %lane2, %c
3412	%s = fsub <4 x float> %0, %1
3413  ret <4 x float> %s
3414}
3415
3416define <4 x float> @no_optimize_dup(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %v) {
3417; CHECK-LABEL: no_optimize_dup:
3418; CHECK:       // %bb.0: // %entry
3419; CHECK-NEXT:    fmla v0.4s, v1.4s, v3.s[3]
3420; CHECK-NEXT:    fmls v0.4s, v2.4s, v3.s[1]
3421; CHECK-NEXT:    ret
3422entry:
3423  %lane1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
3424  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane1, <4 x float> %b, <4 x float> %a)
3425  %lane2 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3426	%1 = fmul <4 x float> %lane2, %c
3427	%s = fsub <4 x float> %0, %1
3428  ret <4 x float> %s
3429}
3430
3431define <2 x float> @test_vfma_lane_simdinstr_opt_pass_caching_a57(<2 x float> %a, <2 x float> %b, <2 x float> %v) "target-cpu"="cortex-a57" {
3432; CHECK-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_a57:
3433; CHECK:       // %bb.0: // %entry
3434; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
3435; CHECK-NEXT:    fmla v0.2s, v1.2s, v2.s[1]
3436; CHECK-NEXT:    ret
3437entry:
3438  %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
3439  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
3440  ret <2 x float> %0
3441}
3442
3443define <2 x float> @test_vfma_lane_simdinstr_opt_pass_caching_m3(<2 x float> %a, <2 x float> %b, <2 x float> %v) "target-cpu"="exynos-m3" {
3444; CHECK-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_m3:
3445; CHECK:       // %bb.0: // %entry
3446; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
3447; CHECK-NEXT:    fmla v0.2s, v1.2s, v2.s[1]
3448; CHECK-NEXT:    ret
3449entry:
3450  %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
3451  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
3452  ret <2 x float> %0
3453}
3454